org.apache.avro.generic.GenericRecord Scala Examples

The following examples show how to use org.apache.avro.generic.GenericRecord. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: AvroParquetSourceTest.scala    From eel-sdk   with Apache License 2.0 6 votes vote down vote up
package io.eels.component.parquet

import java.nio.file.Paths

import io.eels.component.parquet.avro.AvroParquetSource
import io.eels.component.parquet.util.ParquetLogMute
import io.eels.schema._
import org.apache.avro.SchemaBuilder
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.parquet.avro.AvroParquetWriter
import org.scalatest.{Matchers, WordSpec}

class AvroParquetSourceTest extends WordSpec with Matchers {
  ParquetLogMute()

  private implicit val conf = new Configuration()
  private implicit val fs = FileSystem.get(conf)

  private val personFile = Paths.get(getClass.getResource("/io/eels/component/parquet/person.avro.pq").toURI)
  private val resourcesDir = personFile.getParent

  "AvroParquetSource" should {
    "read schema" in {
      val people = AvroParquetSource(personFile)
      people.schema shouldBe StructType(
        Field("name", StringType, nullable = false),
        Field("job", StringType, nullable = false),
        Field("location", StringType, nullable = false)
      )
    }
    "read parquet files" in {
      val people = AvroParquetSource(personFile.toAbsolutePath()).toDataStream().toSet.map(_.values)
      people shouldBe Set(
        Vector("clint eastwood", "actor", "carmel"),
        Vector("elton john", "musician", "pinner")
      )
    }
    "read multiple parquet files using file expansion" in {
      import io.eels.FilePattern._
      val people = AvroParquetSource(s"${resourcesDir.toUri.toString}/*.pq").toDataStream().toSet.map(_.values)
      people shouldBe Set(
        Vector("clint eastwood", "actor", "carmel"),
        Vector("elton john", "musician", "pinner"),
        Vector("clint eastwood", "actor", "carmel"),
        Vector("elton john", "musician", "pinner")
      )
    }
    // todo add merge to parquet source
    "merge schemas" ignore {

      try {
        fs.delete(new Path("merge1.pq"), false)
      } catch {
        case t: Throwable =>
      }
      try {
        fs.delete(new Path("merge2.pq"), false)
      } catch {
        case t: Throwable =>
      }

      val schema1 = SchemaBuilder.builder().record("schema1").fields().requiredString("a").requiredDouble("b").endRecord()
      val schema2 = SchemaBuilder.builder().record("schema2").fields().requiredInt("a").requiredBoolean("c").endRecord()

      val writer1 = AvroParquetWriter.builder[GenericRecord](new Path("merge1.pq")).withSchema(schema1).build()
      val record1 = new GenericData.Record(schema1)
      record1.put("a", "aaaaa")
      record1.put("b", 124.3)
      writer1.write(record1)
      writer1.close()

      val writer2 = AvroParquetWriter.builder[GenericRecord](new Path("merge2.pq")).withSchema(schema2).build()
      val record2 = new GenericData.Record(schema2)
      record2.put("a", 111)
      record2.put("c", true)
      writer2.write(record2)
      writer2.close()

      ParquetSource(new Path("merge*")).schema shouldBe
        StructType(
          Field("a", StringType, nullable = false),
          Field("b", DoubleType, nullable = false),
          Field("c", BooleanType, nullable = false)
        )

      fs.delete(new Path(".merge1.pq.crc"), false)
      fs.delete(new Path(".merge2.pq.crc"), false)
      fs.delete(new Path("merge1.pq"), false)
      fs.delete(new Path("merge2.pq"), false)
    }
  }
} 
Example 2
Source File: FieldMapperEncoderTest.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s.record.encoder

import com.sksamuel.avro4s.{Encoder, SchemaFor, SnakeCase}
import org.apache.avro.generic.GenericRecord
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

class FieldMapperEncoderTest extends AnyFunSuite with Matchers {

  test("adding an in scope FieldMapper should overide the fields in an encoder") {
    implicit val fieldMapper = SnakeCase
    val schema: SchemaFor[NamingTest] = SchemaFor[NamingTest]
    val encoder = Encoder[NamingTest]
    val record = encoder.encode(NamingTest("Foo")).asInstanceOf[GenericRecord]
    record.get("camel_case")
  }

}

case class NamingTest(camelCase: String) 
Example 3
Source File: Codecs.scala    From embedded-kafka-schema-registry   with MIT License 5 votes vote down vote up
package net.manub.embeddedkafka.schemaregistry.avro

import org.apache.avro.generic.GenericRecord
import org.apache.avro.specific.SpecificRecord
import org.apache.kafka.clients.consumer.ConsumerRecord

@deprecated(
  "Avro-related classes will be removed soon",
  since = "5.5.0"
)
object Codecs {
  implicit def stringKeyAvroValueCrDecoder[V <: SpecificRecord]
      : ConsumerRecord[String, V] => (String, V) =
    cr => (cr.key, cr.value)
  implicit def avroValueCrDecoder[V <: SpecificRecord]
      : ConsumerRecord[String, V] => V =
    _.value
  implicit def stringKeyAvroValueTopicCrDecoder[V <: SpecificRecord]
      : ConsumerRecord[String, V] => (String, String, V) =
    cr => (cr.topic, cr.key, cr.value)

  implicit def stringKeyGenericValueCrDecoder
      : ConsumerRecord[String, GenericRecord] => (String, GenericRecord) =
    cr => (cr.key, cr.value)

  implicit def genericKeyGenericValueCrDecoder
      : ConsumerRecord[GenericRecord, GenericRecord] => (
          GenericRecord,
          GenericRecord
      ) =
    cr => (cr.key, cr.value)
} 
Example 4
Source File: AvroSerdes.scala    From embedded-kafka-schema-registry   with MIT License 5 votes vote down vote up
package net.manub.embeddedkafka.schemaregistry.avro

import io.confluent.kafka.serializers.{
  AbstractKafkaSchemaSerDeConfig,
  KafkaAvroDeserializerConfig,
  KafkaAvroDeserializer => ConfluentKafkaAvroDeserializer,
  KafkaAvroSerializer => ConfluentKafkaAvroSerializer
}
import net.manub.embeddedkafka.schemaregistry.EmbeddedKafkaConfig
import org.apache.avro.generic.GenericRecord
import org.apache.avro.specific.SpecificRecord
import org.apache.kafka.common.serialization.{Serde, Serdes}

import scala.jdk.CollectionConverters._

@deprecated(
  "Avro-related classes will be removed soon",
  since = "5.5.0"
)
object AvroSerdes {

  protected def configForSchemaRegistry(
      implicit config: EmbeddedKafkaConfig
  ): Map[String, Object] =
    Map(
      AbstractKafkaSchemaSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG -> s"http://localhost:${config.schemaRegistryPort}"
    )

  protected def specificAvroReaderConfigForSchemaRegistry(
      implicit config: EmbeddedKafkaConfig
  ): Map[String, Object] =
    configForSchemaRegistry ++ Map(
      KafkaAvroDeserializerConfig.SPECIFIC_AVRO_READER_CONFIG -> true.toString
    )

  def specific[T <: SpecificRecord](
      isKey: Boolean = false,
      extraConfig: Map[String, Object] = Map.empty
  )(
      implicit config: EmbeddedKafkaConfig
  ): Serde[T] =
    serdeFrom[T](
      configForSchemaRegistry ++ extraConfig,
      specificAvroReaderConfigForSchemaRegistry ++ extraConfig, //need this to support SpecificRecord
      isKey
    )

  def generic(
      isKey: Boolean = false,
      extraConfig: Map[String, Object] = Map.empty
  )(
      implicit config: EmbeddedKafkaConfig
  ): Serde[GenericRecord] =
    serdeFrom[GenericRecord](
      configForSchemaRegistry ++ extraConfig,
      configForSchemaRegistry ++ extraConfig,
      isKey
    )

  private def serdeFrom[T](
      serConfig: Map[String, Object],
      deserConfig: Map[String, Object],
      isKey: Boolean
  ): Serde[T] = {
    val ser = new ConfluentKafkaAvroSerializer
    ser.configure(serConfig.asJava, isKey)
    val deser = new ConfluentKafkaAvroDeserializer
    deser.configure(deserConfig.asJava, isKey)

    Serdes.serdeFrom(ser, deser).asInstanceOf[Serde[T]]
  }
} 
Example 5
Source File: AvroCodecsSpecification.scala    From kafka-scala-api   with Apache License 2.0 5 votes vote down vote up
package com.example.avro

import org.scalatest._
import com.twitter.bijection.Injection
import com.twitter.bijection.avro.GenericAvroCodecs
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericData, GenericRecord}

class GenericAvroCodecsSpecification extends WordSpec with Matchers {
  val testSchema = new Schema.Parser().parse("""{
                                                   "type":"record",
                                                   "name":"FiscalRecord",
                                                   "namespace":"avro",
                                                   "fields":[
                                                      {
                                                         "name":"calendarDate",
                                                         "type":"string"
                                                      },
                                                      {
                                                         "name":"fiscalWeek",
                                                         "type":[
                                                            "int",
                                                            "null"
                                                         ]
                                                      },
                                                      {
                                                         "name":"fiscalYear",
                                                         "type":[
                                                            "int",
                                                            "null"
                                                         ]
                                                      }
                                                   ]
                                                }""")

  "Generic Avro codec" should {

    "Round trip generic record using Generic Injection" in {
      implicit val genericInjection = GenericAvroCodecs[GenericRecord](testSchema)
      val testRecord = buildGenericAvroRecord(("2012-01-01", 1, 12))
      val bytes = Injection[GenericRecord, Array[Byte]](testRecord)
      val attempt = Injection.invert[GenericRecord, Array[Byte]](bytes)
      assert(attempt.get == testRecord)
    }

    "Round trip generic record using Binary Injection" in {
      implicit val genericBinaryInjection = GenericAvroCodecs.toBinary[GenericRecord](testSchema)
      val testRecord = buildGenericAvroRecord(("2012-01-01", 1, 12))
      val bytes = Injection[GenericRecord, Array[Byte]](testRecord)
      val attempt = Injection.invert[GenericRecord, Array[Byte]](bytes)
      assert(attempt.get == testRecord)
    }

    "Round trip generic record using Json Injection" in {
      implicit val genericJsonInjection = GenericAvroCodecs.toJson[GenericRecord](testSchema)
      val testRecord = buildGenericAvroRecord(("2012-01-01", 1, 12))
      val jsonString = Injection[GenericRecord, String](testRecord)
      val attempt = Injection.invert[GenericRecord, String](jsonString)
      assert(attempt.get == testRecord)
    }
  }

  def buildGenericAvroRecord(i: (String, Int, Int)): GenericRecord = {

    val fiscalRecord = new GenericData.Record(testSchema)
    fiscalRecord.put("calendarDate", i._1)
    fiscalRecord.put("fiscalWeek", i._2)
    fiscalRecord.put("fiscalYear", i._3)
    fiscalRecord
  }
} 
Example 6
Source File: AvroSerializer.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.bloomberg.avro

import java.io.ByteArrayOutputStream

import com.datamountaineer.streamreactor.connect.bloomberg.BloombergData
import com.datamountaineer.streamreactor.connect.bloomberg.avro.AvroSchemaGenerator._
import org.apache.avro.Schema
import org.apache.avro.generic.GenericData.Record
import org.apache.avro.generic.{GenericData, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.EncoderFactory

import scala.collection.JavaConverters._

object AvroSerializer {

  
    private def recursive(record: GenericData.Record, schema: Schema, fieldName: String, value: Any): Unit = {
      value match {
        case _: Boolean => record.put(fieldName, value)
        case _: Int => record.put(fieldName, value)
        case _: Long => record.put(fieldName, value)
        case _: Double => record.put(fieldName, value)
        case _: Char => record.put(fieldName, value)
        case _: Float => record.put(fieldName, value)
        case _: String =>
          record.put(fieldName, value)
        case list: java.util.List[_] =>
          val tmpSchema = schema.getField(fieldName).schema()
          val itemSchema = if (tmpSchema.getType == Schema.Type.UNION) tmpSchema.getTypes.get(1) else tmpSchema
          require(itemSchema.getType == Schema.Type.ARRAY)
          //we might have a record not a primitive
          if (itemSchema.getElementType.getType == Schema.Type.RECORD) {
            val items = new GenericData.Array[GenericData.Record](list.size(), itemSchema)
            list.asScala.foreach { i =>
              //only map is allowed
              val m = i.asInstanceOf[java.util.Map[String, Any]]
              items.add(m.toAvroRecord(itemSchema.getElementType))
            }
            record.put(fieldName, items)
          } else {
            val items = new GenericData.Array[Any](list.size(), itemSchema)
            items.addAll(list)
            record.put(fieldName, items)
          }

        case map: java.util.LinkedHashMap[String @unchecked, _] =>
          //record schema
          val fieldSchema = schema.getField(fieldName).schema()
          val nestedSchema = if (fieldSchema.getType == Schema.Type.UNION) fieldSchema.getTypes.get(1) else fieldSchema
          val nestedRecord = new Record(nestedSchema)
          map.entrySet().asScala.foreach(e =>
            recursive(nestedRecord, nestedSchema, e.getKey, e.getValue))
          record.put(fieldName, nestedRecord)
      }
    }
  }
} 
Example 7
Source File: AvroRecordRowKeyBuilderTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.hbase

import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._
import com.datamountaineer.streamreactor.connect.hbase.avro.AvroRecordFieldExtractorMapFn
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.hbase.util.Bytes
import org.apache.kafka.connect.sink.SinkRecord
import org.mockito.MockitoSugar
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class AvroRecordRowKeyBuilderTest extends AnyWordSpec with Matchers with MockitoSugar {
  val schema: Schema = new Schema.Parser().parse(PersonAvroSchema.schema)

  "AvroRecordRowKeyBuilder" should {
    "extract the values from the avro record and create the key" in {
      val keys = Seq("firstName", "lastName", "age")
      val rowKeyBuilder = new AvroRecordRowKeyBuilderBytes(AvroRecordFieldExtractorMapFn(schema, keys), keys)

      val sinkRecord = mock[SinkRecord]
      val firstName = "Jack"
      val lastName = "Smith"
      val age = 29

      val record = new GenericRecord {

        val values: Map[String, AnyRef] = Map("firstName" -> firstName, "lastName" -> lastName, "age" -> Int.box(age))

        override def get(key: String): AnyRef = values(key)

        override def put(key: String, v: scala.Any): Unit = sys.error("not supported")

        override def get(i: Int): AnyRef = sys.error("not supported")


        override def put(i: Int, v: scala.Any): Unit = sys.error("not supported")


        override def getSchema: Schema = sys.error("not supported")
      }

      val expectedValue = Bytes.add(
        Array(
          firstName.fromString(),
          rowKeyBuilder.delimBytes,
          lastName.fromString(),
          rowKeyBuilder.delimBytes,
          age.fromInt()))
      rowKeyBuilder.build(sinkRecord, record) shouldBe expectedValue
    }
  }
} 
Example 8
Source File: SparkAvroDecoder.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.spark.avro

import org.apache.log4j.Logger

import java.io.ByteArrayOutputStream

import scala.reflect.runtime.universe._

import org.apache.avro.generic.{ GenericDatumReader, GenericDatumWriter, GenericRecord }
import org.apache.avro.io.{ DecoderFactory, EncoderFactory }
import org.apache.spark.sql.{ Dataset, Encoder, Row }
import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder, RowEncoder }
import org.apache.spark.sql.catalyst.expressions.GenericRow
import org.apache.spark.sql.types.StructType
import org.apache.avro.Schema

import cloudflow.spark.sql.SQLImplicits._

case class EncodedKV(key: String, value: Array[Byte])

case class SparkAvroDecoder[T: Encoder: TypeTag](avroSchema: String) {

  val encoder: Encoder[T]                           = implicitly[Encoder[T]]
  val sqlSchema: StructType                         = encoder.schema
  val encoderForDataColumns: ExpressionEncoder[Row] = RowEncoder(sqlSchema)
  @transient lazy val _avroSchema                   = new Schema.Parser().parse(avroSchema)
  @transient lazy val rowConverter                  = SchemaConverters.createConverterToSQL(_avroSchema, sqlSchema)
  @transient lazy val datumReader                   = new GenericDatumReader[GenericRecord](_avroSchema)
  @transient lazy val decoder                       = DecoderFactory.get
  def decode(bytes: Array[Byte]): Row = {
    val binaryDecoder = decoder.binaryDecoder(bytes, null)
    val record        = datumReader.read(null, binaryDecoder)
    rowConverter(record).asInstanceOf[GenericRow]
  }

}


case class SparkAvroEncoder[T: Encoder: TypeTag](avroSchema: String) {

  @transient lazy val log = Logger.getLogger(getClass.getName)

  val BufferSize = 5 * 1024 // 5 Kb

  val encoder                     = implicitly[Encoder[T]]
  val sqlSchema                   = encoder.schema
  @transient lazy val _avroSchema = new Schema.Parser().parse(avroSchema)

  val recordName                = "topLevelRecord" // ???
  val recordNamespace           = "recordNamespace" // ???
  @transient lazy val converter = AvroConverter.createConverterToAvro(sqlSchema, recordName, recordNamespace)

  // Risk: This process is memory intensive. Might require thread-level buffers to optimize memory usage
  def rowToBytes(row: Row): Array[Byte] = {
    val genRecord = converter(row).asInstanceOf[GenericRecord]
    if (log.isDebugEnabled) log.debug(s"genRecord = $genRecord")
    val datumWriter   = new GenericDatumWriter[GenericRecord](_avroSchema)
    val avroEncoder   = EncoderFactory.get
    val byteArrOS     = new ByteArrayOutputStream(BufferSize)
    val binaryEncoder = avroEncoder.binaryEncoder(byteArrOS, null)
    datumWriter.write(genRecord, binaryEncoder)
    binaryEncoder.flush()
    byteArrOS.toByteArray
  }

  def encode(dataset: Dataset[T]): Dataset[Array[Byte]] =
    dataset.toDF().mapPartitions(rows ⇒ rows.map(rowToBytes)).as[Array[Byte]]

  // Note to self: I'm not sure how heavy this chain of transformations is
  def encodeWithKey(dataset: Dataset[T], keyFun: T ⇒ String): Dataset[EncodedKV] = {
    val encoder             = encoderFor[T]
    implicit val rowEncoder = RowEncoder(encoder.schema).resolveAndBind()
    dataset.map { value ⇒
      val key         = keyFun(value)
      val internalRow = encoder.toRow(value)
      val row         = rowEncoder.fromRow(internalRow)
      val bytes       = rowToBytes(row)
      EncodedKV(key, bytes)
    }
  }

} 
Example 9
Source File: Avro4sConsumerImpl.scala    From kafka4s   with Apache License 2.0 5 votes vote down vote up
package com.banno.kafka.consumer

import cats.implicits._
import java.util.regex.Pattern

import scala.concurrent.duration._
import org.apache.kafka.common._
import org.apache.kafka.clients.consumer._
import org.apache.avro.generic.GenericRecord
import com.sksamuel.avro4s.FromRecord
import cats.Functor
import com.banno.kafka._

//this is a Bifunctor[ConsumerApi]

case class Avro4sConsumerImpl[F[_]: Functor, K: FromRecord, V: FromRecord](
    c: ConsumerApi[F, GenericRecord, GenericRecord]
) extends ConsumerApi[F, K, V] {
  def assign(partitions: Iterable[TopicPartition]): F[Unit] = c.assign(partitions)
  def assignment: F[Set[TopicPartition]] = c.assignment
  def beginningOffsets(partitions: Iterable[TopicPartition]): F[Map[TopicPartition, Long]] =
    c.beginningOffsets(partitions)
  def beginningOffsets(
      partitions: Iterable[TopicPartition],
      timeout: FiniteDuration
  ): F[Map[TopicPartition, Long]] =
    c.beginningOffsets(partitions, timeout)
  def close: F[Unit] = c.close
  def close(timeout: FiniteDuration): F[Unit] = c.close(timeout)
  def commitAsync: F[Unit] = c.commitAsync
  def commitAsync(
      offsets: Map[TopicPartition, OffsetAndMetadata],
      callback: OffsetCommitCallback
  ): F[Unit] = c.commitAsync(offsets, callback)
  def commitAsync(callback: OffsetCommitCallback): F[Unit] = c.commitAsync(callback)
  def commitSync: F[Unit] = c.commitSync
  def commitSync(offsets: Map[TopicPartition, OffsetAndMetadata]): F[Unit] = c.commitSync(offsets)
  def committed(partition: Set[TopicPartition]): F[Map[TopicPartition, OffsetAndMetadata]] =
    c.committed(partition)
  def endOffsets(partitions: Iterable[TopicPartition]): F[Map[TopicPartition, Long]] =
    c.endOffsets(partitions)
  def endOffsets(
      partitions: Iterable[TopicPartition],
      timeout: FiniteDuration
  ): F[Map[TopicPartition, Long]] = c.endOffsets(partitions, timeout)
  def listTopics: F[Map[String, Seq[PartitionInfo]]] = c.listTopics
  def listTopics(timeout: FiniteDuration): F[Map[String, Seq[PartitionInfo]]] =
    c.listTopics(timeout)
  def metrics: F[Map[MetricName, Metric]] = c.metrics
  def offsetsForTimes(
      timestampsToSearch: Map[TopicPartition, Long]
  ): F[Map[TopicPartition, OffsetAndTimestamp]] =
    c.offsetsForTimes(timestampsToSearch)
  def offsetsForTimes(
      timestampsToSearch: Map[TopicPartition, Long],
      timeout: FiniteDuration
  ): F[Map[TopicPartition, OffsetAndTimestamp]] =
    c.offsetsForTimes(timestampsToSearch, timeout)
  def partitionsFor(topic: String): F[Seq[PartitionInfo]] = c.partitionsFor(topic)
  def partitionsFor(topic: String, timeout: FiniteDuration): F[Seq[PartitionInfo]] =
    c.partitionsFor(topic, timeout)
  def pause(partitions: Iterable[TopicPartition]): F[Unit] = c.pause(partitions)
  def paused: F[Set[TopicPartition]] = c.paused
  def poll(timeout: FiniteDuration): F[ConsumerRecords[K, V]] =
    c.poll(timeout).map(_.fromGenericRecords[K, V])
  def position(partition: TopicPartition): F[Long] = c.position(partition)
  def resume(partitions: Iterable[TopicPartition]): F[Unit] = c.resume(partitions)
  def seek(partition: TopicPartition, offset: Long): F[Unit] = c.seek(partition, offset)
  def seekToBeginning(partitions: Iterable[TopicPartition]): F[Unit] = c.seekToBeginning(partitions)
  def seekToEnd(partitions: Iterable[TopicPartition]): F[Unit] = c.seekToEnd(partitions)
  def subscribe(topics: Iterable[String]): F[Unit] = c.subscribe(topics)
  def subscribe(topics: Iterable[String], callback: ConsumerRebalanceListener): F[Unit] =
    c.subscribe(topics, callback)
  def subscribe(pattern: Pattern): F[Unit] = c.subscribe(pattern)
  def subscribe(pattern: Pattern, callback: ConsumerRebalanceListener): F[Unit] =
    c.subscribe(pattern, callback)
  def subscription: F[Set[String]] = c.subscription
  def unsubscribe: F[Unit] = c.unsubscribe
  def wakeup: F[Unit] = c.wakeup
} 
Example 10
Source File: ProducerOps.scala    From kafka4s   with Apache License 2.0 5 votes vote down vote up
package com.banno.kafka.producer

import cats.{Applicative, Foldable, MonadError, Traverse}
import cats.implicits._
import fs2._
import org.apache.kafka.common._
import org.apache.kafka.common.errors._
import org.apache.kafka.clients.consumer.OffsetAndMetadata
import org.apache.kafka.clients.producer._

case class ProducerOps[F[_], K, V](producer: ProducerApi[F, K, V]) {

  def sendAndForgetBatch[G[_]: Foldable](
      records: G[ProducerRecord[K, V]]
  )(implicit F: Applicative[F]): F[Unit] =
    records.traverse_(producer.sendAndForget)

  def sendSyncBatch[G[_]: Traverse](
      records: G[ProducerRecord[K, V]]
  )(implicit F: Applicative[F]): F[G[RecordMetadata]] =
    records.traverse(producer.sendSync)

  def sendAsyncBatch[G[_]: Traverse](
      records: G[ProducerRecord[K, V]]
  )(implicit F: Applicative[F]): F[G[RecordMetadata]] =
    records.traverse(producer.sendAsync)

  def pipeSync: Pipe[F, ProducerRecord[K, V], RecordMetadata] =
    _.evalMap(producer.sendSync)

  def pipeAsync: Pipe[F, ProducerRecord[K, V], RecordMetadata] =
    _.evalMap(producer.sendAsync)

  def sink: Pipe[F, ProducerRecord[K, V], Unit] =
    _.evalMap(producer.sendAndForget)

  def sinkSync: Pipe[F, ProducerRecord[K, V], Unit] =
    pipeSync.apply(_).void

  def sinkAsync: Pipe[F, ProducerRecord[K, V], Unit] =
    pipeAsync.apply(_).void

  def transaction[G[_]: Foldable](
      records: G[ProducerRecord[K, V]],
      offsets: Map[TopicPartition, OffsetAndMetadata],
      consumerGroupId: String
  )(implicit F: MonadError[F, Throwable]): F[Unit] =
    (for {
      _ <- producer.beginTransaction
      _ <- sendAndForgetBatch(records) //should be no need to wait for RecordMetadatas or errors, since commitTransaction flushes and throws
      _ <- producer.sendOffsetsToTransaction(offsets, consumerGroupId)
      _ <- producer.commitTransaction
    } yield ()).handleErrorWith {
      // Exception-handling described in https://kafka.apache.org/10/javadoc/org/apache/kafka/clients/producer/KafkaProducer.html#send-org.apache.kafka.clients.producer.ProducerRecord-org.apache.kafka.clients.producer.Callback-
      case e: ProducerFencedException => F.raiseError(e)
      case e: OutOfOrderSequenceException => F.raiseError(e)
      case e: UnsupportedVersionException => F.raiseError(e)
      case e: AuthorizationException => F.raiseError(e)
      case _ => producer.abortTransaction
    }
}

import org.apache.avro.generic.GenericRecord
import com.sksamuel.avro4s.ToRecord

case class GenericProducerOps[F[_]](producer: ProducerApi[F, GenericRecord, GenericRecord]) {

  def toAvro4s[K: ToRecord, V: ToRecord]: ProducerApi[F, K, V] =
    Avro4sProducerImpl[F, K, V](producer)

} 
Example 11
Source File: Avro4sProducerImpl.scala    From kafka4s   with Apache License 2.0 5 votes vote down vote up
package com.banno.kafka.producer

import java.util.concurrent.{Future => JFuture}
import scala.concurrent.duration._
import org.apache.kafka.common._
import org.apache.kafka.clients.consumer.OffsetAndMetadata
import org.apache.kafka.clients.producer._
import org.apache.avro.generic.GenericRecord
import com.sksamuel.avro4s.ToRecord
import com.banno.kafka._

//this is like Bifunctor[ProducerApi] but is contravariant in both arguments, cats does not seem to have anything like ContravriantBifunctor...

case class Avro4sProducerImpl[F[_], K: ToRecord, V: ToRecord](
    p: ProducerApi[F, GenericRecord, GenericRecord]
) extends ProducerApi[F, K, V] {
  def abortTransaction: F[Unit] = p.abortTransaction
  def beginTransaction: F[Unit] = p.beginTransaction
  def close: F[Unit] = p.close
  def close(timeout: FiniteDuration): F[Unit] = p.close(timeout)
  def commitTransaction: F[Unit] = p.commitTransaction
  def flush: F[Unit] = p.flush
  def initTransactions: F[Unit] = p.initTransactions
  def metrics: F[Map[MetricName, Metric]] = p.metrics
  def partitionsFor(topic: String): F[Seq[PartitionInfo]] = p.partitionsFor(topic)
  def sendOffsetsToTransaction(
      offsets: Map[TopicPartition, OffsetAndMetadata],
      consumerGroupId: String
  ): F[Unit] = p.sendOffsetsToTransaction(offsets, consumerGroupId)

  private[producer] def sendRaw(record: ProducerRecord[K, V]): JFuture[RecordMetadata] =
    p.sendRaw(record.toGenericRecord)
  private[producer] def sendRaw(
      record: ProducerRecord[K, V],
      callback: Callback
  ): JFuture[RecordMetadata] = p.sendRaw(record.toGenericRecord, callback)
  private[producer] def sendRaw(
      record: ProducerRecord[K, V],
      callback: Either[Exception, RecordMetadata] => Unit
  ): Unit =
    p.sendRaw(record.toGenericRecord, callback)

  def sendAndForget(record: ProducerRecord[K, V]): F[Unit] = p.sendAndForget(record.toGenericRecord)
  def sendSync(record: ProducerRecord[K, V]): F[RecordMetadata] = p.sendSync(record.toGenericRecord)
  def sendAsync(record: ProducerRecord[K, V]): F[RecordMetadata] =
    p.sendAsync(record.toGenericRecord)
} 
Example 12
Source File: Decoding.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package benchmarks

import java.io.ByteArrayOutputStream
import java.nio.ByteBuffer
import java.util.Collections

import benchmarks.record._
import com.sksamuel.avro4s._
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.util.ByteBufferInputStream
import org.openjdk.jmh.annotations._
import org.openjdk.jmh.infra.Blackhole

object Decoding extends BenchmarkHelpers {
  @State(Scope.Thread)
  class Setup {
    val avroBytes = {
      import benchmarks.record.generated.AttributeValue._
      import benchmarks.record.generated._
      new RecordWithUnionAndTypeField(new ValidInt(255, t)).toByteBuffer
    }

    val avro4sBytes = encode(RecordWithUnionAndTypeField(AttributeValue.Valid[Int](255, t)))

    val (handrolledDecoder, handrolledReader) = {
      import benchmarks.handrolled_codecs._
      implicit val codec: Codec[AttributeValue[Int]] = AttributeValueCodec[Int]
      implicit val schemaFor: SchemaFor[AttributeValue[Int]] = SchemaFor[AttributeValue[Int]](codec.schema)
      val recordSchemaFor = SchemaFor[RecordWithUnionAndTypeField]
      val decoder = Decoder[RecordWithUnionAndTypeField].withSchema(recordSchemaFor)
      val reader = new GenericDatumReader[GenericRecord](recordSchemaFor.schema)
      (decoder, reader)
    }

    val (avro4sDecoder, avro4sReader) = {
      val decoder = Decoder[RecordWithUnionAndTypeField]
      val reader = new GenericDatumReader[GenericRecord](decoder.schema)
      (decoder, reader)
    }
  }

  def encode[T: Encoder: SchemaFor](value: T): ByteBuffer = {
    val outputStream = new ByteArrayOutputStream(512)
    val encoder = Encoder[T]
    val schema = AvroSchema[T]
    val record = encoder.encode(value).asInstanceOf[GenericRecord]
    val writer = new GenericDatumWriter[GenericRecord](schema)
    val enc = EncoderFactory.get().directBinaryEncoder(outputStream, null)
    writer.write(record, enc)
    ByteBuffer.wrap(outputStream.toByteArray)
  }
}

class Decoding extends CommonParams with BenchmarkHelpers {

  import Decoding._

  def decode[T](bytes: ByteBuffer, decoder: Decoder[T], reader: GenericDatumReader[GenericRecord]): T = {
    val dec =
      DecoderFactory.get().binaryDecoder(new ByteBufferInputStream(Collections.singletonList(bytes.duplicate)), null)
    val record = reader.read(null, dec)
    decoder.decode(record)
  }


  @Benchmark
  def avroSpecificRecord(setup: Setup, blackhole: Blackhole) = {
    import benchmarks.record.generated._
    blackhole.consume(RecordWithUnionAndTypeField.fromByteBuffer(setup.avroBytes.duplicate))
  }

  @Benchmark
  def avro4sHandrolled(setup: Setup, blackhole: Blackhole) =
    blackhole.consume(decode(setup.avro4sBytes, setup.handrolledDecoder, setup.handrolledReader))

  @Benchmark
  def avro4sGenerated(setup: Setup, blackhole: Blackhole) =
    blackhole.consume(decode(setup.avro4sBytes, setup.avro4sDecoder, setup.avro4sReader))
} 
Example 13
Source File: Encoding.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package benchmarks

import java.io.ByteArrayOutputStream
import java.nio.ByteBuffer

import benchmarks.record._
import com.sksamuel.avro4s._
import org.apache.avro.generic.{GenericDatumWriter, GenericRecord}
import org.apache.avro.io.EncoderFactory
import org.openjdk.jmh.annotations._
import org.openjdk.jmh.infra.Blackhole

object Encoding extends BenchmarkHelpers {

  @State(Scope.Thread)
  class Setup {
    val record = RecordWithUnionAndTypeField(AttributeValue.Valid[Int](255, t))

    val specificRecord = {
      import benchmarks.record.generated.AttributeValue._
      import benchmarks.record.generated._
      new RecordWithUnionAndTypeField(new ValidInt(255, t))
    }

    val (avro4sEncoder, avro4sWriter) = {
      val schema = AvroSchema[RecordWithUnionAndTypeField]
      val encoder = Encoder[RecordWithUnionAndTypeField]
      val writer = new GenericDatumWriter[GenericRecord](schema)
      (encoder, writer)
    }

    val (handrolledEncoder, handrolledWriter) = {
      import benchmarks.handrolled_codecs._
      implicit val codec: AttributeValueCodec[Int] = AttributeValueCodec[Int]
      implicit val schemaForValid = codec.schemaForValid
      val schema = AvroSchema[RecordWithUnionAndTypeField]
      val encoder = Encoder[RecordWithUnionAndTypeField]
      val writer = new GenericDatumWriter[GenericRecord](schema)
      (encoder, writer)
    }

  }
}

class Encoding extends CommonParams with BenchmarkHelpers {

  import Encoding._

  def encode[T](value: T, encoder: Encoder[T], writer: GenericDatumWriter[GenericRecord]): ByteBuffer = {
    val outputStream = new ByteArrayOutputStream(512)
    val record = encoder.encode(value).asInstanceOf[GenericRecord]
    val enc = EncoderFactory.get().directBinaryEncoder(outputStream, null)
    writer.write(record, enc)
    ByteBuffer.wrap(outputStream.toByteArray)
  }


  @Benchmark
  def avroSpecificRecord(setup: Setup, blackhole: Blackhole) =
    blackhole.consume(setup.specificRecord.toByteBuffer)

  @Benchmark
  def avro4sGenerated(setup: Setup, blackhole: Blackhole) =
    blackhole.consume(encode(setup.record, setup.avro4sEncoder, setup.avro4sWriter))

  @Benchmark
  def avro4sHandrolled(setup: Setup, blackhole: Blackhole) =
    blackhole.consume(encode(setup.record, setup.handrolledEncoder, setup.handrolledWriter))
} 
Example 14
Source File: AvroSchema.scala    From aloha   with MIT License 5 votes vote down vote up
package com.eharmony.aloha.semantics.compiled.plugin.avro

import com.eharmony.aloha.reflect.RefInfo
import com.eharmony.aloha.semantics.compiled.plugin.schemabased.schema.Schema.FieldRetrievalError
import com.eharmony.aloha.semantics.compiled.plugin.schemabased.schema._
import org.apache.avro
import org.apache.avro.Schema.Type._
import org.apache.avro.generic.GenericRecord

import scala.collection.JavaConversions.asScalaBuffer


  protected[avro] def unionField(name: String, index: Int, fieldSchema: avro.Schema, reqField: Boolean): Result = {
    val union = fieldSchema.getTypes

    // If there's only one item in the union, treat the union as if it didn't exist.
    if (1 == union.size)
      extract(name, index, union.head, reqField)
    else {
      val nonNull = union.filter(t => t.getType != NULL)
      if (1 == nonNull.size)
        extract(name, index, nonNull.head, nullable = true)
      else
        Left(FieldRetrievalError("Only UNION fields of one type or two types where one is NULL are allowed."))
    }
  }
}

object AvroSchema {
  def apply(rootSchema: avro.Schema): AvroSchema = new AvroSchema(rootSchema, rootSchema)
} 
Example 15
Source File: BasicEncoderTest.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s.record.encoder

import com.sksamuel.avro4s.examples.UppercasePkg.ClassInUppercasePackage
import com.sksamuel.avro4s._
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericFixed, GenericRecord}
import org.apache.avro.util.Utf8
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class BasicEncoderTest extends AnyWordSpec with Matchers {

  "Encoder" should {
    "encode strings as UTF8" in {
      case class Foo(s: String)
      val schema = AvroSchema[Foo]
      val record = Encoder[Foo].encode(Foo("hello"))
      record shouldBe ImmutableRecord(schema, Vector(new Utf8("hello")))
    }
    "encode strings as GenericFixed and pad bytes when schema is fixed" in {
      case class Foo(s: String)

      val fixedSchema = SchemaFor[String](Schema.createFixed("FixedString", null, null, 7))
      implicit val fixedStringEncoder: Encoder[String] = Encoder.StringEncoder.withSchema(fixedSchema)

      val record = Encoder[Foo].encode(Foo("hello")).asInstanceOf[GenericRecord]
      record.get("s").asInstanceOf[GenericFixed].bytes().toList shouldBe Seq(104, 101, 108, 108, 111, 0, 0)
      // the fixed should have the right size
      record.get("s").asInstanceOf[GenericFixed].bytes().length shouldBe 7
    }
    "encode longs" in {
      case class Foo(l: Long)
      val schema = AvroSchema[Foo]
      Encoder[Foo].encode(Foo(123456L)) shouldBe ImmutableRecord(schema, Vector(java.lang.Long.valueOf(123456L)))
    }
    "encode doubles" in {
      case class Foo(d: Double)
      val schema = AvroSchema[Foo]
      Encoder[Foo].encode(Foo(123.435)) shouldBe ImmutableRecord(schema, Vector(java.lang.Double.valueOf(123.435D)))
    }
    "encode booleans" in {
      case class Foo(d: Boolean)
      val schema = AvroSchema[Foo]
      Encoder[Foo].encode(Foo(true)) shouldBe ImmutableRecord(schema, Vector(java.lang.Boolean.valueOf(true)))
    }
    "encode floats" in {
      case class Foo(d: Float)
      val schema = AvroSchema[Foo]
      Encoder[Foo].encode(Foo(123.435F)) shouldBe ImmutableRecord(schema, Vector(java.lang.Float.valueOf(123.435F)))
    }
    "encode ints" in {
      case class Foo(i: Int)
      val schema = AvroSchema[Foo]
      Encoder[Foo].encode(Foo(123)) shouldBe ImmutableRecord(schema, Vector(java.lang.Integer.valueOf(123)))
    }
    "support uppercase packages" in {
      val schema = AvroSchema[ClassInUppercasePackage]
      val t = com.sksamuel.avro4s.examples.UppercasePkg.ClassInUppercasePackage("hello")
      schema.getFullName shouldBe "com.sksamuel.avro4s.examples.UppercasePkg.ClassInUppercasePackage"
      Encoder[ClassInUppercasePackage].encode(t) shouldBe ImmutableRecord(schema, Vector(new Utf8("hello")))
    }
  }
} 
Example 16
Source File: FixedEncoderTest.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s.record.encoder

import com.sksamuel.avro4s.{AvroFixed, Encoder, SchemaFor}
import org.apache.avro.generic.{GenericFixed, GenericRecord}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

@AvroFixed(8)
case class QuarterSHA256(bytes: Array[Byte]) extends AnyVal

case class FixedString(@AvroFixed(7) mystring: String)

case class AvroMessage(q: QuarterSHA256, payload: Array[Byte])

@AvroFixed(8)
case class FixedValueType(z: String) extends AnyVal
case class OptionFixedWrapper(opt: Option[FixedValueType])

class FixedEncoderTest extends AnyFunSuite with Matchers {

  val m = AvroMessage(
    QuarterSHA256(Array[Byte](0, 1, 2, 3, 4, 5, 6)),
    Array[Byte](0, 1, 2, 3)
  )

  test("encode fixed when used on a value type") {
    val schema = SchemaFor[AvroMessage]
    val record = Encoder[AvroMessage].encode(m).asInstanceOf[GenericRecord]
    record.get("q").asInstanceOf[GenericFixed].bytes().toVector shouldBe Vector(0, 1, 2, 3, 4, 5, 6, 0)
  }

  test("encode fixed when used on a field in a case class") {
    val schema = SchemaFor[FixedString]
    val record = Encoder[FixedString].encode(FixedString("sam")).asInstanceOf[GenericRecord]
    record.get("mystring").asInstanceOf[GenericFixed].bytes.toVector shouldBe Vector(115, 97, 109, 0, 0, 0, 0)
  }

  test("support options of fixed") {
    val schema = SchemaFor[OptionFixedWrapper]
    val record = Encoder[OptionFixedWrapper].encode(OptionFixedWrapper(Some(FixedValueType("sam")))).asInstanceOf[GenericRecord]
    record.get("opt").asInstanceOf[GenericFixed].bytes.toVector shouldBe Vector(115, 97, 109, 0, 0, 0, 0, 0)
  }
} 
Example 17
Source File: ByteArrayEncoderTest.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s.record.encoder

import java.nio.ByteBuffer

import com.sksamuel.avro4s.{AvroSchema, Encoder, SchemaFor}
import org.apache.avro.SchemaBuilder
import org.apache.avro.generic.{GenericFixed, GenericRecord}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

class ByteArrayEncoderTest extends AnyFunSuite with Matchers {

  test("encode byte arrays as BYTES type") {
    case class Test(z: Array[Byte])
    val schema = AvroSchema[Test]
    Encoder[Test].encode(Test(Array[Byte](1, 4, 9)))
      .asInstanceOf[GenericRecord]
      .get("z")
      .asInstanceOf[ByteBuffer]
      .array().toList shouldBe List[Byte](1, 4, 9)
  }

  test("encode byte vectors as BYTES type") {
    case class Test(z: Vector[Byte])
    val schema = AvroSchema[Test]
    Encoder[Test].encode(Test(Vector[Byte](1, 4, 9)))
      .asInstanceOf[GenericRecord]
      .get("z")
      .asInstanceOf[ByteBuffer]
      .array().toList shouldBe List[Byte](1, 4, 9)
  }

  test("encode byte seq as BYTES type") {
    case class Test(z: Seq[Byte])
    val schema = AvroSchema[Test]
    Encoder[Test].encode(Test(Seq[Byte](1, 4, 9)))
      .asInstanceOf[GenericRecord]
      .get("z")
      .asInstanceOf[ByteBuffer]
      .array().toList shouldBe List[Byte](1, 4, 9)
  }

  test("encode byte list as BYTES type") {
    case class Test(z: List[Byte])
    val schema = AvroSchema[Test]
    Encoder[Test].encode(Test(List[Byte](1, 4, 9)))
      .asInstanceOf[GenericRecord]
      .get("z")
      .asInstanceOf[ByteBuffer]
      .array().toList shouldBe List[Byte](1, 4, 9)
  }

  test("encode top level byte arrays") {
    val schema = AvroSchema[Array[Byte]]
    Encoder[Array[Byte]].encode(Array[Byte](1, 4, 9))
      .asInstanceOf[ByteBuffer]
      .array().toList shouldBe List[Byte](1, 4, 9)
  }

  test("encode ByteBuffers as BYTES type") {
    case class Test(z: ByteBuffer)
    val schema = AvroSchema[Test]
    Encoder[Test].encode(Test(ByteBuffer.wrap(Array[Byte](1, 4, 9))))
      .asInstanceOf[GenericRecord]
      .get("z")
      .asInstanceOf[ByteBuffer]
      .array().toList shouldBe List[Byte](1, 4, 9)
  }

  test("encode top level ByteBuffers") {
    val schema = AvroSchema[ByteBuffer]
    Encoder[ByteBuffer].encode(ByteBuffer.wrap(Array[Byte](1, 4, 9)))
      .asInstanceOf[ByteBuffer]
      .array().toList shouldBe List[Byte](1, 4, 9)
  }

  test("support FIXED") {
    val schema = SchemaBuilder.fixed("foo").size(7)
    val fixed = Encoder.ByteArrayEncoder.withSchema(SchemaFor(schema)).encode("hello".getBytes).asInstanceOf[GenericFixed]
    fixed.bytes().toList shouldBe Seq(104, 101, 108, 108, 111, 0, 0)
    fixed.bytes().length shouldBe 7
  }
} 
Example 18
Source File: TupleEncoderTest.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s.record.encoder

import com.sksamuel.avro4s.{AvroSchema, Encoder}
import org.apache.avro.generic.GenericRecord
import org.apache.avro.util.Utf8
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

class TupleEncoderTest extends AnyFunSuite with Matchers {

  test("encode tuple2") {
    case class Test(z: (String, Option[Int]))
    val schema = AvroSchema[Test]
    val record = Encoder[Test].encode(Test("hello", Some(55))).asInstanceOf[GenericRecord]
    val z = record.get("z").asInstanceOf[GenericRecord]
    z.get("_1") shouldBe new Utf8("hello")
    z.get("_2") shouldBe 55
  }

  test("encode tuple3") {
    case class Test(z: (String, Option[Int], Long))
    val schema = AvroSchema[Test]
    val record = Encoder[Test].encode(Test("hello", Some(55), 9999999L)).asInstanceOf[GenericRecord]
    val z = record.get("z").asInstanceOf[GenericRecord]
    z.get("_1") shouldBe new Utf8("hello")
    z.get("_2") shouldBe 55
    z.get("_3") shouldBe 9999999L
  }

  test("encode tuple4") {
    case class Test(z: (String, Option[Int], Boolean, Double))
    val schema = AvroSchema[Test]
    val record = Encoder[Test].encode(Test("hello", Some(55), true, 0.24)).asInstanceOf[GenericRecord]
    val z = record.get("z").asInstanceOf[GenericRecord]
    z.get("_1") shouldBe new Utf8("hello")
    z.get("_2") shouldBe 55
    z.get("_3") shouldBe true
    z.get("_4") shouldBe 0.24
  }

  test("encode tuple5") {
    case class Test(z: (String, Option[Int], String, Boolean, String))
    val schema = AvroSchema[Test]
    val record = Encoder[Test].encode(Test("a", Some(55), "b", true, "c")).asInstanceOf[GenericRecord]
    val z = record.get("z").asInstanceOf[GenericRecord]
    z.get("_1") shouldBe new Utf8("a")
    z.get("_2") shouldBe 55
    z.get("_3") shouldBe new Utf8("b")
    z.get("_4") shouldBe true
    z.get("_5") shouldBe new Utf8("c")
  }
} 
Example 19
Source File: OptionOutputStreamTest.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s.streams.output

import org.apache.avro.generic.GenericRecord
import org.apache.avro.util.Utf8

class OptionOutputStreamTest extends OutputStreamTest {

  test("options of booleans") {
    case class Test(z: Option[Boolean])
    writeRead(Test(Some(true))) { record =>
      record.get("z") shouldBe true
    }
    writeRead(Test(None)) { record =>
      record.get("z") shouldBe null
    }
  }

  test("options of ints") {
    case class Test(z: Option[Int])
    writeRead(Test(Some(43242))) { record =>
      record.get("z") shouldBe 43242
    }
    writeRead(Test(None)) { record =>
      record.get("z") shouldBe null
    }
  }

  test("options of longs") {
    case class Test(z: Option[Long])
    writeRead(Test(Some(43242L))) { record =>
      record.get("z") shouldBe 43242L
    }
    writeRead(Test(None)) { record =>
      record.get("z") shouldBe null
    }
  }

  test("options of doubles") {
    case class Test(z: Option[Double])
    writeRead(Test(Some(123.34))) { record =>
      record.get("z") shouldBe java.lang.Double.valueOf(123.34)
    }
    writeRead(Test(None)) { record =>
      record.get("z") shouldBe null
    }
  }

  test("options of strings") {
    case class Test(z: Option[String])
    writeRead(Test(Some("hello"))) { record =>
      record.get("z") shouldBe new Utf8("hello")
    }
    writeRead(Test(None)) { record =>
      record.get("z") shouldBe null
    }
  }

  test("options of classes") {
    case class Foo(s: String)
    case class Test(z: Option[Foo])
    writeRead(Test(Some(Foo("hello")))) { record =>
      record.get("z").asInstanceOf[GenericRecord].get("s") shouldBe new Utf8("hello")
    }
    writeRead(Test(None)) { record =>
      record.get("z") shouldBe null
    }
  }
} 
Example 20
Source File: OutputStreamTest.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s.streams.output

import java.io.ByteArrayOutputStream

import com.sksamuel.avro4s._
import org.apache.avro.file.{DataFileReader, SeekableByteArrayInput}
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
import org.apache.avro.io.DecoderFactory
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

trait OutputStreamTest extends AnyFunSuite with Matchers {

  def readData[T: SchemaFor](out: ByteArrayOutputStream): GenericRecord = readData(out.toByteArray)
  def readData[T: SchemaFor](bytes: Array[Byte]): GenericRecord = {
    val datumReader = new GenericDatumReader[GenericRecord](AvroSchema[T])
    val dataFileReader = new DataFileReader[GenericRecord](new SeekableByteArrayInput(bytes), datumReader)
    dataFileReader.next
  }

  def writeData[T: Encoder : SchemaFor](t: T): ByteArrayOutputStream = {
    val out = new ByteArrayOutputStream
    val avro = AvroOutputStream.data[T].to(out).build()
    avro.write(t)
    avro.close()
    out
  }

  def readBinary[T: SchemaFor](out: ByteArrayOutputStream): GenericRecord = readBinary(out.toByteArray)
  def readBinary[T: SchemaFor](bytes: Array[Byte]): GenericRecord = {
    val datumReader = new GenericDatumReader[GenericRecord](AvroSchema[T])
    val decoder = DecoderFactory.get().binaryDecoder(new SeekableByteArrayInput(bytes), null)
    datumReader.read(null, decoder)
  }

  def writeBinary[T: Encoder : SchemaFor](t: T): ByteArrayOutputStream = {
    val out = new ByteArrayOutputStream
    val avro = AvroOutputStream.binary[T].to(out).build()
    avro.write(t)
    avro.close()
    out
  }

  def readJson[T: SchemaFor](out: ByteArrayOutputStream): GenericRecord = readJson(out.toByteArray)
  def readJson[T: SchemaFor](bytes: Array[Byte]): GenericRecord = {
    val schema = AvroSchema[T]
    val datumReader = new GenericDatumReader[GenericRecord](schema)
    val decoder = DecoderFactory.get().jsonDecoder(schema, new SeekableByteArrayInput(bytes))
    datumReader.read(null, decoder)
  }

  def writeJson[T: Encoder : SchemaFor](t: T): ByteArrayOutputStream = {
    val out = new ByteArrayOutputStream
    val avro = AvroOutputStream.json[T].to(out).build()
    avro.write(t)
    avro.close()
    out
  }

  def writeRead[T: Encoder : SchemaFor](t: T)(fn: GenericRecord => Any): Unit = {
    {
      val out = writeData(t)
      val record = readData(out)
      fn(record)
    }
    {
      val out = writeBinary(t)
      val record = readBinary(out)
      fn(record)
    }
    {
      val out = writeJson(t)
      val record = readJson(out)
      fn(record)
    }
  }
} 
Example 21
Source File: EitherOutputStreamTest.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s.streams.output

import java.util

import com.sksamuel.avro4s.schema.Wine
import org.apache.avro.AvroRuntimeException
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.avro.util.Utf8

class EitherOutputStreamTest extends OutputStreamTest {

  import scala.collection.JavaConverters._

  test("write out either of primitives") {
    case class Test(z: Either[String, Int])
    writeRead(Test(Left("hello"))) { record =>
      record.get("z") shouldBe new Utf8("hello")
    }
    writeRead(Test(Right(45))) { record =>
      record.get("z") shouldBe 45
    }
  }

  test("write out either of Array") {
    case class Test(z: Either[Array[Int], String])
    writeRead(Test(Left(Array(1, 3, 4)))) { record =>
      record.get("z").asInstanceOf[GenericData.Array[Int]].asScala shouldBe List(1, 3, 4)
    }
  }

  test("write out either of Seq") {
    case class Test(z: Either[String, Seq[String]])
    writeRead(Test(Right(Seq("c", "d")))) { record =>
      record.get("z").asInstanceOf[GenericData.Array[String]].asScala shouldBe List(new Utf8("c"), new Utf8("d"))
    }
  }

  test("write out either of enum") {
    case class Test(z: Either[Wine, Seq[String]])
    writeRead(Test(Left(Wine.Malbec))) { record =>
      record.get("z").asInstanceOf[GenericData.EnumSymbol].toString shouldBe "Malbec"
    }
  }

  test("write out either of Maps") {
    case class Test(z: Either[Array[Int], Map[String, Boolean]])
    writeRead(Test(Right(Map("a" -> true, "b" -> false)))) { record =>
      record.get("z").asInstanceOf[util.HashMap[String, Boolean]].asScala shouldBe Map(new Utf8("a") -> true, new Utf8("b") -> false)
    }
  }

  test("write out case classes") {
    case class Foo(a: String)
    case class Bar(b: Boolean)
    case class Test(z: Either[Foo, Bar])
    writeRead(Test(Left(Foo("hello")))) { record =>
      record.get("z").asInstanceOf[GenericRecord].get("a") shouldBe new Utf8("hello")
    }
    writeRead(Test(Right(Bar(true)))) { record =>
      record.get("z").asInstanceOf[GenericRecord].get("b") shouldBe true
    }
  }

  test("throw an exception if trying to use two collection types in an either") {
    intercept[AvroRuntimeException] {
      case class Test(z: Either[Seq[String], List[Int]])
      writeRead(Test(Left(Seq("hello")))) { record =>
      }
    }
  }
} 
Example 22
Source File: BasicOutputStreamTest.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s.streams.output

import com.sksamuel.avro4s.{Encoder, SchemaFor}
import org.apache.avro.Schema.Parser
import org.apache.avro.generic.{GenericRecord, GenericRecordBuilder}
import org.apache.avro.util.Utf8

class BasicOutputStreamTest extends OutputStreamTest {

  test("write out booleans") {
    case class Test(z: Boolean)
    writeRead(Test(true)) { record =>
      record.get("z") shouldBe true
    }
  }

  test("write out strings") {
    case class Test(z: String)
    writeRead(Test("Hello world")) { record =>
      record.get("z") shouldBe new Utf8("Hello world")
    }
  }

  test("write out longs") {
    case class Test(z: Long)
    writeRead(Test(65653L)) { record =>
      record.get("z") shouldBe 65653L
    }
  }

  test("write out ints") {
    case class Test(z: Int)
    writeRead(Test(44)) { record =>
      record.get("z") shouldBe 44
    }
  }

  test("write out doubles") {
    case class Test(z: Double)
    writeRead(Test(3.235)) { record =>
      record.get("z") shouldBe 3.235
    }
  }

  test("write out floats") {
    case class Test(z: Float)
    writeRead(Test(3.4F)) { record =>
      record.get("z") shouldBe 3.4F
    }
  }

  test("write out generic record") {
    val recordSchema = new Parser().parse(
      """{"type":"record","name":"Test","fields":[{"name":"field","type":"string"}]}"""
    )
    implicit val recordSchemaFor: SchemaFor[GenericRecord] = SchemaFor(recordSchema)

    implicit val encoder: Encoder[GenericRecord] = new Encoder[GenericRecord] {
      def schemaFor = recordSchemaFor

      def encode(value: GenericRecord): AnyRef = value
    }


    val record: GenericRecord = new GenericRecordBuilder(recordSchema).set("field", "value").build()

    writeRead(record) { rec =>
      rec.get("field") shouldBe new Utf8("value")
    }
  }
} 
Example 23
Source File: GithubIssue235.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s.github

import java.io.ByteArrayOutputStream

import com.sksamuel.avro4s.{Decoder, Encoder, RecordFormat, SchemaFor}
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

case class Label(value: String) extends AnyVal
case class Value[A](label: Label, value: A)

sealed trait OneOrTwo[A]
case class One[A](value: Value[A]) extends OneOrTwo[A]
case class Two[A](first: Value[A], second: Value[A]) extends OneOrTwo[A]
case class OneOrTwoWrapper[A](t: OneOrTwo[A])

object Bug {

  def apply[T <: Product](a: T)(
    implicit schemaFor: SchemaFor[T],
    encoder: Encoder[T],
    decoder: Decoder[T]
  ): Unit = {

    val format = RecordFormat[T]
    val schema = schemaFor.schema
    val datumReader = new GenericDatumReader[GenericRecord](schema)
    val datumWriter = new GenericDatumWriter[GenericRecord](schema)

    val stream = new ByteArrayOutputStream()
    val bEncoder = EncoderFactory.get().binaryEncoder(stream, null)

    datumWriter.write(format.to(a), bEncoder)
    bEncoder.flush()

    val bytes = stream.toByteArray
    val bDecoder = DecoderFactory.get().binaryDecoder(bytes, null)
    val record = datumReader.read(null, bDecoder)
    require(format.from(record) == a)
  }

}

class GithubIssue235 extends AnyFunSuite with Matchers {
  test("Broken typeclass derivation upgrading from 1.9.0 to 2.0.1 #235") {
    val o = OneOrTwoWrapper(One(Value(Label("lbl"), "foo")))
    Bug(o)
  }
} 
Example 24
Source File: GithubIssue191.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s.github

import java.io.ByteArrayOutputStream

import com.sksamuel.avro4s.{AvroOutputStream, AvroSchema}
import org.apache.avro.file.{DataFileReader, SeekableByteArrayInput}
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
import org.apache.avro.util.Utf8
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

final case class SN(value: String) extends AnyVal
final case class SimpleUser(name: String, sn: Option[SN])

class GithubIssue191 extends AnyFunSuite with Matchers {

  test("writing out AnyVal in an option") {
    implicit val schema = AvroSchema[SimpleUser]
    val bytes = new ByteArrayOutputStream
    val out = AvroOutputStream.data[SimpleUser].to(bytes).build()
    out.write(SimpleUser("Tom", Some(SN("123"))))
    out.close()

    val datumReader = new GenericDatumReader[GenericRecord](schema)
    val dataFileReader = new DataFileReader[GenericRecord](new SeekableByteArrayInput(bytes.toByteArray), datumReader)
    val record = new Iterator[GenericRecord] {
      override def hasNext: Boolean = dataFileReader.hasNext
      override def next(): GenericRecord = dataFileReader.next
    }.toList.head
    record.getSchema shouldBe schema
    record.get("name") shouldBe new Utf8("Tom")
    record.get("sn") shouldBe new Utf8("123")
  }
} 
Example 25
Source File: AvroSerializer.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.coders.instances.kryo

import com.esotericsoftware.kryo.Kryo
import com.esotericsoftware.kryo.io.{Input, Output}
import com.twitter.chill.KSerializer
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.avro.specific.SpecificRecordBase
import org.apache.beam.sdk.coders.AvroCoder

import scala.collection.mutable.{Map => MMap}
import scala.util.Try

private[coders] class GenericAvroSerializer extends KSerializer[GenericRecord] {
  private lazy val cache: MMap[String, AvroCoder[GenericRecord]] = MMap()

  private def getCoder(schemaStr: String): AvroCoder[GenericRecord] =
    cache.getOrElseUpdate(schemaStr, AvroCoder.of(new Schema.Parser().parse(schemaStr)))
  private def getCoder(schemaStr: String, schema: Schema): AvroCoder[GenericRecord] =
    cache.getOrElseUpdate(schemaStr, AvroCoder.of(schema))

  override def write(kryo: Kryo, out: Output, obj: GenericRecord): Unit = {
    val schemaStr = obj.getSchema.toString
    val coder = this.getCoder(schemaStr, obj.getSchema)
    // write schema before every record in case it's not in reader serializer's cache
    out.writeString(schemaStr)
    coder.encode(obj, out)
  }

  override def read(kryo: Kryo, in: Input, cls: Class[GenericRecord]): GenericRecord = {
    val coder = this.getCoder(in.readString())
    coder.decode(in)
  }
}

private[coders] class SpecificAvroSerializer[T <: SpecificRecordBase] extends KSerializer[T] {
  private lazy val cache: MMap[Class[T], AvroCoder[T]] = MMap()

  private def getCoder(cls: Class[T]): AvroCoder[T] =
    cache.getOrElseUpdate(
      cls,
      Try(cls.getConstructor().newInstance().getSchema)
        .map(AvroCoder.of(cls, _))
        .getOrElse(AvroCoder.of(cls))
    )

  override def write(kser: Kryo, out: Output, obj: T): Unit =
    this.getCoder(obj.getClass.asInstanceOf[Class[T]]).encode(obj, out)

  override def read(kser: Kryo, in: Input, cls: Class[T]): T =
    this.getCoder(cls).decode(in)
} 
Example 26
Source File: TestUtilsBase.scala    From kafka-connect-common   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect

import java.util
import java.util.Collections

import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.apache.kafka.connect.source.SourceTaskContext
import org.apache.kafka.connect.storage.OffsetStorageReader
import org.mockito.Mockito._
import org.mockito.MockitoSugar
import org.scalatest.BeforeAndAfter
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

import scala.collection.JavaConverters._



    //set up partition
    val partition: util.Map[String, String] = Collections.singletonMap(lookupPartitionKey, table)
    //as a list to search for
    val partitionList: util.List[util.Map[String, String]] = List(partition).asJava
    //set up the offset
    val offset: util.Map[String, Object] = (Collections.singletonMap(offsetColumn,offsetValue ))
    //create offsets to initialize from
    val offsets :util.Map[util.Map[String, String],util.Map[String, Object]] = Map(partition -> offset).asJava

    //mock out reader and task context
    val taskContext = mock[SourceTaskContext]
    val reader = mock[OffsetStorageReader]
    when(reader.offsets(partitionList)).thenReturn(offsets)
    when(taskContext.offsetStorageReader()).thenReturn(reader)

    taskContext
  }
} 
Example 27
Source File: AvroConverter.scala    From kafka-connect-common   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.converters.source

import java.io.File
import java.util.Collections

import com.datamountaineer.streamreactor.connect.converters.MsgKey
import io.confluent.connect.avro.AvroData
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
import org.apache.avro.io.DecoderFactory
import org.apache.avro.{Schema => AvroSchema}
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.kafka.connect.source.SourceRecord
import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException


class AvroConverter extends Converter {
  private val avroData = new AvroData(8)
  private var sourceToSchemaMap: Map[String, AvroSchema] = Map.empty
  private var avroReadersMap: Map[String, GenericDatumReader[GenericRecord]] = Map.empty

  override def convert(kafkaTopic: String,
                       sourceTopic: String,
                       messageId: String,
                       bytes: Array[Byte],
                       keys: Seq[String] = Seq.empty,
                       keyDelimiter: String = "."): SourceRecord = {
    Option(bytes) match {
      case None =>
        new SourceRecord(Collections.singletonMap(Converter.TopicKey, sourceTopic),
          null,
          kafkaTopic,
          avroData.toConnectSchema(sourceToSchemaMap(sourceTopic)),
          null)
      case Some(_) =>
        val reader = avroReadersMap.getOrElse(sourceTopic.toLowerCase, throw new ConfigException(s"Invalid ${AvroConverter.SCHEMA_CONFIG} is not configured for $sourceTopic"))
        val decoder = DecoderFactory.get().binaryDecoder(bytes, null)
        val record = reader.read(null, decoder)
        val schemaAndValue = avroData.toConnectData(sourceToSchemaMap(sourceTopic.toLowerCase), record)
        val value = schemaAndValue.value()
        value match {
          case s: Struct if keys.nonEmpty =>
            val keysValue = keys.flatMap { key =>
              Option(KeyExtractor.extract(s, key.split('.').toVector)).map(_.toString)
            }.mkString(keyDelimiter)
            new SourceRecord(
              Collections.singletonMap(Converter.TopicKey, sourceTopic),
              null,
              kafkaTopic,
              Schema.STRING_SCHEMA,
              keysValue,
              schemaAndValue.schema(),
              schemaAndValue.value())
          case _ =>
            new SourceRecord(
              Collections.singletonMap(Converter.TopicKey, sourceTopic),
              null,
              kafkaTopic,
              MsgKey.schema,
              MsgKey.getStruct(sourceTopic, messageId),
              schemaAndValue.schema(),
              schemaAndValue.value())
        }

    }
  }

  override def initialize(config: Map[String, String]): Unit = {
    sourceToSchemaMap = AvroConverter.getSchemas(config)
    avroReadersMap = sourceToSchemaMap.map { case (key, schema) =>
      key -> new GenericDatumReader[GenericRecord](schema)
    }
  }
}

object AvroConverter {
  val SCHEMA_CONFIG = "connect.source.converter.avro.schemas"

  def getSchemas(config: Map[String, String]): Map[String, AvroSchema] = {
    config.getOrElse(SCHEMA_CONFIG, throw new ConfigException(s"$SCHEMA_CONFIG is not provided"))
      .toString
      .split(';')
      .filter(_.trim.nonEmpty)
      .map(_.split("="))
      .map {
        case Array(source, path) =>
          val file = new File(path)
          if (!file.exists()) {
            throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The file $path doesn't exist!")
          }
          val s = source.trim.toLowerCase()
          if (s.isEmpty) {
            throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The topic is not valid for entry containing $path")
          }
          s -> new AvroSchema.Parser().parse(file)
        case other => throw new ConfigException(s"$SCHEMA_CONFIG is not properly set. The format is Mqtt_Source->AVRO_FILE")
      }.toMap
  }
} 
Example 28
Source File: AvroConverter.scala    From kafka-connect-common   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.converters.sink

import com.datamountaineer.streamreactor.connect.converters.MsgKey
import io.confluent.connect.avro.AvroData
import java.io.ByteArrayOutputStream
import java.io.File
import org.apache.avro.{Schema => AvroSchema}
import org.apache.avro.generic.GenericRecord
import org.apache.avro.io.EncoderFactory
import org.apache.avro.reflect.ReflectDatumWriter
import org.apache.kafka.connect.sink.SinkRecord
import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException


class AvroConverter extends Converter {
  private val avroData = new AvroData(8)
  private var sinkToSchemaMap: Map[String, AvroSchema] = Map.empty
  private var avroWritersMap: Map[String, ReflectDatumWriter[Object]] = Map.empty

  override def convert(sinkTopic: String,
                       data: SinkRecord): SinkRecord = {
    Option(data) match {
      case None =>
        new SinkRecord(
          sinkTopic,
          0,
          null,
          null,
          avroData.toConnectSchema(sinkToSchemaMap(sinkTopic)),
          null,
          0
        )
      case Some(_) =>
        val kafkaTopic = data.topic()
        val writer = avroWritersMap.getOrElse(kafkaTopic.toLowerCase, throw new ConfigException(s"Invalid ${AvroConverter.SCHEMA_CONFIG} is not configured for $kafkaTopic"))

        val output = new ByteArrayOutputStream();
        val decoder = EncoderFactory.get().binaryEncoder(output, null)
        output.reset()

        val avro = avroData.fromConnectData(data.valueSchema(), data.value())
        avro.asInstanceOf[GenericRecord]

        val record = writer.write(avro, decoder)
        decoder.flush()
        val arr = output.toByteArray

        new SinkRecord(
          kafkaTopic,
          data.kafkaPartition(),
          MsgKey.schema,
          MsgKey.getStruct(sinkTopic, data.key().toString()),
          data.valueSchema(),
          arr,
          0
        )


    }
  }

  override def initialize(config: Map[String, String]): Unit = {
    sinkToSchemaMap = AvroConverter.getSchemas(config)
    avroWritersMap = sinkToSchemaMap.map { case (key, schema) =>
      key -> new ReflectDatumWriter[Object](schema)
    }
  }
}

object AvroConverter {
  val SCHEMA_CONFIG = "connect.converter.avro.schemas"

  def getSchemas(config: Map[String, String]): Map[String, AvroSchema] = {
    config.getOrElse(SCHEMA_CONFIG, throw new ConfigException(s"$SCHEMA_CONFIG is not provided"))
      .toString
      .split(';')
      .filter(_.trim.nonEmpty)
      .map(_.split("="))
      .map {
        case Array(sink, path) =>
          val file = new File(path)
          if (!file.exists()) {
            throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The file $path doesn't exist!")
          }
          val s = sink.trim.toLowerCase()
          if (s.isEmpty) {
            throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The topic is not valid for entry containing $path")
          }
          s -> new AvroSchema.Parser().parse(file)
        case other => throw new ConfigException(s"$SCHEMA_CONFIG is not properly set. The format is Mqtt_Sink->AVRO_FILE")
      }.toMap
  }
} 
Example 29
Source File: AvroSerializer.scala    From kafka-connect-common   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.serialization

import java.io.{ByteArrayOutputStream, InputStream, OutputStream}

import com.sksamuel.avro4s.{RecordFormat, SchemaFor}
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.{DecoderFactory, EncoderFactory}

object AvroSerializer {
  def write[T <: Product](t: T)(implicit os: OutputStream, formatter: RecordFormat[T], schemaFor: SchemaFor[T]): Unit = write(apply(t), schemaFor())

  def write(record: GenericRecord, schema: Schema)(implicit os: OutputStream) = {
    val writer = new GenericDatumWriter[GenericRecord](schema)
    val encoder = EncoderFactory.get().binaryEncoder(os, null)

    writer.write(record, encoder)
    encoder.flush()
    os.flush()
  }

  def getBytes[T <: Product](t: T)(implicit recordFormat: RecordFormat[T], schemaFor: SchemaFor[T]): Array[Byte] = getBytes(recordFormat.to(t), schemaFor())

  def getBytes(record: GenericRecord, schema: Schema): Array[Byte] = {
    implicit val output = new ByteArrayOutputStream()
    write(record, schema)
    output.toByteArray
  }

  def read(is: InputStream, schema: Schema): GenericRecord = {
    val reader = new GenericDatumReader[GenericRecord](schema)
    val decoder = DecoderFactory.get().binaryDecoder(is, null)
    reader.read(null, decoder)
  }

  def read[T <: Product](is: InputStream)(implicit schemaFor: SchemaFor[T], recordFormat: RecordFormat[T]): T = recordFormat.from(read(is, schemaFor()))

  def apply[T <: Product](t: T)(implicit formatter: RecordFormat[T]): GenericRecord = formatter.to(t)
} 
Example 30
Source File: FixAvroIO.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package fix
package v0_7_0

import com.spotify.scio.ContextAndArgs
import org.apache.avro.generic.GenericRecord
import com.spotify.scio.testing.{AvroIO, BigQueryIO, PipelineSpec, TextIO}

case class InputClass(s: String, i: Int) extends GenericRecord {
  def getSchema(): org.apache.avro.Schema = ???
  def get(x$1: String): Object = ???
  def put(x$1: String,x$2: Any): Unit = ???
  def get(x$1: Int): Object = ???
  def put(x$1: Int,x$2: Any): Unit = ???
}

case class OutputClass(result: String) extends GenericRecord {
  def getSchema(): org.apache.avro.Schema = ???
  def get(x$1: String): Object = ???
  def put(x$1: String,x$2: Any): Unit = ???
  def get(x$1: Int): Object = ???
  def put(x$1: Int,x$2: Any): Unit = ???
}

object TestJob

class ValidationJobTest extends PipelineSpec {
  val inputs: List[InputClass] = (1 to 10).toList.map{ i => InputClass(s"s$i", i) }
  val inputs2 = (1 to 10).zip(inputs).toMap
  val inputs3 = inputs2.values
  val expected = List(OutputClass("result"))

  "TestJob" should "run" in {
    JobTest[TestJob.type]
      .input(AvroIO("current"), inputs)
      .input(AvroIO("reference"), inputs2.values)
      .input(AvroIO("reference2"), inputs3)
      .input(AvroIO[InputClass]("donttouch"), inputs)
      .output[OutputClass](AvroIO("foo")){ coll =>
coll should containInAnyOrder(expected)
()
}
      .run()
  }
} 
Example 31
Source File: FixAvroIO.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package fix
package v0_7_0

import com.spotify.scio.ContextAndArgs
import org.apache.avro.generic.GenericRecord
import com.spotify.scio.testing.PipelineSpec
import com.spotify.scio.avro._
import com.spotify.scio.bigquery._
import com.spotify.scio.io._

case class InputClass(s: String, i: Int) extends GenericRecord {
  def getSchema(): org.apache.avro.Schema = ???
  def get(x$1: String): Object = ???
  def put(x$1: String,x$2: Any): Unit = ???
  def get(x$1: Int): Object = ???
  def put(x$1: Int,x$2: Any): Unit = ???
}

case class OutputClass(result: String) extends GenericRecord {
  def getSchema(): org.apache.avro.Schema = ???
  def get(x$1: String): Object = ???
  def put(x$1: String,x$2: Any): Unit = ???
  def get(x$1: Int): Object = ???
  def put(x$1: Int,x$2: Any): Unit = ???
}

object TestJob

class ValidationJobTest extends PipelineSpec {
  val inputs: List[InputClass] = (1 to 10).toList.map{ i => InputClass(s"s$i", i) }
  val inputs2 = (1 to 10).zip(inputs).toMap
  val inputs3 = inputs2.values
  val expected = List(OutputClass("result"))

  "TestJob" should "run" in {
    JobTest[TestJob.type]
      .input(AvroIO[InputClass]("current"), inputs)
      .input(AvroIO[GenericRecord]("reference"), inputs2.values)
      .input(AvroIO[InputClass]("reference2"), inputs3)
      .input(AvroIO[InputClass]("donttouch"), inputs)
      .output[OutputClass](AvroIO("foo")){ coll =>
coll should containInAnyOrder(expected)
()
}
      .run()
  }
} 
Example 32
Source File: CoderTestUtils.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.coders

import com.spotify.scio.avro.TestRecord
import org.apache.beam.sdk.coders.{Coder => BCoder}
import org.apache.avro.generic.GenericRecord
import org.apache.beam.sdk.util.CoderUtils

object CoderTestUtils {
  case class Pair(name: String, size: Int)
  case class CaseClassWithGenericRecord(name: String, size: Int, record: GenericRecord)
  case class CaseClassWithSpecificRecord(name: String, size: Int, record: TestRecord)

  def testRoundTrip[T](coder: BCoder[T], value: T): Boolean =
    testRoundTrip(coder, coder, value)

  def testRoundTrip[T](writer: BCoder[T], reader: BCoder[T], value: T): Boolean = {
    val bytes = CoderUtils.encodeToByteArray(writer, value)
    val result = CoderUtils.decodeFromByteArray(reader, bytes)
    result == value
  }
} 
Example 33
Source File: ProtobufUtilTest.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.util

import java.io.File
import java.nio.channels.Channels
import java.nio.file.Files

import com.spotify.scio.ScioContext
import com.spotify.scio.avro._
import com.spotify.scio.coders.Coder
import com.spotify.scio.proto.Track.TrackPB
import org.apache.avro.file.DataFileStream
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
import org.apache.beam.sdk.io.{FileSystems, LocalResources}
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

import scala.jdk.CollectionConverters._

class ProtobufUtilTest extends AnyFlatSpec with Matchers {

  "ProtobufUtil" should "convert Message -> GenericRecords that can be written and read" in {
    val sc = ScioContext()

    val dir = Files.createTempDirectory("protobuf-util-")
    val (path1, path2) = (new File(s"$dir/1"), new File(s"$dir/2"))
    path1.deleteOnExit()
    path2.deleteOnExit()
    dir.toFile.deleteOnExit()

    implicit val grCoder: Coder[GenericRecord] = ProtobufUtil.AvroMessageCoder

    val messages = sc
      .parallelize(1 to 10)
      .map(i => TrackPB.newBuilder().setTrackId(i.toString).build())

    messages
      .map(ProtobufUtil.toAvro[TrackPB])
      .saveAsAvroFile(
        path1.getPath,
        suffix = ".protobuf",
        metadata = ProtobufUtil.schemaMetadataOf[TrackPB],
        schema = ProtobufUtil.AvroMessageSchema,
        numShards = 1
      )

    val protoWriteTap = messages.saveAsProtobufFile(path2.getPath, numShards = 1)

    val result = sc.run().waitUntilDone()

    val (tapFromAvroWrite, tapFromProtoWrite) = (
      ObjectFileTap[TrackPB](ScioUtil.addPartSuffix(path1.getPath)),
      protoWriteTap.get(result)
    )

    tapFromAvroWrite.value.toList should contain theSameElementsAs tapFromProtoWrite.value.toList
    getMetadata(path1) should contain theSameElementsAs getMetadata(path2)
  }

  private def getMetadata(dir: File): Map[String, AnyRef] = {
    val files = dir.listFiles()
    if (files.length != 1) {
      fail(s"Directory $dir should contain 1 Avro file. Instead, found ${files.toList}")
    }

    val dfs = new DataFileStream[GenericRecord](
      Channels.newInputStream(FileSystems.open(LocalResources.fromFile(files(0), false))),
      new GenericDatumReader[GenericRecord]
    )

    dfs.getMetaKeys.asScala.map(k => (k, dfs.getMetaString(k))).toMap
  }
} 
Example 34
Source File: Pretty.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.testing

import org.apache.avro.generic.GenericRecord
import org.apache.avro.specific.SpecificRecordBase
import scala.jdk.CollectionConverters._
import com.spotify.scio.{registerSysProps, SysProp}
import scala.util.Try

@registerSysProps
object PrettySysProps {
  val PrettyPrint =
    SysProp("tests.prettyprint.colors", "Should pretty printed values be rendered with colors")
}

object Pretty {
  import pprint.Tree
  import fansi.{Color, Str}

  private def renderFieldName(n: String) =
    Tree.Lazy(ctx => List(Color.LightBlue(n).toString).iterator)

  private def renderGenericRecord: PartialFunction[GenericRecord, Tree] = {
    case g =>
      val renderer =
        new pprint.Renderer(
          printer.defaultWidth,
          printer.colorApplyPrefix,
          printer.colorLiteral,
          printer.defaultIndent
        )
      def render(tree: Tree): Str =
        Str.join(renderer.rec(tree, 0, 0).iter.toSeq: _*)
      Tree.Lazy { ctx =>
        val fields =
          for {
            f <- g.getSchema().getFields().asScala
          } yield Str.join(
            render(renderFieldName(f.name)),
            ": ",
            render(treeifyAvro(g.get(f.name())))
          )
        List(
          Color.LightGray("{ ").toString +
            fields.reduce((a, b) => Str.join(a, ", ", b)) +
            Color.LightGray(" }")
        ).iterator
      }
  }

  private def renderSpecificRecord: PartialFunction[SpecificRecordBase, Tree] = {
    case x =>
      val fs =
        for {
          f <- x.getSchema().getFields().asScala
        } yield Tree.Infix(renderFieldName(f.name), "=", treeifyAvro(x.get(f.name())))
      Tree.Apply(x.getClass().getSimpleName(), fs.iterator)
  }

  private def treeifyAvro: PartialFunction[Any, Tree] = {
    case x: SpecificRecordBase =>
      renderSpecificRecord(x)
    case g: GenericRecord =>
      renderGenericRecord(g)
    case x =>
      printer.treeify(x)
  }

  private val handlers: PartialFunction[Any, Tree] = {
    case x: GenericRecord => treeifyAvro(x)
  }

  private val useColors =
    PrettySysProps.PrettyPrint.valueOption
      .flatMap(x => Try(x.toBoolean).toOption)
      .getOrElse {
        // Crude test to check if the terminal seems to support colors
        (System.console() != null) && (System.getenv().get("TERM") != null)
      }

  val printer =
    if (useColors) {
      pprint.PPrinter(
        additionalHandlers = handlers
      )
    } else {
      pprint.PPrinter(
        additionalHandlers = handlers,
        colorLiteral = fansi.Attrs.Empty,
        colorApplyPrefix = fansi.Attrs.Empty
      )
    }
} 
Example 35
Source File: AvroInstances.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.schemas.instances

import com.spotify.scio.schemas.{RawRecord, Schema}
import org.apache.avro.specific.SpecificRecord
import org.apache.avro.generic.{GenericRecord, IndexedRecord}
import org.apache.beam.sdk.schemas.utils.AvroUtils
import org.apache.beam.sdk.schemas.{AvroRecordSchema, Schema => BSchema}
import org.apache.beam.sdk.transforms.SerializableFunction
import org.apache.beam.sdk.values.{Row, TypeDescriptor}

import scala.jdk.CollectionConverters._
import scala.reflect.{classTag, ClassTag}

trait AvroInstances {
  implicit def avroSchema[T <: SpecificRecord: ClassTag]: Schema[T] = {
    // TODO: broken because of a bug upstream https://issues.apache.org/jira/browse/BEAM-6742
    // RawRecord[T](new AvroRecordSchema())
    import org.apache.avro.reflect.ReflectData
    val rc = classTag[T].runtimeClass.asInstanceOf[Class[T]]
    val provider = new AvroRecordSchema()
    val td = TypeDescriptor.of(rc)
    val schema = provider.schemaFor(td)
    val avroSchema =
      new AvroInstances.SerializableSchema(ReflectData.get().getSchema(td.getRawType))

    def fromRow = provider.fromRowFunction(td)

    val toRow: SerializableFunction[T, Row] =
      new SerializableFunction[T, Row] {
        def apply(t: T): Row =
          AvroInstances.recordtoRow(schema, avroSchema, t)
      }
    RawRecord[T](schema, fromRow, toRow)
  }

  def fromAvroSchema(schema: org.apache.avro.Schema): Schema[GenericRecord] = {
    val beamSchema = AvroUtils.toBeamSchema(schema)
    val avroSchema = new AvroInstances.SerializableSchema(schema)
    val toRow = new SerializableFunction[GenericRecord, Row] {
      def apply(t: GenericRecord): Row =
        AvroInstances.recordtoRow[GenericRecord](beamSchema, avroSchema, t)
    }

    val fromRow = new SerializableFunction[Row, GenericRecord] {
      def apply(t: Row): GenericRecord =
        AvroUtils.toGenericRecord(t, avroSchema.get)
    }

    RawRecord[GenericRecord](beamSchema, fromRow, toRow)
  }
}

object AvroInstances {
  private class SerializableSchema(@transient private val schema: org.apache.avro.Schema)
      extends Serializable {
    private[this] val stringSchema = schema.toString
    def get: org.apache.avro.Schema = new org.apache.avro.Schema.Parser().parse(stringSchema)
  }

  // Workaround BEAM-6742
  private def recordtoRow[T <: IndexedRecord](
    schema: BSchema,
    avroSchema: SerializableSchema,
    t: T
  ): Row = {
    val row = Row.withSchema(schema)
    schema.getFields.asScala.zip(avroSchema.get.getFields.asScala).zipWithIndex.foreach {
      case ((f, a), i) =>
        val value = t.get(i)
        val v = AvroUtils.convertAvroFieldStrict(value, a.schema, f.getType)
        row.addValue(v)
    }
    row.build()
  }
} 
Example 36
Source File: AvroCoders.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.coders.instances

import java.io.{InputStream, OutputStream}

import com.spotify.scio.coders.{AvroCoderMacros, Coder}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.avro.specific.{SpecificData, SpecificFixed}
import org.apache.beam.sdk.coders.Coder.NonDeterministicException
import org.apache.beam.sdk.coders.{AtomicCoder, AvroCoder, StringUtf8Coder}
import org.apache.beam.sdk.util.common.ElementByteSizeObserver

import scala.reflect.{classTag, ClassTag}

final private class SlowGenericRecordCoder extends AtomicCoder[GenericRecord] {
  // TODO: can we find something more efficient than String ?
  private[this] val sc = StringUtf8Coder.of()

  override def encode(value: GenericRecord, os: OutputStream): Unit = {
    val schema = value.getSchema
    val coder = AvroCoder.of(schema)
    sc.encode(schema.toString, os)
    coder.encode(value, os)
  }

  override def decode(is: InputStream): GenericRecord = {
    val schemaStr = sc.decode(is)
    val schema = new Schema.Parser().parse(schemaStr)
    val coder = AvroCoder.of(schema)
    coder.decode(is)
  }

  // delegate methods for determinism and equality checks
  override def verifyDeterministic(): Unit =
    throw new NonDeterministicException(
      this,
      "Coder[GenericRecord] without schema is non-deterministic"
    )
  override def consistentWithEquals(): Boolean = false
  override def structuralValue(value: GenericRecord): AnyRef =
    AvroCoder.of(value.getSchema).structuralValue(value)

  // delegate methods for byte size estimation
  override def isRegisterByteSizeObserverCheap(value: GenericRecord): Boolean =
    AvroCoder.of(value.getSchema).isRegisterByteSizeObserverCheap(value)
  override def registerByteSizeObserver(
    value: GenericRecord,
    observer: ElementByteSizeObserver
  ): Unit =
    AvroCoder.of(value.getSchema).registerByteSizeObserver(value, observer)
}


  // TODO: Use a coder that does not serialize the schema
  def avroGenericRecordCoder(schema: Schema): Coder[GenericRecord] =
    Coder.beam(AvroCoder.of(schema))

  // XXX: similar to GenericAvroSerializer
  def avroGenericRecordCoder: Coder[GenericRecord] =
    Coder.beam(new SlowGenericRecordCoder)

  import org.apache.avro.specific.SpecificRecordBase
  implicit def genAvro[T <: SpecificRecordBase]: Coder[T] =
    macro AvroCoderMacros.staticInvokeCoder[T]

  implicit def avroSpecificFixedCoder[T <: SpecificFixed: ClassTag]: Coder[T] =
    SpecificFixedCoder[T]
} 
Example 37
Source File: StdAvroModelFactory.scala    From aloha   with MIT License 5 votes vote down vote up
package com.eharmony.aloha.factory.avro

import java.io.File

import org.apache.commons.{vfs => vfs1, vfs2}
import com.eharmony.aloha.io.vfs.{Vfs1, Vfs2}
import com.eharmony.aloha.audit.impl.avro.Score
import com.eharmony.aloha.factory.ModelFactory
import org.apache.avro.generic.GenericRecord

import scala.util.Try




  @deprecated(message = "Prefer StdAvroModelFactory.fromConfig(conf: FactoryConfig)", since = "4.0.1")
  def apply(modelDomainSchemaVfsUrl: String,
            modelCodomainRefInfoStr: String,
            imports: Seq[String] = Nil,
            classCacheDir: Option[File] = None,
            dereferenceAsOptional: Boolean = true,
            useVfs2: Boolean = true): Try[ModelFactory[GenericRecord, Score]] = {

    val vfs = url(modelDomainSchemaVfsUrl, useVfs2)

    vfs.flatMap { u =>
      UrlConfig(
        u,
        modelCodomainRefInfoStr,
        imports,
        classCacheDir,
        dereferenceAsOptional
      )()
    }
  }

  private[this] def url(modelDomainSchemaVfsUrl: String, useVfs2: Boolean) = {
    val u =
      if (useVfs2)
        Try { Vfs2(vfs2.VFS.getManager.resolveFile(modelDomainSchemaVfsUrl)) }
      else Try { Vfs1(vfs1.VFS.getManager.resolveFile(modelDomainSchemaVfsUrl)) }
    FactoryConfig.wrapException(u)
  }
} 
Example 38
Source File: AvroBytesUtil.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.coders

import java.nio.ByteBuffer

import org.apache.avro.{Schema => ASchema}
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.beam.sdk.coders.{Coder => BCoder}
import org.apache.beam.sdk.util.CoderUtils

import scala.jdk.CollectionConverters._

private[scio] object AvroBytesUtil {
  val schema: ASchema = {
    val s = ASchema.createRecord("AvroBytesRecord", null, null, false)
    s.setFields(
      List(
        new ASchema.Field(
          "bytes",
          ASchema.create(ASchema.Type.BYTES),
          null,
          null.asInstanceOf[Object]
        )
      ).asJava
    )
    s
  }

  def encode[T](coder: BCoder[T], obj: T): GenericRecord = {
    val bytes = CoderUtils.encodeToByteArray(coder, obj)
    val record = new GenericData.Record(schema)
    record.put("bytes", ByteBuffer.wrap(bytes))
    record
  }

  def decode[T](coder: BCoder[T], record: GenericRecord): T = {
    val bb = record.get("bytes").asInstanceOf[ByteBuffer]
    val bytes =
      java.util.Arrays.copyOfRange(bb.array(), bb.position(), bb.limit())
    CoderUtils.decodeFromByteArray(coder, bytes)
  }
} 
Example 39
Source File: GroupByBenchmark.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.jmh

import com.spotify.scio.{ScioContext, ScioExecutionContext}
import com.spotify.scio.avro._
import com.spotify.scio.coders._
import org.apache.beam.sdk.coders.{KvCoder, Coder => BCoder}
import org.apache.beam.sdk.values.KV
import org.apache.beam.sdk.transforms.GroupByKey
import org.apache.beam.sdk.options.{PipelineOptions, PipelineOptionsFactory}
import java.util.concurrent.TimeUnit

import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.openjdk.jmh.annotations._

import scala.jdk.CollectionConverters._
@BenchmarkMode(Array(Mode.AverageTime))
@OutputTimeUnit(TimeUnit.SECONDS)
@State(Scope.Thread)
class GroupByBenchmark {
  val schema =
    """
      {
        "type": "record",
        "name": "Event",
        "namespace": "smbjoin",
        "fields": [
          {
            "name": "id",
            "type": "string"
          },
          {
            "name": "value",
            "type": "double"
          }
        ]
      }
    """

  val avroSchema =
    new Schema.Parser().parse(schema)

  private def runWithContext[T](fn: ScioContext => T): ScioExecutionContext = {
    val opts = PipelineOptionsFactory.as(classOf[PipelineOptions])
    val sc = ScioContext(opts)
    fn(sc)
    sc.run()
  }

  val source = "src/test/resources/events-10000-0.avro"
  implicit val coderGenericRecord: Coder[GenericRecord] =
    Coder.avroGenericRecordCoder(avroSchema)

  val charCoder = CoderMaterializer.beamWithDefault(Coder[Char])
  val doubleCoder = CoderMaterializer.beamWithDefault(Coder[Double])
  val kvCoder: BCoder[KV[Char, Double]] = KvCoder.of(charCoder, doubleCoder)

  @Benchmark
  def testScioGroupByKey: ScioExecutionContext =
    runWithContext { sc =>
      sc.avroFile(source, schema = avroSchema)
        .map(rec => (rec.get("id").toString.head, rec.get("value").asInstanceOf[Double]))
        .groupByKey
    }

  @Benchmark
  def testBeamGroupByKey: ScioExecutionContext =
    runWithContext { sc =>
      sc.wrap {
        sc.avroFile(source, schema = avroSchema)
          .map { rec =>
            KV.of(rec.get("id").toString.head, rec.get("value").asInstanceOf[Double])
          }
          .internal
          .setCoder(kvCoder)
          .apply(GroupByKey.create[Char, Double])
      }.map(kv => (kv.getKey, kv.getValue.asScala))
    }
} 
Example 40
Source File: BigQueryIT.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.extra.bigquery

import java.{util => ju}

import com.google.protobuf.ByteString
import com.spotify.scio.avro.types.AvroType
import com.spotify.scio.bigquery.client.BigQuery
import com.spotify.scio.bigquery.Table
import com.spotify.scio.bigquery.TableRow
import com.spotify.scio.coders._
import com.spotify.scio.ContextAndArgs
import org.apache.avro.generic.GenericRecord
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryAvroUtilsWrapper
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

object BigQueryIT {
  @AvroType.fromSchema("""{
      | "type":"record",
      | "name":"Account",
      | "namespace":"com.spotify.scio.avro",
      | "doc":"Record for an account",
      | "fields":[
      |   {"name":"id","type":"long"},
      |   {"name":"type","type":"string"},
      |   {"name":"name","type":"string"},
      |   {"name":"amount","type":"double"},
      |   {"name":"secret","type":"bytes"}]}
    """.stripMargin)
  class Account

  implicit def genericCoder = Coder.avroGenericRecordCoder(Account.schema)

}

final class BigQueryIT extends AnyFlatSpec with Matchers {
  import BigQueryIT._

  it should "save avro to BigQuery" in {
    val args = Array(
      "--project=data-integration-test",
      "--tempLocation=gs://data-integration-test-eu/temp"
    )
    val (sc, _) = ContextAndArgs(args)
    val prefix = ju.UUID.randomUUID().toString.replaceAll("-", "")
    val table = Table.Spec(s"data-integration-test:bigquery_avro_it.${prefix}_accounts")

    val data: Seq[GenericRecord] = (1 to 100).map { i =>
      Account.toGenericRecord(
        Account(i, "checking", s"account$i", i.toDouble, ByteString.copyFromUtf8("%20cフーバー"))
      )
    }

    val tap = sc
      .parallelize(data)
      .saveAvroAsBigQuery(
        table.ref,
        Account.schema,
        writeDisposition = WriteDisposition.WRITE_EMPTY,
        createDisposition = CreateDisposition.CREATE_IF_NEEDED
      )

    val result = sc.run().waitUntilDone()

    val ts = BigQuery.defaultInstance().tables.schema(table.ref)
    val expected: Seq[TableRow] = data.map { gr =>
      BigQueryAvroUtilsWrapper.convertGenericRecordToTableRow(gr, ts)
    }

    result.tap(tap).value.toSet shouldEqual expected.toSet
  }

} 
Example 41
Source File: AvroUtils.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.avro

import org.apache.avro.Schema
import org.apache.avro.generic.{GenericData, GenericRecord}

import scala.jdk.CollectionConverters._

object AvroUtils {
  private def f(name: String, tpe: Schema.Type) =
    new Schema.Field(
      name,
      Schema.createUnion(List(Schema.create(Schema.Type.NULL), Schema.create(tpe)).asJava),
      null: String,
      null: AnyRef
    )

  private def fArr(name: String, tpe: Schema.Type) =
    new Schema.Field(name, Schema.createArray(Schema.create(tpe)), null: String, null: AnyRef)

  val schema = Schema.createRecord("GenericTestRecord", null, null, false)
  schema.setFields(
    List(
      f("int_field", Schema.Type.INT),
      f("long_field", Schema.Type.LONG),
      f("float_field", Schema.Type.FLOAT),
      f("double_field", Schema.Type.DOUBLE),
      f("boolean_field", Schema.Type.BOOLEAN),
      f("string_field", Schema.Type.STRING),
      fArr("array_field", Schema.Type.STRING)
    ).asJava
  )

  def newGenericRecord(i: Int): GenericRecord = {
    val r = new GenericData.Record(schema)
    r.put("int_field", 1 * i)
    r.put("long_field", 1L * i)
    r.put("float_field", 1f * i)
    r.put("double_field", 1.0 * i)
    r.put("boolean_field", true)
    r.put("string_field", "hello")
    r.put("array_field", List[CharSequence]("a", "b", "c").asJava)
    r
  }

  def newSpecificRecord(i: Int): TestRecord =
    new TestRecord(
      i,
      i.toLong,
      i.toFloat,
      i.toDouble,
      true,
      "hello",
      List[CharSequence]("a", "b", "c").asJava
    )
} 
Example 42
Source File: MagnolifyAvroExampleTest.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.examples.extra

import com.spotify.scio.avro.AvroIO
import com.spotify.scio.io._
import com.spotify.scio.testing._
import org.apache.avro.generic.{GenericData, GenericRecord}

class MagnolifyAvroExampleTest extends PipelineSpec {
  import MagnolifyAvroExample._

  val textIn = Seq("a b c d e", "a b a b")
  val wordCount = Seq(("a", 3L), ("b", 3L), ("c", 1L), ("d", 1L), ("e", 1L))
  val records: Seq[GenericRecord] = wordCount.map { kv =>
    val r = new GenericData.Record(wordCountType.schema)
    r.put("word", kv._1)
    r.put("count", kv._2)
    r
  }
  val textOut = wordCount.map(kv => kv._1 + ": " + kv._2)

  "MagnolifyAvroWriteExample" should "work" in {
    JobTest[com.spotify.scio.examples.extra.MagnolifyAvroWriteExample.type]
      .args("--input=in.txt", "--output=wc.avro")
      .input(TextIO("in.txt"), textIn)
      .output(AvroIO[GenericRecord]("wc.avro"))(coll => coll should containInAnyOrder(records))
      .run()
  }

  "MagnolifyAvroReadExample" should "work" in {
    JobTest[com.spotify.scio.examples.extra.MagnolifyAvroReadExample.type]
      .args("--input=wc.avro", "--output=out.txt")
      .input(AvroIO[GenericRecord]("wc.avro"), records)
      .output(TextIO("out.txt"))(coll => coll should containInAnyOrder(textOut))
      .run()
  }
} 
Example 43
Source File: Utils.scala    From shc   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.hbase

import java.util
import java.util.Comparator

import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.catalyst.expressions.MutableRow
import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.execution.SparkSqlSerializer
import org.apache.spark.sql.types._

import scala.collection.mutable.ArrayBuffer
import scala.math.Ordering

object Utils {

  def setRowCol(
      row: MutableRow,
      field: (Field, Int),
      src: HBaseType,
      offset: Int,
      length: Int): Unit = {
    val index = field._2
    val f = field._1
    if (f.sedes.isDefined) {
      // If we already have sedes defined , use it.
      val m = f.sedes.get.deserialize(src, offset, length)
      row.update(index, m)
    } else if (f.exeSchema.isDefined) {
      // println("avro schema is defined to do deserialization")
      // If we have avro schema defined, use it to get record, and then covert them to catalyst data type
      val m = AvroSedes.deserialize(src, f.exeSchema.get)
      // println(m)
      val n = f.avroToCatalyst.map(_(m))
      row.update(index, n.get)
    } else  {
      // Fall back to atomic type
      f.dt match {
        case BooleanType => row.setBoolean(index, toBoolean(src, offset))
        case ByteType => row.setByte(index, src(offset))
        case DoubleType => row.setDouble(index, Bytes.toDouble(src, offset))
        case FloatType => row.setFloat(index, Bytes.toFloat(src, offset))
        case IntegerType => row.setInt(index, Bytes.toInt(src, offset))
        case LongType => row.setLong(index, Bytes.toLong(src, offset))
        case ShortType => row.setShort(index, Bytes.toShort(src, offset))
        case StringType => row.update(index, toUTF8String(src, offset, length))
        case BinaryType =>
          val newArray = new Array[Byte](length)
          System.arraycopy(src, offset, newArray, 0, length)
          row.update(index, newArray)
        case _ => row.update(index, SparkSqlSerializer.deserialize[Any](src)) //TODO
      }
    }
  }

  // convert input to data type
  def toBytes(input: Any, field: Field): Array[Byte] = {
    if (field.sedes.isDefined) {
      field.sedes.get.serialize(input)
    } else if (field.schema.isDefined) {
      // Here we assume the top level type is structType
      val record = field.catalystToAvro(input)
      AvroSedes.serialize(record, field.schema.get)
    } else {
      input match {
        case data: Boolean => Bytes.toBytes(data)
        case data: Byte => Array(data)
        case data: Array[Byte] => data
        case data: Double => Bytes.toBytes(data)
        case data: Float => Bytes.toBytes(data)
        case data: Int => Bytes.toBytes(data)
        case data: Long => Bytes.toBytes(data)
        case data: Short => Bytes.toBytes(data)
        case data: UTF8String => data.getBytes
        case data: String => Bytes.toBytes(data)
          //Bytes.toBytes(input.asInstanceOf[String])//input.asInstanceOf[UTF8String].getBytes
        case _ => throw new Exception(s"unsupported data type ${field.dt}") //TODO
      }
    }
  }

  def toBoolean(input: HBaseType, offset: Int): Boolean = {
    input(offset) != 0
  }

  def toUTF8String(input: HBaseType, offset: Int, length: Int): UTF8String = {
    UTF8String(input.slice(offset, offset + length))
  }
} 
Example 44
Source File: Sedes.scala    From shc   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.hbase

import java.io.ByteArrayInputStream

import org.apache.avro.Schema
import org.apache.avro.Schema.Type._
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io._
import org.apache.commons.io.output.ByteArrayOutputStream
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.types._

trait Sedes {
  def serialize(value: Any): Array[Byte]
  def deserialize(bytes: Array[Byte], start: Int, end: Int): Any
}

class DoubleSedes extends Sedes {
  override def serialize(value: Any): Array[Byte] = Bytes.toBytes(value.asInstanceOf[Double])
  override def deserialize(bytes: Array[Byte], start: Int, end: Int): Any = {
    Bytes.toLong(bytes, start)
  }
} 
Example 45
Source File: StdAvroModelFactoryTest.scala    From aloha   with MIT License 5 votes vote down vote up
package com.eharmony.aloha.factory.avro

import com.eharmony.aloha.audit.impl.avro.Score
import com.eharmony.aloha.factory.ModelFactory
import com.eharmony.aloha.io.vfs.Vfs1
import com.eharmony.aloha.models.Model
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.commons.io.IOUtils
import org.junit.Assert.assertEquals
import org.junit.Test
import org.junit.runner.RunWith
import org.junit.runners.BlockJUnit4ClassRunner

import scala.util.Try


  private[this] def record = {
    val r = new GenericData.Record(TheSchema)
    r.put("req_str_1", "smart handsome stubborn")
    r
  }
}

object StdAvroModelFactoryTest {
  private lazy val TheSchema = {
    val is = getClass.getClassLoader.getResourceAsStream(SchemaUrlResource)
    try new Schema.Parser().parse(is) finally IOUtils.closeQuietly(is)
  }

  private val ExpectedResult = 7d

  private val SchemaUrlResource = "avro/class7.avpr"

  private val SchemaUrl = s"res:$SchemaUrlResource"

  private val SchemaFile = new java.io.File(getClass.getClassLoader.getResource(SchemaUrlResource).getFile)

  private val SchemaVfs1FileObject = org.apache.commons.vfs.VFS.getManager.resolveFile(SchemaUrl)

  private val SchemaVfs2FileObject = org.apache.commons.vfs2.VFS.getManager.resolveFile(SchemaUrl)

  private val Imports = Seq("com.eharmony.aloha.feature.BasicFunctions._", "scala.math._")

  private val ReturnType = "Double"

  private val ModelJson =
    """
      |{
      |  "modelType": "Regression",
      |  "modelId": { "id": 0, "name": "" },
      |  "features" : {
      |    "my_attributes": "${req_str_1}.split(\"\\\\W+\").map(v => (s\"=$v\", 1.0))"
      |  },
      |  "weights": {
      |    "my_attributes=handsome": 1,
      |    "my_attributes=smart": 2,
      |    "my_attributes=stubborn": 4
      |  }
      |}
    """.stripMargin
} 
Example 46
Source File: ImplicitsTest.scala    From aloha   with MIT License 5 votes vote down vote up
package com.eharmony.aloha.audit.impl.avro

import com.google.common.collect.Lists
import org.junit.Assert.assertEquals
import org.junit.Test
import org.junit.runner.RunWith
import org.junit.runners.BlockJUnit4ClassRunner

import scala.collection.JavaConverters.seqAsJavaListConverter
import com.eharmony.aloha.audit.impl.avro.Implicits.{RichFlatScore, RichScore}
import java.{lang => jl, util => ju}

import org.apache.avro.generic.GenericRecord


  @Test def testAllFieldsAppear(): Unit = {
    val s = filledInScore
    assertEquals(s, s.toFlatScore.toScore)
  }

  @Test def testSameFieldsInGenericRecord(): Unit = {
    val s = filledInScore
    val s1 = s.asInstanceOf[GenericRecord]
    val s2 = s.toFlatScore.asInstanceOf[GenericRecord]

    testStuff(s1, s2, Map(
      "model" -> modelId,
      "value" -> value,
      "errorMsgs" -> errors,
      "missingVarNames" -> missing,
      "prob" -> prob
    ))
  }

  private[this] def testStuff(r1: GenericRecord, r2: GenericRecord, data: Map[String, Any]): Unit = {
    data.foreach { case (k, v) =>
      val v1 = r1.get(k)
      val v2 = r2.get(k)
      assertEquals(s"for r1('$k') = $v1.  Expected $v", v, r1.get(k))
      assertEquals(s"for r2('$k') = $v2.  Expected $v", v, r2.get(k))
    }
  }
}


object ImplicitsTest {
  private def filledInScore = new Score(modelId, value, subvalues, errors, missing, prob)
  private def modelId = new ModelId(5L, "five")
  private def value: jl.Double = 13d
  private def subvalues = Lists.newArrayList(scr(12L, 8))
  private def errors: ju.List[CharSequence] = Lists.newArrayList("one error", "two errors")
  private def missing: ju.List[CharSequence] =
    Lists.newArrayList("some feature", "another feature", "yet another feature")
  private def prob: jl.Float = 1f

  private lazy val score: Score =
    scr(1, 1,
      scr(2L, 2,
        scr(4f, 4),
        scr(5,  5)
      ),
      scr(3d, 3,
        scr(6d, 6),
        scr(7L, 7)
      )
    )

  private lazy val irregularTree: Score =
    scr(1, 1,
      scr(2L, 2),
      scr(3d, 3,
        scr(5d, 5),
        scr(6L, 6)
      ),
      scr(4d, 4,
        scr(7L, 7)
      )
    )

  private[this] def scr(value: Any, id: Long, children: Score*): Score = {
    new Score(
      new ModelId(id, ""),
      value,
      Lists.newArrayList(children.asJava),
      java.util.Collections.emptyList(),
      java.util.Collections.emptyList(),
      null
    )
  }
} 
Example 47
Source File: AvroDataInputStream.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s

import java.io.InputStream

import org.apache.avro.Schema
import org.apache.avro.file.DataFileStream
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.avro.io.DatumReader

import scala.util.Try

class AvroDataInputStream[T](in: InputStream,
                             writerSchema: Option[Schema])
                            (implicit decoder: Decoder[T]) extends AvroInputStream[T] {

  val resolved = decoder.resolveDecoder()

  // if no reader or writer schema is specified, then we create a reader that uses what's present in the files
  private val datumReader = writerSchema match {
    case Some(writer) => GenericData.get.createDatumReader(writer, resolved.schema)
    case None => GenericData.get.createDatumReader(null, resolved.schema)
  }

  private val dataFileReader = new DataFileStream[GenericRecord](in, datumReader.asInstanceOf[DatumReader[GenericRecord]])

  override def iterator: Iterator[T] = new Iterator[T] {
    override def hasNext: Boolean = dataFileReader.hasNext
    override def next(): T = {
      val record = dataFileReader.next
      resolved.decode(record)
    }
  }

  override def tryIterator: Iterator[Try[T]] = new Iterator[Try[T]] {
    override def hasNext: Boolean = dataFileReader.hasNext
    override def next(): Try[T] = Try {
      val record = dataFileReader.next
      resolved.decode(record)
    }
  }

  override def close(): Unit = in.close()
} 
Example 48
Source File: AvroRecord.scala    From hydra   with Apache License 2.0 5 votes vote down vote up
package hydra.kafka.producer

import com.pluralsight.hydra.avro.JsonConverter
import hydra.core.transport.AckStrategy
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.commons.lang3.StringUtils


case class AvroRecord(
    destination: String,
    schema: Schema,
    key: String,
    payload: GenericRecord,
    ackStrategy: AckStrategy
) extends KafkaRecord[String, GenericRecord]

object AvroRecord {

  def apply(
      destination: String,
      schema: Schema,
      key: Option[String],
      json: String,
      ackStrategy: AckStrategy,
      useStrictValidation: Boolean = false
  ): AvroRecord = {

    val payload: GenericRecord = {
      val converter: JsonConverter[GenericRecord] =
        new JsonConverter[GenericRecord](schema, useStrictValidation)
      converter.convert(json)
    }

    AvroRecord(destination, schema, key.orNull, payload, ackStrategy)
  }

  def apply(
      destination: String,
      schema: Schema,
      key: Option[String],
      record: GenericRecord,
      ackStrategy: AckStrategy
  ): AvroRecord = {
    AvroRecord(destination, schema, key.orNull, record, ackStrategy)
  }
} 
Example 49
Source File: SpecificDefautValuesSpec.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
import test._
import org.specs2.mutable.Specification
import java.io.File
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.file.DataFileReader

import DefaultEnum._

class SpecificDefaultValuesSpec extends Specification {

  "A case class with default values" should {
    "deserialize correctly" in {
      val record = DefaultTest()
      val records = List(record)
      
      val fileName = s"${records.head.getClass.getName}"
      val fileEnding = "avro"
      val file = File.createTempFile(fileName, fileEnding)
      file.deleteOnExit()
      SpecificTestUtil.write(file, records)
      
      val dummyRecord = new GenericDatumReader[GenericRecord]
      val schema = new DataFileReader(file, dummyRecord).getSchema
      val userDatumReader = new SpecificDatumReader[DefaultTest](schema)
      val dataFileReader = new DataFileReader[DefaultTest](file, userDatumReader)
      val sameRecord = dataFileReader.next

      sameRecord.suit === SPADES
      sameRecord.number === 0
      sameRecord.str === "str"
      sameRecord.optionString === None
      sameRecord.optionStringValue === Some("default")
      sameRecord.embedded === Embedded(1)
      sameRecord.defaultArray === List(1,3,4,5)
      sameRecord.optionalEnum === None
      sameRecord.defaultMap === Map("Hello" -> "world", "Merry" -> "Christmas")
      sameRecord.byt === "\u00FF".getBytes
    }
  }

} 
Example 50
Source File: SpecificTestUtil.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
package test

import java.io.File

import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.Schema
import org.apache.avro.file.{ DataFileReader, DataFileWriter }

import org.specs2.mutable.Specification

object SpecificTestUtil extends Specification {

  def write[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val userDatumWriter = new SpecificDatumWriter[T]()
    val dataFileWriter = new DataFileWriter[T](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file)
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close()
  }

  def read[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new SpecificDatumReader[T](schema)
    val dataFileReader = new DataFileReader[T](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader.
    var record: T = null.asInstanceOf[T]
    var sameRecord: T = null.asInstanceOf[T]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  }

  def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

  def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = {
    val schema = record.getSchema
    val writer = new SpecificDatumWriter[T](schema)
    val out = new java.io.ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(out, null)
    writer.write(record, encoder)
    encoder.flush
    val ba = out.toByteArray
    ba.size must ===(1)
    ba(0) must ===(0)
    out.close
    val reader = new SpecificDatumReader[T](schema)
    val decoder = DecoderFactory.get().binaryDecoder(ba, null)
    val decoded = reader.read(record, decoder)
    decoded must ===(record)
  }

} 
Example 51
Source File: SpecificDefautValuesSpec.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
import test._
import org.specs2.mutable.Specification
import java.io.File
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.file.DataFileReader

import DefaultEnum._

class SpecificDefaultValuesSpec extends Specification {

  "A case class with default values" should {
    "deserialize correctly" in {
      val record = DefaultTest()
      val records = List(record)
      
      val fileName = s"${records.head.getClass.getName}"
      val fileEnding = "avro"
      val file = File.createTempFile(fileName, fileEnding)
      file.deleteOnExit()
      SpecificTestUtil.write(file, records)
      
      val dummyRecord = new GenericDatumReader[GenericRecord]
      val schema = new DataFileReader(file, dummyRecord).getSchema
      val userDatumReader = new SpecificDatumReader[DefaultTest](schema)
      val dataFileReader = new DataFileReader[DefaultTest](file, userDatumReader)
      val sameRecord = dataFileReader.next

      sameRecord.suit === SPADES
      sameRecord.number === 0
      sameRecord.str === "str"
      sameRecord.optionString === None
      sameRecord.optionStringValue === Some("default")
      sameRecord.embedded === Embedded(1)
      sameRecord.defaultArray === List(1,3,4,5)
      sameRecord.optionalEnum === None
      sameRecord.defaultMap === Map("Hello" -> "world", "Merry" -> "Christmas")
      sameRecord.byt === "\u00FF".getBytes
    }
  }

} 
Example 52
Source File: SpecificTestUtil.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
package test

import java.io.File

import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.Schema
import org.apache.avro.file.{ DataFileReader, DataFileWriter }

import org.specs2.mutable.Specification

object SpecificTestUtil extends Specification {

  def write[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val userDatumWriter = new SpecificDatumWriter[T]
    val dataFileWriter = new DataFileWriter[T](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
  }

  def read[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new SpecificDatumReader[T](schema)
    val dataFileReader = new DataFileReader[T](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader.
    var record: T = null.asInstanceOf[T]
    var sameRecord: T = null.asInstanceOf[T]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  }

  def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

  def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = {
    val schema = record.getSchema
    val writer = new SpecificDatumWriter[T](schema)
    val out = new java.io.ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(out, null)
    writer.write(record, encoder)
    encoder.flush
    val ba = out.toByteArray
    ba.size must ===(1)
    ba(0) must ===(0)
    out.close
    val reader = new SpecificDatumReader[T](schema)
    val decoder = DecoderFactory.get().binaryDecoder(ba, null)
    val decoded = reader.read(record, decoder)
    decoded must ===(record)
  }

} 
Example 53
Source File: SpecificDefautValuesSpec.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
import test._
import org.specs2.mutable.Specification
import java.io.File
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.file.DataFileReader

class SpecificDefaultValuesSpec extends Specification {

  "A case class with default values" should {
    "deserialize correctly" in {
      val record = DefaultTest()
      val records = List(record)
      
      val fileName = s"${records.head.getClass.getName}"
      val fileEnding = "avro"
      val file = File.createTempFile(fileName, fileEnding)
      file.deleteOnExit()
      SpecificTestUtil.write(file, records)
      
      val dummyRecord = new GenericDatumReader[GenericRecord]
      val schema = new DataFileReader(file, dummyRecord).getSchema
      val userDatumReader = new SpecificDatumReader[DefaultTest](schema)
      val dataFileReader = new DataFileReader[DefaultTest](file, userDatumReader)
      val sameRecord = dataFileReader.next

      sameRecord.suit === "SPADES"
      sameRecord.number === 0
      sameRecord.str === "str"
      sameRecord.optionString === None
      sameRecord.optionStringValue === Some("default")
      sameRecord.embedded === Embedded(1)
      sameRecord.defaultArray === Array(1,3,4,5)
      sameRecord.optionalEnum === None
      sameRecord.defaultMap === Map("Hello" -> "world", "Merry" -> "Christmas")
      sameRecord.byt === "\u00FF".getBytes
    }
  }

} 
Example 54
Source File: BytesWithSchemaToObject.scala    From trucking-iot   with Apache License 2.0 5 votes vote down vote up
package com.orendainx.trucking.storm.bolts

import java.io.ByteArrayInputStream
import java.nio.charset.StandardCharsets
import java.util

import com.hortonworks.registries.schemaregistry.SchemaMetadata
import com.hortonworks.registries.schemaregistry.avro.AvroSchemaProvider
import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient
import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotDeserializer
import com.orendainx.trucking.commons.models.{EnrichedTruckData, TrafficData}
import com.typesafe.scalalogging.Logger
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.storm.task.{OutputCollector, TopologyContext}
import org.apache.storm.topology.OutputFieldsDeclarer
import org.apache.storm.topology.base.BaseRichBolt
import org.apache.storm.tuple.{Fields, Tuple, Values}

import scala.collection.JavaConversions._



  // Helper function to convert GenericRecord (result of deserializing via Schema Registry) into JVM object
  private def recordToEnrichedTruckData(r: GenericRecord): EnrichedTruckData =
    EnrichedTruckData(
      r.get("eventTime").toString.toLong,
      r.get("truckId").toString.toInt,
      r.get("driverId").toString.toInt,
      r.get("driverName").toString,
      r.get("routeId").toString.toInt,
      r.get("routeName").toString,
      r.get("latitude").toString.toDouble,
      r.get("longitude").toString.toDouble,
      r.get("speed").toString.toInt,
      r.get("eventType").toString,
      r.get("foggy").toString.toInt,
      r.get("rainy").toString.toInt,
      r.get("windy").toString.toInt)

  // Helper function to convert GenericRecord (result of deserializing via Schema Registry) into JVM object
  private def recordToTrafficData(r: GenericRecord): TrafficData =
    TrafficData(r.get("eventTime").toString.toLong, r.get("routeId").toString.toInt, r.get("congestionLevel").toString.toInt)
} 
Example 55
Source File: NiFiPacketWithSchemaToObject.scala    From trucking-iot   with Apache License 2.0 5 votes vote down vote up
package com.orendainx.trucking.storm.bolts

import java.io.ByteArrayInputStream
import java.util

import com.hortonworks.registries.schemaregistry.SchemaMetadata
import com.hortonworks.registries.schemaregistry.avro.AvroSchemaProvider
import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient
import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotDeserializer
import com.orendainx.trucking.commons.models.{EnrichedTruckData, TrafficData}
import com.typesafe.scalalogging.Logger
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.nifi.storm.NiFiDataPacket
import org.apache.storm.task.{OutputCollector, TopologyContext}
import org.apache.storm.topology.OutputFieldsDeclarer
import org.apache.storm.topology.base.BaseRichBolt
import org.apache.storm.tuple.{Fields, Tuple, Values}

import scala.collection.JavaConversions._


class NiFiPacketWithSchemaToObject extends BaseRichBolt {

  private lazy val log = Logger(this.getClass)
  private var outputCollector: OutputCollector = _

  // Declare schema-related fields to be initialized when this component's prepare() method is called
  private var schemaRegistryClient: SchemaRegistryClient = _
  private var deserializer: AvroSnapshotDeserializer = _
  private var truckDataSchemaMetadata: SchemaMetadata = _
  private var trafficDataSchemaMetadata: SchemaMetadata = _

  override def prepare(stormConf: util.Map[_, _], context: TopologyContext, collector: OutputCollector): Unit = {

    outputCollector = collector

    val schemaRegistryUrl = stormConf.get(SchemaRegistryClient.Configuration.SCHEMA_REGISTRY_URL.name()).toString
    val clientConfig = Map(SchemaRegistryClient.Configuration.SCHEMA_REGISTRY_URL.name() -> schemaRegistryUrl)

    schemaRegistryClient = new SchemaRegistryClient(clientConfig)
    truckDataSchemaMetadata = schemaRegistryClient.getSchemaMetadataInfo("EnrichedTruckData").getSchemaMetadata
    trafficDataSchemaMetadata = schemaRegistryClient.getSchemaMetadataInfo("TrafficData").getSchemaMetadata
    deserializer = schemaRegistryClient.getDefaultDeserializer(AvroSchemaProvider.TYPE).asInstanceOf[AvroSnapshotDeserializer]
    deserializer.init(clientConfig)
  }

  override def execute(tuple: Tuple): Unit = {
    val dp = tuple.getValueByField("nifiDataPacket").asInstanceOf[NiFiDataPacket]

    // Deserialize each tuple and convert it into its proper case class (e.g. EnrichedTruckData or TrafficData)
    val (dataType, data) = dp.getAttributes.get("dataType") match {
      case typ @ "EnrichedTruckData" => (typ, recordToEnrichedTruckData(deserializer.deserialize(new ByteArrayInputStream(dp.getContent), null).asInstanceOf[GenericData.Record]))
      case typ @ "TrafficData" => (typ, recordToTrafficData(deserializer.deserialize(new ByteArrayInputStream(dp.getContent), null).asInstanceOf[GenericData.Record]))
    }

    outputCollector.emit(new Values(data, dataType))
    outputCollector.ack(tuple)
  }

  override def declareOutputFields(declarer: OutputFieldsDeclarer): Unit = declarer.declare(new Fields("data", "dataType"))

  // Helper function to convert GenericRecord (result of deserializing via Schema Registry) into JVM object
  private def recordToEnrichedTruckData(r: GenericRecord): EnrichedTruckData =
    EnrichedTruckData(
      r.get("eventTime").toString.toLong,
      r.get("truckId").toString.toInt,
      r.get("driverId").toString.toInt,
      r.get("driverName").toString,
      r.get("routeId").toString.toInt,
      r.get("routeName").toString,
      r.get("latitude").toString.toDouble,
      r.get("longitude").toString.toDouble,
      r.get("speed").toString.toInt,
      r.get("eventType").toString,
      r.get("foggy").toString.toInt,
      r.get("rainy").toString.toInt,
      r.get("windy").toString.toInt)

  // Helper function to convert GenericRecord (result of deserializing via Schema Registry) into JVM object
  private def recordToTrafficData(r: GenericRecord): TrafficData =
    TrafficData(r.get("eventTime").toString.toLong, r.get("routeId").toString.toInt, r.get("congestionLevel").toString.toInt)
} 
Example 56
Source File: SerializedWithSchemaToObject.scala    From trucking-iot   with Apache License 2.0 5 votes vote down vote up
package com.orendainx.trucking.storm.bolts

import java.io.ByteArrayInputStream
import java.nio.charset.StandardCharsets
import java.util

import com.hortonworks.registries.schemaregistry.SchemaMetadata
import com.hortonworks.registries.schemaregistry.avro.AvroSchemaProvider
import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient
import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotDeserializer
import com.orendainx.trucking.commons.models.{EnrichedTruckData, TrafficData}
import com.typesafe.scalalogging.Logger
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.storm.task.{OutputCollector, TopologyContext}
import org.apache.storm.topology.OutputFieldsDeclarer
import org.apache.storm.topology.base.BaseRichBolt
import org.apache.storm.tuple.{Fields, Tuple, Values}

import scala.collection.JavaConversions._


class SerializedWithSchemaToObject extends BaseRichBolt {

  private lazy val log = Logger(this.getClass)
  private var outputCollector: OutputCollector = _

  // Declare schema-related fields to be initialized when this component's prepare() method is called
  private var schemaRegistryClient: SchemaRegistryClient = _
  private var deserializer: AvroSnapshotDeserializer = _
  private var truckDataSchemaMetadata: SchemaMetadata = _
  private var trafficDataSchemaMetadata: SchemaMetadata = _

  override def prepare(stormConf: util.Map[_, _], context: TopologyContext, collector: OutputCollector): Unit = {

    outputCollector = collector

    val schemaRegistryUrl = stormConf.get(SchemaRegistryClient.Configuration.SCHEMA_REGISTRY_URL.name()).toString
    val clientConfig = Map(SchemaRegistryClient.Configuration.SCHEMA_REGISTRY_URL.name() -> schemaRegistryUrl)

    schemaRegistryClient = new SchemaRegistryClient(clientConfig)
    truckDataSchemaMetadata = schemaRegistryClient.getSchemaMetadataInfo("EnrichedTruckData").getSchemaMetadata
    trafficDataSchemaMetadata = schemaRegistryClient.getSchemaMetadataInfo("TrafficData").getSchemaMetadata
    deserializer = schemaRegistryClient.getDefaultDeserializer(AvroSchemaProvider.TYPE).asInstanceOf[AvroSnapshotDeserializer]
    deserializer.init(clientConfig)
  }

  override def execute(tuple: Tuple): Unit = {

    // Deserialize each tuple and convert it into its proper case class (e.g. EnrichedTruckData or TrafficData)
    val str = tuple.getStringByField("data").getBytes(StandardCharsets.UTF_8)
    log.info(s"str2: ${tuple.getStringByField("data")}")
    val bytes = new ByteArrayInputStream(str)
    log.info(s"bytes: $bytes")
    val (dataType, data) = tuple.getStringByField("dataType") match {
      case typ @ "EnrichedTruckData" =>
        log.info(s"des: ${deserializer.deserialize(bytes, null)}")
        (typ, recordToEnrichedTruckData(deserializer.deserialize(bytes, null).asInstanceOf[GenericData.Record]))
      case typ @ "TrafficData" =>
        log.info(s"des: ${deserializer.deserialize(bytes, null)}")
        (typ, recordToTrafficData(deserializer.deserialize(bytes, null).asInstanceOf[GenericData.Record]))
    }

    outputCollector.emit(new Values(data, dataType))
    outputCollector.ack(tuple)
  }

  override def declareOutputFields(declarer: OutputFieldsDeclarer): Unit = declarer.declare(new Fields("data", "dataType"))

  // Helper function to convert GenericRecord (result of deserializing via Schema Registry) into JVM object
  private def recordToEnrichedTruckData(r: GenericRecord): EnrichedTruckData =
    EnrichedTruckData(
      r.get("eventTime").toString.toLong,
      r.get("truckId").toString.toInt,
      r.get("driverId").toString.toInt,
      r.get("driverName").toString,
      r.get("routeId").toString.toInt,
      r.get("routeName").toString,
      r.get("latitude").toString.toDouble,
      r.get("longitude").toString.toDouble,
      r.get("speed").toString.toInt,
      r.get("eventType").toString,
      r.get("foggy").toString.toInt,
      r.get("rainy").toString.toInt,
      r.get("windy").toString.toInt)

  // Helper function to convert GenericRecord (result of deserializing via Schema Registry) into JVM object
  private def recordToTrafficData(r: GenericRecord): TrafficData =
    TrafficData(r.get("eventTime").toString.toLong, r.get("routeId").toString.toInt, r.get("congestionLevel").toString.toInt)
} 
Example 57
Source File: MetadataAlgebraSpec.scala    From hydra   with Apache License 2.0 5 votes vote down vote up
package hydra.kafka.algebras

import java.time.Instant

import cats.data.NonEmptyList
import cats.effect.{Concurrent, ContextShift, IO, Sync, Timer}
import cats.implicits._
import hydra.avro.registry.SchemaRegistry
import hydra.core.marshallers.History
import hydra.kafka.algebras.MetadataAlgebra.TopicMetadataContainer
import hydra.kafka.model.ContactMethod.Slack
import hydra.kafka.model.TopicMetadataV2Request.Subject
import hydra.kafka.model.{Public, StreamTypeV2, TopicMetadataV2, TopicMetadataV2Key, TopicMetadataV2Request, TopicMetadataV2Value}
import io.chrisdavenport.log4cats.SelfAwareStructuredLogger
import io.chrisdavenport.log4cats.slf4j.Slf4jLogger
import org.apache.avro.generic.GenericRecord
import org.scalatest.Assertion
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpecLike
import retry.RetryPolicies._
import retry.syntax.all._
import retry.{RetryPolicy, _}

import scala.concurrent.ExecutionContext
import scala.concurrent.duration._

class MetadataAlgebraSpec extends AnyWordSpecLike with Matchers {

  implicit private val contextShift: ContextShift[IO] = IO.contextShift(ExecutionContext.global)
  private implicit val concurrentEffect: Concurrent[IO] = IO.ioConcurrentEffect

  private implicit val policy: RetryPolicy[IO] = limitRetries[IO](5) |+| exponentialBackoff[IO](500.milliseconds)
  private implicit val timer: Timer[IO] = IO.timer(ExecutionContext.global)
  private implicit def noop[A]: (A, RetryDetails) => IO[Unit] = retry.noop[IO, A]

  implicit private def unsafeLogger[F[_]: Sync]: SelfAwareStructuredLogger[F] =
    Slf4jLogger.getLogger[F]

  private implicit class RetryAndAssert[A](boolIO: IO[A]) {
    def retryIfFalse(check: A => Boolean): IO[Assertion] =
      boolIO.map(check).retryingM(identity, policy, noop).map(assert(_))
  }


  private val metadataTopicName = "_internal.metadataTopic"
  private val consumerGroup = "Consumer Group"

  (for {
    kafkaClient <- KafkaClientAlgebra.test[IO]
    schemaRegistry <- SchemaRegistry.test[IO]
    metadata <- MetadataAlgebra.make(metadataTopicName, consumerGroup, kafkaClient, schemaRegistry, consumeMetadataEnabled = true)
  } yield {
    runTests(metadata, kafkaClient)
  }).unsafeRunSync()

  private def runTests(metadataAlgebra: MetadataAlgebra[IO], kafkaClientAlgebra: KafkaClientAlgebra[IO]): Unit = {
    "MetadataAlgebraSpec" should {

      "retrieve none for non-existant topic" in {
        val subject = Subject.createValidated("Non-existantTopic").get
        metadataAlgebra.getMetadataFor(subject).unsafeRunSync() shouldBe None
      }

      "retrieve metadata" in {
        val subject = Subject.createValidated("subject1").get
        val (genericRecordsIO, key, value) = getMetadataGenericRecords(subject)

        (for {
          record <- genericRecordsIO
          _ <- kafkaClientAlgebra.publishMessage(record, metadataTopicName)
          _ <- metadataAlgebra.getMetadataFor(subject).retryIfFalse(_.isDefined)
          metadata <- metadataAlgebra.getMetadataFor(subject)
        } yield metadata shouldBe Some(TopicMetadataContainer(key, value, None, None))).unsafeRunSync()
      }

      "retrieve all metadata" in {
        val subject = Subject.createValidated("subject2").get
        val (genericRecordsIO, key, value) = getMetadataGenericRecords(subject)
        (for {
          record <- genericRecordsIO
          _ <- kafkaClientAlgebra.publishMessage(record, metadataTopicName)
          _ <- metadataAlgebra.getMetadataFor(subject).retryIfFalse(_.isDefined)
          allMetadata <- metadataAlgebra.getAllMetadata
        } yield allMetadata should have length 2).unsafeRunSync()
      }
    }
  }

  private def getMetadataGenericRecords(subject: Subject): (IO[(GenericRecord, Option[GenericRecord])], TopicMetadataV2Key, TopicMetadataV2Value) = {
    val key = TopicMetadataV2Key(subject)
    val value = TopicMetadataV2Value(
        StreamTypeV2.Entity,
        deprecated = false,
        Public,
        NonEmptyList.one(Slack.create("#channel").get),
        Instant.now,
        List(),
        None)
    (TopicMetadataV2.encode[IO](key, Some(value)), key, value)
  }
} 
Example 58
Source File: AvroKeyRecord.scala    From hydra   with Apache License 2.0 5 votes vote down vote up
package hydra.kafka.producer

import com.pluralsight.hydra.avro.JsonConverter
import hydra.core.transport.AckStrategy
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord

final case class AvroKeyRecord(
    destination: String,
    keySchema: Schema,
    valueSchema: Schema,
    key: GenericRecord,
    payload: GenericRecord,
    ackStrategy: AckStrategy
) extends KafkaRecord[GenericRecord, GenericRecord]

object AvroKeyRecord {

  def apply(
      destination: String,
      keySchema: Schema,
      valueSchema: Schema,
      keyJson: String,
      valueJson: String,
      ackStrategy: AckStrategy
  ): AvroKeyRecord = {

    val (key, value): (GenericRecord, GenericRecord) = {
      val keyConverter: String => GenericRecord =
        new JsonConverter[GenericRecord](keySchema).convert
      val valueConverter: String => GenericRecord =
        new JsonConverter[GenericRecord](valueSchema).convert
      (keyConverter(keyJson), valueConverter(valueJson))
    }

    AvroKeyRecord(destination, keySchema, valueSchema, key, value, ackStrategy)
  }

  def apply(
      destination: String,
      keySchema: Schema,
      valueSchema: Schema,
      key: GenericRecord,
      value: GenericRecord,
      ackStrategy: AckStrategy
  ): AvroKeyRecord = {
    new AvroKeyRecord(
      destination,
      keySchema,
      valueSchema,
      key,
      value,
      ackStrategy
    )
  }
} 
Example 59
Source File: AvroRecordFactory.scala    From hydra   with Apache License 2.0 5 votes vote down vote up
package hydra.kafka.producer

import akka.actor.ActorRef
import akka.pattern.ask
import akka.util
import com.pluralsight.hydra.avro.JsonConverter
import hydra.avro.registry.ConfluentSchemaRegistry
import hydra.avro.resource.SchemaResource
import hydra.avro.util.AvroUtils
import hydra.common.config.ConfigSupport
import hydra.common.logging.LoggingAdapter
import hydra.core.akka.SchemaRegistryActor.{FetchSchemaRequest, FetchSchemaResponse}
import hydra.core.ingest.HydraRequest
import hydra.core.transport.ValidationStrategy.Strict
import org.apache.avro.generic.GenericRecord

import scala.concurrent.duration._
import scala.concurrent.{ExecutionContext, Future}


class AvroRecordFactory(schemaResourceLoader: ActorRef)
    extends KafkaRecordFactory[String, GenericRecord]
    with ConfigSupport with LoggingAdapter {

  private implicit val timeout = util.Timeout(3.seconds)

  override def build(
      request: HydraRequest
  )(implicit ec: ExecutionContext): Future[AvroRecord] = {
    for {
      (topic, subject) <- Future.fromTry(getTopicAndSchemaSubject(request))
      schemaResource <- (schemaResourceLoader ? FetchSchemaRequest(subject))
        .mapTo[FetchSchemaResponse]
        .map(_.schemaResource)
      record <- convert(schemaResource, request)
    } yield AvroRecord(
      topic,
      schemaResource.schema,
      getKey(request, record),
      record,
      request.ackStrategy
    )
  }

  private def convert(schemaResource: SchemaResource, request: HydraRequest)(
      implicit ec: ExecutionContext
  ): Future[GenericRecord] = {
    val converter = new JsonConverter[GenericRecord](
      schemaResource.schema,
      request.validationStrategy == Strict
    )
    Future({
      val converted = converter.convert(request.payload)
      converted
    }).recover {
      case ex => throw AvroUtils.improveException(ex, schemaResource,
        ConfluentSchemaRegistry.registryUrl(applicationConfig))
    }
  }
} 
Example 60
Source File: SpecificTestUtil.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
package test

import java.io.File

import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.Schema
import org.apache.avro.file.{ DataFileReader, DataFileWriter }

import org.specs2.mutable.Specification

object SpecificTestUtil extends Specification {

  def write[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val userDatumWriter = new SpecificDatumWriter[T]
    val dataFileWriter = new DataFileWriter[T](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
  }

  def read[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new SpecificDatumReader[T](schema)
    val dataFileReader = new DataFileReader[T](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader.
    var record: T = null.asInstanceOf[T]
    var sameRecord: T = null.asInstanceOf[T]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord.equals(record)
  }

  def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

  def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = {
    val schema = record.getSchema
    val writer = new SpecificDatumWriter[T](schema)
    val out = new java.io.ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(out, null)
    writer.write(record, encoder)
    encoder.flush
    val ba = out.toByteArray
    ba.size must ===(1)
    ba(0) must ===(0)
    out.close
    val reader = new SpecificDatumReader[T](schema)
    val decoder = DecoderFactory.get().binaryDecoder(ba, null)
    val decoded = reader.read(record, decoder)
    decoded must ===(record)
  }

} 
Example 61
Source File: KafkaRecordFactory.scala    From hydra   with Apache License 2.0 5 votes vote down vote up
package hydra.kafka.producer

import com.fasterxml.jackson.databind.JsonNode
import hydra.avro.util.SchemaWrapper
import hydra.core.ingest.RequestParams._
import hydra.core.ingest.{HydraRequest, RequestParams}
import hydra.core.protocol.MissingMetadataException
import hydra.core.transport.RecordFactory
import hydra.kafka.producer.KafkaRecordFactory.RecordKeyExtractor
import org.apache.avro.generic.GenericRecord

import scala.util.{Failure, Success, Try}


  def getTopicAndSchemaSubject(request: HydraRequest): Try[(String, String)] = {
    val subject = request.metadataValue(RequestParams.HYDRA_SCHEMA_PARAM)
    request.metadataValue(HYDRA_KAFKA_TOPIC_PARAM) match {
      case Some(topic) => Success(topic -> subject.getOrElse(topic))
      case None =>
        Failure(
          MissingMetadataException(
            HYDRA_KAFKA_TOPIC_PARAM,
            "No kafka topic present in the request."
          )
        )
    }
  }
}

object KafkaRecordFactory {

  trait RecordKeyExtractor[K, V] {

    def extractKeyValue(request: HydraRequest, record: V): Option[K]
  }

  object RecordKeyExtractor {

    implicit object StringRecordKeyExtractor
        extends RecordKeyExtractor[String, String] {

      override def extractKeyValue(
          request: HydraRequest,
          record: String
      ): Option[String] = {
        request
          .metadataValue(HYDRA_RECORD_KEY_PARAM)
          .map(key => JsonPathKeys.getKey(key, record))
      }
    }

    implicit object JsonRecordKeyExtractor
        extends RecordKeyExtractor[String, JsonNode] {

      override def extractKeyValue(
          request: HydraRequest,
          record: JsonNode
      ): Option[String] = {
        request
          .metadataValue(HYDRA_RECORD_KEY_PARAM)
          .map(key => JsonPathKeys.getKey(key, record.toString))
      }
    }

    implicit object SchemaKeyExtractor
        extends RecordKeyExtractor[String, GenericRecord] {

      override def extractKeyValue(
          request: HydraRequest,
          payload: GenericRecord
      ): Option[String] = {
        request
          .metadataValue(HYDRA_RECORD_KEY_PARAM)
          .map { key => JsonPathKeys.getKey(key, request.payload) }
          .orElse {
            val schema = payload.getSchema
            val wrapper = SchemaWrapper.from(schema)
            wrapper
              .validate()
              .get //we're throwing the exception here so that the request ends with a 400
            wrapper.primaryKeys.map(payload.get) match {
              case Nil  => None
              case keys => Some(keys.mkString("|"))
            }
          }
      }
    }

  }

} 
Example 62
Source File: IngestionFlowV2.scala    From hydra   with Apache License 2.0 5 votes vote down vote up
package hydra.ingest.services

import java.io.IOException

import cats.MonadError
import cats.implicits._
import hydra.avro.registry.SchemaRegistry
import hydra.avro.resource.SchemaResourceLoader.SchemaNotFoundException
import hydra.avro.util.SchemaWrapper
import hydra.core.transport.ValidationStrategy
import hydra.kafka.algebras.KafkaClientAlgebra
import hydra.kafka.algebras.KafkaClientAlgebra.PublishResponse
import hydra.kafka.model.TopicMetadataV2Request.Subject
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import scalacache._
import scalacache.guava._
import scalacache.memoization._

import scala.concurrent.duration._
import scala.util.{Failure, Try}

final class IngestionFlowV2[F[_]: MonadError[*[_], Throwable]: Mode](
                                                                    schemaRegistry: SchemaRegistry[F],
                                                                    kafkaClient: KafkaClientAlgebra[F],
                                                                    schemaRegistryBaseUrl: String) {

  import IngestionFlowV2._
  import hydra.avro.convert.StringToGenericRecord._

  implicit val guavaCache: Cache[SchemaWrapper] = GuavaCache[SchemaWrapper]

  private def getSchema(subject: String): F[Schema] = {
    schemaRegistry.getLatestSchemaBySubject(subject)
      .flatMap { maybeSchema =>
        val schemaNotFound = SchemaNotFoundException(subject)
        MonadError[F, Throwable].fromOption(maybeSchema, SchemaNotFoundAugmentedException(schemaNotFound, subject))
      }
  }

  private def getSchemaWrapper(subject: Subject, isKey: Boolean): F[SchemaWrapper] = memoizeF[F, SchemaWrapper](Some(2.minutes)) {
    val suffix = if (isKey) "-key" else "-value"
    getSchema(subject.value + suffix).map { sch =>
      SchemaWrapper.from(sch)
    }
  }

  private def recover[A](subject: Subject, isKey: Boolean): PartialFunction[Throwable, Try[A]] = {
    val suffix = if (isKey) "-key" else "-value"
    val location = s"$schemaRegistryBaseUrl/subjects/${subject.value}$suffix/versions/latest/schema"
    val pf: PartialFunction[Throwable, Try[A]] = {
      case e: ValidationExtraFieldsError =>
        Failure(AvroConversionAugmentedException(s"${e.getClass.getName}: ${e.getMessage} [$location]"))
      case e: InvalidLogicalTypeError =>
        Failure(AvroConversionAugmentedException(s"${e.getClass.getName}: ${e.getMessage} [$location]"))
      case e: IOException =>
        Failure(AvroConversionAugmentedException(s"${e.getClass.getName}: ${e.getMessage} [$location]"))
      case e => Failure(e)
    }
    pf
  }

  private def getSchemas(request: V2IngestRequest, topic: Subject): F[(GenericRecord, Option[GenericRecord])] = {
    val useStrictValidation = request.validationStrategy.getOrElse(ValidationStrategy.Strict) == ValidationStrategy.Strict
    def getRecord(payload: String, schema: Schema): Try[GenericRecord] =
      payload.toGenericRecord(schema, useStrictValidation)
    for {
      kSchema <- getSchemaWrapper(topic, isKey = true)
      vSchema <- getSchemaWrapper(topic, isKey = false)
      k <- MonadError[F, Throwable].fromTry(
        getRecord(request.keyPayload, kSchema.schema).recoverWith(recover(topic, isKey = true)))
      v <- MonadError[F, Throwable].fromTry(
        request.valPayload.traverse(getRecord(_, vSchema.schema)).recoverWith(recover(topic, isKey = false)))
    } yield (k, v)
  }

  def ingest(request: V2IngestRequest, topic: Subject): F[PublishResponse] = {
    getSchemas(request, topic).flatMap { case (key, value) =>
      kafkaClient.publishMessage((key, value), topic.value).rethrow
    }
  }
}

object IngestionFlowV2 {
  final case class V2IngestRequest(keyPayload: String, valPayload: Option[String], validationStrategy: Option[ValidationStrategy])

  final case class AvroConversionAugmentedException(message: String) extends RuntimeException(message)
  final case class SchemaNotFoundAugmentedException(schemaNotFoundException: SchemaNotFoundException, topic: String)
    extends RuntimeException(s"Schema '$topic' cannot be loaded. Cause: ${schemaNotFoundException.getClass.getName}: Schema not found for $topic")
} 
Example 63
Source File: IngestionFlow.scala    From hydra   with Apache License 2.0 5 votes vote down vote up
package hydra.ingest.services

import java.io.IOException

import cats.MonadError
import cats.implicits._
import com.pluralsight.hydra.avro.JsonToAvroConversionException
import hydra.avro.registry.SchemaRegistry
import hydra.avro.resource.SchemaResourceLoader.SchemaNotFoundException
import hydra.avro.util.SchemaWrapper
import hydra.core.ingest.HydraRequest
import hydra.core.ingest.RequestParams.{HYDRA_KAFKA_TOPIC_PARAM, HYDRA_RECORD_KEY_PARAM}
import hydra.core.transport.{AckStrategy, ValidationStrategy}
import hydra.kafka.algebras.KafkaClientAlgebra
import hydra.kafka.producer.AvroRecord
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import scalacache._
import scalacache.guava._
import scalacache.memoization._

import scala.concurrent.duration._
import scala.util.{Failure, Success, Try}

final class IngestionFlow[F[_]: MonadError[*[_], Throwable]: Mode](
                                                                    schemaRegistry: SchemaRegistry[F],
                                                                    kafkaClient: KafkaClientAlgebra[F],
                                                                    schemaRegistryBaseUrl: String
                                                                  ) {

  import IngestionFlow._

  implicit val guavaCache: Cache[SchemaWrapper] = GuavaCache[SchemaWrapper]

  private def getValueSchema(topicName: String): F[Schema] = {
    schemaRegistry.getLatestSchemaBySubject(topicName + "-value")
      .flatMap { maybeSchema =>
        val schemaNotFound = SchemaNotFoundException(topicName)
        MonadError[F, Throwable].fromOption(maybeSchema, SchemaNotFoundAugmentedException(schemaNotFound, topicName))
      }
  }

  private def getValueSchemaWrapper(topicName: String): F[SchemaWrapper] = memoizeF[F, SchemaWrapper](Some(2.minutes)) {
    getValueSchema(topicName).map { valueSchema =>
      SchemaWrapper.from(valueSchema)
    }
  }

  def ingest(request: HydraRequest): F[Unit] = {
    request.metadataValue(HYDRA_KAFKA_TOPIC_PARAM) match {
      case Some(topic) => getValueSchemaWrapper(topic).flatMap { schemaWrapper =>
        val useStrictValidation = request.validationStrategy == ValidationStrategy.Strict
        val payloadTryMaybe: Try[Option[GenericRecord]] = Option(request.payload) match {
          case Some(p) => convertToAvro(topic, schemaWrapper, useStrictValidation, p).map(avroRecord => Some(avroRecord.payload))
          case None => Success(None)
        }
        val v1Key = getV1RecordKey(schemaWrapper, payloadTryMaybe, request)
        MonadError[F, Throwable].fromTry(payloadTryMaybe).flatMap { payloadMaybe =>
          kafkaClient.publishStringKeyMessage((v1Key, payloadMaybe), topic).void
        }
      }
      case None => MonadError[F, Throwable].raiseError(MissingTopicNameException(request))
    }
  }

  private def getV1RecordKey(schemaWrapper: SchemaWrapper, payloadTryMaybe: Try[Option[GenericRecord]], request: HydraRequest): Option[String] = {
    val headerV1Key = request.metadata.get(HYDRA_RECORD_KEY_PARAM)
    val optionString = schemaWrapper.primaryKeys.toList match {
      case Nil => None
      case l => l.flatMap(pkName => payloadTryMaybe match {
        case Success(payloadMaybe) =>
          payloadMaybe.flatMap(p => Try(p.get(pkName)).toOption)
        case Failure(_) => None
      }).mkString("|").some
    }
    headerV1Key.orElse(optionString)
  }

  private def convertToAvro(topic: String, schemaWrapper: SchemaWrapper, useStrictValidation: Boolean, payloadString: String): Try[AvroRecord] = {
    Try(AvroRecord(topic, schemaWrapper.schema, None, payloadString, AckStrategy.Replicated, useStrictValidation)).recoverWith {
      case e: JsonToAvroConversionException =>
        val location = s"$schemaRegistryBaseUrl/subjects/$topic-value/versions/latest/schema"
        Failure(new AvroConversionAugmentedException(s"${e.getClass.getName}: ${e.getMessage} [$location]"))
      case e: IOException =>
        val location = s"$schemaRegistryBaseUrl/subjects/$topic-value/versions/latest/schema"
        Failure(new AvroConversionAugmentedException(s"${e.getMessage} [$location]"))
      case e => Failure(e)
    }
  }
}

object IngestionFlow {
  final case class MissingTopicNameException(request: HydraRequest)
    extends Exception(s"Missing the topic name in request with correlationId ${request.correlationId}")
  final case class AvroConversionAugmentedException(message: String) extends RuntimeException(message)
  final case class SchemaNotFoundAugmentedException(schemaNotFoundException: SchemaNotFoundException, topic: String)
    extends RuntimeException(s"Schema '$topic' cannot be loaded. Cause: ${schemaNotFoundException.getClass.getName}: Schema not found for $topic")
} 
Example 64
Source File: StringToGenericRecord.scala    From hydra   with Apache License 2.0 5 votes vote down vote up
package hydra.avro.convert

import java.util.UUID

import org.apache.avro.{LogicalTypes, Schema}
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
import org.apache.avro.io.DecoderFactory
import cats.implicits._
import org.apache.avro.util.Utf8

import scala.util.{Failure, Success, Try}

object StringToGenericRecord {

  final case class ValidationExtraFieldsError(fields: Set[String]) extends RuntimeException(
    s"Extra fields ${fields.mkString(",")} found with Strict Validation Strategy"
  )

  final case class InvalidLogicalTypeError(expected: String, received: AnyRef) extends RuntimeException(
    s"Invalid logical type. Expected $expected but received $received"
  )

  implicit class ConvertToGenericRecord(s: String) {

    private def isUuidValid(s: String): Boolean =
      Try(UUID.fromString(s)).isSuccess

    private def checkLogicalTypes(record: GenericRecord): Try[Unit] = {
      import collection.JavaConverters._
      def checkAll(avroField: AnyRef, fieldSchema: Option[Schema]): Try[Unit] = avroField match {
        case g: GenericRecord => g.getSchema.getFields.asScala.toList
          .traverse(f => checkAll(g.get(f.name), f.schema.some)).void
        case u: Utf8 if fieldSchema.exists(f => Option(f.getLogicalType).exists(_.getName == LogicalTypes.uuid.getName)) =>
          if (isUuidValid(u.toString)) Success(()) else Failure(InvalidLogicalTypeError("UUID", u.toString))
        case _ => Success(())
      }
      val fields = record.getSchema.getFields.asScala.toList
      fields.traverse(f => checkAll(record.get(f.name), f.schema.some)).void
    }

    private def getAllPayloadFieldNames: Set[String] = {
      import spray.json._
      def loop(cur: JsValue, extraName: Option[String]): Set[String] = cur match {
        case JsObject(f) => f.flatMap { case (k: String, v: JsValue) =>
          loop(v, k.some) ++ Set(extraName.getOrElse("") + k)
        }.toSet
        case _ => Set.empty
      }
      loop(s.parseJson, None)
    }

    private def getAllSchemaFieldNames(schema: Schema): Set[String] = {
      import Schema.Type._
      import collection.JavaConverters._
      def loop(sch: Schema, extraName: Option[String]): Set[String] = sch.getType match {
        case RECORD => sch.getFields.asScala.toSet.flatMap { f: Schema.Field =>
          loop(f.schema, f.name.some) ++ Set(extraName.getOrElse("") + f.name)
        }
        case _ => Set.empty
      }
      loop(schema, None)
    }

    def toGenericRecord(schema: Schema, useStrictValidation: Boolean): Try[GenericRecord] = Try {
      if (useStrictValidation) {
        val diff = getAllPayloadFieldNames diff getAllSchemaFieldNames(schema)
        if (diff.nonEmpty) throw ValidationExtraFieldsError(diff)
      }
      val decoderFactory = new DecoderFactory
      val decoder = decoderFactory.jsonDecoder(schema, s)
      val reader = new GenericDatumReader[GenericRecord](schema)
      reader.read(null, decoder)
    }.flatTap(checkLogicalTypes)
  }

} 
Example 65
Source File: AvroParquetReaderFnTest.scala    From eel-sdk   with Apache License 2.0 5 votes vote down vote up
package io.eels.component.parquet

import java.util.UUID

import io.eels.component.avro.AvroSchemaFns
import io.eels.component.parquet.avro.AvroParquetReaderFn
import io.eels.schema.{DoubleType, Field, LongType, StructType}
import org.apache.avro.SchemaBuilder
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.avro.util.Utf8
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.parquet.avro.AvroParquetWriter
import org.scalatest.{BeforeAndAfterAll, Matchers, WordSpec}

class AvroParquetReaderFnTest extends WordSpec with Matchers with BeforeAndAfterAll {

  private implicit val conf = new Configuration()
  private implicit val fs = FileSystem.get(new Configuration())

  private val path = new Path(UUID.randomUUID().toString())

  override def afterAll(): Unit = {
    val fs = FileSystem.get(new Configuration())
    fs.delete(path, false)
  }

  private val avroSchema = SchemaBuilder.record("com.chuckle").fields()
    .requiredString("str").requiredLong("looong").requiredDouble("dooble").endRecord()

  private val writer = AvroParquetWriter.builder[GenericRecord](path)
    .withSchema(avroSchema)
    .build()

  private val record = new GenericData.Record(avroSchema)
  record.put("str", "wibble")
  record.put("looong", 999L)
  record.put("dooble", 12.34)
  writer.write(record)
  writer.close()

  val schema = StructType(Field("str"), Field("looong", LongType(true), true), Field("dooble", DoubleType, true))

  "AvroParquetReaderFn" should {
    "support projections on doubles" in {

      val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema.removeField("looong"))))
      val record = reader.read()
      reader.close()

      record.get("str").asInstanceOf[Utf8].toString shouldBe "wibble"
      record.get("dooble") shouldBe 12.34
    }
    "support projections on longs" in {

      val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema.removeField("str"))))
      val record = reader.read()
      reader.close()

      record.get("looong") shouldBe 999L
    }
    "support full projections" in {

      val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema)))
      val record = reader.read()
      reader.close()

      record.get("str").asInstanceOf[Utf8].toString shouldBe "wibble"
      record.get("looong") shouldBe 999L
      record.get("dooble") shouldBe 12.34

    }
    "support non projections" in {

      val reader = AvroParquetReaderFn(path, None, None)
      val group = reader.read()
      reader.close()

      group.get("str").asInstanceOf[Utf8].toString shouldBe "wibble"
      group.get("looong") shouldBe 999L
      group.get("dooble") shouldBe 12.34

    }
  }
} 
Example 66
Source File: AvroParquetReaderFn.scala    From eel-sdk   with Apache License 2.0 5 votes vote down vote up
package io.eels.component.parquet.avro

import io.eels.Predicate
import io.eels.component.parquet.{ParquetPredicateBuilder, ParquetReaderConfig}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.parquet.avro.{AvroParquetReader, AvroReadSupport}
import org.apache.parquet.filter2.compat.FilterCompat
import org.apache.parquet.hadoop.ParquetReader


  def apply(path: Path,
            predicate: Option[Predicate],
            projectionSchema: Option[Schema])(implicit conf: Configuration): ParquetReader[GenericRecord] = {

    // The parquet reader can use a projection by setting a projected schema onto a conf object
    def configuration(): Configuration = {
      val newconf = new Configuration(conf)
      projectionSchema.foreach { it =>
        AvroReadSupport.setAvroReadSchema(newconf, it)
        AvroReadSupport.setRequestedProjection(newconf, it)
      }
      //conf.set(ParquetInputFormat.DICTIONARY_FILTERING_ENABLED, "true")
      newconf.set(org.apache.parquet.hadoop.ParquetFileReader.PARQUET_READ_PARALLELISM, config.parallelism.toString)
      newconf
    }

    // a filter is set when we have a predicate for the read
    def filter(): FilterCompat.Filter = predicate.map(ParquetPredicateBuilder.build)
      .map(FilterCompat.get)
      .getOrElse(FilterCompat.NOOP)

    AvroParquetReader.builder[GenericRecord](path)
      .withCompatibility(false)
      .withConf(configuration())
      .withFilter(filter())
      .build()
      .asInstanceOf[ParquetReader[GenericRecord]]
  }
} 
Example 67
Source File: AvroParquetWriterFn.scala    From eel-sdk   with Apache License 2.0 5 votes vote down vote up
package io.eels.component.parquet.avro

import com.sksamuel.exts.Logging
import io.eels.component.parquet.ParquetWriterConfig
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.fs.Path
import org.apache.parquet.avro.AvroParquetWriter
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter}


object AvroParquetWriterFn extends Logging {
  def apply(path: Path, avroSchema: Schema): ParquetWriter[GenericRecord] = {
    val config = ParquetWriterConfig()
    AvroParquetWriter.builder[GenericRecord](path)
      .withSchema(avroSchema)
      .withCompressionCodec(config.compressionCodec)
      .withPageSize(config.pageSize)
      .withRowGroupSize(config.blockSize)
      .withDictionaryEncoding(config.enableDictionary)
      .withWriteMode(ParquetFileWriter.Mode.CREATE)
      .withValidation(config.validating)
      .build()
  }
} 
Example 68
Source File: AvroParquetRowWriter.scala    From eel-sdk   with Apache License 2.0 5 votes vote down vote up
package io.eels.component.parquet.avro

import com.sksamuel.exts.Logging
import com.typesafe.config.{Config, ConfigFactory}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.fs.{FileSystem, Path}


class AvroParquetRowWriter(path: Path,
                           avroSchema: Schema)(implicit fs: FileSystem) extends Logging {

  private val config: Config = ConfigFactory.load()
  private val skipCrc = config.getBoolean("eel.parquet.skipCrc")
  logger.info(s"Parquet writer will skipCrc = $skipCrc")

  private val writer = AvroParquetWriterFn(path, avroSchema)

  def write(record: GenericRecord): Unit = {
    writer.write(record)
  }

  def close(): Unit = {
    writer.close()
    if (skipCrc) {
      val crc = new Path("." + path.toString() + ".crc")
      logger.debug("Deleting crc $crc")
      if (fs.exists(crc))
        fs.delete(crc, false)
    }
  }
} 
Example 69
Source File: AvroWriter.scala    From eel-sdk   with Apache License 2.0 5 votes vote down vote up
package io.eels.component.avro

import java.io.OutputStream
import java.util.concurrent.atomic.AtomicInteger

import io.eels.Row
import io.eels.schema.StructType
import org.apache.avro.file.DataFileWriter
import org.apache.avro.generic
import org.apache.avro.generic.GenericRecord

class AvroWriter(structType: StructType, out: OutputStream) {
  
  private val schema = AvroSchemaFns.toAvroSchema(structType)
  private val datumWriter = new generic.GenericDatumWriter[GenericRecord](schema)
  private val dataFileWriter = new DataFileWriter[GenericRecord](datumWriter)
  private val serializer = new RowSerializer(schema)
  private val _records = new AtomicInteger(0)

  dataFileWriter.create(schema, out)

  def write(row: Row): Unit = {
    val record = serializer.serialize(row)
    dataFileWriter.append(record)
    _records.incrementAndGet()
  }

  def records: Int = _records.get()

  def close(): Unit = {
    dataFileWriter.flush()
    dataFileWriter.close()
  }
} 
Example 70
Source File: AvroDeserializer.scala    From eel-sdk   with Apache License 2.0 5 votes vote down vote up
package io.eels.component.avro

import com.typesafe.config.ConfigFactory
import io.eels.Row
import io.eels.schema.StructType
import org.apache.avro.Schema.Field
import org.apache.avro.generic.GenericRecord
import org.apache.avro.util.Utf8

import scala.collection.JavaConverters._


class AvroDeserializer(useJavaString: Boolean = ConfigFactory.load().getBoolean("eel.avro.java.string")) {

  val config = ConfigFactory.load()
  val deserializeAsNullable = config.getBoolean("eel.avro.deserializeAsNullable")
  var schema: StructType = null
  var fields: Array[Field] = null
  var range: Range = null

  def toScala(value: Any): Any = {
    value match {
      case record: GenericRecord => toValues(record)
      case utf8: Utf8 if useJavaString => value.asInstanceOf[Utf8].toString
      case col: java.util.Collection[Any] => col.asScala.toVector.map(toScala)
      case map: java.util.Map[_, _] => map.asScala.toMap.map { case (k, v) => toScala(k) -> toScala(v) }
      case other => other
    }
  }

  def toValues(record: GenericRecord): Vector[Any] = {
    val vector = Vector.newBuilder[Any]
    for (k <- 0 until record.getSchema.getFields.size) {
      val value = record.get(k)
      vector += toScala(value)
    }
    vector.result
  }

  def toRow(record: GenericRecord): Row = {
    // take the schema from the first record
    if (schema == null) {
      schema = AvroSchemaFns.fromAvroSchema(record.getSchema, deserializeAsNullable)
    }
    Row(schema, toValues(record))
  }
} 
Example 71
Source File: IndexWithKeyFields.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.data

import com.fasterxml.jackson.databind.JsonNode
import com.typesafe.config.ConfigFactory
import org.apache.avro.{LogicalTypes, Schema, SchemaBuilder}
import org.apache.avro.generic.GenericRecord
import org.apache.log4j.LogManager
import org.joda.time.format.ISODateTimeFormat

import scala.util.control.NonFatal


case class IndexWithKeyFields(uuid: String,
                              lastModified: java.sql.Timestamp,
                              path: String) extends GenericRecord with CsvGenerator {

  override def put(key: String, v: scala.Any): Unit = ???

  override def get(key: String): AnyRef = key match {
    case "uuid" => uuid
    case "lastModified" => java.lang.Long.valueOf(lastModified.getTime)
    case "path" => path
  }

  override def put(i: Int, v: scala.Any): Unit = ???

  override def get(i: Int): AnyRef = i match {
    case 0 => uuid
    case 1 => java.lang.Long.valueOf(lastModified.getTime)
    case 2 => path
    case _ => throw new IllegalArgumentException
  }

  override def getSchema: Schema = IndexWithSystemFields.schema

  override def csv: String =
    (if (uuid == null) "" else uuid) + "," +
      (if (lastModified == null) "" else ISODateTimeFormat.dateTime.print(lastModified.getTime)) + "," +
      (if (path == null) "" else path)
}

object IndexWithKeyFields extends ObjectExtractor[IndexWithKeyFields] {

  private val logger = LogManager.getLogger(IndexWithSystemFields.getClass)

  // AVRO-2065 - doesn't allow union over logical type, so we can't make timestamp column nullable.
  val timestampMilliType: Schema = LogicalTypes.timestampMillis.addToSchema(Schema.create(Schema.Type.LONG))

  val schema: Schema = SchemaBuilder
    .record("IndexWithSystemFields").namespace("cmwell.analytics")
    .fields
    .name("uuid").`type`.unionOf.stringType.and.nullType.endUnion.noDefault
    .name("lastModified").`type`(timestampMilliType).noDefault
    .name("path").`type`.unionOf.stringType.and.nullType.endUnion.noDefault
    .endRecord

  private val config = ConfigFactory.load
  val infotonSize: Int = config.getInt("extract-index-from-es.fetch-size-index-with-uuid-lastModified-path")

  def includeFields: String = {
    // Note that 'quad' is not included in this list
    val fields = "uuid,lastModified,path"
      .split(",")
      .map(name => s""""system.$name"""")
      .mkString(",")

    s""""_source": [$fields]"""
  }

  def extractFromJson(hit: JsonNode): IndexWithKeyFields = {

    val system = hit.findValue("_source").findValue("system")

    def extractString(name: String): String = system.findValue(name) match {
      case x: JsonNode => x.asText
      case _ => null
    }

    // Extracting date values as Long - as a java.sql.Date might be better
    def extractDate(name: String): java.sql.Timestamp = system.findValue(name) match {
      case x: JsonNode =>
        try {
          new java.sql.Timestamp(ISODateTimeFormat.dateTime.parseDateTime(x.asText).getMillis)
        }
        catch {
          case NonFatal(ex) =>
            logger.warn(s"Failed conversion of date value: $x", ex)
            throw ex
        }
      case _ => null
    }

    IndexWithKeyFields(
      uuid = extractString("uuid"),
      lastModified = extractDate("lastModified"),
      path = extractString("path"))
  }
} 
Example 72
Source File: AvroSEBasicTest.scala    From akka-serialization-test   with Apache License 2.0 5 votes vote down vote up
package com.github.dnvriend.serializer.avro4s

import com.github.dnvriend.TestSpec
import com.github.dnvriend.domain.BookStore.{ ChangedBookV1, ChangedBookV2, ChangedBookV3, ChangedBookV4 }
import com.github.dnvriend.serializer.avro.{ BookSerializerV1, BookSerializerV2, BookSerializerV3 }
import com.sksamuel.avro4s.{ AvroSchema, RecordFormat }
import org.apache.avro.Schema
import org.apache.avro.file.SeekableByteArrayInput
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord }
import org.apache.avro.io.DecoderFactory

// SE stands for Schema Evolution
class AvroSEBasicTest extends TestSpec {

  @Override
  def fromBytes(bytes: Array[Byte], schema: Schema): GenericRecord = {
    val serveReader = new GenericDatumReader[GenericRecord](schema)
    serveReader.read(null, DecoderFactory.get().binaryDecoder(bytes, null))
  }

  val title = "Moby-Dick; or, The Whale"
  val year = 1851
  val editor = "Scala Books"

  "AvroSEBasicTest" should "deserialize old class with renamed field" in {
    // in this case, two different serializers can be used

    val obj = ChangedBookV1(title, year)
    val serializerV1 = new BookSerializerV1
    val bytes: Array[Byte] = serializerV1.toBinary(obj)
    val serializerV2 = new BookSerializerV2

    serializerV2.fromBinary(bytes) should matchPattern {
      case ChangedBookV2(`title`, `year`) ⇒
    }
  }

  it should "deserialize old class without new field" in {

    val obj = ChangedBookV2(title, year)
    val serializerV2 = new BookSerializerV2
    val bytes: Array[Byte] = serializerV2.toBinary(obj)

    val in = new SeekableByteArrayInput(bytes)

    val schema2 = AvroSchema[ChangedBookV2]
    val schema3 = AvroSchema[ChangedBookV3]

    val gdr = new GenericDatumReader[GenericRecord](schema2, schema3)
    val binDecoder = DecoderFactory.get().binaryDecoder(in, null)
    val record: GenericRecord = gdr.read(null, binDecoder)
    val format = RecordFormat[ChangedBookV3]
    val r = format.from(record)

    r should matchPattern {
      case ChangedBookV3(`title`, `year`, "") ⇒
    }

  }

  it should "deserialize old class with dropped field" in {

    val obj = ChangedBookV3(title, year, editor)
    val serializerV3 = new BookSerializerV3
    val bytes: Array[Byte] = serializerV3.toBinary(obj)

    val in = new SeekableByteArrayInput(bytes)

    val schema3 = AvroSchema[ChangedBookV3]
    val schema4 = AvroSchema[ChangedBookV4]

    val gdr = new GenericDatumReader[GenericRecord](schema3, schema4)
    val binDecoder = DecoderFactory.get().binaryDecoder(in, null)
    val record: GenericRecord = gdr.read(null, binDecoder)
    val format = RecordFormat[ChangedBookV4]
    val r = format.from(record)

    r should matchPattern {
      case ChangedBookV4(`title`, `editor`) ⇒
    }

  }

} 
Example 73
Source File: AvroIOTest.scala    From ratatool   with Apache License 2.0 5 votes vote down vote up
package com.spotify.ratatool.io

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, File}

import com.spotify.ratatool.Schemas
import com.spotify.ratatool.avro.specific.TestRecord
import org.apache.avro.generic.GenericRecord
import com.spotify.ratatool.scalacheck._
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

class AvroIOTest extends AnyFlatSpec with Matchers {

  private val genericSchema = Schemas.avroSchema
  private val genericGen = genericRecordOf(genericSchema)
  private val genericData = (1 to 100).flatMap(_ => genericGen.sample)

  private val specificSchema = TestRecord.getClassSchema
  private val specificGen = specificRecordOf[TestRecord]
  private val specificData = (1 to 100).flatMap(_ => specificGen.sample)

  "AvroIO" should "work with generic record and stream" in {
    val out = new ByteArrayOutputStream()
    AvroIO.writeToOutputStream(genericData, genericSchema, out)
    val in = new ByteArrayInputStream(out.toByteArray)
    val result = AvroIO.readFromInputStream[GenericRecord](in).toList
    result should equal (genericData)
  }

  it should "work with generic record and file" in {
    val file = File.createTempFile("ratatool-", ".avro")
    file.deleteOnExit()
    AvroIO.writeToFile(genericData, genericSchema, file)
    val result = AvroIO.readFromFile[GenericRecord](file).toList
    result should equal (genericData)
  }

  it should "work with specific record and stream" in {
    val out = new ByteArrayOutputStream()
    AvroIO.writeToOutputStream(specificData, specificSchema, out)
    val in = new ByteArrayInputStream(out.toByteArray)
    val result = AvroIO.readFromInputStream[TestRecord](in).toList
    result.map(FixRandomData(_)) should equal (specificData.map(FixRandomData(_)))
  }

  it should "work with specific record and file" in {
    val file = File.createTempFile("ratatool-", ".avro")
    file.deleteOnExit()
    AvroIO.writeToFile(specificData, specificSchema, file)
    val result = AvroIO.readFromFile[TestRecord](file).toList
    result.map(FixRandomData(_)) should equal (specificData.map(FixRandomData(_)))
  }
} 
Example 74
Source File: AvroIO.scala    From ratatool   with Apache License 2.0 5 votes vote down vote up
package com.spotify.ratatool.io

import java.io.{File, InputStream, OutputStream}
import java.nio.ByteBuffer
import java.nio.channels.SeekableByteChannel

import com.google.common.io.ByteStreams
import org.apache.avro.Schema
import org.apache.avro.file.{DataFileReader, DataFileWriter, SeekableByteArrayInput, SeekableInput}
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.{DatumReader, DatumWriter}
import org.apache.avro.reflect.{ReflectDatumReader, ReflectDatumWriter}
import org.apache.avro.specific.{SpecificDatumReader, SpecificDatumWriter, SpecificRecord}
import org.apache.beam.sdk.io.FileSystems
import org.apache.beam.sdk.io.fs.MatchResult.Metadata

import scala.jdk.CollectionConverters._
import scala.reflect.ClassTag


  def writeToOutputStream[T: ClassTag](data: Iterable[T],
                                       schema: Schema,
                                       os: OutputStream): Unit = {
    val fileWriter = new DataFileWriter(createDatumWriter[T]).create(schema, os)
    data.foreach(fileWriter.append)
    fileWriter.close()
  }

  def getAvroSchemaFromFile(path: String): Schema = {
    require(FileStorage(path).exists, s"File `$path` does not exist!")
    val files = FileStorage(path).listFiles.filter(_.resourceId.getFilename.endsWith(".avro"))
    require(files.nonEmpty, s"File `$path` does not contain avro files")
    val reader = new GenericDatumReader[GenericRecord]()
    val dfr = new DataFileReader[GenericRecord](AvroIO.getAvroSeekableInput(files.head), reader)
    dfr.getSchema
  }

  private def getAvroSeekableInput(meta: Metadata): SeekableInput = new SeekableInput {
    require(meta.isReadSeekEfficient)
    private val in = FileSystems.open(meta.resourceId()).asInstanceOf[SeekableByteChannel]
    override def read(b: Array[Byte], off: Int, len: Int): Int =
      in.read(ByteBuffer.wrap(b, off, len))
    override def tell(): Long = in.position()
    override def length(): Long = in.size()
    override def seek(p: Long): Unit = in.position(p)
    override def close(): Unit = in.close()
  }

} 
Example 75
Source File: ParquetSampler.scala    From ratatool   with Apache License 2.0 5 votes vote down vote up
package com.spotify.ratatool.samplers

import com.spotify.ratatool.io.ParquetIO
import org.apache.avro.generic.GenericRecord
import org.slf4j.{Logger, LoggerFactory}

import scala.collection.mutable.ListBuffer


class ParquetSampler(path: String, protected val seed: Option[Long] = None)
  extends Sampler[GenericRecord] {

  private val logger: Logger = LoggerFactory.getLogger(classOf[ParquetSampler])

  override def sample(n: Long, head: Boolean): Seq[GenericRecord] = {
    require(n > 0, "n must be > 0")
    require(head, "Parquet can only be used with --head")
    logger.info("Taking a sample of {} from Parquet {}", n, path)

    val result = ListBuffer.empty[GenericRecord]
    val iterator = ParquetIO.readFromFile(path)
    while (result.length < n && iterator.hasNext) {
      result.append(iterator.next())
    }
    result.toList
  }

} 
Example 76
Source File: CSVAutoReadersTest.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.readers

import com.salesforce.op.test.PassengerSparkFixtureTest
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.junit.runner.RunWith
import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner

import scala.collection.JavaConverters._


@RunWith(classOf[JUnitRunner])
class CSVAutoReadersTest extends FlatSpec with PassengerSparkFixtureTest {

  private val expectedSchema = new Schema.Parser().parse(resourceFile(name = "PassengerAuto.avsc"))
  private val allFields = expectedSchema.getFields.asScala.map(_.name())
  private val keyField: String = allFields.head

  Spec[CSVAutoReader[_]] should "read in data correctly and infer schema" in {
    val dataReader = DataReaders.Simple.csvAuto[GenericRecord](
      path = Some(passengerCsvWithHeaderPath),
      key = _.get(keyField).toString
    )
    val data = dataReader.readRDD().collect()
    data.foreach(_ shouldBe a[GenericRecord])
    data.length shouldBe 8

    val inferredSchema = data.head.getSchema
    inferredSchema shouldBe expectedSchema
  }

  it should "read in data correctly and infer schema based with headers provided" in {
    val dataReader = DataReaders.Simple.csvAuto[GenericRecord](
      path = Some(passengerCsvPath),
      key = _.get(keyField).toString,
      headers = allFields
    )
    val data = dataReader.readRDD().collect()
    data.foreach(_ shouldBe a[GenericRecord])
    data.length shouldBe 8

    val inferredSchema = data.head.getSchema
    inferredSchema shouldBe expectedSchema

  }

} 
Example 77
Source File: CSVReaders.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.readers

import com.salesforce.op.OpParams
import com.salesforce.op.utils.io.csv.{CSVInOut, CSVOptions, CSVToAvro}
import org.apache.avro.generic.GenericRecord
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SparkSession}

import scala.reflect.ClassTag
import scala.reflect.runtime.universe.WeakTypeTag


class ConditionalCSVReader[T <: GenericRecord : ClassTag : WeakTypeTag]
(
  readPath: Option[String],
  key: T => String,
  schema: String,
  options: CSVOptions = CSVDefaults.CSVOptions,
  timeZone: String = CSVDefaults.TimeZone,
  val conditionalParams: ConditionalParams[T]
) extends CSVReader[T](readPath = readPath, key = key,
  schema = schema, options = options, timeZone = timeZone) with ConditionalDataReader[T] 
Example 78
Source File: AvroInOutTest.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.utils.io.avro

import java.io.{File, FileNotFoundException, FileWriter}
import java.nio.file.Paths

import com.salesforce.op.test.TestSparkContext
import com.salesforce.op.utils.io.avro.AvroInOut._
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.rdd.RDD
import org.junit.runner.RunWith
import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class AvroInOutTest extends FlatSpec with TestSparkContext {
  val avroSchemaPath = s"$testDataDir/PassengerDataAll.avsc"
  val avroFilePath = s"$testDataDir/PassengerDataAll.avro"
  val avroFileRecordCount = 891
  val hdfs: FileSystem = FileSystem.get(sc.hadoopConfiguration)
  lazy val avroTemp: String = tempDir + "/avro-inout-test"

  Spec(AvroInOut.getClass) should "creates RDD from an avro file" in {
    val res = readPathSeq(avroFilePath, withCount = true, deepCopy = true, persist = false)
    res shouldBe a[RDD[_]]
    res.count shouldBe avroFileRecordCount
  }

  it should "creates RDD from a sequence of avro files" in {
    val res = readPathSeq(s"$avroFilePath,$avroFilePath")
    res.count shouldBe avroFileRecordCount*2
  }

  it should "create RDD from a mixed sequence of valid and invalid avro files" in {
    val res = readPathSeq(s"badfile/path1,$avroFilePath,badfile/path2,$avroFilePath,badfile/path3")
    res.count shouldBe avroFileRecordCount*2
  }

  it should "throw an error if passed in avro files are invalid" in {
    val error = intercept[IllegalArgumentException](readPathSeq("badfile/path1,badfile/path2"))
    error.getMessage shouldBe "No valid directory found in path 'badfile/path1,badfile/path2'"
  }

  it should "creates Some(RDD) from an avro file" in {
    val res = read(avroFilePath)
    res.size shouldBe 1
    res.get shouldBe an[RDD[_]]
    res.get.count shouldBe avroFileRecordCount
  }

  it should "create None from an invalid avro file" in {
    val res = read("badfile/path")
    res shouldBe None
  }

  Spec[AvroWriter[_]] should "writeAvro to filesystem" in {
    val avroData = readPathSeq(avroFilePath).asInstanceOf[RDD[GenericRecord]]
    val avroSchema = loadFile(avroSchemaPath)

    val error = intercept[FileNotFoundException](hdfs.listStatus(new Path(avroTemp)))
    error.getMessage shouldBe s"File $avroTemp does not exist"

    AvroWriter(avroData).writeAvro(avroTemp, avroSchema)
    val hdfsFiles = hdfs.listStatus(new Path(avroTemp)) filter (x => x.getPath.getName.contains("part"))
    val res = readPathSeq((for { x <- hdfsFiles } yield avroTemp + "/" + x.getPath.getName).mkString(","))
    res.count shouldBe avroFileRecordCount
  }

  it should "checkPathsExist" in {
    val tmpDir = Paths.get(File.separator, "tmp").toFile
    val f1 = new File(tmpDir, "avroinouttest")
    f1.delete()
    val w = new FileWriter(f1)
    w.write("just checking")
    w.close()
    val f2 = new File(tmpDir, "thisfilecannotexist")
    f2.delete()
    val f3 = new File(tmpDir, "this file cannot exist")
    f3.delete()
    assume(f1.exists && !f2.exists && !f3.exists)

    // check for one dir being invalid in the path amongst two
    selectExistingPaths(s"$f1,$f2") shouldBe f1.toString

    // check if all dirs in the path are invalid then we get an exception
    intercept[IllegalArgumentException] { selectExistingPaths(f2.toString) }

    // also, check if all dirs in the path are invalid ( in a different way ) then we get an exception
    intercept[IllegalArgumentException] { selectExistingPaths(f3.toString) }

    // check for one dir being invalid ( in a different way ) in the path amongst the two dirs in it
    selectExistingPaths(s"$f1,$f3") shouldBe f1.toString

    // check for paths order insensitivity
    selectExistingPaths(s"$f3,$f1") shouldBe f1.toString

    // check for an exception if the path is an empty string
    intercept[IllegalArgumentException] { selectExistingPaths("") }
  }

} 
Example 79
Source File: RichGenericRecordTest.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.utils.avro

import com.salesforce.op.test.{TestCommon, TestSparkContext}
import com.salesforce.op.utils.io.avro.AvroInOut
import org.apache.avro.generic.GenericRecord
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{FlatSpec, Matchers}


@RunWith(classOf[JUnitRunner])
class RichGenericRecordTest extends FlatSpec with Matchers with TestSparkContext with TestCommon {

  import com.salesforce.op.utils.avro.RichGenericRecord._

  val dataPath = resourceFile(parent = "../test-data", name = s"PassengerData.avro").getPath
  val passengerData = AvroInOut.read[GenericRecord](dataPath).getOrElse(throw new Exception("Couldn't read data"))
  val firstRow = passengerData.sortBy(_.get("passengerId").toString.toInt).first

  Spec[RichGenericRecord] should "get value of Int" in {
    val id = firstRow.getValue[Int]("passengerId")
    id shouldBe Some(1)
  }

  it should "get value of Double" in {
    val survived = firstRow.getValue[Double]("survived")
    survived shouldBe Some(0.0)
  }

  it should "get value of Long" in {
    val height = firstRow.getValue[Long]("height")
    height shouldBe Some(168L)
  }

  it should "get value of String" in {
    val gender = firstRow.getValue[String]("gender")
    gender shouldBe Some("Female")
  }

  it should "get value of Char" in {
    val gender = firstRow.getValue[Char]("gender")
    gender shouldBe Some("Female")
  }

  it should "get value of Float" in {
    val age = firstRow.getValue[Float]("age")
    age shouldBe Some(32.0)
  }

  it should "get value of Short" in {
    val weight = firstRow.getValue[Short]("weight")
    weight shouldBe Some(67)
  }

  it should "throw error for invalid field" in {
    val error = intercept[IllegalArgumentException](firstRow.getValue[Short]("invalidField"))
    error.getMessage shouldBe "requirement failed: invalidField is not found in Avro schema!"
  }
} 
Example 80
Source File: RichGenericRecord.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.utils.avro

import org.apache.avro.generic.GenericRecord

object RichGenericRecord {

  
  private def javaConvert(in: Any): Any = {
    in match {
      case s: java.lang.String => s
      case s: org.apache.avro.util.Utf8 => s.toString
      case i: java.lang.Integer => i.toInt
      case d: java.lang.Double => d.toDouble
      case l: java.lang.Long => l.toLong
      case b: java.lang.Boolean => b
      case f: java.lang.Float => f.toFloat
      case s: java.lang.Short => s.toShort
      case c: java.lang.Character => c.toChar
      case x => throw new NotImplementedError(s"${x.getClass} is not an implemented type")
    }
  }

} 
Example 81
Source File: IndexWithCompleteDocument.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.data

import com.fasterxml.jackson.databind.JsonNode
import com.typesafe.config.ConfigFactory
import org.apache.avro.generic.GenericRecord
import org.apache.avro.{Schema, SchemaBuilder}

case class IndexWithCompleteDocument(uuid: String, document: String) extends GenericRecord with CsvGenerator {

  override def put(key: String, v: scala.Any): Unit = ???

  override def get(key: String): AnyRef = key match {
    case "uuid" => uuid
    case "document" => document
    case _ => throw new IllegalArgumentException
  }

  override def put(i: Int, v: scala.Any): Unit = ???

  override def get(i: Int): AnyRef = i match {
    case 0 => uuid
    case 1 => document
    case _ => throw new IllegalArgumentException
  }

  override def getSchema: Schema = IndexWithCompleteDocument.schema

  // Specifically don't implement CsvGenerator.csv since it is guaranteed to be invalid CSV - force use of Parquet.
}

object IndexWithCompleteDocument extends ObjectExtractor[IndexWithCompleteDocument] {

  val schema: Schema = SchemaBuilder
    .record("IndexWithCompleteDocument").namespace("cmwell.analytics")
    .fields
    .name("uuid").`type`.unionOf.stringType.and.nullType.endUnion.noDefault
    .name("document").`type`.unionOf.stringType.and.nullType.endUnion.noDefault
    .endRecord

  private val config = ConfigFactory.load
  val infotonSize: Int = config.getInt("extract-index-from-es.fetch-size-index-with-complete-document")

  def includeFields: String = s""""_source": "*""""

  def extractFromJson(hit: JsonNode): IndexWithCompleteDocument =
    IndexWithCompleteDocument(
      uuid = hit.findValue("_id").asText,
      document = hit.findValue("_source").toString)
} 
Example 82
Source File: DataWriterFactory.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.data

import java.io.File
import java.nio.file.Paths

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.util.Shard
import org.apache.avro.generic.GenericRecord
import org.apache.commons.io.FileUtils
import org.apache.parquet.hadoop.metadata.CompressionCodecName

import scala.concurrent.ExecutionContextExecutor

trait DataWriterFactory[T <: GenericRecord] {
  def apply(shard: Shard): DataWriter[T]
}

object DataWriterFactory {

  private val compressionCodec = CompressionCodecName.SNAPPY


  def file[T <: GenericRecord with CsvGenerator](format: String,
                                                 objectExtractor: ObjectExtractor[T],
                                                 outDirectory: String): Shard => DataWriter[T] = {

    val extension = s".$format" + (if (format == "parquet") s"${compressionCodec.getExtension}" else "")

    // Generate a meaningful file name for the target file name based on the source shard index name and shard number.
    (sourceShard: Shard) => {
      val outFile: File = Paths.get(outDirectory, s"part-r-${sourceShard.indexName}.${sourceShard.shard}$extension").toFile

      if (outFile.exists)
        FileUtils.forceDelete(outFile)

      new File(outFile.getParent).mkdirs()

      FileDataWriter[T](format, objectExtractor.schema, outFile.toString, compressionCodec)
    }
  }

  
  def index[T <: GenericRecord](indexMap: Map[String, String], // source-index -> target-index
                                esEndpoint: String)
                               (implicit system: ActorSystem,
                                executionContext: ExecutionContextExecutor,
                                actorMaterializer: ActorMaterializer
                               ): Shard => DataWriter[T] = {

    (sourceShard: Shard) => {
      val targetIndex = indexMap(sourceShard.indexName)
      new IndexDataWriter[T](indexName = targetIndex, esEndpoint = esEndpoint)
    }
  }
} 
Example 83
Source File: AvroDataOutputStream.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s

import java.io.OutputStream

import org.apache.avro.Schema
import org.apache.avro.file.{CodecFactory, DataFileWriter}
import org.apache.avro.generic.{GenericDatumWriter, GenericRecord}


case class AvroDataOutputStream[T](os: OutputStream,
                                   codec: CodecFactory)
                                  (implicit encoder: Encoder[T]) extends AvroOutputStream[T] {

  val resolved = encoder.resolveEncoder()

  val (writer, writeFn) = resolved.schema.getType match {
    case Schema.Type.DOUBLE | Schema.Type.LONG | Schema.Type.BOOLEAN | Schema.Type.STRING | Schema.Type.INT | Schema.Type.FLOAT =>
      val datumWriter = new GenericDatumWriter[T](resolved.schema)
      val dataFileWriter = new DataFileWriter[T](datumWriter)
      dataFileWriter.setCodec(codec)
      dataFileWriter.create(resolved.schema, os)
      (dataFileWriter, (t: T) => dataFileWriter.append(t))
    case _ =>
      val datumWriter = new GenericDatumWriter[GenericRecord](resolved.schema)
      val dataFileWriter = new DataFileWriter[GenericRecord](datumWriter)
      dataFileWriter.setCodec(codec)
      dataFileWriter.create(resolved.schema, os)
      (dataFileWriter, (t: T) => {
        val record = resolved.encode(t).asInstanceOf[GenericRecord]
        dataFileWriter.append(record)
      })
  }

  override def close(): Unit = {
    flush()
    writer.close()
  }

  override def write(t: T): Unit = {
    writeFn(t)
  }

  override def flush(): Unit = writer.flush()
  override def fSync(): Unit = writer.fSync()
} 
Example 84
Source File: Job.scala    From spark-avro-compactor   with Apache License 2.0 5 votes vote down vote up
package ie.ianduffy.spark.avro.compactor

import ie.ianduffy.spark.avro.compactor.Utils._
import io.confluent.kafka.schemaregistry.client.{SchemaMetadata, SchemaRegistryClient}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.avro.mapred.AvroKey
import org.apache.avro.mapreduce.AvroKeyOutputFormat
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.NullWritable
import org.apache.spark.sql.SparkSession
import org.slf4j.LoggerFactory

object Job {

  private val log = LoggerFactory.getLogger(Job.getClass.getName.replace("$", ""))

  def run(spark: SparkSession, schemaRegistry: SchemaRegistryClient, jobConfig: JobConfig): Unit = {
    val schema: Schema = {
      val latestSchemaMetadata: SchemaMetadata = schemaRegistry.getLatestSchemaMetadata(jobConfig.schemaRegistrySubject)
      val id: Int = latestSchemaMetadata.getId
      schemaRegistry.getById(id)
    }

    implicit val sparkConfig: Configuration = spark.sparkContext.hadoopConfiguration
    sparkConfig.set("avro.schema.input.key", schema.toString())
    sparkConfig.set("avro.schema.output.key", schema.toString())

    val inputPath: Path = new Path(jobConfig.input)
    val outputPath: Path = new Path(jobConfig.output)

    val fs: FileSystem = inputPath.getFileSystem(sparkConfig)

    // avoid raising org.apache.hadoop.mapred.FileAlreadyExistsException
    if (jobConfig.overrideOutput) fs.delete(outputPath, true)

    // from fileSystem prefix with s3 the default is 64MB and can be overwitten by fs.s3.block.size
    // from fileSystem prefix with s3a the default is 32MB and can be overwitten by setting fs.s3a.block.size
    val outputBlocksize: Long = fs.getDefaultBlockSize(outputPath)

    // Where inputPath is of the form s3://some/path
    val inputPathSize: Long = fs.getContentSummary(inputPath).getSpaceConsumed

    val numPartitions: Int = Math.max(1, Math.floor((inputPathSize / CompressionRatio.AVRO_SNAPPY) / outputBlocksize).toInt)

    log.debug(
      s"""outputBlocksize: $outputBlocksize
         | inputPathSize: $inputPathSize
         | splitSize: $numPartitions
       """.stripMargin)

    val rdd = readHadoopFile(spark, inputPath.toString)

    rdd.coalesce(numPartitions)
      .saveAsNewAPIHadoopFile(
        outputPath.toString,
        classOf[AvroKey[GenericRecord]],
        classOf[NullWritable],
        classOf[AvroKeyOutputFormat[GenericRecord]],
        sparkConfig
      )
  }
} 
Example 85
Source File: Utils.scala    From spark-avro-compactor   with Apache License 2.0 5 votes vote down vote up
package ie.ianduffy.spark.avro.compactor

import org.apache.avro.generic.GenericRecord
import org.apache.avro.mapred.AvroKey
import org.apache.avro.mapreduce.AvroKeyInputFormat
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.NullWritable
import org.apache.spark.sql.SparkSession

object Utils {

  def createSparkSession: SparkSession =
    SparkSession
      .builder
      .appName("avro-compactor")
      .getOrCreate


  def readHadoopFile(spark: SparkSession, path: String)(implicit sparkConfig: Configuration) = {
    spark.sparkContext.newAPIHadoopFile(
      path,
      classOf[AvroKeyInputFormat[GenericRecord]],
      classOf[AvroKey[GenericRecord]],
      classOf[NullWritable],
      sparkConfig
    )
  }

} 
Example 86
Source File: AvroToParquetWriter.scala    From etl-light   with MIT License 5 votes vote down vote up
package yamrcraft.etlite.writers

import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.fs.Path
import org.apache.parquet.avro.AvroParquetWriter
import org.slf4j.LoggerFactory
import yamrcraft.etlite.utils.FileUtils

class AvroToParquetWriter(tempFile: String, outputFile: String) extends Writer[GenericRecord] {

  val logger = LoggerFactory.getLogger(this.getClass)

  // lazy initialization
  var writer: Option[AvroParquetWriter[GenericRecord]] = None

  val tempPath = new Path(tempFile + ".parquet")
  val outputPath = new Path(outputFile + ".parquet")
  logger.info(s"creating writer for working file: ${tempPath.toString}, outputFile: ${outputPath.toString}")

  override def write(event: GenericRecord): Unit = {
    logger.info(s"ParquetWriter.write, event type: ${event.getSchema.getName}")
    if (writer.isEmpty) {
      writer = Some(createWriter(tempPath.toString, event.getSchema))
    }

    writer.get.write(event)
  }

  override def commit(): Unit = {
    writer.get.close()

    val fs = FileUtils.getFS(outputPath.toString)
    fs.mkdirs(outputPath.getParent)
    if (fs.exists(outputPath)) {
      fs.rename(outputPath, new Path(outputPath.getParent, s"__${outputPath.getName}.${System.currentTimeMillis()}.old.__"))
    }
    // copy temp file to output file (typically temp file would be on local file system).
    if (tempFile.startsWith("file")) {
      logger.info(s"copy file from: ${tempPath.toString} to $outputPath")
      fs.copyFromLocalFile(true, true, tempPath, outputPath)
    } else {
      logger.info(s"renaming file from: ${tempPath.toString} to $outputPath")
      fs.rename(tempPath, outputPath)
    }
  }

  private def createWriter(file: String, schema: Schema) = {
    val fs = FileUtils.getFS(file)
    val path = new Path(file)
    if (fs.exists(path)) {
      fs.delete(path, true)
    }
    fs.mkdirs(path.getParent)
    new AvroParquetWriter[GenericRecord](path, schema)
  }

} 
Example 87
Source File: JsonToParquetPipelineFactory.scala    From etl-light   with MIT License 5 votes vote down vote up
package yamrcraft.etlite.pipeline

import org.apache.avro.generic.GenericRecord
import yamrcraft.etlite.PipelineSettings
import yamrcraft.etlite.transformers.{JsonToAvroTransformer, Message}
import yamrcraft.etlite.writers.{AvroToParquetWriter, TimePartitioningWriter}

class JsonToParquetPipelineFactory extends PipelineFactory[Message[GenericRecord]] {

  def createPipeline(settings: PipelineSettings, jobId: Long, partitionId: Int): Pipeline[Message[GenericRecord]] =
    new Pipeline(
      new JsonToAvroTransformer(settings.transformerConfig),
      new TimePartitioningWriter(
        settings.writerConfig,
        jobId,
        partitionId,
        (tempFile, outputFile) => new AvroToParquetWriter(tempFile, outputFile))
    )

} 
Example 88
Source File: JsonToAvroTransformer.scala    From etl-light   with MIT License 5 votes vote down vote up
package yamrcraft.etlite.transformers

import com.typesafe.config.Config
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import play.api.libs.json.Json
import yamrcraft.etlite.utils.ConfigConversions._
import yamrcraft.etlite.utils.{FileUtils, JsonAvroConverter, TimeUtils}
import yamrcraft.etlite.{ErrorType, EtlException}

class JsonToAvroTransformer(config: Config) extends Transformer[Message[GenericRecord]] {

  val converter = new JsonAvroConverter()

  // config settings
  val timestampField = config.getString("timestamp-field")
  val timestampFieldFormat = config.getString("timestamp-field-format")
  val defaultSchemaFileName = config.getString("default-schema-file")
  val (schemaSelectionField, schemas) = {
    config.hasPath("schema-selection") match {
      case true =>
        (Some(config.getString("schema-selection.field")),
          Some(config.getConfig("schema-selection.schemas").asMap.map {case (k,v) => (k, createSchema(v))}) )
      case false => (None, None)
    }
  }

  val defaultSchema: Schema = createSchema(defaultSchemaFileName)

  @throws(classOf[EtlException])
  override def transform(inbound: InboundMessage): Message[GenericRecord] = {

    try {
      val schema = getSchema(inbound.msg)
      val record = converter.convertToGenericDataRecord(inbound.msg, schema)

      Message[GenericRecord](
        record,
        schema.getName,
        extractTimestamp(record)
      )

    } catch {
      case e: EtlException => throw e
      case e: Exception => throw new EtlException(ErrorType.TransformationError, e)
    }
  }

  private def createSchema(path: String): Schema = new Schema.Parser().parse(FileUtils.readContent(path))

  private def getSchema(msg: Array[Byte]): Schema = {
    if (schemaSelectionField.isEmpty) {
      defaultSchema
    } else {
      val msgAsString = new String(msg, "UTF8")
      val msgJson = Json.parse(msgAsString)
      val selectionValue = (msgJson \ schemaSelectionField.get).asOpt[String]
      schemas.get.getOrElse(selectionValue.get, defaultSchema)
    }
  }

  @throws(classOf[EtlException])
  private def extractTimestamp(event: GenericRecord): Long = {
    try {
      (event.get(timestampField): Any) match {
        case ts: Long => ts.asInstanceOf[Long]
        case ts: String => TimeUtils.stringTimeToLong(ts, timestampFieldFormat)
        case _ => throw new RuntimeException("timestamp field is not of either Long or String types.")
      }
    } catch {
      case e: Exception => throw new EtlException(ErrorType.PartitionTimestampError, e)
    }
  }
} 
Example 89
Source File: AvroDecoder.scala    From cuesheet   with Apache License 2.0 5 votes vote down vote up
package com.kakao.cuesheet.convert

import java.util.Arrays.copyOfRange

import kafka.serializer.Decoder
import kafka.utils.VerifiableProperties
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}


sealed trait AvroDecoder[T] extends Decoder[T] {

  def props: VerifiableProperties

  protected val schema = new Schema.Parser().parse(props.getString(Avro.SCHEMA))
  protected val skipBytes = props.getInt(Avro.SKIP_BYTES, 0)

  protected val reader = new GenericDatumReader[GenericRecord](schema)
  protected val decoder = Avro.recordDecoder(reader)

  private def skip(bytes: Array[Byte], size: Int): Array[Byte] = {
    val length = bytes.length
    length - size match {
      case remaining if remaining > 0 => copyOfRange(bytes, size, length)
      case _ => new Array[Byte](0)
    }
  }

  def parse(bytes: Array[Byte]): GenericRecord = {
    val data = if (skipBytes == 0) bytes else skip(bytes, skipBytes)
    decoder(data)
  }
}

class AvroRecordDecoder(val props: VerifiableProperties) extends AvroDecoder[GenericRecord] {
  override def fromBytes(bytes: Array[Byte]): GenericRecord = parse(bytes)
}

class AvroMapDecoder(val props: VerifiableProperties) extends AvroDecoder[Map[String, Any]] {
  override def fromBytes(bytes: Array[Byte]): Map[String, Any] = Avro.toMap(parse(bytes))
}

class AvroJsonDecoder(val props: VerifiableProperties) extends AvroDecoder[String] {
  override def fromBytes(bytes: Array[Byte]): String = Avro.toJson(parse(bytes))
} 
Example 90
Source File: AvroTypeSpec.scala    From shapeless-datatype   with Apache License 2.0 5 votes vote down vote up
package shapeless.datatype.avro

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
import java.net.URI
import java.nio.ByteBuffer

import com.google.protobuf.ByteString
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.joda.time.Instant
import org.scalacheck.Prop.forAll
import org.scalacheck.ScalacheckShapeless._
import org.scalacheck._
import shapeless._
import shapeless.datatype.record._

import scala.reflect.runtime.universe._

object AvroTypeSpec extends Properties("AvroType") {
  import shapeless.datatype.test.Records._
  import shapeless.datatype.test.SerializableUtils._

  implicit def compareByteArrays(x: Array[Byte], y: Array[Byte]) = java.util.Arrays.equals(x, y)
  implicit def compareIntArrays(x: Array[Int], y: Array[Int]) = java.util.Arrays.equals(x, y)

  def roundTrip[A: TypeTag, L <: HList](m: A)(implicit
    gen: LabelledGeneric.Aux[A, L],
    fromL: FromAvroRecord[L],
    toL: ToAvroRecord[L],
    mr: MatchRecord[L]
  ): Boolean = {
    val t = ensureSerializable(AvroType[A])
    val f1: SerializableFunction[A, GenericRecord] =
      new SerializableFunction[A, GenericRecord] {
        override def apply(m: A): GenericRecord = t.toGenericRecord(m)
      }
    val f2: SerializableFunction[GenericRecord, Option[A]] =
      new SerializableFunction[GenericRecord, Option[A]] {
        override def apply(m: GenericRecord): Option[A] = t.fromGenericRecord(m)
      }
    val toFn = ensureSerializable(f1)
    val fromFn = ensureSerializable(f2)
    val copy = fromFn(roundTripRecord(toFn(m)))
    val rm = RecordMatcher[A]
    copy.exists(rm(_, m))
  }

  def roundTripRecord(r: GenericRecord): GenericRecord = {
    val writer = new GenericDatumWriter[GenericRecord](r.getSchema)
    val baos = new ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(baos, null)
    writer.write(r, encoder)
    encoder.flush()
    baos.close()
    val bytes = baos.toByteArray

    val reader = new GenericDatumReader[GenericRecord](r.getSchema)
    val bais = new ByteArrayInputStream(bytes)
    val decoder = DecoderFactory.get().binaryDecoder(bais, null)
    reader.read(null, decoder)
  }

  implicit val byteStringAvroType = AvroType.at[ByteString](Schema.Type.BYTES)(
    v => ByteString.copyFrom(v.asInstanceOf[ByteBuffer]),
    v => ByteBuffer.wrap(v.toByteArray)
  )
  implicit val instantAvroType =
    AvroType.at[Instant](Schema.Type.LONG)(v => new Instant(v.asInstanceOf[Long]), _.getMillis)
  property("required") = forAll { m: Required => roundTrip(m) }
  property("optional") = forAll { m: Optional => roundTrip(m) }
  property("repeated") = forAll { m: Repeated => roundTrip(m) }
  property("mixed") = forAll { m: Mixed => roundTrip(m) }
  property("nested") = forAll { m: Nested => roundTrip(m) }
  property("seqs") = forAll { m: Seqs => roundTrip(m) }

  implicit val uriAvroType =
    AvroType.at[URI](Schema.Type.STRING)(v => URI.create(v.toString), _.toString)
  property("custom") = forAll { m: Custom => roundTrip(m) }
} 
Example 91
Source File: AvroType.scala    From shapeless-datatype   with Apache License 2.0 5 votes vote down vote up
package shapeless.datatype.avro

import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import shapeless._

import scala.reflect.runtime.universe._

class AvroType[A] extends Serializable {
  def fromGenericRecord[L <: HList](
    m: GenericRecord
  )(implicit gen: LabelledGeneric.Aux[A, L], fromL: FromAvroRecord[L]): Option[A] =
    fromL(Right(m)).map(gen.from)
  def toGenericRecord[L <: HList](
    a: A
  )(implicit gen: LabelledGeneric.Aux[A, L], toL: ToAvroRecord[L], tt: TypeTag[A]): GenericRecord =
    toL(gen.to(a)).left.get.build(AvroSchema[A])
}

object AvroType {
  def apply[A: TypeTag]: AvroType[A] = new AvroType[A]

  def at[V: TypeTag](
    schemaType: Schema.Type
  )(fromFn: Any => V, toFn: V => Any): BaseAvroMappableType[V] = {
    AvroSchema.register(implicitly[TypeTag[V]].tpe, schemaType)
    new BaseAvroMappableType[V] {
      override def from(value: Any): V = fromFn(value)
      override def to(value: V): Any = toFn(value)
    }
  }
} 
Example 92
Source File: SpecificTestUtil.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
package test

import java.io.File

import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.Schema
import org.apache.avro.file.{ DataFileReader, DataFileWriter }

import org.specs2.mutable.Specification

object SpecificTestUtil extends Specification {

  def write[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val userDatumWriter = new SpecificDatumWriter[T]
    val dataFileWriter = new DataFileWriter[T](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
  }

  def read[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new SpecificDatumReader[T](schema)
    val dataFileReader = new DataFileReader[T](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader.
    var record: T = null.asInstanceOf[T]
    var sameRecord: T = null.asInstanceOf[T]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  }

  def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

  def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = {
    val schema = record.getSchema
    val writer = new SpecificDatumWriter[T](schema)
    val out = new java.io.ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(out, null)
    writer.write(record, encoder)
    encoder.flush
    val ba = out.toByteArray
    ba.size must ===(1)
    ba(0) must ===(0)
    out.close
    val reader = new SpecificDatumReader[T](schema)
    val decoder = DecoderFactory.get().binaryDecoder(ba, null)
    val decoded = reader.read(record, decoder)
    decoded must ===(record)
  }

} 
Example 93
Source File: SpecificDefautValuesSpec.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
import test._
import org.specs2.mutable.Specification
import java.io.File
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.file.DataFileReader

import DefaultEnum._

class SpecificDefaultValuesSpec extends Specification {

  "A case class with default values" should {
    "deserialize correctly" in {
      val record = DefaultTest()
      val records = List(record)
      
      val fileName = s"${records.head.getClass.getName}"
      val fileEnding = "avro"
      val file = File.createTempFile(fileName, fileEnding)
      file.deleteOnExit()
      SpecificTestUtil.write(file, records)
      
      val dummyRecord = new GenericDatumReader[GenericRecord]
      val schema = new DataFileReader(file, dummyRecord).getSchema
      val userDatumReader = new SpecificDatumReader[DefaultTest](schema)
      val dataFileReader = new DataFileReader[DefaultTest](file, userDatumReader)
      val sameRecord = dataFileReader.next

      sameRecord.suit === SPADES
      sameRecord.number === 0
      sameRecord.str === "str"
      sameRecord.optionString === None
      sameRecord.optionStringValue === Some("default")
      sameRecord.embedded === Embedded(1)
      sameRecord.defaultArray === Vector(1,3,4,5)
      sameRecord.optionalEnum === None
      sameRecord.defaultMap === Map("Hello" -> "world", "Merry" -> "Christmas")
      sameRecord.byt === "\u00FF".getBytes
    }
  }

}