org.apache.avro.io.DecoderFactory Scala Examples
The following examples show how to use org.apache.avro.io.DecoderFactory.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: AvroSEBasicTest.scala From akka-serialization-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.serializer.avro4s import com.github.dnvriend.TestSpec import com.github.dnvriend.domain.BookStore.{ ChangedBookV1, ChangedBookV2, ChangedBookV3, ChangedBookV4 } import com.github.dnvriend.serializer.avro.{ BookSerializerV1, BookSerializerV2, BookSerializerV3 } import com.sksamuel.avro4s.{ AvroSchema, RecordFormat } import org.apache.avro.Schema import org.apache.avro.file.SeekableByteArrayInput import org.apache.avro.generic.{ GenericDatumReader, GenericRecord } import org.apache.avro.io.DecoderFactory // SE stands for Schema Evolution class AvroSEBasicTest extends TestSpec { @Override def fromBytes(bytes: Array[Byte], schema: Schema): GenericRecord = { val serveReader = new GenericDatumReader[GenericRecord](schema) serveReader.read(null, DecoderFactory.get().binaryDecoder(bytes, null)) } val title = "Moby-Dick; or, The Whale" val year = 1851 val editor = "Scala Books" "AvroSEBasicTest" should "deserialize old class with renamed field" in { // in this case, two different serializers can be used val obj = ChangedBookV1(title, year) val serializerV1 = new BookSerializerV1 val bytes: Array[Byte] = serializerV1.toBinary(obj) val serializerV2 = new BookSerializerV2 serializerV2.fromBinary(bytes) should matchPattern { case ChangedBookV2(`title`, `year`) ⇒ } } it should "deserialize old class without new field" in { val obj = ChangedBookV2(title, year) val serializerV2 = new BookSerializerV2 val bytes: Array[Byte] = serializerV2.toBinary(obj) val in = new SeekableByteArrayInput(bytes) val schema2 = AvroSchema[ChangedBookV2] val schema3 = AvroSchema[ChangedBookV3] val gdr = new GenericDatumReader[GenericRecord](schema2, schema3) val binDecoder = DecoderFactory.get().binaryDecoder(in, null) val record: GenericRecord = gdr.read(null, binDecoder) val format = RecordFormat[ChangedBookV3] val r = format.from(record) r should matchPattern { case ChangedBookV3(`title`, `year`, "") ⇒ } } it should "deserialize old class with dropped field" in { val obj = ChangedBookV3(title, year, editor) val serializerV3 = new BookSerializerV3 val bytes: Array[Byte] = serializerV3.toBinary(obj) val in = new SeekableByteArrayInput(bytes) val schema3 = AvroSchema[ChangedBookV3] val schema4 = AvroSchema[ChangedBookV4] val gdr = new GenericDatumReader[GenericRecord](schema3, schema4) val binDecoder = DecoderFactory.get().binaryDecoder(in, null) val record: GenericRecord = gdr.read(null, binDecoder) val format = RecordFormat[ChangedBookV4] val r = format.from(record) r should matchPattern { case ChangedBookV4(`title`, `editor`) ⇒ } } }
Example 2
Source File: StringToGenericRecord.scala From hydra with Apache License 2.0 | 5 votes |
package hydra.avro.convert import java.util.UUID import org.apache.avro.{LogicalTypes, Schema} import org.apache.avro.generic.{GenericDatumReader, GenericRecord} import org.apache.avro.io.DecoderFactory import cats.implicits._ import org.apache.avro.util.Utf8 import scala.util.{Failure, Success, Try} object StringToGenericRecord { final case class ValidationExtraFieldsError(fields: Set[String]) extends RuntimeException( s"Extra fields ${fields.mkString(",")} found with Strict Validation Strategy" ) final case class InvalidLogicalTypeError(expected: String, received: AnyRef) extends RuntimeException( s"Invalid logical type. Expected $expected but received $received" ) implicit class ConvertToGenericRecord(s: String) { private def isUuidValid(s: String): Boolean = Try(UUID.fromString(s)).isSuccess private def checkLogicalTypes(record: GenericRecord): Try[Unit] = { import collection.JavaConverters._ def checkAll(avroField: AnyRef, fieldSchema: Option[Schema]): Try[Unit] = avroField match { case g: GenericRecord => g.getSchema.getFields.asScala.toList .traverse(f => checkAll(g.get(f.name), f.schema.some)).void case u: Utf8 if fieldSchema.exists(f => Option(f.getLogicalType).exists(_.getName == LogicalTypes.uuid.getName)) => if (isUuidValid(u.toString)) Success(()) else Failure(InvalidLogicalTypeError("UUID", u.toString)) case _ => Success(()) } val fields = record.getSchema.getFields.asScala.toList fields.traverse(f => checkAll(record.get(f.name), f.schema.some)).void } private def getAllPayloadFieldNames: Set[String] = { import spray.json._ def loop(cur: JsValue, extraName: Option[String]): Set[String] = cur match { case JsObject(f) => f.flatMap { case (k: String, v: JsValue) => loop(v, k.some) ++ Set(extraName.getOrElse("") + k) }.toSet case _ => Set.empty } loop(s.parseJson, None) } private def getAllSchemaFieldNames(schema: Schema): Set[String] = { import Schema.Type._ import collection.JavaConverters._ def loop(sch: Schema, extraName: Option[String]): Set[String] = sch.getType match { case RECORD => sch.getFields.asScala.toSet.flatMap { f: Schema.Field => loop(f.schema, f.name.some) ++ Set(extraName.getOrElse("") + f.name) } case _ => Set.empty } loop(schema, None) } def toGenericRecord(schema: Schema, useStrictValidation: Boolean): Try[GenericRecord] = Try { if (useStrictValidation) { val diff = getAllPayloadFieldNames diff getAllSchemaFieldNames(schema) if (diff.nonEmpty) throw ValidationExtraFieldsError(diff) } val decoderFactory = new DecoderFactory val decoder = decoderFactory.jsonDecoder(schema, s) val reader = new GenericDatumReader[GenericRecord](schema) reader.read(null, decoder) }.flatTap(checkLogicalTypes) } }
Example 3
Source File: AvroTransformer.scala From streamliner-examples with Apache License 2.0 | 5 votes |
package com.memsql.spark.examples.avro import com.memsql.spark.etl.api.{UserTransformConfig, Transformer, PhaseConfig} import com.memsql.spark.etl.utils.PhaseLogger import org.apache.spark.rdd.RDD import org.apache.spark.sql.{SQLContext, DataFrame, Row} import org.apache.spark.sql.types.StructType import org.apache.avro.Schema import org.apache.avro.generic.GenericData import org.apache.avro.io.DecoderFactory import org.apache.avro.specific.SpecificDatumReader // Takes DataFrames of byte arrays, where each row is a serialized Avro record. // Returns DataFrames of deserialized data, where each field has its own column. class AvroTransformer extends Transformer { var avroSchemaStr: String = null var sparkSqlSchema: StructType = null def AvroRDDToDataFrame(sqlContext: SQLContext, rdd: RDD[Row]): DataFrame = { val rowRDD: RDD[Row] = rdd.mapPartitions({ partition => { // Create per-partition copies of non-serializable objects val parser: Schema.Parser = new Schema.Parser() val avroSchema = parser.parse(avroSchemaStr) val reader = new SpecificDatumReader[GenericData.Record](avroSchema) partition.map({ rowOfBytes => val bytes = rowOfBytes(0).asInstanceOf[Array[Byte]] val decoder = DecoderFactory.get().binaryDecoder(bytes, null) val record = reader.read(null, decoder) val avroToRow = new AvroToRow() avroToRow.getRow(record) }) }}) sqlContext.createDataFrame(rowRDD, sparkSqlSchema) } override def initialize(sqlContext: SQLContext, config: PhaseConfig, logger: PhaseLogger): Unit = { val userConfig = config.asInstanceOf[UserTransformConfig] val avroSchemaJson = userConfig.getConfigJsValue("avroSchema") match { case Some(s) => s case None => throw new IllegalArgumentException("avroSchema must be set in the config") } avroSchemaStr = avroSchemaJson.toString val parser = new Schema.Parser() val avroSchema = parser.parse(avroSchemaJson.toString) sparkSqlSchema = AvroToSchema.getSchema(avroSchema) } override def transform(sqlContext: SQLContext, df: DataFrame, config: PhaseConfig, logger: PhaseLogger): DataFrame = { AvroRDDToDataFrame(sqlContext, df.rdd) } }
Example 4
Source File: SpecificTestUtil.scala From sbt-avrohugger with Apache License 2.0 | 5 votes |
package test import java.io.File import org.apache.avro.io.{DecoderFactory, EncoderFactory} import org.apache.avro.generic.{ GenericDatumReader, GenericRecord} import org.apache.avro.specific.{ SpecificDatumReader, SpecificDatumWriter, SpecificRecordBase } import org.apache.avro.Schema import org.apache.avro.file.{ DataFileReader, DataFileWriter } import org.specs2.mutable.Specification object SpecificTestUtil extends Specification { def write[T <: SpecificRecordBase](file: File, records: List[T]) = { val userDatumWriter = new SpecificDatumWriter[T] val dataFileWriter = new DataFileWriter[T](userDatumWriter) dataFileWriter.create(records.head.getSchema, file); records.foreach(record => dataFileWriter.append(record)) dataFileWriter.close(); } def read[T <: SpecificRecordBase](file: File, records: List[T]) = { val dummyRecord = new GenericDatumReader[GenericRecord] val schema = new DataFileReader(file, dummyRecord).getSchema val userDatumReader = new SpecificDatumReader[T](schema) val dataFileReader = new DataFileReader[T](file, userDatumReader) // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader. var record: T = null.asInstanceOf[T] var sameRecord: T = null.asInstanceOf[T] val recordIter = records.iterator while (dataFileReader.hasNext) { sameRecord = dataFileReader.next(sameRecord) record = recordIter.next } dataFileReader.close() sameRecord must ===(record) } def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = { val fileName = s"${records.head.getClass.getName}" val fileEnding = "avro" val file = File.createTempFile(fileName, fileEnding) file.deleteOnExit() write(file, records) read(file, records) } def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = { val schema = record.getSchema val writer = new SpecificDatumWriter[T](schema) val out = new java.io.ByteArrayOutputStream() val encoder = EncoderFactory.get().binaryEncoder(out, null) writer.write(record, encoder) encoder.flush val ba = out.toByteArray ba.size must ===(1) ba(0) must ===(0) out.close val reader = new SpecificDatumReader[T](schema) val decoder = DecoderFactory.get().binaryDecoder(ba, null) val decoded = reader.read(record, decoder) decoded must ===(record) } }
Example 5
Source File: SpecificTestUtil.scala From sbt-avrohugger with Apache License 2.0 | 5 votes |
package test import java.io.File import org.apache.avro.io.{DecoderFactory, EncoderFactory} import org.apache.avro.generic.{ GenericDatumReader, GenericRecord} import org.apache.avro.specific.{ SpecificDatumReader, SpecificDatumWriter, SpecificRecordBase } import org.apache.avro.Schema import org.apache.avro.file.{ DataFileReader, DataFileWriter } import org.specs2.mutable.Specification object SpecificTestUtil extends Specification { def write[T <: SpecificRecordBase](file: File, records: List[T]) = { val userDatumWriter = new SpecificDatumWriter[T]() val dataFileWriter = new DataFileWriter[T](userDatumWriter) dataFileWriter.create(records.head.getSchema, file) records.foreach(record => dataFileWriter.append(record)) dataFileWriter.close() } def read[T <: SpecificRecordBase](file: File, records: List[T]) = { val dummyRecord = new GenericDatumReader[GenericRecord] val schema = new DataFileReader(file, dummyRecord).getSchema val userDatumReader = new SpecificDatumReader[T](schema) val dataFileReader = new DataFileReader[T](file, userDatumReader) // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader. var record: T = null.asInstanceOf[T] var sameRecord: T = null.asInstanceOf[T] val recordIter = records.iterator while (dataFileReader.hasNext) { sameRecord = dataFileReader.next(sameRecord) record = recordIter.next } dataFileReader.close() sameRecord must ===(record) } def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = { val fileName = s"${records.head.getClass.getName}" val fileEnding = "avro" val file = File.createTempFile(fileName, fileEnding) file.deleteOnExit() write(file, records) read(file, records) } def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = { val schema = record.getSchema val writer = new SpecificDatumWriter[T](schema) val out = new java.io.ByteArrayOutputStream() val encoder = EncoderFactory.get().binaryEncoder(out, null) writer.write(record, encoder) encoder.flush val ba = out.toByteArray ba.size must ===(1) ba(0) must ===(0) out.close val reader = new SpecificDatumReader[T](schema) val decoder = DecoderFactory.get().binaryDecoder(ba, null) val decoded = reader.read(record, decoder) decoded must ===(record) } }
Example 6
Source File: SpecificTestUtil.scala From sbt-avrohugger with Apache License 2.0 | 5 votes |
package test import java.io.File import org.apache.avro.io.{DecoderFactory, EncoderFactory} import org.apache.avro.generic.{ GenericDatumReader, GenericRecord} import org.apache.avro.specific.{ SpecificDatumReader, SpecificDatumWriter, SpecificRecordBase } import org.apache.avro.Schema import org.apache.avro.file.{ DataFileReader, DataFileWriter } import org.specs2.mutable.Specification object SpecificTestUtil extends Specification { def write[T <: SpecificRecordBase](file: File, records: List[T]) = { val userDatumWriter = new SpecificDatumWriter[T] val dataFileWriter = new DataFileWriter[T](userDatumWriter) dataFileWriter.create(records.head.getSchema, file); records.foreach(record => dataFileWriter.append(record)) dataFileWriter.close(); } def read[T <: SpecificRecordBase](file: File, records: List[T]) = { val dummyRecord = new GenericDatumReader[GenericRecord] val schema = new DataFileReader(file, dummyRecord).getSchema val userDatumReader = new SpecificDatumReader[T](schema) val dataFileReader = new DataFileReader[T](file, userDatumReader) // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader. var record: T = null.asInstanceOf[T] var sameRecord: T = null.asInstanceOf[T] val recordIter = records.iterator while (dataFileReader.hasNext) { sameRecord = dataFileReader.next(sameRecord) record = recordIter.next } dataFileReader.close() sameRecord.equals(record) } def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = { val fileName = s"${records.head.getClass.getName}" val fileEnding = "avro" val file = File.createTempFile(fileName, fileEnding) file.deleteOnExit() write(file, records) read(file, records) } def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = { val schema = record.getSchema val writer = new SpecificDatumWriter[T](schema) val out = new java.io.ByteArrayOutputStream() val encoder = EncoderFactory.get().binaryEncoder(out, null) writer.write(record, encoder) encoder.flush val ba = out.toByteArray ba.size must ===(1) ba(0) must ===(0) out.close val reader = new SpecificDatumReader[T](schema) val decoder = DecoderFactory.get().binaryDecoder(ba, null) val decoded = reader.read(record, decoder) decoded must ===(record) } }
Example 7
Source File: SpecificTestUtil.scala From sbt-avrohugger with Apache License 2.0 | 5 votes |
package test import java.io.File import org.apache.avro.io.{DecoderFactory, EncoderFactory} import org.apache.avro.generic.{ GenericDatumReader, GenericRecord} import org.apache.avro.specific.{ SpecificDatumReader, SpecificDatumWriter, SpecificRecordBase } import org.apache.avro.Schema import org.apache.avro.file.{ DataFileReader, DataFileWriter } import org.specs2.mutable.Specification object SpecificTestUtil extends Specification { def write[T <: SpecificRecordBase](file: File, records: List[T]) = { val userDatumWriter = new SpecificDatumWriter[T] val dataFileWriter = new DataFileWriter[T](userDatumWriter) dataFileWriter.create(records.head.getSchema, file); records.foreach(record => dataFileWriter.append(record)) dataFileWriter.close(); } def read[T <: SpecificRecordBase](file: File, records: List[T]) = { val dummyRecord = new GenericDatumReader[GenericRecord] val schema = new DataFileReader(file, dummyRecord).getSchema val userDatumReader = new SpecificDatumReader[T](schema) val dataFileReader = new DataFileReader[T](file, userDatumReader) // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader. var record: T = null.asInstanceOf[T] var sameRecord: T = null.asInstanceOf[T] val recordIter = records.iterator while (dataFileReader.hasNext) { sameRecord = dataFileReader.next(sameRecord) record = recordIter.next } dataFileReader.close() sameRecord must ===(record) } def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = { val fileName = s"${records.head.getClass.getName}" val fileEnding = "avro" val file = File.createTempFile(fileName, fileEnding) file.deleteOnExit() write(file, records) read(file, records) } def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = { val schema = record.getSchema val writer = new SpecificDatumWriter[T](schema) val out = new java.io.ByteArrayOutputStream() val encoder = EncoderFactory.get().binaryEncoder(out, null) writer.write(record, encoder) encoder.flush val ba = out.toByteArray ba.size must ===(1) ba(0) must ===(0) out.close val reader = new SpecificDatumReader[T](schema) val decoder = DecoderFactory.get().binaryDecoder(ba, null) val decoded = reader.read(record, decoder) decoded must ===(record) } }
Example 8
Source File: DefaultRowReader.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.avro import java.nio.charset.Charset import org.apache.avro.Schema import org.apache.avro.generic.{GenericData, GenericDatumReader} import org.apache.avro.io.{BinaryDecoder, DecoderFactory} import SchemaConverter._ import ml.combust.mleap.runtime.serialization.{BuiltinFormats, RowReader} import ml.combust.mleap.core.types.StructType import ml.combust.mleap.runtime.frame.{ArrayRow, Row} import scala.util.Try class DefaultRowReader(override val schema: StructType) extends RowReader { val valueConverter = ValueConverter() lazy val readers = schema.fields.map(_.dataType).map(valueConverter.avroToMleap) val avroSchema = schema: Schema val datumReader = new GenericDatumReader[GenericData.Record](avroSchema) var decoder: BinaryDecoder = null var record = new GenericData.Record(avroSchema) override def fromBytes(bytes: Array[Byte], charset: Charset = BuiltinFormats.charset): Try[Row] = Try { decoder = DecoderFactory.get().binaryDecoder(bytes, decoder) record = datumReader.read(record, decoder) val row = ArrayRow(new Array[Any](schema.fields.length)) for(i <- schema.fields.indices) { row.set(i, readers(i)(record.get(i))) } row } }
Example 9
Source File: AvroTypeSpec.scala From shapeless-datatype with Apache License 2.0 | 5 votes |
package shapeless.datatype.avro import java.io.{ByteArrayInputStream, ByteArrayOutputStream} import java.net.URI import java.nio.ByteBuffer import com.google.protobuf.ByteString import org.apache.avro.Schema import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord} import org.apache.avro.io.{DecoderFactory, EncoderFactory} import org.joda.time.Instant import org.scalacheck.Prop.forAll import org.scalacheck.ScalacheckShapeless._ import org.scalacheck._ import shapeless._ import shapeless.datatype.record._ import scala.reflect.runtime.universe._ object AvroTypeSpec extends Properties("AvroType") { import shapeless.datatype.test.Records._ import shapeless.datatype.test.SerializableUtils._ implicit def compareByteArrays(x: Array[Byte], y: Array[Byte]) = java.util.Arrays.equals(x, y) implicit def compareIntArrays(x: Array[Int], y: Array[Int]) = java.util.Arrays.equals(x, y) def roundTrip[A: TypeTag, L <: HList](m: A)(implicit gen: LabelledGeneric.Aux[A, L], fromL: FromAvroRecord[L], toL: ToAvroRecord[L], mr: MatchRecord[L] ): Boolean = { val t = ensureSerializable(AvroType[A]) val f1: SerializableFunction[A, GenericRecord] = new SerializableFunction[A, GenericRecord] { override def apply(m: A): GenericRecord = t.toGenericRecord(m) } val f2: SerializableFunction[GenericRecord, Option[A]] = new SerializableFunction[GenericRecord, Option[A]] { override def apply(m: GenericRecord): Option[A] = t.fromGenericRecord(m) } val toFn = ensureSerializable(f1) val fromFn = ensureSerializable(f2) val copy = fromFn(roundTripRecord(toFn(m))) val rm = RecordMatcher[A] copy.exists(rm(_, m)) } def roundTripRecord(r: GenericRecord): GenericRecord = { val writer = new GenericDatumWriter[GenericRecord](r.getSchema) val baos = new ByteArrayOutputStream() val encoder = EncoderFactory.get().binaryEncoder(baos, null) writer.write(r, encoder) encoder.flush() baos.close() val bytes = baos.toByteArray val reader = new GenericDatumReader[GenericRecord](r.getSchema) val bais = new ByteArrayInputStream(bytes) val decoder = DecoderFactory.get().binaryDecoder(bais, null) reader.read(null, decoder) } implicit val byteStringAvroType = AvroType.at[ByteString](Schema.Type.BYTES)( v => ByteString.copyFrom(v.asInstanceOf[ByteBuffer]), v => ByteBuffer.wrap(v.toByteArray) ) implicit val instantAvroType = AvroType.at[Instant](Schema.Type.LONG)(v => new Instant(v.asInstanceOf[Long]), _.getMillis) property("required") = forAll { m: Required => roundTrip(m) } property("optional") = forAll { m: Optional => roundTrip(m) } property("repeated") = forAll { m: Repeated => roundTrip(m) } property("mixed") = forAll { m: Mixed => roundTrip(m) } property("nested") = forAll { m: Nested => roundTrip(m) } property("seqs") = forAll { m: Seqs => roundTrip(m) } implicit val uriAvroType = AvroType.at[URI](Schema.Type.STRING)(v => URI.create(v.toString), _.toString) property("custom") = forAll { m: Custom => roundTrip(m) } }
Example 10
Source File: GithubIssue235.scala From avro4s with Apache License 2.0 | 5 votes |
package com.sksamuel.avro4s.github import java.io.ByteArrayOutputStream import com.sksamuel.avro4s.{Decoder, Encoder, RecordFormat, SchemaFor} import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord} import org.apache.avro.io.{DecoderFactory, EncoderFactory} import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers case class Label(value: String) extends AnyVal case class Value[A](label: Label, value: A) sealed trait OneOrTwo[A] case class One[A](value: Value[A]) extends OneOrTwo[A] case class Two[A](first: Value[A], second: Value[A]) extends OneOrTwo[A] case class OneOrTwoWrapper[A](t: OneOrTwo[A]) object Bug { def apply[T <: Product](a: T)( implicit schemaFor: SchemaFor[T], encoder: Encoder[T], decoder: Decoder[T] ): Unit = { val format = RecordFormat[T] val schema = schemaFor.schema val datumReader = new GenericDatumReader[GenericRecord](schema) val datumWriter = new GenericDatumWriter[GenericRecord](schema) val stream = new ByteArrayOutputStream() val bEncoder = EncoderFactory.get().binaryEncoder(stream, null) datumWriter.write(format.to(a), bEncoder) bEncoder.flush() val bytes = stream.toByteArray val bDecoder = DecoderFactory.get().binaryDecoder(bytes, null) val record = datumReader.read(null, bDecoder) require(format.from(record) == a) } } class GithubIssue235 extends AnyFunSuite with Matchers { test("Broken typeclass derivation upgrading from 1.9.0 to 2.0.1 #235") { val o = OneOrTwoWrapper(One(Value(Label("lbl"), "foo"))) Bug(o) } }
Example 11
Source File: OutputStreamTest.scala From avro4s with Apache License 2.0 | 5 votes |
package com.sksamuel.avro4s.streams.output import java.io.ByteArrayOutputStream import com.sksamuel.avro4s._ import org.apache.avro.file.{DataFileReader, SeekableByteArrayInput} import org.apache.avro.generic.{GenericDatumReader, GenericRecord} import org.apache.avro.io.DecoderFactory import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers trait OutputStreamTest extends AnyFunSuite with Matchers { def readData[T: SchemaFor](out: ByteArrayOutputStream): GenericRecord = readData(out.toByteArray) def readData[T: SchemaFor](bytes: Array[Byte]): GenericRecord = { val datumReader = new GenericDatumReader[GenericRecord](AvroSchema[T]) val dataFileReader = new DataFileReader[GenericRecord](new SeekableByteArrayInput(bytes), datumReader) dataFileReader.next } def writeData[T: Encoder : SchemaFor](t: T): ByteArrayOutputStream = { val out = new ByteArrayOutputStream val avro = AvroOutputStream.data[T].to(out).build() avro.write(t) avro.close() out } def readBinary[T: SchemaFor](out: ByteArrayOutputStream): GenericRecord = readBinary(out.toByteArray) def readBinary[T: SchemaFor](bytes: Array[Byte]): GenericRecord = { val datumReader = new GenericDatumReader[GenericRecord](AvroSchema[T]) val decoder = DecoderFactory.get().binaryDecoder(new SeekableByteArrayInput(bytes), null) datumReader.read(null, decoder) } def writeBinary[T: Encoder : SchemaFor](t: T): ByteArrayOutputStream = { val out = new ByteArrayOutputStream val avro = AvroOutputStream.binary[T].to(out).build() avro.write(t) avro.close() out } def readJson[T: SchemaFor](out: ByteArrayOutputStream): GenericRecord = readJson(out.toByteArray) def readJson[T: SchemaFor](bytes: Array[Byte]): GenericRecord = { val schema = AvroSchema[T] val datumReader = new GenericDatumReader[GenericRecord](schema) val decoder = DecoderFactory.get().jsonDecoder(schema, new SeekableByteArrayInput(bytes)) datumReader.read(null, decoder) } def writeJson[T: Encoder : SchemaFor](t: T): ByteArrayOutputStream = { val out = new ByteArrayOutputStream val avro = AvroOutputStream.json[T].to(out).build() avro.write(t) avro.close() out } def writeRead[T: Encoder : SchemaFor](t: T)(fn: GenericRecord => Any): Unit = { { val out = writeData(t) val record = readData(out) fn(record) } { val out = writeBinary(t) val record = readBinary(out) fn(record) } { val out = writeJson(t) val record = readJson(out) fn(record) } } }
Example 12
Source File: Decoding.scala From avro4s with Apache License 2.0 | 5 votes |
package benchmarks import java.io.ByteArrayOutputStream import java.nio.ByteBuffer import java.util.Collections import benchmarks.record._ import com.sksamuel.avro4s._ import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord} import org.apache.avro.io.{DecoderFactory, EncoderFactory} import org.apache.avro.util.ByteBufferInputStream import org.openjdk.jmh.annotations._ import org.openjdk.jmh.infra.Blackhole object Decoding extends BenchmarkHelpers { @State(Scope.Thread) class Setup { val avroBytes = { import benchmarks.record.generated.AttributeValue._ import benchmarks.record.generated._ new RecordWithUnionAndTypeField(new ValidInt(255, t)).toByteBuffer } val avro4sBytes = encode(RecordWithUnionAndTypeField(AttributeValue.Valid[Int](255, t))) val (handrolledDecoder, handrolledReader) = { import benchmarks.handrolled_codecs._ implicit val codec: Codec[AttributeValue[Int]] = AttributeValueCodec[Int] implicit val schemaFor: SchemaFor[AttributeValue[Int]] = SchemaFor[AttributeValue[Int]](codec.schema) val recordSchemaFor = SchemaFor[RecordWithUnionAndTypeField] val decoder = Decoder[RecordWithUnionAndTypeField].withSchema(recordSchemaFor) val reader = new GenericDatumReader[GenericRecord](recordSchemaFor.schema) (decoder, reader) } val (avro4sDecoder, avro4sReader) = { val decoder = Decoder[RecordWithUnionAndTypeField] val reader = new GenericDatumReader[GenericRecord](decoder.schema) (decoder, reader) } } def encode[T: Encoder: SchemaFor](value: T): ByteBuffer = { val outputStream = new ByteArrayOutputStream(512) val encoder = Encoder[T] val schema = AvroSchema[T] val record = encoder.encode(value).asInstanceOf[GenericRecord] val writer = new GenericDatumWriter[GenericRecord](schema) val enc = EncoderFactory.get().directBinaryEncoder(outputStream, null) writer.write(record, enc) ByteBuffer.wrap(outputStream.toByteArray) } } class Decoding extends CommonParams with BenchmarkHelpers { import Decoding._ def decode[T](bytes: ByteBuffer, decoder: Decoder[T], reader: GenericDatumReader[GenericRecord]): T = { val dec = DecoderFactory.get().binaryDecoder(new ByteBufferInputStream(Collections.singletonList(bytes.duplicate)), null) val record = reader.read(null, dec) decoder.decode(record) } @Benchmark def avroSpecificRecord(setup: Setup, blackhole: Blackhole) = { import benchmarks.record.generated._ blackhole.consume(RecordWithUnionAndTypeField.fromByteBuffer(setup.avroBytes.duplicate)) } @Benchmark def avro4sHandrolled(setup: Setup, blackhole: Blackhole) = blackhole.consume(decode(setup.avro4sBytes, setup.handrolledDecoder, setup.handrolledReader)) @Benchmark def avro4sGenerated(setup: Setup, blackhole: Blackhole) = blackhole.consume(decode(setup.avro4sBytes, setup.avro4sDecoder, setup.avro4sReader)) }
Example 13
Source File: SparkAvroDecoder.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark.avro import org.apache.log4j.Logger import java.io.ByteArrayOutputStream import scala.reflect.runtime.universe._ import org.apache.avro.generic.{ GenericDatumReader, GenericDatumWriter, GenericRecord } import org.apache.avro.io.{ DecoderFactory, EncoderFactory } import org.apache.spark.sql.{ Dataset, Encoder, Row } import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder, RowEncoder } import org.apache.spark.sql.catalyst.expressions.GenericRow import org.apache.spark.sql.types.StructType import org.apache.avro.Schema import cloudflow.spark.sql.SQLImplicits._ case class EncodedKV(key: String, value: Array[Byte]) case class SparkAvroDecoder[T: Encoder: TypeTag](avroSchema: String) { val encoder: Encoder[T] = implicitly[Encoder[T]] val sqlSchema: StructType = encoder.schema val encoderForDataColumns: ExpressionEncoder[Row] = RowEncoder(sqlSchema) @transient lazy val _avroSchema = new Schema.Parser().parse(avroSchema) @transient lazy val rowConverter = SchemaConverters.createConverterToSQL(_avroSchema, sqlSchema) @transient lazy val datumReader = new GenericDatumReader[GenericRecord](_avroSchema) @transient lazy val decoder = DecoderFactory.get def decode(bytes: Array[Byte]): Row = { val binaryDecoder = decoder.binaryDecoder(bytes, null) val record = datumReader.read(null, binaryDecoder) rowConverter(record).asInstanceOf[GenericRow] } } case class SparkAvroEncoder[T: Encoder: TypeTag](avroSchema: String) { @transient lazy val log = Logger.getLogger(getClass.getName) val BufferSize = 5 * 1024 // 5 Kb val encoder = implicitly[Encoder[T]] val sqlSchema = encoder.schema @transient lazy val _avroSchema = new Schema.Parser().parse(avroSchema) val recordName = "topLevelRecord" // ??? val recordNamespace = "recordNamespace" // ??? @transient lazy val converter = AvroConverter.createConverterToAvro(sqlSchema, recordName, recordNamespace) // Risk: This process is memory intensive. Might require thread-level buffers to optimize memory usage def rowToBytes(row: Row): Array[Byte] = { val genRecord = converter(row).asInstanceOf[GenericRecord] if (log.isDebugEnabled) log.debug(s"genRecord = $genRecord") val datumWriter = new GenericDatumWriter[GenericRecord](_avroSchema) val avroEncoder = EncoderFactory.get val byteArrOS = new ByteArrayOutputStream(BufferSize) val binaryEncoder = avroEncoder.binaryEncoder(byteArrOS, null) datumWriter.write(genRecord, binaryEncoder) binaryEncoder.flush() byteArrOS.toByteArray } def encode(dataset: Dataset[T]): Dataset[Array[Byte]] = dataset.toDF().mapPartitions(rows ⇒ rows.map(rowToBytes)).as[Array[Byte]] // Note to self: I'm not sure how heavy this chain of transformations is def encodeWithKey(dataset: Dataset[T], keyFun: T ⇒ String): Dataset[EncodedKV] = { val encoder = encoderFor[T] implicit val rowEncoder = RowEncoder(encoder.schema).resolveAndBind() dataset.map { value ⇒ val key = keyFun(value) val internalRow = encoder.toRow(value) val row = rowEncoder.fromRow(internalRow) val bytes = rowToBytes(row) EncodedKV(key, bytes) } } }
Example 14
Source File: AvroSerializer.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.serialization import java.io.{ByteArrayOutputStream, InputStream, OutputStream} import com.sksamuel.avro4s.{RecordFormat, SchemaFor} import org.apache.avro.Schema import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord} import org.apache.avro.io.{DecoderFactory, EncoderFactory} object AvroSerializer { def write[T <: Product](t: T)(implicit os: OutputStream, formatter: RecordFormat[T], schemaFor: SchemaFor[T]): Unit = write(apply(t), schemaFor()) def write(record: GenericRecord, schema: Schema)(implicit os: OutputStream) = { val writer = new GenericDatumWriter[GenericRecord](schema) val encoder = EncoderFactory.get().binaryEncoder(os, null) writer.write(record, encoder) encoder.flush() os.flush() } def getBytes[T <: Product](t: T)(implicit recordFormat: RecordFormat[T], schemaFor: SchemaFor[T]): Array[Byte] = getBytes(recordFormat.to(t), schemaFor()) def getBytes(record: GenericRecord, schema: Schema): Array[Byte] = { implicit val output = new ByteArrayOutputStream() write(record, schema) output.toByteArray } def read(is: InputStream, schema: Schema): GenericRecord = { val reader = new GenericDatumReader[GenericRecord](schema) val decoder = DecoderFactory.get().binaryDecoder(is, null) reader.read(null, decoder) } def read[T <: Product](is: InputStream)(implicit schemaFor: SchemaFor[T], recordFormat: RecordFormat[T]): T = recordFormat.from(read(is, schemaFor())) def apply[T <: Product](t: T)(implicit formatter: RecordFormat[T]): GenericRecord = formatter.to(t) }
Example 15
Source File: AvroConverter.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.converters.source import java.io.File import java.util.Collections import com.datamountaineer.streamreactor.connect.converters.MsgKey import io.confluent.connect.avro.AvroData import org.apache.avro.generic.{GenericDatumReader, GenericRecord} import org.apache.avro.io.DecoderFactory import org.apache.avro.{Schema => AvroSchema} import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.kafka.connect.source.SourceRecord import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException class AvroConverter extends Converter { private val avroData = new AvroData(8) private var sourceToSchemaMap: Map[String, AvroSchema] = Map.empty private var avroReadersMap: Map[String, GenericDatumReader[GenericRecord]] = Map.empty override def convert(kafkaTopic: String, sourceTopic: String, messageId: String, bytes: Array[Byte], keys: Seq[String] = Seq.empty, keyDelimiter: String = "."): SourceRecord = { Option(bytes) match { case None => new SourceRecord(Collections.singletonMap(Converter.TopicKey, sourceTopic), null, kafkaTopic, avroData.toConnectSchema(sourceToSchemaMap(sourceTopic)), null) case Some(_) => val reader = avroReadersMap.getOrElse(sourceTopic.toLowerCase, throw new ConfigException(s"Invalid ${AvroConverter.SCHEMA_CONFIG} is not configured for $sourceTopic")) val decoder = DecoderFactory.get().binaryDecoder(bytes, null) val record = reader.read(null, decoder) val schemaAndValue = avroData.toConnectData(sourceToSchemaMap(sourceTopic.toLowerCase), record) val value = schemaAndValue.value() value match { case s: Struct if keys.nonEmpty => val keysValue = keys.flatMap { key => Option(KeyExtractor.extract(s, key.split('.').toVector)).map(_.toString) }.mkString(keyDelimiter) new SourceRecord( Collections.singletonMap(Converter.TopicKey, sourceTopic), null, kafkaTopic, Schema.STRING_SCHEMA, keysValue, schemaAndValue.schema(), schemaAndValue.value()) case _ => new SourceRecord( Collections.singletonMap(Converter.TopicKey, sourceTopic), null, kafkaTopic, MsgKey.schema, MsgKey.getStruct(sourceTopic, messageId), schemaAndValue.schema(), schemaAndValue.value()) } } } override def initialize(config: Map[String, String]): Unit = { sourceToSchemaMap = AvroConverter.getSchemas(config) avroReadersMap = sourceToSchemaMap.map { case (key, schema) => key -> new GenericDatumReader[GenericRecord](schema) } } } object AvroConverter { val SCHEMA_CONFIG = "connect.source.converter.avro.schemas" def getSchemas(config: Map[String, String]): Map[String, AvroSchema] = { config.getOrElse(SCHEMA_CONFIG, throw new ConfigException(s"$SCHEMA_CONFIG is not provided")) .toString .split(';') .filter(_.trim.nonEmpty) .map(_.split("=")) .map { case Array(source, path) => val file = new File(path) if (!file.exists()) { throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The file $path doesn't exist!") } val s = source.trim.toLowerCase() if (s.isEmpty) { throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The topic is not valid for entry containing $path") } s -> new AvroSchema.Parser().parse(file) case other => throw new ConfigException(s"$SCHEMA_CONFIG is not properly set. The format is Mqtt_Source->AVRO_FILE") }.toMap } }