org.apache.avro.generic.GenericRecord Scala Examples
The following examples show how to use org.apache.avro.generic.GenericRecord.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: AvroParquetSourceTest.scala From eel-sdk with Apache License 2.0 | 6 votes |
package io.eels.component.parquet import java.nio.file.Paths import io.eels.component.parquet.avro.AvroParquetSource import io.eels.component.parquet.util.ParquetLogMute import io.eels.schema._ import org.apache.avro.SchemaBuilder import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.parquet.avro.AvroParquetWriter import org.scalatest.{Matchers, WordSpec} class AvroParquetSourceTest extends WordSpec with Matchers { ParquetLogMute() private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(conf) private val personFile = Paths.get(getClass.getResource("/io/eels/component/parquet/person.avro.pq").toURI) private val resourcesDir = personFile.getParent "AvroParquetSource" should { "read schema" in { val people = AvroParquetSource(personFile) people.schema shouldBe StructType( Field("name", StringType, nullable = false), Field("job", StringType, nullable = false), Field("location", StringType, nullable = false) ) } "read parquet files" in { val people = AvroParquetSource(personFile.toAbsolutePath()).toDataStream().toSet.map(_.values) people shouldBe Set( Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner") ) } "read multiple parquet files using file expansion" in { import io.eels.FilePattern._ val people = AvroParquetSource(s"${resourcesDir.toUri.toString}/*.pq").toDataStream().toSet.map(_.values) people shouldBe Set( Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner"), Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner") ) } // todo add merge to parquet source "merge schemas" ignore { try { fs.delete(new Path("merge1.pq"), false) } catch { case t: Throwable => } try { fs.delete(new Path("merge2.pq"), false) } catch { case t: Throwable => } val schema1 = SchemaBuilder.builder().record("schema1").fields().requiredString("a").requiredDouble("b").endRecord() val schema2 = SchemaBuilder.builder().record("schema2").fields().requiredInt("a").requiredBoolean("c").endRecord() val writer1 = AvroParquetWriter.builder[GenericRecord](new Path("merge1.pq")).withSchema(schema1).build() val record1 = new GenericData.Record(schema1) record1.put("a", "aaaaa") record1.put("b", 124.3) writer1.write(record1) writer1.close() val writer2 = AvroParquetWriter.builder[GenericRecord](new Path("merge2.pq")).withSchema(schema2).build() val record2 = new GenericData.Record(schema2) record2.put("a", 111) record2.put("c", true) writer2.write(record2) writer2.close() ParquetSource(new Path("merge*")).schema shouldBe StructType( Field("a", StringType, nullable = false), Field("b", DoubleType, nullable = false), Field("c", BooleanType, nullable = false) ) fs.delete(new Path(".merge1.pq.crc"), false) fs.delete(new Path(".merge2.pq.crc"), false) fs.delete(new Path("merge1.pq"), false) fs.delete(new Path("merge2.pq"), false) } } }
Example 2
Source File: FieldMapperEncoderTest.scala From avro4s with Apache License 2.0 | 5 votes |
package com.sksamuel.avro4s.record.encoder import com.sksamuel.avro4s.{Encoder, SchemaFor, SnakeCase} import org.apache.avro.generic.GenericRecord import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers class FieldMapperEncoderTest extends AnyFunSuite with Matchers { test("adding an in scope FieldMapper should overide the fields in an encoder") { implicit val fieldMapper = SnakeCase val schema: SchemaFor[NamingTest] = SchemaFor[NamingTest] val encoder = Encoder[NamingTest] val record = encoder.encode(NamingTest("Foo")).asInstanceOf[GenericRecord] record.get("camel_case") } } case class NamingTest(camelCase: String)
Example 3
Source File: Codecs.scala From embedded-kafka-schema-registry with MIT License | 5 votes |
package net.manub.embeddedkafka.schemaregistry.avro import org.apache.avro.generic.GenericRecord import org.apache.avro.specific.SpecificRecord import org.apache.kafka.clients.consumer.ConsumerRecord @deprecated( "Avro-related classes will be removed soon", since = "5.5.0" ) object Codecs { implicit def stringKeyAvroValueCrDecoder[V <: SpecificRecord] : ConsumerRecord[String, V] => (String, V) = cr => (cr.key, cr.value) implicit def avroValueCrDecoder[V <: SpecificRecord] : ConsumerRecord[String, V] => V = _.value implicit def stringKeyAvroValueTopicCrDecoder[V <: SpecificRecord] : ConsumerRecord[String, V] => (String, String, V) = cr => (cr.topic, cr.key, cr.value) implicit def stringKeyGenericValueCrDecoder : ConsumerRecord[String, GenericRecord] => (String, GenericRecord) = cr => (cr.key, cr.value) implicit def genericKeyGenericValueCrDecoder : ConsumerRecord[GenericRecord, GenericRecord] => ( GenericRecord, GenericRecord ) = cr => (cr.key, cr.value) }
Example 4
Source File: AvroSerdes.scala From embedded-kafka-schema-registry with MIT License | 5 votes |
package net.manub.embeddedkafka.schemaregistry.avro import io.confluent.kafka.serializers.{ AbstractKafkaSchemaSerDeConfig, KafkaAvroDeserializerConfig, KafkaAvroDeserializer => ConfluentKafkaAvroDeserializer, KafkaAvroSerializer => ConfluentKafkaAvroSerializer } import net.manub.embeddedkafka.schemaregistry.EmbeddedKafkaConfig import org.apache.avro.generic.GenericRecord import org.apache.avro.specific.SpecificRecord import org.apache.kafka.common.serialization.{Serde, Serdes} import scala.jdk.CollectionConverters._ @deprecated( "Avro-related classes will be removed soon", since = "5.5.0" ) object AvroSerdes { protected def configForSchemaRegistry( implicit config: EmbeddedKafkaConfig ): Map[String, Object] = Map( AbstractKafkaSchemaSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG -> s"http://localhost:${config.schemaRegistryPort}" ) protected def specificAvroReaderConfigForSchemaRegistry( implicit config: EmbeddedKafkaConfig ): Map[String, Object] = configForSchemaRegistry ++ Map( KafkaAvroDeserializerConfig.SPECIFIC_AVRO_READER_CONFIG -> true.toString ) def specific[T <: SpecificRecord]( isKey: Boolean = false, extraConfig: Map[String, Object] = Map.empty )( implicit config: EmbeddedKafkaConfig ): Serde[T] = serdeFrom[T]( configForSchemaRegistry ++ extraConfig, specificAvroReaderConfigForSchemaRegistry ++ extraConfig, //need this to support SpecificRecord isKey ) def generic( isKey: Boolean = false, extraConfig: Map[String, Object] = Map.empty )( implicit config: EmbeddedKafkaConfig ): Serde[GenericRecord] = serdeFrom[GenericRecord]( configForSchemaRegistry ++ extraConfig, configForSchemaRegistry ++ extraConfig, isKey ) private def serdeFrom[T]( serConfig: Map[String, Object], deserConfig: Map[String, Object], isKey: Boolean ): Serde[T] = { val ser = new ConfluentKafkaAvroSerializer ser.configure(serConfig.asJava, isKey) val deser = new ConfluentKafkaAvroDeserializer deser.configure(deserConfig.asJava, isKey) Serdes.serdeFrom(ser, deser).asInstanceOf[Serde[T]] } }
Example 5
Source File: AvroCodecsSpecification.scala From kafka-scala-api with Apache License 2.0 | 5 votes |
package com.example.avro import org.scalatest._ import com.twitter.bijection.Injection import com.twitter.bijection.avro.GenericAvroCodecs import org.apache.avro.Schema import org.apache.avro.generic.{GenericData, GenericRecord} class GenericAvroCodecsSpecification extends WordSpec with Matchers { val testSchema = new Schema.Parser().parse("""{ "type":"record", "name":"FiscalRecord", "namespace":"avro", "fields":[ { "name":"calendarDate", "type":"string" }, { "name":"fiscalWeek", "type":[ "int", "null" ] }, { "name":"fiscalYear", "type":[ "int", "null" ] } ] }""") "Generic Avro codec" should { "Round trip generic record using Generic Injection" in { implicit val genericInjection = GenericAvroCodecs[GenericRecord](testSchema) val testRecord = buildGenericAvroRecord(("2012-01-01", 1, 12)) val bytes = Injection[GenericRecord, Array[Byte]](testRecord) val attempt = Injection.invert[GenericRecord, Array[Byte]](bytes) assert(attempt.get == testRecord) } "Round trip generic record using Binary Injection" in { implicit val genericBinaryInjection = GenericAvroCodecs.toBinary[GenericRecord](testSchema) val testRecord = buildGenericAvroRecord(("2012-01-01", 1, 12)) val bytes = Injection[GenericRecord, Array[Byte]](testRecord) val attempt = Injection.invert[GenericRecord, Array[Byte]](bytes) assert(attempt.get == testRecord) } "Round trip generic record using Json Injection" in { implicit val genericJsonInjection = GenericAvroCodecs.toJson[GenericRecord](testSchema) val testRecord = buildGenericAvroRecord(("2012-01-01", 1, 12)) val jsonString = Injection[GenericRecord, String](testRecord) val attempt = Injection.invert[GenericRecord, String](jsonString) assert(attempt.get == testRecord) } } def buildGenericAvroRecord(i: (String, Int, Int)): GenericRecord = { val fiscalRecord = new GenericData.Record(testSchema) fiscalRecord.put("calendarDate", i._1) fiscalRecord.put("fiscalWeek", i._2) fiscalRecord.put("fiscalYear", i._3) fiscalRecord } }
Example 6
Source File: AvroSerializer.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.bloomberg.avro import java.io.ByteArrayOutputStream import com.datamountaineer.streamreactor.connect.bloomberg.BloombergData import com.datamountaineer.streamreactor.connect.bloomberg.avro.AvroSchemaGenerator._ import org.apache.avro.Schema import org.apache.avro.generic.GenericData.Record import org.apache.avro.generic.{GenericData, GenericDatumWriter, GenericRecord} import org.apache.avro.io.EncoderFactory import scala.collection.JavaConverters._ object AvroSerializer { private def recursive(record: GenericData.Record, schema: Schema, fieldName: String, value: Any): Unit = { value match { case _: Boolean => record.put(fieldName, value) case _: Int => record.put(fieldName, value) case _: Long => record.put(fieldName, value) case _: Double => record.put(fieldName, value) case _: Char => record.put(fieldName, value) case _: Float => record.put(fieldName, value) case _: String => record.put(fieldName, value) case list: java.util.List[_] => val tmpSchema = schema.getField(fieldName).schema() val itemSchema = if (tmpSchema.getType == Schema.Type.UNION) tmpSchema.getTypes.get(1) else tmpSchema require(itemSchema.getType == Schema.Type.ARRAY) //we might have a record not a primitive if (itemSchema.getElementType.getType == Schema.Type.RECORD) { val items = new GenericData.Array[GenericData.Record](list.size(), itemSchema) list.asScala.foreach { i => //only map is allowed val m = i.asInstanceOf[java.util.Map[String, Any]] items.add(m.toAvroRecord(itemSchema.getElementType)) } record.put(fieldName, items) } else { val items = new GenericData.Array[Any](list.size(), itemSchema) items.addAll(list) record.put(fieldName, items) } case map: java.util.LinkedHashMap[String @unchecked, _] => //record schema val fieldSchema = schema.getField(fieldName).schema() val nestedSchema = if (fieldSchema.getType == Schema.Type.UNION) fieldSchema.getTypes.get(1) else fieldSchema val nestedRecord = new Record(nestedSchema) map.entrySet().asScala.foreach(e => recursive(nestedRecord, nestedSchema, e.getKey, e.getValue)) record.put(fieldName, nestedRecord) } } } }
Example 7
Source File: AvroRecordRowKeyBuilderTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.hbase import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._ import com.datamountaineer.streamreactor.connect.hbase.avro.AvroRecordFieldExtractorMapFn import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.hadoop.hbase.util.Bytes import org.apache.kafka.connect.sink.SinkRecord import org.mockito.MockitoSugar import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class AvroRecordRowKeyBuilderTest extends AnyWordSpec with Matchers with MockitoSugar { val schema: Schema = new Schema.Parser().parse(PersonAvroSchema.schema) "AvroRecordRowKeyBuilder" should { "extract the values from the avro record and create the key" in { val keys = Seq("firstName", "lastName", "age") val rowKeyBuilder = new AvroRecordRowKeyBuilderBytes(AvroRecordFieldExtractorMapFn(schema, keys), keys) val sinkRecord = mock[SinkRecord] val firstName = "Jack" val lastName = "Smith" val age = 29 val record = new GenericRecord { val values: Map[String, AnyRef] = Map("firstName" -> firstName, "lastName" -> lastName, "age" -> Int.box(age)) override def get(key: String): AnyRef = values(key) override def put(key: String, v: scala.Any): Unit = sys.error("not supported") override def get(i: Int): AnyRef = sys.error("not supported") override def put(i: Int, v: scala.Any): Unit = sys.error("not supported") override def getSchema: Schema = sys.error("not supported") } val expectedValue = Bytes.add( Array( firstName.fromString(), rowKeyBuilder.delimBytes, lastName.fromString(), rowKeyBuilder.delimBytes, age.fromInt())) rowKeyBuilder.build(sinkRecord, record) shouldBe expectedValue } } }
Example 8
Source File: SparkAvroDecoder.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark.avro import org.apache.log4j.Logger import java.io.ByteArrayOutputStream import scala.reflect.runtime.universe._ import org.apache.avro.generic.{ GenericDatumReader, GenericDatumWriter, GenericRecord } import org.apache.avro.io.{ DecoderFactory, EncoderFactory } import org.apache.spark.sql.{ Dataset, Encoder, Row } import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder, RowEncoder } import org.apache.spark.sql.catalyst.expressions.GenericRow import org.apache.spark.sql.types.StructType import org.apache.avro.Schema import cloudflow.spark.sql.SQLImplicits._ case class EncodedKV(key: String, value: Array[Byte]) case class SparkAvroDecoder[T: Encoder: TypeTag](avroSchema: String) { val encoder: Encoder[T] = implicitly[Encoder[T]] val sqlSchema: StructType = encoder.schema val encoderForDataColumns: ExpressionEncoder[Row] = RowEncoder(sqlSchema) @transient lazy val _avroSchema = new Schema.Parser().parse(avroSchema) @transient lazy val rowConverter = SchemaConverters.createConverterToSQL(_avroSchema, sqlSchema) @transient lazy val datumReader = new GenericDatumReader[GenericRecord](_avroSchema) @transient lazy val decoder = DecoderFactory.get def decode(bytes: Array[Byte]): Row = { val binaryDecoder = decoder.binaryDecoder(bytes, null) val record = datumReader.read(null, binaryDecoder) rowConverter(record).asInstanceOf[GenericRow] } } case class SparkAvroEncoder[T: Encoder: TypeTag](avroSchema: String) { @transient lazy val log = Logger.getLogger(getClass.getName) val BufferSize = 5 * 1024 // 5 Kb val encoder = implicitly[Encoder[T]] val sqlSchema = encoder.schema @transient lazy val _avroSchema = new Schema.Parser().parse(avroSchema) val recordName = "topLevelRecord" // ??? val recordNamespace = "recordNamespace" // ??? @transient lazy val converter = AvroConverter.createConverterToAvro(sqlSchema, recordName, recordNamespace) // Risk: This process is memory intensive. Might require thread-level buffers to optimize memory usage def rowToBytes(row: Row): Array[Byte] = { val genRecord = converter(row).asInstanceOf[GenericRecord] if (log.isDebugEnabled) log.debug(s"genRecord = $genRecord") val datumWriter = new GenericDatumWriter[GenericRecord](_avroSchema) val avroEncoder = EncoderFactory.get val byteArrOS = new ByteArrayOutputStream(BufferSize) val binaryEncoder = avroEncoder.binaryEncoder(byteArrOS, null) datumWriter.write(genRecord, binaryEncoder) binaryEncoder.flush() byteArrOS.toByteArray } def encode(dataset: Dataset[T]): Dataset[Array[Byte]] = dataset.toDF().mapPartitions(rows ⇒ rows.map(rowToBytes)).as[Array[Byte]] // Note to self: I'm not sure how heavy this chain of transformations is def encodeWithKey(dataset: Dataset[T], keyFun: T ⇒ String): Dataset[EncodedKV] = { val encoder = encoderFor[T] implicit val rowEncoder = RowEncoder(encoder.schema).resolveAndBind() dataset.map { value ⇒ val key = keyFun(value) val internalRow = encoder.toRow(value) val row = rowEncoder.fromRow(internalRow) val bytes = rowToBytes(row) EncodedKV(key, bytes) } } }
Example 9
Source File: Avro4sConsumerImpl.scala From kafka4s with Apache License 2.0 | 5 votes |
package com.banno.kafka.consumer import cats.implicits._ import java.util.regex.Pattern import scala.concurrent.duration._ import org.apache.kafka.common._ import org.apache.kafka.clients.consumer._ import org.apache.avro.generic.GenericRecord import com.sksamuel.avro4s.FromRecord import cats.Functor import com.banno.kafka._ //this is a Bifunctor[ConsumerApi] case class Avro4sConsumerImpl[F[_]: Functor, K: FromRecord, V: FromRecord]( c: ConsumerApi[F, GenericRecord, GenericRecord] ) extends ConsumerApi[F, K, V] { def assign(partitions: Iterable[TopicPartition]): F[Unit] = c.assign(partitions) def assignment: F[Set[TopicPartition]] = c.assignment def beginningOffsets(partitions: Iterable[TopicPartition]): F[Map[TopicPartition, Long]] = c.beginningOffsets(partitions) def beginningOffsets( partitions: Iterable[TopicPartition], timeout: FiniteDuration ): F[Map[TopicPartition, Long]] = c.beginningOffsets(partitions, timeout) def close: F[Unit] = c.close def close(timeout: FiniteDuration): F[Unit] = c.close(timeout) def commitAsync: F[Unit] = c.commitAsync def commitAsync( offsets: Map[TopicPartition, OffsetAndMetadata], callback: OffsetCommitCallback ): F[Unit] = c.commitAsync(offsets, callback) def commitAsync(callback: OffsetCommitCallback): F[Unit] = c.commitAsync(callback) def commitSync: F[Unit] = c.commitSync def commitSync(offsets: Map[TopicPartition, OffsetAndMetadata]): F[Unit] = c.commitSync(offsets) def committed(partition: Set[TopicPartition]): F[Map[TopicPartition, OffsetAndMetadata]] = c.committed(partition) def endOffsets(partitions: Iterable[TopicPartition]): F[Map[TopicPartition, Long]] = c.endOffsets(partitions) def endOffsets( partitions: Iterable[TopicPartition], timeout: FiniteDuration ): F[Map[TopicPartition, Long]] = c.endOffsets(partitions, timeout) def listTopics: F[Map[String, Seq[PartitionInfo]]] = c.listTopics def listTopics(timeout: FiniteDuration): F[Map[String, Seq[PartitionInfo]]] = c.listTopics(timeout) def metrics: F[Map[MetricName, Metric]] = c.metrics def offsetsForTimes( timestampsToSearch: Map[TopicPartition, Long] ): F[Map[TopicPartition, OffsetAndTimestamp]] = c.offsetsForTimes(timestampsToSearch) def offsetsForTimes( timestampsToSearch: Map[TopicPartition, Long], timeout: FiniteDuration ): F[Map[TopicPartition, OffsetAndTimestamp]] = c.offsetsForTimes(timestampsToSearch, timeout) def partitionsFor(topic: String): F[Seq[PartitionInfo]] = c.partitionsFor(topic) def partitionsFor(topic: String, timeout: FiniteDuration): F[Seq[PartitionInfo]] = c.partitionsFor(topic, timeout) def pause(partitions: Iterable[TopicPartition]): F[Unit] = c.pause(partitions) def paused: F[Set[TopicPartition]] = c.paused def poll(timeout: FiniteDuration): F[ConsumerRecords[K, V]] = c.poll(timeout).map(_.fromGenericRecords[K, V]) def position(partition: TopicPartition): F[Long] = c.position(partition) def resume(partitions: Iterable[TopicPartition]): F[Unit] = c.resume(partitions) def seek(partition: TopicPartition, offset: Long): F[Unit] = c.seek(partition, offset) def seekToBeginning(partitions: Iterable[TopicPartition]): F[Unit] = c.seekToBeginning(partitions) def seekToEnd(partitions: Iterable[TopicPartition]): F[Unit] = c.seekToEnd(partitions) def subscribe(topics: Iterable[String]): F[Unit] = c.subscribe(topics) def subscribe(topics: Iterable[String], callback: ConsumerRebalanceListener): F[Unit] = c.subscribe(topics, callback) def subscribe(pattern: Pattern): F[Unit] = c.subscribe(pattern) def subscribe(pattern: Pattern, callback: ConsumerRebalanceListener): F[Unit] = c.subscribe(pattern, callback) def subscription: F[Set[String]] = c.subscription def unsubscribe: F[Unit] = c.unsubscribe def wakeup: F[Unit] = c.wakeup }
Example 10
Source File: ProducerOps.scala From kafka4s with Apache License 2.0 | 5 votes |
package com.banno.kafka.producer import cats.{Applicative, Foldable, MonadError, Traverse} import cats.implicits._ import fs2._ import org.apache.kafka.common._ import org.apache.kafka.common.errors._ import org.apache.kafka.clients.consumer.OffsetAndMetadata import org.apache.kafka.clients.producer._ case class ProducerOps[F[_], K, V](producer: ProducerApi[F, K, V]) { def sendAndForgetBatch[G[_]: Foldable]( records: G[ProducerRecord[K, V]] )(implicit F: Applicative[F]): F[Unit] = records.traverse_(producer.sendAndForget) def sendSyncBatch[G[_]: Traverse]( records: G[ProducerRecord[K, V]] )(implicit F: Applicative[F]): F[G[RecordMetadata]] = records.traverse(producer.sendSync) def sendAsyncBatch[G[_]: Traverse]( records: G[ProducerRecord[K, V]] )(implicit F: Applicative[F]): F[G[RecordMetadata]] = records.traverse(producer.sendAsync) def pipeSync: Pipe[F, ProducerRecord[K, V], RecordMetadata] = _.evalMap(producer.sendSync) def pipeAsync: Pipe[F, ProducerRecord[K, V], RecordMetadata] = _.evalMap(producer.sendAsync) def sink: Pipe[F, ProducerRecord[K, V], Unit] = _.evalMap(producer.sendAndForget) def sinkSync: Pipe[F, ProducerRecord[K, V], Unit] = pipeSync.apply(_).void def sinkAsync: Pipe[F, ProducerRecord[K, V], Unit] = pipeAsync.apply(_).void def transaction[G[_]: Foldable]( records: G[ProducerRecord[K, V]], offsets: Map[TopicPartition, OffsetAndMetadata], consumerGroupId: String )(implicit F: MonadError[F, Throwable]): F[Unit] = (for { _ <- producer.beginTransaction _ <- sendAndForgetBatch(records) //should be no need to wait for RecordMetadatas or errors, since commitTransaction flushes and throws _ <- producer.sendOffsetsToTransaction(offsets, consumerGroupId) _ <- producer.commitTransaction } yield ()).handleErrorWith { // Exception-handling described in https://kafka.apache.org/10/javadoc/org/apache/kafka/clients/producer/KafkaProducer.html#send-org.apache.kafka.clients.producer.ProducerRecord-org.apache.kafka.clients.producer.Callback- case e: ProducerFencedException => F.raiseError(e) case e: OutOfOrderSequenceException => F.raiseError(e) case e: UnsupportedVersionException => F.raiseError(e) case e: AuthorizationException => F.raiseError(e) case _ => producer.abortTransaction } } import org.apache.avro.generic.GenericRecord import com.sksamuel.avro4s.ToRecord case class GenericProducerOps[F[_]](producer: ProducerApi[F, GenericRecord, GenericRecord]) { def toAvro4s[K: ToRecord, V: ToRecord]: ProducerApi[F, K, V] = Avro4sProducerImpl[F, K, V](producer) }
Example 11
Source File: Avro4sProducerImpl.scala From kafka4s with Apache License 2.0 | 5 votes |
package com.banno.kafka.producer import java.util.concurrent.{Future => JFuture} import scala.concurrent.duration._ import org.apache.kafka.common._ import org.apache.kafka.clients.consumer.OffsetAndMetadata import org.apache.kafka.clients.producer._ import org.apache.avro.generic.GenericRecord import com.sksamuel.avro4s.ToRecord import com.banno.kafka._ //this is like Bifunctor[ProducerApi] but is contravariant in both arguments, cats does not seem to have anything like ContravriantBifunctor... case class Avro4sProducerImpl[F[_], K: ToRecord, V: ToRecord]( p: ProducerApi[F, GenericRecord, GenericRecord] ) extends ProducerApi[F, K, V] { def abortTransaction: F[Unit] = p.abortTransaction def beginTransaction: F[Unit] = p.beginTransaction def close: F[Unit] = p.close def close(timeout: FiniteDuration): F[Unit] = p.close(timeout) def commitTransaction: F[Unit] = p.commitTransaction def flush: F[Unit] = p.flush def initTransactions: F[Unit] = p.initTransactions def metrics: F[Map[MetricName, Metric]] = p.metrics def partitionsFor(topic: String): F[Seq[PartitionInfo]] = p.partitionsFor(topic) def sendOffsetsToTransaction( offsets: Map[TopicPartition, OffsetAndMetadata], consumerGroupId: String ): F[Unit] = p.sendOffsetsToTransaction(offsets, consumerGroupId) private[producer] def sendRaw(record: ProducerRecord[K, V]): JFuture[RecordMetadata] = p.sendRaw(record.toGenericRecord) private[producer] def sendRaw( record: ProducerRecord[K, V], callback: Callback ): JFuture[RecordMetadata] = p.sendRaw(record.toGenericRecord, callback) private[producer] def sendRaw( record: ProducerRecord[K, V], callback: Either[Exception, RecordMetadata] => Unit ): Unit = p.sendRaw(record.toGenericRecord, callback) def sendAndForget(record: ProducerRecord[K, V]): F[Unit] = p.sendAndForget(record.toGenericRecord) def sendSync(record: ProducerRecord[K, V]): F[RecordMetadata] = p.sendSync(record.toGenericRecord) def sendAsync(record: ProducerRecord[K, V]): F[RecordMetadata] = p.sendAsync(record.toGenericRecord) }
Example 12
Source File: Decoding.scala From avro4s with Apache License 2.0 | 5 votes |
package benchmarks import java.io.ByteArrayOutputStream import java.nio.ByteBuffer import java.util.Collections import benchmarks.record._ import com.sksamuel.avro4s._ import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord} import org.apache.avro.io.{DecoderFactory, EncoderFactory} import org.apache.avro.util.ByteBufferInputStream import org.openjdk.jmh.annotations._ import org.openjdk.jmh.infra.Blackhole object Decoding extends BenchmarkHelpers { @State(Scope.Thread) class Setup { val avroBytes = { import benchmarks.record.generated.AttributeValue._ import benchmarks.record.generated._ new RecordWithUnionAndTypeField(new ValidInt(255, t)).toByteBuffer } val avro4sBytes = encode(RecordWithUnionAndTypeField(AttributeValue.Valid[Int](255, t))) val (handrolledDecoder, handrolledReader) = { import benchmarks.handrolled_codecs._ implicit val codec: Codec[AttributeValue[Int]] = AttributeValueCodec[Int] implicit val schemaFor: SchemaFor[AttributeValue[Int]] = SchemaFor[AttributeValue[Int]](codec.schema) val recordSchemaFor = SchemaFor[RecordWithUnionAndTypeField] val decoder = Decoder[RecordWithUnionAndTypeField].withSchema(recordSchemaFor) val reader = new GenericDatumReader[GenericRecord](recordSchemaFor.schema) (decoder, reader) } val (avro4sDecoder, avro4sReader) = { val decoder = Decoder[RecordWithUnionAndTypeField] val reader = new GenericDatumReader[GenericRecord](decoder.schema) (decoder, reader) } } def encode[T: Encoder: SchemaFor](value: T): ByteBuffer = { val outputStream = new ByteArrayOutputStream(512) val encoder = Encoder[T] val schema = AvroSchema[T] val record = encoder.encode(value).asInstanceOf[GenericRecord] val writer = new GenericDatumWriter[GenericRecord](schema) val enc = EncoderFactory.get().directBinaryEncoder(outputStream, null) writer.write(record, enc) ByteBuffer.wrap(outputStream.toByteArray) } } class Decoding extends CommonParams with BenchmarkHelpers { import Decoding._ def decode[T](bytes: ByteBuffer, decoder: Decoder[T], reader: GenericDatumReader[GenericRecord]): T = { val dec = DecoderFactory.get().binaryDecoder(new ByteBufferInputStream(Collections.singletonList(bytes.duplicate)), null) val record = reader.read(null, dec) decoder.decode(record) } @Benchmark def avroSpecificRecord(setup: Setup, blackhole: Blackhole) = { import benchmarks.record.generated._ blackhole.consume(RecordWithUnionAndTypeField.fromByteBuffer(setup.avroBytes.duplicate)) } @Benchmark def avro4sHandrolled(setup: Setup, blackhole: Blackhole) = blackhole.consume(decode(setup.avro4sBytes, setup.handrolledDecoder, setup.handrolledReader)) @Benchmark def avro4sGenerated(setup: Setup, blackhole: Blackhole) = blackhole.consume(decode(setup.avro4sBytes, setup.avro4sDecoder, setup.avro4sReader)) }
Example 13
Source File: Encoding.scala From avro4s with Apache License 2.0 | 5 votes |
package benchmarks import java.io.ByteArrayOutputStream import java.nio.ByteBuffer import benchmarks.record._ import com.sksamuel.avro4s._ import org.apache.avro.generic.{GenericDatumWriter, GenericRecord} import org.apache.avro.io.EncoderFactory import org.openjdk.jmh.annotations._ import org.openjdk.jmh.infra.Blackhole object Encoding extends BenchmarkHelpers { @State(Scope.Thread) class Setup { val record = RecordWithUnionAndTypeField(AttributeValue.Valid[Int](255, t)) val specificRecord = { import benchmarks.record.generated.AttributeValue._ import benchmarks.record.generated._ new RecordWithUnionAndTypeField(new ValidInt(255, t)) } val (avro4sEncoder, avro4sWriter) = { val schema = AvroSchema[RecordWithUnionAndTypeField] val encoder = Encoder[RecordWithUnionAndTypeField] val writer = new GenericDatumWriter[GenericRecord](schema) (encoder, writer) } val (handrolledEncoder, handrolledWriter) = { import benchmarks.handrolled_codecs._ implicit val codec: AttributeValueCodec[Int] = AttributeValueCodec[Int] implicit val schemaForValid = codec.schemaForValid val schema = AvroSchema[RecordWithUnionAndTypeField] val encoder = Encoder[RecordWithUnionAndTypeField] val writer = new GenericDatumWriter[GenericRecord](schema) (encoder, writer) } } } class Encoding extends CommonParams with BenchmarkHelpers { import Encoding._ def encode[T](value: T, encoder: Encoder[T], writer: GenericDatumWriter[GenericRecord]): ByteBuffer = { val outputStream = new ByteArrayOutputStream(512) val record = encoder.encode(value).asInstanceOf[GenericRecord] val enc = EncoderFactory.get().directBinaryEncoder(outputStream, null) writer.write(record, enc) ByteBuffer.wrap(outputStream.toByteArray) } @Benchmark def avroSpecificRecord(setup: Setup, blackhole: Blackhole) = blackhole.consume(setup.specificRecord.toByteBuffer) @Benchmark def avro4sGenerated(setup: Setup, blackhole: Blackhole) = blackhole.consume(encode(setup.record, setup.avro4sEncoder, setup.avro4sWriter)) @Benchmark def avro4sHandrolled(setup: Setup, blackhole: Blackhole) = blackhole.consume(encode(setup.record, setup.handrolledEncoder, setup.handrolledWriter)) }
Example 14
Source File: AvroSchema.scala From aloha with MIT License | 5 votes |
package com.eharmony.aloha.semantics.compiled.plugin.avro import com.eharmony.aloha.reflect.RefInfo import com.eharmony.aloha.semantics.compiled.plugin.schemabased.schema.Schema.FieldRetrievalError import com.eharmony.aloha.semantics.compiled.plugin.schemabased.schema._ import org.apache.avro import org.apache.avro.Schema.Type._ import org.apache.avro.generic.GenericRecord import scala.collection.JavaConversions.asScalaBuffer protected[avro] def unionField(name: String, index: Int, fieldSchema: avro.Schema, reqField: Boolean): Result = { val union = fieldSchema.getTypes // If there's only one item in the union, treat the union as if it didn't exist. if (1 == union.size) extract(name, index, union.head, reqField) else { val nonNull = union.filter(t => t.getType != NULL) if (1 == nonNull.size) extract(name, index, nonNull.head, nullable = true) else Left(FieldRetrievalError("Only UNION fields of one type or two types where one is NULL are allowed.")) } } } object AvroSchema { def apply(rootSchema: avro.Schema): AvroSchema = new AvroSchema(rootSchema, rootSchema) }
Example 15
Source File: BasicEncoderTest.scala From avro4s with Apache License 2.0 | 5 votes |
package com.sksamuel.avro4s.record.encoder import com.sksamuel.avro4s.examples.UppercasePkg.ClassInUppercasePackage import com.sksamuel.avro4s._ import org.apache.avro.Schema import org.apache.avro.generic.{GenericFixed, GenericRecord} import org.apache.avro.util.Utf8 import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class BasicEncoderTest extends AnyWordSpec with Matchers { "Encoder" should { "encode strings as UTF8" in { case class Foo(s: String) val schema = AvroSchema[Foo] val record = Encoder[Foo].encode(Foo("hello")) record shouldBe ImmutableRecord(schema, Vector(new Utf8("hello"))) } "encode strings as GenericFixed and pad bytes when schema is fixed" in { case class Foo(s: String) val fixedSchema = SchemaFor[String](Schema.createFixed("FixedString", null, null, 7)) implicit val fixedStringEncoder: Encoder[String] = Encoder.StringEncoder.withSchema(fixedSchema) val record = Encoder[Foo].encode(Foo("hello")).asInstanceOf[GenericRecord] record.get("s").asInstanceOf[GenericFixed].bytes().toList shouldBe Seq(104, 101, 108, 108, 111, 0, 0) // the fixed should have the right size record.get("s").asInstanceOf[GenericFixed].bytes().length shouldBe 7 } "encode longs" in { case class Foo(l: Long) val schema = AvroSchema[Foo] Encoder[Foo].encode(Foo(123456L)) shouldBe ImmutableRecord(schema, Vector(java.lang.Long.valueOf(123456L))) } "encode doubles" in { case class Foo(d: Double) val schema = AvroSchema[Foo] Encoder[Foo].encode(Foo(123.435)) shouldBe ImmutableRecord(schema, Vector(java.lang.Double.valueOf(123.435D))) } "encode booleans" in { case class Foo(d: Boolean) val schema = AvroSchema[Foo] Encoder[Foo].encode(Foo(true)) shouldBe ImmutableRecord(schema, Vector(java.lang.Boolean.valueOf(true))) } "encode floats" in { case class Foo(d: Float) val schema = AvroSchema[Foo] Encoder[Foo].encode(Foo(123.435F)) shouldBe ImmutableRecord(schema, Vector(java.lang.Float.valueOf(123.435F))) } "encode ints" in { case class Foo(i: Int) val schema = AvroSchema[Foo] Encoder[Foo].encode(Foo(123)) shouldBe ImmutableRecord(schema, Vector(java.lang.Integer.valueOf(123))) } "support uppercase packages" in { val schema = AvroSchema[ClassInUppercasePackage] val t = com.sksamuel.avro4s.examples.UppercasePkg.ClassInUppercasePackage("hello") schema.getFullName shouldBe "com.sksamuel.avro4s.examples.UppercasePkg.ClassInUppercasePackage" Encoder[ClassInUppercasePackage].encode(t) shouldBe ImmutableRecord(schema, Vector(new Utf8("hello"))) } } }
Example 16
Source File: FixedEncoderTest.scala From avro4s with Apache License 2.0 | 5 votes |
package com.sksamuel.avro4s.record.encoder import com.sksamuel.avro4s.{AvroFixed, Encoder, SchemaFor} import org.apache.avro.generic.{GenericFixed, GenericRecord} import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers @AvroFixed(8) case class QuarterSHA256(bytes: Array[Byte]) extends AnyVal case class FixedString(@AvroFixed(7) mystring: String) case class AvroMessage(q: QuarterSHA256, payload: Array[Byte]) @AvroFixed(8) case class FixedValueType(z: String) extends AnyVal case class OptionFixedWrapper(opt: Option[FixedValueType]) class FixedEncoderTest extends AnyFunSuite with Matchers { val m = AvroMessage( QuarterSHA256(Array[Byte](0, 1, 2, 3, 4, 5, 6)), Array[Byte](0, 1, 2, 3) ) test("encode fixed when used on a value type") { val schema = SchemaFor[AvroMessage] val record = Encoder[AvroMessage].encode(m).asInstanceOf[GenericRecord] record.get("q").asInstanceOf[GenericFixed].bytes().toVector shouldBe Vector(0, 1, 2, 3, 4, 5, 6, 0) } test("encode fixed when used on a field in a case class") { val schema = SchemaFor[FixedString] val record = Encoder[FixedString].encode(FixedString("sam")).asInstanceOf[GenericRecord] record.get("mystring").asInstanceOf[GenericFixed].bytes.toVector shouldBe Vector(115, 97, 109, 0, 0, 0, 0) } test("support options of fixed") { val schema = SchemaFor[OptionFixedWrapper] val record = Encoder[OptionFixedWrapper].encode(OptionFixedWrapper(Some(FixedValueType("sam")))).asInstanceOf[GenericRecord] record.get("opt").asInstanceOf[GenericFixed].bytes.toVector shouldBe Vector(115, 97, 109, 0, 0, 0, 0, 0) } }
Example 17
Source File: ByteArrayEncoderTest.scala From avro4s with Apache License 2.0 | 5 votes |
package com.sksamuel.avro4s.record.encoder import java.nio.ByteBuffer import com.sksamuel.avro4s.{AvroSchema, Encoder, SchemaFor} import org.apache.avro.SchemaBuilder import org.apache.avro.generic.{GenericFixed, GenericRecord} import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers class ByteArrayEncoderTest extends AnyFunSuite with Matchers { test("encode byte arrays as BYTES type") { case class Test(z: Array[Byte]) val schema = AvroSchema[Test] Encoder[Test].encode(Test(Array[Byte](1, 4, 9))) .asInstanceOf[GenericRecord] .get("z") .asInstanceOf[ByteBuffer] .array().toList shouldBe List[Byte](1, 4, 9) } test("encode byte vectors as BYTES type") { case class Test(z: Vector[Byte]) val schema = AvroSchema[Test] Encoder[Test].encode(Test(Vector[Byte](1, 4, 9))) .asInstanceOf[GenericRecord] .get("z") .asInstanceOf[ByteBuffer] .array().toList shouldBe List[Byte](1, 4, 9) } test("encode byte seq as BYTES type") { case class Test(z: Seq[Byte]) val schema = AvroSchema[Test] Encoder[Test].encode(Test(Seq[Byte](1, 4, 9))) .asInstanceOf[GenericRecord] .get("z") .asInstanceOf[ByteBuffer] .array().toList shouldBe List[Byte](1, 4, 9) } test("encode byte list as BYTES type") { case class Test(z: List[Byte]) val schema = AvroSchema[Test] Encoder[Test].encode(Test(List[Byte](1, 4, 9))) .asInstanceOf[GenericRecord] .get("z") .asInstanceOf[ByteBuffer] .array().toList shouldBe List[Byte](1, 4, 9) } test("encode top level byte arrays") { val schema = AvroSchema[Array[Byte]] Encoder[Array[Byte]].encode(Array[Byte](1, 4, 9)) .asInstanceOf[ByteBuffer] .array().toList shouldBe List[Byte](1, 4, 9) } test("encode ByteBuffers as BYTES type") { case class Test(z: ByteBuffer) val schema = AvroSchema[Test] Encoder[Test].encode(Test(ByteBuffer.wrap(Array[Byte](1, 4, 9)))) .asInstanceOf[GenericRecord] .get("z") .asInstanceOf[ByteBuffer] .array().toList shouldBe List[Byte](1, 4, 9) } test("encode top level ByteBuffers") { val schema = AvroSchema[ByteBuffer] Encoder[ByteBuffer].encode(ByteBuffer.wrap(Array[Byte](1, 4, 9))) .asInstanceOf[ByteBuffer] .array().toList shouldBe List[Byte](1, 4, 9) } test("support FIXED") { val schema = SchemaBuilder.fixed("foo").size(7) val fixed = Encoder.ByteArrayEncoder.withSchema(SchemaFor(schema)).encode("hello".getBytes).asInstanceOf[GenericFixed] fixed.bytes().toList shouldBe Seq(104, 101, 108, 108, 111, 0, 0) fixed.bytes().length shouldBe 7 } }
Example 18
Source File: TupleEncoderTest.scala From avro4s with Apache License 2.0 | 5 votes |
package com.sksamuel.avro4s.record.encoder import com.sksamuel.avro4s.{AvroSchema, Encoder} import org.apache.avro.generic.GenericRecord import org.apache.avro.util.Utf8 import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers class TupleEncoderTest extends AnyFunSuite with Matchers { test("encode tuple2") { case class Test(z: (String, Option[Int])) val schema = AvroSchema[Test] val record = Encoder[Test].encode(Test("hello", Some(55))).asInstanceOf[GenericRecord] val z = record.get("z").asInstanceOf[GenericRecord] z.get("_1") shouldBe new Utf8("hello") z.get("_2") shouldBe 55 } test("encode tuple3") { case class Test(z: (String, Option[Int], Long)) val schema = AvroSchema[Test] val record = Encoder[Test].encode(Test("hello", Some(55), 9999999L)).asInstanceOf[GenericRecord] val z = record.get("z").asInstanceOf[GenericRecord] z.get("_1") shouldBe new Utf8("hello") z.get("_2") shouldBe 55 z.get("_3") shouldBe 9999999L } test("encode tuple4") { case class Test(z: (String, Option[Int], Boolean, Double)) val schema = AvroSchema[Test] val record = Encoder[Test].encode(Test("hello", Some(55), true, 0.24)).asInstanceOf[GenericRecord] val z = record.get("z").asInstanceOf[GenericRecord] z.get("_1") shouldBe new Utf8("hello") z.get("_2") shouldBe 55 z.get("_3") shouldBe true z.get("_4") shouldBe 0.24 } test("encode tuple5") { case class Test(z: (String, Option[Int], String, Boolean, String)) val schema = AvroSchema[Test] val record = Encoder[Test].encode(Test("a", Some(55), "b", true, "c")).asInstanceOf[GenericRecord] val z = record.get("z").asInstanceOf[GenericRecord] z.get("_1") shouldBe new Utf8("a") z.get("_2") shouldBe 55 z.get("_3") shouldBe new Utf8("b") z.get("_4") shouldBe true z.get("_5") shouldBe new Utf8("c") } }
Example 19
Source File: OptionOutputStreamTest.scala From avro4s with Apache License 2.0 | 5 votes |
package com.sksamuel.avro4s.streams.output import org.apache.avro.generic.GenericRecord import org.apache.avro.util.Utf8 class OptionOutputStreamTest extends OutputStreamTest { test("options of booleans") { case class Test(z: Option[Boolean]) writeRead(Test(Some(true))) { record => record.get("z") shouldBe true } writeRead(Test(None)) { record => record.get("z") shouldBe null } } test("options of ints") { case class Test(z: Option[Int]) writeRead(Test(Some(43242))) { record => record.get("z") shouldBe 43242 } writeRead(Test(None)) { record => record.get("z") shouldBe null } } test("options of longs") { case class Test(z: Option[Long]) writeRead(Test(Some(43242L))) { record => record.get("z") shouldBe 43242L } writeRead(Test(None)) { record => record.get("z") shouldBe null } } test("options of doubles") { case class Test(z: Option[Double]) writeRead(Test(Some(123.34))) { record => record.get("z") shouldBe java.lang.Double.valueOf(123.34) } writeRead(Test(None)) { record => record.get("z") shouldBe null } } test("options of strings") { case class Test(z: Option[String]) writeRead(Test(Some("hello"))) { record => record.get("z") shouldBe new Utf8("hello") } writeRead(Test(None)) { record => record.get("z") shouldBe null } } test("options of classes") { case class Foo(s: String) case class Test(z: Option[Foo]) writeRead(Test(Some(Foo("hello")))) { record => record.get("z").asInstanceOf[GenericRecord].get("s") shouldBe new Utf8("hello") } writeRead(Test(None)) { record => record.get("z") shouldBe null } } }
Example 20
Source File: OutputStreamTest.scala From avro4s with Apache License 2.0 | 5 votes |
package com.sksamuel.avro4s.streams.output import java.io.ByteArrayOutputStream import com.sksamuel.avro4s._ import org.apache.avro.file.{DataFileReader, SeekableByteArrayInput} import org.apache.avro.generic.{GenericDatumReader, GenericRecord} import org.apache.avro.io.DecoderFactory import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers trait OutputStreamTest extends AnyFunSuite with Matchers { def readData[T: SchemaFor](out: ByteArrayOutputStream): GenericRecord = readData(out.toByteArray) def readData[T: SchemaFor](bytes: Array[Byte]): GenericRecord = { val datumReader = new GenericDatumReader[GenericRecord](AvroSchema[T]) val dataFileReader = new DataFileReader[GenericRecord](new SeekableByteArrayInput(bytes), datumReader) dataFileReader.next } def writeData[T: Encoder : SchemaFor](t: T): ByteArrayOutputStream = { val out = new ByteArrayOutputStream val avro = AvroOutputStream.data[T].to(out).build() avro.write(t) avro.close() out } def readBinary[T: SchemaFor](out: ByteArrayOutputStream): GenericRecord = readBinary(out.toByteArray) def readBinary[T: SchemaFor](bytes: Array[Byte]): GenericRecord = { val datumReader = new GenericDatumReader[GenericRecord](AvroSchema[T]) val decoder = DecoderFactory.get().binaryDecoder(new SeekableByteArrayInput(bytes), null) datumReader.read(null, decoder) } def writeBinary[T: Encoder : SchemaFor](t: T): ByteArrayOutputStream = { val out = new ByteArrayOutputStream val avro = AvroOutputStream.binary[T].to(out).build() avro.write(t) avro.close() out } def readJson[T: SchemaFor](out: ByteArrayOutputStream): GenericRecord = readJson(out.toByteArray) def readJson[T: SchemaFor](bytes: Array[Byte]): GenericRecord = { val schema = AvroSchema[T] val datumReader = new GenericDatumReader[GenericRecord](schema) val decoder = DecoderFactory.get().jsonDecoder(schema, new SeekableByteArrayInput(bytes)) datumReader.read(null, decoder) } def writeJson[T: Encoder : SchemaFor](t: T): ByteArrayOutputStream = { val out = new ByteArrayOutputStream val avro = AvroOutputStream.json[T].to(out).build() avro.write(t) avro.close() out } def writeRead[T: Encoder : SchemaFor](t: T)(fn: GenericRecord => Any): Unit = { { val out = writeData(t) val record = readData(out) fn(record) } { val out = writeBinary(t) val record = readBinary(out) fn(record) } { val out = writeJson(t) val record = readJson(out) fn(record) } } }
Example 21
Source File: EitherOutputStreamTest.scala From avro4s with Apache License 2.0 | 5 votes |
package com.sksamuel.avro4s.streams.output import java.util import com.sksamuel.avro4s.schema.Wine import org.apache.avro.AvroRuntimeException import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.avro.util.Utf8 class EitherOutputStreamTest extends OutputStreamTest { import scala.collection.JavaConverters._ test("write out either of primitives") { case class Test(z: Either[String, Int]) writeRead(Test(Left("hello"))) { record => record.get("z") shouldBe new Utf8("hello") } writeRead(Test(Right(45))) { record => record.get("z") shouldBe 45 } } test("write out either of Array") { case class Test(z: Either[Array[Int], String]) writeRead(Test(Left(Array(1, 3, 4)))) { record => record.get("z").asInstanceOf[GenericData.Array[Int]].asScala shouldBe List(1, 3, 4) } } test("write out either of Seq") { case class Test(z: Either[String, Seq[String]]) writeRead(Test(Right(Seq("c", "d")))) { record => record.get("z").asInstanceOf[GenericData.Array[String]].asScala shouldBe List(new Utf8("c"), new Utf8("d")) } } test("write out either of enum") { case class Test(z: Either[Wine, Seq[String]]) writeRead(Test(Left(Wine.Malbec))) { record => record.get("z").asInstanceOf[GenericData.EnumSymbol].toString shouldBe "Malbec" } } test("write out either of Maps") { case class Test(z: Either[Array[Int], Map[String, Boolean]]) writeRead(Test(Right(Map("a" -> true, "b" -> false)))) { record => record.get("z").asInstanceOf[util.HashMap[String, Boolean]].asScala shouldBe Map(new Utf8("a") -> true, new Utf8("b") -> false) } } test("write out case classes") { case class Foo(a: String) case class Bar(b: Boolean) case class Test(z: Either[Foo, Bar]) writeRead(Test(Left(Foo("hello")))) { record => record.get("z").asInstanceOf[GenericRecord].get("a") shouldBe new Utf8("hello") } writeRead(Test(Right(Bar(true)))) { record => record.get("z").asInstanceOf[GenericRecord].get("b") shouldBe true } } test("throw an exception if trying to use two collection types in an either") { intercept[AvroRuntimeException] { case class Test(z: Either[Seq[String], List[Int]]) writeRead(Test(Left(Seq("hello")))) { record => } } } }
Example 22
Source File: BasicOutputStreamTest.scala From avro4s with Apache License 2.0 | 5 votes |
package com.sksamuel.avro4s.streams.output import com.sksamuel.avro4s.{Encoder, SchemaFor} import org.apache.avro.Schema.Parser import org.apache.avro.generic.{GenericRecord, GenericRecordBuilder} import org.apache.avro.util.Utf8 class BasicOutputStreamTest extends OutputStreamTest { test("write out booleans") { case class Test(z: Boolean) writeRead(Test(true)) { record => record.get("z") shouldBe true } } test("write out strings") { case class Test(z: String) writeRead(Test("Hello world")) { record => record.get("z") shouldBe new Utf8("Hello world") } } test("write out longs") { case class Test(z: Long) writeRead(Test(65653L)) { record => record.get("z") shouldBe 65653L } } test("write out ints") { case class Test(z: Int) writeRead(Test(44)) { record => record.get("z") shouldBe 44 } } test("write out doubles") { case class Test(z: Double) writeRead(Test(3.235)) { record => record.get("z") shouldBe 3.235 } } test("write out floats") { case class Test(z: Float) writeRead(Test(3.4F)) { record => record.get("z") shouldBe 3.4F } } test("write out generic record") { val recordSchema = new Parser().parse( """{"type":"record","name":"Test","fields":[{"name":"field","type":"string"}]}""" ) implicit val recordSchemaFor: SchemaFor[GenericRecord] = SchemaFor(recordSchema) implicit val encoder: Encoder[GenericRecord] = new Encoder[GenericRecord] { def schemaFor = recordSchemaFor def encode(value: GenericRecord): AnyRef = value } val record: GenericRecord = new GenericRecordBuilder(recordSchema).set("field", "value").build() writeRead(record) { rec => rec.get("field") shouldBe new Utf8("value") } } }
Example 23
Source File: GithubIssue235.scala From avro4s with Apache License 2.0 | 5 votes |
package com.sksamuel.avro4s.github import java.io.ByteArrayOutputStream import com.sksamuel.avro4s.{Decoder, Encoder, RecordFormat, SchemaFor} import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord} import org.apache.avro.io.{DecoderFactory, EncoderFactory} import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers case class Label(value: String) extends AnyVal case class Value[A](label: Label, value: A) sealed trait OneOrTwo[A] case class One[A](value: Value[A]) extends OneOrTwo[A] case class Two[A](first: Value[A], second: Value[A]) extends OneOrTwo[A] case class OneOrTwoWrapper[A](t: OneOrTwo[A]) object Bug { def apply[T <: Product](a: T)( implicit schemaFor: SchemaFor[T], encoder: Encoder[T], decoder: Decoder[T] ): Unit = { val format = RecordFormat[T] val schema = schemaFor.schema val datumReader = new GenericDatumReader[GenericRecord](schema) val datumWriter = new GenericDatumWriter[GenericRecord](schema) val stream = new ByteArrayOutputStream() val bEncoder = EncoderFactory.get().binaryEncoder(stream, null) datumWriter.write(format.to(a), bEncoder) bEncoder.flush() val bytes = stream.toByteArray val bDecoder = DecoderFactory.get().binaryDecoder(bytes, null) val record = datumReader.read(null, bDecoder) require(format.from(record) == a) } } class GithubIssue235 extends AnyFunSuite with Matchers { test("Broken typeclass derivation upgrading from 1.9.0 to 2.0.1 #235") { val o = OneOrTwoWrapper(One(Value(Label("lbl"), "foo"))) Bug(o) } }
Example 24
Source File: GithubIssue191.scala From avro4s with Apache License 2.0 | 5 votes |
package com.sksamuel.avro4s.github import java.io.ByteArrayOutputStream import com.sksamuel.avro4s.{AvroOutputStream, AvroSchema} import org.apache.avro.file.{DataFileReader, SeekableByteArrayInput} import org.apache.avro.generic.{GenericDatumReader, GenericRecord} import org.apache.avro.util.Utf8 import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers final case class SN(value: String) extends AnyVal final case class SimpleUser(name: String, sn: Option[SN]) class GithubIssue191 extends AnyFunSuite with Matchers { test("writing out AnyVal in an option") { implicit val schema = AvroSchema[SimpleUser] val bytes = new ByteArrayOutputStream val out = AvroOutputStream.data[SimpleUser].to(bytes).build() out.write(SimpleUser("Tom", Some(SN("123")))) out.close() val datumReader = new GenericDatumReader[GenericRecord](schema) val dataFileReader = new DataFileReader[GenericRecord](new SeekableByteArrayInput(bytes.toByteArray), datumReader) val record = new Iterator[GenericRecord] { override def hasNext: Boolean = dataFileReader.hasNext override def next(): GenericRecord = dataFileReader.next }.toList.head record.getSchema shouldBe schema record.get("name") shouldBe new Utf8("Tom") record.get("sn") shouldBe new Utf8("123") } }
Example 25
Source File: AvroSerializer.scala From scio with Apache License 2.0 | 5 votes |
package com.spotify.scio.coders.instances.kryo import com.esotericsoftware.kryo.Kryo import com.esotericsoftware.kryo.io.{Input, Output} import com.twitter.chill.KSerializer import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.avro.specific.SpecificRecordBase import org.apache.beam.sdk.coders.AvroCoder import scala.collection.mutable.{Map => MMap} import scala.util.Try private[coders] class GenericAvroSerializer extends KSerializer[GenericRecord] { private lazy val cache: MMap[String, AvroCoder[GenericRecord]] = MMap() private def getCoder(schemaStr: String): AvroCoder[GenericRecord] = cache.getOrElseUpdate(schemaStr, AvroCoder.of(new Schema.Parser().parse(schemaStr))) private def getCoder(schemaStr: String, schema: Schema): AvroCoder[GenericRecord] = cache.getOrElseUpdate(schemaStr, AvroCoder.of(schema)) override def write(kryo: Kryo, out: Output, obj: GenericRecord): Unit = { val schemaStr = obj.getSchema.toString val coder = this.getCoder(schemaStr, obj.getSchema) // write schema before every record in case it's not in reader serializer's cache out.writeString(schemaStr) coder.encode(obj, out) } override def read(kryo: Kryo, in: Input, cls: Class[GenericRecord]): GenericRecord = { val coder = this.getCoder(in.readString()) coder.decode(in) } } private[coders] class SpecificAvroSerializer[T <: SpecificRecordBase] extends KSerializer[T] { private lazy val cache: MMap[Class[T], AvroCoder[T]] = MMap() private def getCoder(cls: Class[T]): AvroCoder[T] = cache.getOrElseUpdate( cls, Try(cls.getConstructor().newInstance().getSchema) .map(AvroCoder.of(cls, _)) .getOrElse(AvroCoder.of(cls)) ) override def write(kser: Kryo, out: Output, obj: T): Unit = this.getCoder(obj.getClass.asInstanceOf[Class[T]]).encode(obj, out) override def read(kser: Kryo, in: Input, cls: Class[T]): T = this.getCoder(cls).decode(in) }
Example 26
Source File: TestUtilsBase.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect import java.util import java.util.Collections import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord import org.apache.kafka.connect.source.SourceTaskContext import org.apache.kafka.connect.storage.OffsetStorageReader import org.mockito.Mockito._ import org.mockito.MockitoSugar import org.scalatest.BeforeAndAfter import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec import scala.collection.JavaConverters._ //set up partition val partition: util.Map[String, String] = Collections.singletonMap(lookupPartitionKey, table) //as a list to search for val partitionList: util.List[util.Map[String, String]] = List(partition).asJava //set up the offset val offset: util.Map[String, Object] = (Collections.singletonMap(offsetColumn,offsetValue )) //create offsets to initialize from val offsets :util.Map[util.Map[String, String],util.Map[String, Object]] = Map(partition -> offset).asJava //mock out reader and task context val taskContext = mock[SourceTaskContext] val reader = mock[OffsetStorageReader] when(reader.offsets(partitionList)).thenReturn(offsets) when(taskContext.offsetStorageReader()).thenReturn(reader) taskContext } }
Example 27
Source File: AvroConverter.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.converters.source import java.io.File import java.util.Collections import com.datamountaineer.streamreactor.connect.converters.MsgKey import io.confluent.connect.avro.AvroData import org.apache.avro.generic.{GenericDatumReader, GenericRecord} import org.apache.avro.io.DecoderFactory import org.apache.avro.{Schema => AvroSchema} import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.kafka.connect.source.SourceRecord import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException class AvroConverter extends Converter { private val avroData = new AvroData(8) private var sourceToSchemaMap: Map[String, AvroSchema] = Map.empty private var avroReadersMap: Map[String, GenericDatumReader[GenericRecord]] = Map.empty override def convert(kafkaTopic: String, sourceTopic: String, messageId: String, bytes: Array[Byte], keys: Seq[String] = Seq.empty, keyDelimiter: String = "."): SourceRecord = { Option(bytes) match { case None => new SourceRecord(Collections.singletonMap(Converter.TopicKey, sourceTopic), null, kafkaTopic, avroData.toConnectSchema(sourceToSchemaMap(sourceTopic)), null) case Some(_) => val reader = avroReadersMap.getOrElse(sourceTopic.toLowerCase, throw new ConfigException(s"Invalid ${AvroConverter.SCHEMA_CONFIG} is not configured for $sourceTopic")) val decoder = DecoderFactory.get().binaryDecoder(bytes, null) val record = reader.read(null, decoder) val schemaAndValue = avroData.toConnectData(sourceToSchemaMap(sourceTopic.toLowerCase), record) val value = schemaAndValue.value() value match { case s: Struct if keys.nonEmpty => val keysValue = keys.flatMap { key => Option(KeyExtractor.extract(s, key.split('.').toVector)).map(_.toString) }.mkString(keyDelimiter) new SourceRecord( Collections.singletonMap(Converter.TopicKey, sourceTopic), null, kafkaTopic, Schema.STRING_SCHEMA, keysValue, schemaAndValue.schema(), schemaAndValue.value()) case _ => new SourceRecord( Collections.singletonMap(Converter.TopicKey, sourceTopic), null, kafkaTopic, MsgKey.schema, MsgKey.getStruct(sourceTopic, messageId), schemaAndValue.schema(), schemaAndValue.value()) } } } override def initialize(config: Map[String, String]): Unit = { sourceToSchemaMap = AvroConverter.getSchemas(config) avroReadersMap = sourceToSchemaMap.map { case (key, schema) => key -> new GenericDatumReader[GenericRecord](schema) } } } object AvroConverter { val SCHEMA_CONFIG = "connect.source.converter.avro.schemas" def getSchemas(config: Map[String, String]): Map[String, AvroSchema] = { config.getOrElse(SCHEMA_CONFIG, throw new ConfigException(s"$SCHEMA_CONFIG is not provided")) .toString .split(';') .filter(_.trim.nonEmpty) .map(_.split("=")) .map { case Array(source, path) => val file = new File(path) if (!file.exists()) { throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The file $path doesn't exist!") } val s = source.trim.toLowerCase() if (s.isEmpty) { throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The topic is not valid for entry containing $path") } s -> new AvroSchema.Parser().parse(file) case other => throw new ConfigException(s"$SCHEMA_CONFIG is not properly set. The format is Mqtt_Source->AVRO_FILE") }.toMap } }
Example 28
Source File: AvroConverter.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.converters.sink import com.datamountaineer.streamreactor.connect.converters.MsgKey import io.confluent.connect.avro.AvroData import java.io.ByteArrayOutputStream import java.io.File import org.apache.avro.{Schema => AvroSchema} import org.apache.avro.generic.GenericRecord import org.apache.avro.io.EncoderFactory import org.apache.avro.reflect.ReflectDatumWriter import org.apache.kafka.connect.sink.SinkRecord import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException class AvroConverter extends Converter { private val avroData = new AvroData(8) private var sinkToSchemaMap: Map[String, AvroSchema] = Map.empty private var avroWritersMap: Map[String, ReflectDatumWriter[Object]] = Map.empty override def convert(sinkTopic: String, data: SinkRecord): SinkRecord = { Option(data) match { case None => new SinkRecord( sinkTopic, 0, null, null, avroData.toConnectSchema(sinkToSchemaMap(sinkTopic)), null, 0 ) case Some(_) => val kafkaTopic = data.topic() val writer = avroWritersMap.getOrElse(kafkaTopic.toLowerCase, throw new ConfigException(s"Invalid ${AvroConverter.SCHEMA_CONFIG} is not configured for $kafkaTopic")) val output = new ByteArrayOutputStream(); val decoder = EncoderFactory.get().binaryEncoder(output, null) output.reset() val avro = avroData.fromConnectData(data.valueSchema(), data.value()) avro.asInstanceOf[GenericRecord] val record = writer.write(avro, decoder) decoder.flush() val arr = output.toByteArray new SinkRecord( kafkaTopic, data.kafkaPartition(), MsgKey.schema, MsgKey.getStruct(sinkTopic, data.key().toString()), data.valueSchema(), arr, 0 ) } } override def initialize(config: Map[String, String]): Unit = { sinkToSchemaMap = AvroConverter.getSchemas(config) avroWritersMap = sinkToSchemaMap.map { case (key, schema) => key -> new ReflectDatumWriter[Object](schema) } } } object AvroConverter { val SCHEMA_CONFIG = "connect.converter.avro.schemas" def getSchemas(config: Map[String, String]): Map[String, AvroSchema] = { config.getOrElse(SCHEMA_CONFIG, throw new ConfigException(s"$SCHEMA_CONFIG is not provided")) .toString .split(';') .filter(_.trim.nonEmpty) .map(_.split("=")) .map { case Array(sink, path) => val file = new File(path) if (!file.exists()) { throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The file $path doesn't exist!") } val s = sink.trim.toLowerCase() if (s.isEmpty) { throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The topic is not valid for entry containing $path") } s -> new AvroSchema.Parser().parse(file) case other => throw new ConfigException(s"$SCHEMA_CONFIG is not properly set. The format is Mqtt_Sink->AVRO_FILE") }.toMap } }
Example 29
Source File: AvroSerializer.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.serialization import java.io.{ByteArrayOutputStream, InputStream, OutputStream} import com.sksamuel.avro4s.{RecordFormat, SchemaFor} import org.apache.avro.Schema import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord} import org.apache.avro.io.{DecoderFactory, EncoderFactory} object AvroSerializer { def write[T <: Product](t: T)(implicit os: OutputStream, formatter: RecordFormat[T], schemaFor: SchemaFor[T]): Unit = write(apply(t), schemaFor()) def write(record: GenericRecord, schema: Schema)(implicit os: OutputStream) = { val writer = new GenericDatumWriter[GenericRecord](schema) val encoder = EncoderFactory.get().binaryEncoder(os, null) writer.write(record, encoder) encoder.flush() os.flush() } def getBytes[T <: Product](t: T)(implicit recordFormat: RecordFormat[T], schemaFor: SchemaFor[T]): Array[Byte] = getBytes(recordFormat.to(t), schemaFor()) def getBytes(record: GenericRecord, schema: Schema): Array[Byte] = { implicit val output = new ByteArrayOutputStream() write(record, schema) output.toByteArray } def read(is: InputStream, schema: Schema): GenericRecord = { val reader = new GenericDatumReader[GenericRecord](schema) val decoder = DecoderFactory.get().binaryDecoder(is, null) reader.read(null, decoder) } def read[T <: Product](is: InputStream)(implicit schemaFor: SchemaFor[T], recordFormat: RecordFormat[T]): T = recordFormat.from(read(is, schemaFor())) def apply[T <: Product](t: T)(implicit formatter: RecordFormat[T]): GenericRecord = formatter.to(t) }
Example 30
Source File: FixAvroIO.scala From scio with Apache License 2.0 | 5 votes |
package fix package v0_7_0 import com.spotify.scio.ContextAndArgs import org.apache.avro.generic.GenericRecord import com.spotify.scio.testing.{AvroIO, BigQueryIO, PipelineSpec, TextIO} case class InputClass(s: String, i: Int) extends GenericRecord { def getSchema(): org.apache.avro.Schema = ??? def get(x$1: String): Object = ??? def put(x$1: String,x$2: Any): Unit = ??? def get(x$1: Int): Object = ??? def put(x$1: Int,x$2: Any): Unit = ??? } case class OutputClass(result: String) extends GenericRecord { def getSchema(): org.apache.avro.Schema = ??? def get(x$1: String): Object = ??? def put(x$1: String,x$2: Any): Unit = ??? def get(x$1: Int): Object = ??? def put(x$1: Int,x$2: Any): Unit = ??? } object TestJob class ValidationJobTest extends PipelineSpec { val inputs: List[InputClass] = (1 to 10).toList.map{ i => InputClass(s"s$i", i) } val inputs2 = (1 to 10).zip(inputs).toMap val inputs3 = inputs2.values val expected = List(OutputClass("result")) "TestJob" should "run" in { JobTest[TestJob.type] .input(AvroIO("current"), inputs) .input(AvroIO("reference"), inputs2.values) .input(AvroIO("reference2"), inputs3) .input(AvroIO[InputClass]("donttouch"), inputs) .output[OutputClass](AvroIO("foo")){ coll => coll should containInAnyOrder(expected) () } .run() } }
Example 31
Source File: FixAvroIO.scala From scio with Apache License 2.0 | 5 votes |
package fix package v0_7_0 import com.spotify.scio.ContextAndArgs import org.apache.avro.generic.GenericRecord import com.spotify.scio.testing.PipelineSpec import com.spotify.scio.avro._ import com.spotify.scio.bigquery._ import com.spotify.scio.io._ case class InputClass(s: String, i: Int) extends GenericRecord { def getSchema(): org.apache.avro.Schema = ??? def get(x$1: String): Object = ??? def put(x$1: String,x$2: Any): Unit = ??? def get(x$1: Int): Object = ??? def put(x$1: Int,x$2: Any): Unit = ??? } case class OutputClass(result: String) extends GenericRecord { def getSchema(): org.apache.avro.Schema = ??? def get(x$1: String): Object = ??? def put(x$1: String,x$2: Any): Unit = ??? def get(x$1: Int): Object = ??? def put(x$1: Int,x$2: Any): Unit = ??? } object TestJob class ValidationJobTest extends PipelineSpec { val inputs: List[InputClass] = (1 to 10).toList.map{ i => InputClass(s"s$i", i) } val inputs2 = (1 to 10).zip(inputs).toMap val inputs3 = inputs2.values val expected = List(OutputClass("result")) "TestJob" should "run" in { JobTest[TestJob.type] .input(AvroIO[InputClass]("current"), inputs) .input(AvroIO[GenericRecord]("reference"), inputs2.values) .input(AvroIO[InputClass]("reference2"), inputs3) .input(AvroIO[InputClass]("donttouch"), inputs) .output[OutputClass](AvroIO("foo")){ coll => coll should containInAnyOrder(expected) () } .run() } }
Example 32
Source File: CoderTestUtils.scala From scio with Apache License 2.0 | 5 votes |
package com.spotify.scio.coders import com.spotify.scio.avro.TestRecord import org.apache.beam.sdk.coders.{Coder => BCoder} import org.apache.avro.generic.GenericRecord import org.apache.beam.sdk.util.CoderUtils object CoderTestUtils { case class Pair(name: String, size: Int) case class CaseClassWithGenericRecord(name: String, size: Int, record: GenericRecord) case class CaseClassWithSpecificRecord(name: String, size: Int, record: TestRecord) def testRoundTrip[T](coder: BCoder[T], value: T): Boolean = testRoundTrip(coder, coder, value) def testRoundTrip[T](writer: BCoder[T], reader: BCoder[T], value: T): Boolean = { val bytes = CoderUtils.encodeToByteArray(writer, value) val result = CoderUtils.decodeFromByteArray(reader, bytes) result == value } }
Example 33
Source File: ProtobufUtilTest.scala From scio with Apache License 2.0 | 5 votes |
package com.spotify.scio.util import java.io.File import java.nio.channels.Channels import java.nio.file.Files import com.spotify.scio.ScioContext import com.spotify.scio.avro._ import com.spotify.scio.coders.Coder import com.spotify.scio.proto.Track.TrackPB import org.apache.avro.file.DataFileStream import org.apache.avro.generic.{GenericDatumReader, GenericRecord} import org.apache.beam.sdk.io.{FileSystems, LocalResources} import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers import scala.jdk.CollectionConverters._ class ProtobufUtilTest extends AnyFlatSpec with Matchers { "ProtobufUtil" should "convert Message -> GenericRecords that can be written and read" in { val sc = ScioContext() val dir = Files.createTempDirectory("protobuf-util-") val (path1, path2) = (new File(s"$dir/1"), new File(s"$dir/2")) path1.deleteOnExit() path2.deleteOnExit() dir.toFile.deleteOnExit() implicit val grCoder: Coder[GenericRecord] = ProtobufUtil.AvroMessageCoder val messages = sc .parallelize(1 to 10) .map(i => TrackPB.newBuilder().setTrackId(i.toString).build()) messages .map(ProtobufUtil.toAvro[TrackPB]) .saveAsAvroFile( path1.getPath, suffix = ".protobuf", metadata = ProtobufUtil.schemaMetadataOf[TrackPB], schema = ProtobufUtil.AvroMessageSchema, numShards = 1 ) val protoWriteTap = messages.saveAsProtobufFile(path2.getPath, numShards = 1) val result = sc.run().waitUntilDone() val (tapFromAvroWrite, tapFromProtoWrite) = ( ObjectFileTap[TrackPB](ScioUtil.addPartSuffix(path1.getPath)), protoWriteTap.get(result) ) tapFromAvroWrite.value.toList should contain theSameElementsAs tapFromProtoWrite.value.toList getMetadata(path1) should contain theSameElementsAs getMetadata(path2) } private def getMetadata(dir: File): Map[String, AnyRef] = { val files = dir.listFiles() if (files.length != 1) { fail(s"Directory $dir should contain 1 Avro file. Instead, found ${files.toList}") } val dfs = new DataFileStream[GenericRecord]( Channels.newInputStream(FileSystems.open(LocalResources.fromFile(files(0), false))), new GenericDatumReader[GenericRecord] ) dfs.getMetaKeys.asScala.map(k => (k, dfs.getMetaString(k))).toMap } }
Example 34
Source File: Pretty.scala From scio with Apache License 2.0 | 5 votes |
package com.spotify.scio.testing import org.apache.avro.generic.GenericRecord import org.apache.avro.specific.SpecificRecordBase import scala.jdk.CollectionConverters._ import com.spotify.scio.{registerSysProps, SysProp} import scala.util.Try @registerSysProps object PrettySysProps { val PrettyPrint = SysProp("tests.prettyprint.colors", "Should pretty printed values be rendered with colors") } object Pretty { import pprint.Tree import fansi.{Color, Str} private def renderFieldName(n: String) = Tree.Lazy(ctx => List(Color.LightBlue(n).toString).iterator) private def renderGenericRecord: PartialFunction[GenericRecord, Tree] = { case g => val renderer = new pprint.Renderer( printer.defaultWidth, printer.colorApplyPrefix, printer.colorLiteral, printer.defaultIndent ) def render(tree: Tree): Str = Str.join(renderer.rec(tree, 0, 0).iter.toSeq: _*) Tree.Lazy { ctx => val fields = for { f <- g.getSchema().getFields().asScala } yield Str.join( render(renderFieldName(f.name)), ": ", render(treeifyAvro(g.get(f.name()))) ) List( Color.LightGray("{ ").toString + fields.reduce((a, b) => Str.join(a, ", ", b)) + Color.LightGray(" }") ).iterator } } private def renderSpecificRecord: PartialFunction[SpecificRecordBase, Tree] = { case x => val fs = for { f <- x.getSchema().getFields().asScala } yield Tree.Infix(renderFieldName(f.name), "=", treeifyAvro(x.get(f.name()))) Tree.Apply(x.getClass().getSimpleName(), fs.iterator) } private def treeifyAvro: PartialFunction[Any, Tree] = { case x: SpecificRecordBase => renderSpecificRecord(x) case g: GenericRecord => renderGenericRecord(g) case x => printer.treeify(x) } private val handlers: PartialFunction[Any, Tree] = { case x: GenericRecord => treeifyAvro(x) } private val useColors = PrettySysProps.PrettyPrint.valueOption .flatMap(x => Try(x.toBoolean).toOption) .getOrElse { // Crude test to check if the terminal seems to support colors (System.console() != null) && (System.getenv().get("TERM") != null) } val printer = if (useColors) { pprint.PPrinter( additionalHandlers = handlers ) } else { pprint.PPrinter( additionalHandlers = handlers, colorLiteral = fansi.Attrs.Empty, colorApplyPrefix = fansi.Attrs.Empty ) } }
Example 35
Source File: AvroInstances.scala From scio with Apache License 2.0 | 5 votes |
package com.spotify.scio.schemas.instances import com.spotify.scio.schemas.{RawRecord, Schema} import org.apache.avro.specific.SpecificRecord import org.apache.avro.generic.{GenericRecord, IndexedRecord} import org.apache.beam.sdk.schemas.utils.AvroUtils import org.apache.beam.sdk.schemas.{AvroRecordSchema, Schema => BSchema} import org.apache.beam.sdk.transforms.SerializableFunction import org.apache.beam.sdk.values.{Row, TypeDescriptor} import scala.jdk.CollectionConverters._ import scala.reflect.{classTag, ClassTag} trait AvroInstances { implicit def avroSchema[T <: SpecificRecord: ClassTag]: Schema[T] = { // TODO: broken because of a bug upstream https://issues.apache.org/jira/browse/BEAM-6742 // RawRecord[T](new AvroRecordSchema()) import org.apache.avro.reflect.ReflectData val rc = classTag[T].runtimeClass.asInstanceOf[Class[T]] val provider = new AvroRecordSchema() val td = TypeDescriptor.of(rc) val schema = provider.schemaFor(td) val avroSchema = new AvroInstances.SerializableSchema(ReflectData.get().getSchema(td.getRawType)) def fromRow = provider.fromRowFunction(td) val toRow: SerializableFunction[T, Row] = new SerializableFunction[T, Row] { def apply(t: T): Row = AvroInstances.recordtoRow(schema, avroSchema, t) } RawRecord[T](schema, fromRow, toRow) } def fromAvroSchema(schema: org.apache.avro.Schema): Schema[GenericRecord] = { val beamSchema = AvroUtils.toBeamSchema(schema) val avroSchema = new AvroInstances.SerializableSchema(schema) val toRow = new SerializableFunction[GenericRecord, Row] { def apply(t: GenericRecord): Row = AvroInstances.recordtoRow[GenericRecord](beamSchema, avroSchema, t) } val fromRow = new SerializableFunction[Row, GenericRecord] { def apply(t: Row): GenericRecord = AvroUtils.toGenericRecord(t, avroSchema.get) } RawRecord[GenericRecord](beamSchema, fromRow, toRow) } } object AvroInstances { private class SerializableSchema(@transient private val schema: org.apache.avro.Schema) extends Serializable { private[this] val stringSchema = schema.toString def get: org.apache.avro.Schema = new org.apache.avro.Schema.Parser().parse(stringSchema) } // Workaround BEAM-6742 private def recordtoRow[T <: IndexedRecord]( schema: BSchema, avroSchema: SerializableSchema, t: T ): Row = { val row = Row.withSchema(schema) schema.getFields.asScala.zip(avroSchema.get.getFields.asScala).zipWithIndex.foreach { case ((f, a), i) => val value = t.get(i) val v = AvroUtils.convertAvroFieldStrict(value, a.schema, f.getType) row.addValue(v) } row.build() } }
Example 36
Source File: AvroCoders.scala From scio with Apache License 2.0 | 5 votes |
package com.spotify.scio.coders.instances import java.io.{InputStream, OutputStream} import com.spotify.scio.coders.{AvroCoderMacros, Coder} import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.avro.specific.{SpecificData, SpecificFixed} import org.apache.beam.sdk.coders.Coder.NonDeterministicException import org.apache.beam.sdk.coders.{AtomicCoder, AvroCoder, StringUtf8Coder} import org.apache.beam.sdk.util.common.ElementByteSizeObserver import scala.reflect.{classTag, ClassTag} final private class SlowGenericRecordCoder extends AtomicCoder[GenericRecord] { // TODO: can we find something more efficient than String ? private[this] val sc = StringUtf8Coder.of() override def encode(value: GenericRecord, os: OutputStream): Unit = { val schema = value.getSchema val coder = AvroCoder.of(schema) sc.encode(schema.toString, os) coder.encode(value, os) } override def decode(is: InputStream): GenericRecord = { val schemaStr = sc.decode(is) val schema = new Schema.Parser().parse(schemaStr) val coder = AvroCoder.of(schema) coder.decode(is) } // delegate methods for determinism and equality checks override def verifyDeterministic(): Unit = throw new NonDeterministicException( this, "Coder[GenericRecord] without schema is non-deterministic" ) override def consistentWithEquals(): Boolean = false override def structuralValue(value: GenericRecord): AnyRef = AvroCoder.of(value.getSchema).structuralValue(value) // delegate methods for byte size estimation override def isRegisterByteSizeObserverCheap(value: GenericRecord): Boolean = AvroCoder.of(value.getSchema).isRegisterByteSizeObserverCheap(value) override def registerByteSizeObserver( value: GenericRecord, observer: ElementByteSizeObserver ): Unit = AvroCoder.of(value.getSchema).registerByteSizeObserver(value, observer) } // TODO: Use a coder that does not serialize the schema def avroGenericRecordCoder(schema: Schema): Coder[GenericRecord] = Coder.beam(AvroCoder.of(schema)) // XXX: similar to GenericAvroSerializer def avroGenericRecordCoder: Coder[GenericRecord] = Coder.beam(new SlowGenericRecordCoder) import org.apache.avro.specific.SpecificRecordBase implicit def genAvro[T <: SpecificRecordBase]: Coder[T] = macro AvroCoderMacros.staticInvokeCoder[T] implicit def avroSpecificFixedCoder[T <: SpecificFixed: ClassTag]: Coder[T] = SpecificFixedCoder[T] }
Example 37
Source File: StdAvroModelFactory.scala From aloha with MIT License | 5 votes |
package com.eharmony.aloha.factory.avro import java.io.File import org.apache.commons.{vfs => vfs1, vfs2} import com.eharmony.aloha.io.vfs.{Vfs1, Vfs2} import com.eharmony.aloha.audit.impl.avro.Score import com.eharmony.aloha.factory.ModelFactory import org.apache.avro.generic.GenericRecord import scala.util.Try @deprecated(message = "Prefer StdAvroModelFactory.fromConfig(conf: FactoryConfig)", since = "4.0.1") def apply(modelDomainSchemaVfsUrl: String, modelCodomainRefInfoStr: String, imports: Seq[String] = Nil, classCacheDir: Option[File] = None, dereferenceAsOptional: Boolean = true, useVfs2: Boolean = true): Try[ModelFactory[GenericRecord, Score]] = { val vfs = url(modelDomainSchemaVfsUrl, useVfs2) vfs.flatMap { u => UrlConfig( u, modelCodomainRefInfoStr, imports, classCacheDir, dereferenceAsOptional )() } } private[this] def url(modelDomainSchemaVfsUrl: String, useVfs2: Boolean) = { val u = if (useVfs2) Try { Vfs2(vfs2.VFS.getManager.resolveFile(modelDomainSchemaVfsUrl)) } else Try { Vfs1(vfs1.VFS.getManager.resolveFile(modelDomainSchemaVfsUrl)) } FactoryConfig.wrapException(u) } }
Example 38
Source File: AvroBytesUtil.scala From scio with Apache License 2.0 | 5 votes |
package com.spotify.scio.coders import java.nio.ByteBuffer import org.apache.avro.{Schema => ASchema} import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.beam.sdk.coders.{Coder => BCoder} import org.apache.beam.sdk.util.CoderUtils import scala.jdk.CollectionConverters._ private[scio] object AvroBytesUtil { val schema: ASchema = { val s = ASchema.createRecord("AvroBytesRecord", null, null, false) s.setFields( List( new ASchema.Field( "bytes", ASchema.create(ASchema.Type.BYTES), null, null.asInstanceOf[Object] ) ).asJava ) s } def encode[T](coder: BCoder[T], obj: T): GenericRecord = { val bytes = CoderUtils.encodeToByteArray(coder, obj) val record = new GenericData.Record(schema) record.put("bytes", ByteBuffer.wrap(bytes)) record } def decode[T](coder: BCoder[T], record: GenericRecord): T = { val bb = record.get("bytes").asInstanceOf[ByteBuffer] val bytes = java.util.Arrays.copyOfRange(bb.array(), bb.position(), bb.limit()) CoderUtils.decodeFromByteArray(coder, bytes) } }
Example 39
Source File: GroupByBenchmark.scala From scio with Apache License 2.0 | 5 votes |
package com.spotify.scio.jmh import com.spotify.scio.{ScioContext, ScioExecutionContext} import com.spotify.scio.avro._ import com.spotify.scio.coders._ import org.apache.beam.sdk.coders.{KvCoder, Coder => BCoder} import org.apache.beam.sdk.values.KV import org.apache.beam.sdk.transforms.GroupByKey import org.apache.beam.sdk.options.{PipelineOptions, PipelineOptionsFactory} import java.util.concurrent.TimeUnit import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.openjdk.jmh.annotations._ import scala.jdk.CollectionConverters._ @BenchmarkMode(Array(Mode.AverageTime)) @OutputTimeUnit(TimeUnit.SECONDS) @State(Scope.Thread) class GroupByBenchmark { val schema = """ { "type": "record", "name": "Event", "namespace": "smbjoin", "fields": [ { "name": "id", "type": "string" }, { "name": "value", "type": "double" } ] } """ val avroSchema = new Schema.Parser().parse(schema) private def runWithContext[T](fn: ScioContext => T): ScioExecutionContext = { val opts = PipelineOptionsFactory.as(classOf[PipelineOptions]) val sc = ScioContext(opts) fn(sc) sc.run() } val source = "src/test/resources/events-10000-0.avro" implicit val coderGenericRecord: Coder[GenericRecord] = Coder.avroGenericRecordCoder(avroSchema) val charCoder = CoderMaterializer.beamWithDefault(Coder[Char]) val doubleCoder = CoderMaterializer.beamWithDefault(Coder[Double]) val kvCoder: BCoder[KV[Char, Double]] = KvCoder.of(charCoder, doubleCoder) @Benchmark def testScioGroupByKey: ScioExecutionContext = runWithContext { sc => sc.avroFile(source, schema = avroSchema) .map(rec => (rec.get("id").toString.head, rec.get("value").asInstanceOf[Double])) .groupByKey } @Benchmark def testBeamGroupByKey: ScioExecutionContext = runWithContext { sc => sc.wrap { sc.avroFile(source, schema = avroSchema) .map { rec => KV.of(rec.get("id").toString.head, rec.get("value").asInstanceOf[Double]) } .internal .setCoder(kvCoder) .apply(GroupByKey.create[Char, Double]) }.map(kv => (kv.getKey, kv.getValue.asScala)) } }
Example 40
Source File: BigQueryIT.scala From scio with Apache License 2.0 | 5 votes |
package com.spotify.scio.extra.bigquery import java.{util => ju} import com.google.protobuf.ByteString import com.spotify.scio.avro.types.AvroType import com.spotify.scio.bigquery.client.BigQuery import com.spotify.scio.bigquery.Table import com.spotify.scio.bigquery.TableRow import com.spotify.scio.coders._ import com.spotify.scio.ContextAndArgs import org.apache.avro.generic.GenericRecord import org.apache.beam.sdk.io.gcp.bigquery.BigQueryAvroUtilsWrapper import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers object BigQueryIT { @AvroType.fromSchema("""{ | "type":"record", | "name":"Account", | "namespace":"com.spotify.scio.avro", | "doc":"Record for an account", | "fields":[ | {"name":"id","type":"long"}, | {"name":"type","type":"string"}, | {"name":"name","type":"string"}, | {"name":"amount","type":"double"}, | {"name":"secret","type":"bytes"}]} """.stripMargin) class Account implicit def genericCoder = Coder.avroGenericRecordCoder(Account.schema) } final class BigQueryIT extends AnyFlatSpec with Matchers { import BigQueryIT._ it should "save avro to BigQuery" in { val args = Array( "--project=data-integration-test", "--tempLocation=gs://data-integration-test-eu/temp" ) val (sc, _) = ContextAndArgs(args) val prefix = ju.UUID.randomUUID().toString.replaceAll("-", "") val table = Table.Spec(s"data-integration-test:bigquery_avro_it.${prefix}_accounts") val data: Seq[GenericRecord] = (1 to 100).map { i => Account.toGenericRecord( Account(i, "checking", s"account$i", i.toDouble, ByteString.copyFromUtf8("%20cフーバー")) ) } val tap = sc .parallelize(data) .saveAvroAsBigQuery( table.ref, Account.schema, writeDisposition = WriteDisposition.WRITE_EMPTY, createDisposition = CreateDisposition.CREATE_IF_NEEDED ) val result = sc.run().waitUntilDone() val ts = BigQuery.defaultInstance().tables.schema(table.ref) val expected: Seq[TableRow] = data.map { gr => BigQueryAvroUtilsWrapper.convertGenericRecordToTableRow(gr, ts) } result.tap(tap).value.toSet shouldEqual expected.toSet } }
Example 41
Source File: AvroUtils.scala From scio with Apache License 2.0 | 5 votes |
package com.spotify.scio.avro import org.apache.avro.Schema import org.apache.avro.generic.{GenericData, GenericRecord} import scala.jdk.CollectionConverters._ object AvroUtils { private def f(name: String, tpe: Schema.Type) = new Schema.Field( name, Schema.createUnion(List(Schema.create(Schema.Type.NULL), Schema.create(tpe)).asJava), null: String, null: AnyRef ) private def fArr(name: String, tpe: Schema.Type) = new Schema.Field(name, Schema.createArray(Schema.create(tpe)), null: String, null: AnyRef) val schema = Schema.createRecord("GenericTestRecord", null, null, false) schema.setFields( List( f("int_field", Schema.Type.INT), f("long_field", Schema.Type.LONG), f("float_field", Schema.Type.FLOAT), f("double_field", Schema.Type.DOUBLE), f("boolean_field", Schema.Type.BOOLEAN), f("string_field", Schema.Type.STRING), fArr("array_field", Schema.Type.STRING) ).asJava ) def newGenericRecord(i: Int): GenericRecord = { val r = new GenericData.Record(schema) r.put("int_field", 1 * i) r.put("long_field", 1L * i) r.put("float_field", 1f * i) r.put("double_field", 1.0 * i) r.put("boolean_field", true) r.put("string_field", "hello") r.put("array_field", List[CharSequence]("a", "b", "c").asJava) r } def newSpecificRecord(i: Int): TestRecord = new TestRecord( i, i.toLong, i.toFloat, i.toDouble, true, "hello", List[CharSequence]("a", "b", "c").asJava ) }
Example 42
Source File: MagnolifyAvroExampleTest.scala From scio with Apache License 2.0 | 5 votes |
package com.spotify.scio.examples.extra import com.spotify.scio.avro.AvroIO import com.spotify.scio.io._ import com.spotify.scio.testing._ import org.apache.avro.generic.{GenericData, GenericRecord} class MagnolifyAvroExampleTest extends PipelineSpec { import MagnolifyAvroExample._ val textIn = Seq("a b c d e", "a b a b") val wordCount = Seq(("a", 3L), ("b", 3L), ("c", 1L), ("d", 1L), ("e", 1L)) val records: Seq[GenericRecord] = wordCount.map { kv => val r = new GenericData.Record(wordCountType.schema) r.put("word", kv._1) r.put("count", kv._2) r } val textOut = wordCount.map(kv => kv._1 + ": " + kv._2) "MagnolifyAvroWriteExample" should "work" in { JobTest[com.spotify.scio.examples.extra.MagnolifyAvroWriteExample.type] .args("--input=in.txt", "--output=wc.avro") .input(TextIO("in.txt"), textIn) .output(AvroIO[GenericRecord]("wc.avro"))(coll => coll should containInAnyOrder(records)) .run() } "MagnolifyAvroReadExample" should "work" in { JobTest[com.spotify.scio.examples.extra.MagnolifyAvroReadExample.type] .args("--input=wc.avro", "--output=out.txt") .input(AvroIO[GenericRecord]("wc.avro"), records) .output(TextIO("out.txt"))(coll => coll should containInAnyOrder(textOut)) .run() } }
Example 43
Source File: Utils.scala From shc with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.hbase import java.util import java.util.Comparator import org.apache.avro.generic.GenericRecord import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.sql.catalyst.expressions.MutableRow import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.execution.SparkSqlSerializer import org.apache.spark.sql.types._ import scala.collection.mutable.ArrayBuffer import scala.math.Ordering object Utils { def setRowCol( row: MutableRow, field: (Field, Int), src: HBaseType, offset: Int, length: Int): Unit = { val index = field._2 val f = field._1 if (f.sedes.isDefined) { // If we already have sedes defined , use it. val m = f.sedes.get.deserialize(src, offset, length) row.update(index, m) } else if (f.exeSchema.isDefined) { // println("avro schema is defined to do deserialization") // If we have avro schema defined, use it to get record, and then covert them to catalyst data type val m = AvroSedes.deserialize(src, f.exeSchema.get) // println(m) val n = f.avroToCatalyst.map(_(m)) row.update(index, n.get) } else { // Fall back to atomic type f.dt match { case BooleanType => row.setBoolean(index, toBoolean(src, offset)) case ByteType => row.setByte(index, src(offset)) case DoubleType => row.setDouble(index, Bytes.toDouble(src, offset)) case FloatType => row.setFloat(index, Bytes.toFloat(src, offset)) case IntegerType => row.setInt(index, Bytes.toInt(src, offset)) case LongType => row.setLong(index, Bytes.toLong(src, offset)) case ShortType => row.setShort(index, Bytes.toShort(src, offset)) case StringType => row.update(index, toUTF8String(src, offset, length)) case BinaryType => val newArray = new Array[Byte](length) System.arraycopy(src, offset, newArray, 0, length) row.update(index, newArray) case _ => row.update(index, SparkSqlSerializer.deserialize[Any](src)) //TODO } } } // convert input to data type def toBytes(input: Any, field: Field): Array[Byte] = { if (field.sedes.isDefined) { field.sedes.get.serialize(input) } else if (field.schema.isDefined) { // Here we assume the top level type is structType val record = field.catalystToAvro(input) AvroSedes.serialize(record, field.schema.get) } else { input match { case data: Boolean => Bytes.toBytes(data) case data: Byte => Array(data) case data: Array[Byte] => data case data: Double => Bytes.toBytes(data) case data: Float => Bytes.toBytes(data) case data: Int => Bytes.toBytes(data) case data: Long => Bytes.toBytes(data) case data: Short => Bytes.toBytes(data) case data: UTF8String => data.getBytes case data: String => Bytes.toBytes(data) //Bytes.toBytes(input.asInstanceOf[String])//input.asInstanceOf[UTF8String].getBytes case _ => throw new Exception(s"unsupported data type ${field.dt}") //TODO } } } def toBoolean(input: HBaseType, offset: Int): Boolean = { input(offset) != 0 } def toUTF8String(input: HBaseType, offset: Int, length: Int): UTF8String = { UTF8String(input.slice(offset, offset + length)) } }
Example 44
Source File: Sedes.scala From shc with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.hbase import java.io.ByteArrayInputStream import org.apache.avro.Schema import org.apache.avro.Schema.Type._ import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord} import org.apache.avro.io._ import org.apache.commons.io.output.ByteArrayOutputStream import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.sql.types._ trait Sedes { def serialize(value: Any): Array[Byte] def deserialize(bytes: Array[Byte], start: Int, end: Int): Any } class DoubleSedes extends Sedes { override def serialize(value: Any): Array[Byte] = Bytes.toBytes(value.asInstanceOf[Double]) override def deserialize(bytes: Array[Byte], start: Int, end: Int): Any = { Bytes.toLong(bytes, start) } }
Example 45
Source File: StdAvroModelFactoryTest.scala From aloha with MIT License | 5 votes |
package com.eharmony.aloha.factory.avro import com.eharmony.aloha.audit.impl.avro.Score import com.eharmony.aloha.factory.ModelFactory import com.eharmony.aloha.io.vfs.Vfs1 import com.eharmony.aloha.models.Model import org.apache.avro.Schema import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.commons.io.IOUtils import org.junit.Assert.assertEquals import org.junit.Test import org.junit.runner.RunWith import org.junit.runners.BlockJUnit4ClassRunner import scala.util.Try private[this] def record = { val r = new GenericData.Record(TheSchema) r.put("req_str_1", "smart handsome stubborn") r } } object StdAvroModelFactoryTest { private lazy val TheSchema = { val is = getClass.getClassLoader.getResourceAsStream(SchemaUrlResource) try new Schema.Parser().parse(is) finally IOUtils.closeQuietly(is) } private val ExpectedResult = 7d private val SchemaUrlResource = "avro/class7.avpr" private val SchemaUrl = s"res:$SchemaUrlResource" private val SchemaFile = new java.io.File(getClass.getClassLoader.getResource(SchemaUrlResource).getFile) private val SchemaVfs1FileObject = org.apache.commons.vfs.VFS.getManager.resolveFile(SchemaUrl) private val SchemaVfs2FileObject = org.apache.commons.vfs2.VFS.getManager.resolveFile(SchemaUrl) private val Imports = Seq("com.eharmony.aloha.feature.BasicFunctions._", "scala.math._") private val ReturnType = "Double" private val ModelJson = """ |{ | "modelType": "Regression", | "modelId": { "id": 0, "name": "" }, | "features" : { | "my_attributes": "${req_str_1}.split(\"\\\\W+\").map(v => (s\"=$v\", 1.0))" | }, | "weights": { | "my_attributes=handsome": 1, | "my_attributes=smart": 2, | "my_attributes=stubborn": 4 | } |} """.stripMargin }
Example 46
Source File: ImplicitsTest.scala From aloha with MIT License | 5 votes |
package com.eharmony.aloha.audit.impl.avro import com.google.common.collect.Lists import org.junit.Assert.assertEquals import org.junit.Test import org.junit.runner.RunWith import org.junit.runners.BlockJUnit4ClassRunner import scala.collection.JavaConverters.seqAsJavaListConverter import com.eharmony.aloha.audit.impl.avro.Implicits.{RichFlatScore, RichScore} import java.{lang => jl, util => ju} import org.apache.avro.generic.GenericRecord @Test def testAllFieldsAppear(): Unit = { val s = filledInScore assertEquals(s, s.toFlatScore.toScore) } @Test def testSameFieldsInGenericRecord(): Unit = { val s = filledInScore val s1 = s.asInstanceOf[GenericRecord] val s2 = s.toFlatScore.asInstanceOf[GenericRecord] testStuff(s1, s2, Map( "model" -> modelId, "value" -> value, "errorMsgs" -> errors, "missingVarNames" -> missing, "prob" -> prob )) } private[this] def testStuff(r1: GenericRecord, r2: GenericRecord, data: Map[String, Any]): Unit = { data.foreach { case (k, v) => val v1 = r1.get(k) val v2 = r2.get(k) assertEquals(s"for r1('$k') = $v1. Expected $v", v, r1.get(k)) assertEquals(s"for r2('$k') = $v2. Expected $v", v, r2.get(k)) } } } object ImplicitsTest { private def filledInScore = new Score(modelId, value, subvalues, errors, missing, prob) private def modelId = new ModelId(5L, "five") private def value: jl.Double = 13d private def subvalues = Lists.newArrayList(scr(12L, 8)) private def errors: ju.List[CharSequence] = Lists.newArrayList("one error", "two errors") private def missing: ju.List[CharSequence] = Lists.newArrayList("some feature", "another feature", "yet another feature") private def prob: jl.Float = 1f private lazy val score: Score = scr(1, 1, scr(2L, 2, scr(4f, 4), scr(5, 5) ), scr(3d, 3, scr(6d, 6), scr(7L, 7) ) ) private lazy val irregularTree: Score = scr(1, 1, scr(2L, 2), scr(3d, 3, scr(5d, 5), scr(6L, 6) ), scr(4d, 4, scr(7L, 7) ) ) private[this] def scr(value: Any, id: Long, children: Score*): Score = { new Score( new ModelId(id, ""), value, Lists.newArrayList(children.asJava), java.util.Collections.emptyList(), java.util.Collections.emptyList(), null ) } }
Example 47
Source File: AvroDataInputStream.scala From avro4s with Apache License 2.0 | 5 votes |
package com.sksamuel.avro4s import java.io.InputStream import org.apache.avro.Schema import org.apache.avro.file.DataFileStream import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.avro.io.DatumReader import scala.util.Try class AvroDataInputStream[T](in: InputStream, writerSchema: Option[Schema]) (implicit decoder: Decoder[T]) extends AvroInputStream[T] { val resolved = decoder.resolveDecoder() // if no reader or writer schema is specified, then we create a reader that uses what's present in the files private val datumReader = writerSchema match { case Some(writer) => GenericData.get.createDatumReader(writer, resolved.schema) case None => GenericData.get.createDatumReader(null, resolved.schema) } private val dataFileReader = new DataFileStream[GenericRecord](in, datumReader.asInstanceOf[DatumReader[GenericRecord]]) override def iterator: Iterator[T] = new Iterator[T] { override def hasNext: Boolean = dataFileReader.hasNext override def next(): T = { val record = dataFileReader.next resolved.decode(record) } } override def tryIterator: Iterator[Try[T]] = new Iterator[Try[T]] { override def hasNext: Boolean = dataFileReader.hasNext override def next(): Try[T] = Try { val record = dataFileReader.next resolved.decode(record) } } override def close(): Unit = in.close() }
Example 48
Source File: AvroRecord.scala From hydra with Apache License 2.0 | 5 votes |
package hydra.kafka.producer import com.pluralsight.hydra.avro.JsonConverter import hydra.core.transport.AckStrategy import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.commons.lang3.StringUtils case class AvroRecord( destination: String, schema: Schema, key: String, payload: GenericRecord, ackStrategy: AckStrategy ) extends KafkaRecord[String, GenericRecord] object AvroRecord { def apply( destination: String, schema: Schema, key: Option[String], json: String, ackStrategy: AckStrategy, useStrictValidation: Boolean = false ): AvroRecord = { val payload: GenericRecord = { val converter: JsonConverter[GenericRecord] = new JsonConverter[GenericRecord](schema, useStrictValidation) converter.convert(json) } AvroRecord(destination, schema, key.orNull, payload, ackStrategy) } def apply( destination: String, schema: Schema, key: Option[String], record: GenericRecord, ackStrategy: AckStrategy ): AvroRecord = { AvroRecord(destination, schema, key.orNull, record, ackStrategy) } }
Example 49
Source File: SpecificDefautValuesSpec.scala From sbt-avrohugger with Apache License 2.0 | 5 votes |
import test._ import org.specs2.mutable.Specification import java.io.File import org.apache.avro.generic.{ GenericDatumReader, GenericRecord} import org.apache.avro.specific.{ SpecificDatumReader, SpecificDatumWriter, SpecificRecordBase } import org.apache.avro.file.DataFileReader import DefaultEnum._ class SpecificDefaultValuesSpec extends Specification { "A case class with default values" should { "deserialize correctly" in { val record = DefaultTest() val records = List(record) val fileName = s"${records.head.getClass.getName}" val fileEnding = "avro" val file = File.createTempFile(fileName, fileEnding) file.deleteOnExit() SpecificTestUtil.write(file, records) val dummyRecord = new GenericDatumReader[GenericRecord] val schema = new DataFileReader(file, dummyRecord).getSchema val userDatumReader = new SpecificDatumReader[DefaultTest](schema) val dataFileReader = new DataFileReader[DefaultTest](file, userDatumReader) val sameRecord = dataFileReader.next sameRecord.suit === SPADES sameRecord.number === 0 sameRecord.str === "str" sameRecord.optionString === None sameRecord.optionStringValue === Some("default") sameRecord.embedded === Embedded(1) sameRecord.defaultArray === List(1,3,4,5) sameRecord.optionalEnum === None sameRecord.defaultMap === Map("Hello" -> "world", "Merry" -> "Christmas") sameRecord.byt === "\u00FF".getBytes } } }
Example 50
Source File: SpecificTestUtil.scala From sbt-avrohugger with Apache License 2.0 | 5 votes |
package test import java.io.File import org.apache.avro.io.{DecoderFactory, EncoderFactory} import org.apache.avro.generic.{ GenericDatumReader, GenericRecord} import org.apache.avro.specific.{ SpecificDatumReader, SpecificDatumWriter, SpecificRecordBase } import org.apache.avro.Schema import org.apache.avro.file.{ DataFileReader, DataFileWriter } import org.specs2.mutable.Specification object SpecificTestUtil extends Specification { def write[T <: SpecificRecordBase](file: File, records: List[T]) = { val userDatumWriter = new SpecificDatumWriter[T]() val dataFileWriter = new DataFileWriter[T](userDatumWriter) dataFileWriter.create(records.head.getSchema, file) records.foreach(record => dataFileWriter.append(record)) dataFileWriter.close() } def read[T <: SpecificRecordBase](file: File, records: List[T]) = { val dummyRecord = new GenericDatumReader[GenericRecord] val schema = new DataFileReader(file, dummyRecord).getSchema val userDatumReader = new SpecificDatumReader[T](schema) val dataFileReader = new DataFileReader[T](file, userDatumReader) // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader. var record: T = null.asInstanceOf[T] var sameRecord: T = null.asInstanceOf[T] val recordIter = records.iterator while (dataFileReader.hasNext) { sameRecord = dataFileReader.next(sameRecord) record = recordIter.next } dataFileReader.close() sameRecord must ===(record) } def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = { val fileName = s"${records.head.getClass.getName}" val fileEnding = "avro" val file = File.createTempFile(fileName, fileEnding) file.deleteOnExit() write(file, records) read(file, records) } def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = { val schema = record.getSchema val writer = new SpecificDatumWriter[T](schema) val out = new java.io.ByteArrayOutputStream() val encoder = EncoderFactory.get().binaryEncoder(out, null) writer.write(record, encoder) encoder.flush val ba = out.toByteArray ba.size must ===(1) ba(0) must ===(0) out.close val reader = new SpecificDatumReader[T](schema) val decoder = DecoderFactory.get().binaryDecoder(ba, null) val decoded = reader.read(record, decoder) decoded must ===(record) } }
Example 51
Source File: SpecificDefautValuesSpec.scala From sbt-avrohugger with Apache License 2.0 | 5 votes |
import test._ import org.specs2.mutable.Specification import java.io.File import org.apache.avro.generic.{ GenericDatumReader, GenericRecord} import org.apache.avro.specific.{ SpecificDatumReader, SpecificDatumWriter, SpecificRecordBase } import org.apache.avro.file.DataFileReader import DefaultEnum._ class SpecificDefaultValuesSpec extends Specification { "A case class with default values" should { "deserialize correctly" in { val record = DefaultTest() val records = List(record) val fileName = s"${records.head.getClass.getName}" val fileEnding = "avro" val file = File.createTempFile(fileName, fileEnding) file.deleteOnExit() SpecificTestUtil.write(file, records) val dummyRecord = new GenericDatumReader[GenericRecord] val schema = new DataFileReader(file, dummyRecord).getSchema val userDatumReader = new SpecificDatumReader[DefaultTest](schema) val dataFileReader = new DataFileReader[DefaultTest](file, userDatumReader) val sameRecord = dataFileReader.next sameRecord.suit === SPADES sameRecord.number === 0 sameRecord.str === "str" sameRecord.optionString === None sameRecord.optionStringValue === Some("default") sameRecord.embedded === Embedded(1) sameRecord.defaultArray === List(1,3,4,5) sameRecord.optionalEnum === None sameRecord.defaultMap === Map("Hello" -> "world", "Merry" -> "Christmas") sameRecord.byt === "\u00FF".getBytes } } }
Example 52
Source File: SpecificTestUtil.scala From sbt-avrohugger with Apache License 2.0 | 5 votes |
package test import java.io.File import org.apache.avro.io.{DecoderFactory, EncoderFactory} import org.apache.avro.generic.{ GenericDatumReader, GenericRecord} import org.apache.avro.specific.{ SpecificDatumReader, SpecificDatumWriter, SpecificRecordBase } import org.apache.avro.Schema import org.apache.avro.file.{ DataFileReader, DataFileWriter } import org.specs2.mutable.Specification object SpecificTestUtil extends Specification { def write[T <: SpecificRecordBase](file: File, records: List[T]) = { val userDatumWriter = new SpecificDatumWriter[T] val dataFileWriter = new DataFileWriter[T](userDatumWriter) dataFileWriter.create(records.head.getSchema, file); records.foreach(record => dataFileWriter.append(record)) dataFileWriter.close(); } def read[T <: SpecificRecordBase](file: File, records: List[T]) = { val dummyRecord = new GenericDatumReader[GenericRecord] val schema = new DataFileReader(file, dummyRecord).getSchema val userDatumReader = new SpecificDatumReader[T](schema) val dataFileReader = new DataFileReader[T](file, userDatumReader) // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader. var record: T = null.asInstanceOf[T] var sameRecord: T = null.asInstanceOf[T] val recordIter = records.iterator while (dataFileReader.hasNext) { sameRecord = dataFileReader.next(sameRecord) record = recordIter.next } dataFileReader.close() sameRecord must ===(record) } def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = { val fileName = s"${records.head.getClass.getName}" val fileEnding = "avro" val file = File.createTempFile(fileName, fileEnding) file.deleteOnExit() write(file, records) read(file, records) } def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = { val schema = record.getSchema val writer = new SpecificDatumWriter[T](schema) val out = new java.io.ByteArrayOutputStream() val encoder = EncoderFactory.get().binaryEncoder(out, null) writer.write(record, encoder) encoder.flush val ba = out.toByteArray ba.size must ===(1) ba(0) must ===(0) out.close val reader = new SpecificDatumReader[T](schema) val decoder = DecoderFactory.get().binaryDecoder(ba, null) val decoded = reader.read(record, decoder) decoded must ===(record) } }
Example 53
Source File: SpecificDefautValuesSpec.scala From sbt-avrohugger with Apache License 2.0 | 5 votes |
import test._ import org.specs2.mutable.Specification import java.io.File import org.apache.avro.generic.{ GenericDatumReader, GenericRecord} import org.apache.avro.specific.{ SpecificDatumReader, SpecificDatumWriter, SpecificRecordBase } import org.apache.avro.file.DataFileReader class SpecificDefaultValuesSpec extends Specification { "A case class with default values" should { "deserialize correctly" in { val record = DefaultTest() val records = List(record) val fileName = s"${records.head.getClass.getName}" val fileEnding = "avro" val file = File.createTempFile(fileName, fileEnding) file.deleteOnExit() SpecificTestUtil.write(file, records) val dummyRecord = new GenericDatumReader[GenericRecord] val schema = new DataFileReader(file, dummyRecord).getSchema val userDatumReader = new SpecificDatumReader[DefaultTest](schema) val dataFileReader = new DataFileReader[DefaultTest](file, userDatumReader) val sameRecord = dataFileReader.next sameRecord.suit === "SPADES" sameRecord.number === 0 sameRecord.str === "str" sameRecord.optionString === None sameRecord.optionStringValue === Some("default") sameRecord.embedded === Embedded(1) sameRecord.defaultArray === Array(1,3,4,5) sameRecord.optionalEnum === None sameRecord.defaultMap === Map("Hello" -> "world", "Merry" -> "Christmas") sameRecord.byt === "\u00FF".getBytes } } }
Example 54
Source File: BytesWithSchemaToObject.scala From trucking-iot with Apache License 2.0 | 5 votes |
package com.orendainx.trucking.storm.bolts import java.io.ByteArrayInputStream import java.nio.charset.StandardCharsets import java.util import com.hortonworks.registries.schemaregistry.SchemaMetadata import com.hortonworks.registries.schemaregistry.avro.AvroSchemaProvider import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotDeserializer import com.orendainx.trucking.commons.models.{EnrichedTruckData, TrafficData} import com.typesafe.scalalogging.Logger import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.storm.task.{OutputCollector, TopologyContext} import org.apache.storm.topology.OutputFieldsDeclarer import org.apache.storm.topology.base.BaseRichBolt import org.apache.storm.tuple.{Fields, Tuple, Values} import scala.collection.JavaConversions._ // Helper function to convert GenericRecord (result of deserializing via Schema Registry) into JVM object private def recordToEnrichedTruckData(r: GenericRecord): EnrichedTruckData = EnrichedTruckData( r.get("eventTime").toString.toLong, r.get("truckId").toString.toInt, r.get("driverId").toString.toInt, r.get("driverName").toString, r.get("routeId").toString.toInt, r.get("routeName").toString, r.get("latitude").toString.toDouble, r.get("longitude").toString.toDouble, r.get("speed").toString.toInt, r.get("eventType").toString, r.get("foggy").toString.toInt, r.get("rainy").toString.toInt, r.get("windy").toString.toInt) // Helper function to convert GenericRecord (result of deserializing via Schema Registry) into JVM object private def recordToTrafficData(r: GenericRecord): TrafficData = TrafficData(r.get("eventTime").toString.toLong, r.get("routeId").toString.toInt, r.get("congestionLevel").toString.toInt) }
Example 55
Source File: NiFiPacketWithSchemaToObject.scala From trucking-iot with Apache License 2.0 | 5 votes |
package com.orendainx.trucking.storm.bolts import java.io.ByteArrayInputStream import java.util import com.hortonworks.registries.schemaregistry.SchemaMetadata import com.hortonworks.registries.schemaregistry.avro.AvroSchemaProvider import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotDeserializer import com.orendainx.trucking.commons.models.{EnrichedTruckData, TrafficData} import com.typesafe.scalalogging.Logger import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.nifi.storm.NiFiDataPacket import org.apache.storm.task.{OutputCollector, TopologyContext} import org.apache.storm.topology.OutputFieldsDeclarer import org.apache.storm.topology.base.BaseRichBolt import org.apache.storm.tuple.{Fields, Tuple, Values} import scala.collection.JavaConversions._ class NiFiPacketWithSchemaToObject extends BaseRichBolt { private lazy val log = Logger(this.getClass) private var outputCollector: OutputCollector = _ // Declare schema-related fields to be initialized when this component's prepare() method is called private var schemaRegistryClient: SchemaRegistryClient = _ private var deserializer: AvroSnapshotDeserializer = _ private var truckDataSchemaMetadata: SchemaMetadata = _ private var trafficDataSchemaMetadata: SchemaMetadata = _ override def prepare(stormConf: util.Map[_, _], context: TopologyContext, collector: OutputCollector): Unit = { outputCollector = collector val schemaRegistryUrl = stormConf.get(SchemaRegistryClient.Configuration.SCHEMA_REGISTRY_URL.name()).toString val clientConfig = Map(SchemaRegistryClient.Configuration.SCHEMA_REGISTRY_URL.name() -> schemaRegistryUrl) schemaRegistryClient = new SchemaRegistryClient(clientConfig) truckDataSchemaMetadata = schemaRegistryClient.getSchemaMetadataInfo("EnrichedTruckData").getSchemaMetadata trafficDataSchemaMetadata = schemaRegistryClient.getSchemaMetadataInfo("TrafficData").getSchemaMetadata deserializer = schemaRegistryClient.getDefaultDeserializer(AvroSchemaProvider.TYPE).asInstanceOf[AvroSnapshotDeserializer] deserializer.init(clientConfig) } override def execute(tuple: Tuple): Unit = { val dp = tuple.getValueByField("nifiDataPacket").asInstanceOf[NiFiDataPacket] // Deserialize each tuple and convert it into its proper case class (e.g. EnrichedTruckData or TrafficData) val (dataType, data) = dp.getAttributes.get("dataType") match { case typ @ "EnrichedTruckData" => (typ, recordToEnrichedTruckData(deserializer.deserialize(new ByteArrayInputStream(dp.getContent), null).asInstanceOf[GenericData.Record])) case typ @ "TrafficData" => (typ, recordToTrafficData(deserializer.deserialize(new ByteArrayInputStream(dp.getContent), null).asInstanceOf[GenericData.Record])) } outputCollector.emit(new Values(data, dataType)) outputCollector.ack(tuple) } override def declareOutputFields(declarer: OutputFieldsDeclarer): Unit = declarer.declare(new Fields("data", "dataType")) // Helper function to convert GenericRecord (result of deserializing via Schema Registry) into JVM object private def recordToEnrichedTruckData(r: GenericRecord): EnrichedTruckData = EnrichedTruckData( r.get("eventTime").toString.toLong, r.get("truckId").toString.toInt, r.get("driverId").toString.toInt, r.get("driverName").toString, r.get("routeId").toString.toInt, r.get("routeName").toString, r.get("latitude").toString.toDouble, r.get("longitude").toString.toDouble, r.get("speed").toString.toInt, r.get("eventType").toString, r.get("foggy").toString.toInt, r.get("rainy").toString.toInt, r.get("windy").toString.toInt) // Helper function to convert GenericRecord (result of deserializing via Schema Registry) into JVM object private def recordToTrafficData(r: GenericRecord): TrafficData = TrafficData(r.get("eventTime").toString.toLong, r.get("routeId").toString.toInt, r.get("congestionLevel").toString.toInt) }
Example 56
Source File: SerializedWithSchemaToObject.scala From trucking-iot with Apache License 2.0 | 5 votes |
package com.orendainx.trucking.storm.bolts import java.io.ByteArrayInputStream import java.nio.charset.StandardCharsets import java.util import com.hortonworks.registries.schemaregistry.SchemaMetadata import com.hortonworks.registries.schemaregistry.avro.AvroSchemaProvider import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotDeserializer import com.orendainx.trucking.commons.models.{EnrichedTruckData, TrafficData} import com.typesafe.scalalogging.Logger import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.storm.task.{OutputCollector, TopologyContext} import org.apache.storm.topology.OutputFieldsDeclarer import org.apache.storm.topology.base.BaseRichBolt import org.apache.storm.tuple.{Fields, Tuple, Values} import scala.collection.JavaConversions._ class SerializedWithSchemaToObject extends BaseRichBolt { private lazy val log = Logger(this.getClass) private var outputCollector: OutputCollector = _ // Declare schema-related fields to be initialized when this component's prepare() method is called private var schemaRegistryClient: SchemaRegistryClient = _ private var deserializer: AvroSnapshotDeserializer = _ private var truckDataSchemaMetadata: SchemaMetadata = _ private var trafficDataSchemaMetadata: SchemaMetadata = _ override def prepare(stormConf: util.Map[_, _], context: TopologyContext, collector: OutputCollector): Unit = { outputCollector = collector val schemaRegistryUrl = stormConf.get(SchemaRegistryClient.Configuration.SCHEMA_REGISTRY_URL.name()).toString val clientConfig = Map(SchemaRegistryClient.Configuration.SCHEMA_REGISTRY_URL.name() -> schemaRegistryUrl) schemaRegistryClient = new SchemaRegistryClient(clientConfig) truckDataSchemaMetadata = schemaRegistryClient.getSchemaMetadataInfo("EnrichedTruckData").getSchemaMetadata trafficDataSchemaMetadata = schemaRegistryClient.getSchemaMetadataInfo("TrafficData").getSchemaMetadata deserializer = schemaRegistryClient.getDefaultDeserializer(AvroSchemaProvider.TYPE).asInstanceOf[AvroSnapshotDeserializer] deserializer.init(clientConfig) } override def execute(tuple: Tuple): Unit = { // Deserialize each tuple and convert it into its proper case class (e.g. EnrichedTruckData or TrafficData) val str = tuple.getStringByField("data").getBytes(StandardCharsets.UTF_8) log.info(s"str2: ${tuple.getStringByField("data")}") val bytes = new ByteArrayInputStream(str) log.info(s"bytes: $bytes") val (dataType, data) = tuple.getStringByField("dataType") match { case typ @ "EnrichedTruckData" => log.info(s"des: ${deserializer.deserialize(bytes, null)}") (typ, recordToEnrichedTruckData(deserializer.deserialize(bytes, null).asInstanceOf[GenericData.Record])) case typ @ "TrafficData" => log.info(s"des: ${deserializer.deserialize(bytes, null)}") (typ, recordToTrafficData(deserializer.deserialize(bytes, null).asInstanceOf[GenericData.Record])) } outputCollector.emit(new Values(data, dataType)) outputCollector.ack(tuple) } override def declareOutputFields(declarer: OutputFieldsDeclarer): Unit = declarer.declare(new Fields("data", "dataType")) // Helper function to convert GenericRecord (result of deserializing via Schema Registry) into JVM object private def recordToEnrichedTruckData(r: GenericRecord): EnrichedTruckData = EnrichedTruckData( r.get("eventTime").toString.toLong, r.get("truckId").toString.toInt, r.get("driverId").toString.toInt, r.get("driverName").toString, r.get("routeId").toString.toInt, r.get("routeName").toString, r.get("latitude").toString.toDouble, r.get("longitude").toString.toDouble, r.get("speed").toString.toInt, r.get("eventType").toString, r.get("foggy").toString.toInt, r.get("rainy").toString.toInt, r.get("windy").toString.toInt) // Helper function to convert GenericRecord (result of deserializing via Schema Registry) into JVM object private def recordToTrafficData(r: GenericRecord): TrafficData = TrafficData(r.get("eventTime").toString.toLong, r.get("routeId").toString.toInt, r.get("congestionLevel").toString.toInt) }
Example 57
Source File: MetadataAlgebraSpec.scala From hydra with Apache License 2.0 | 5 votes |
package hydra.kafka.algebras import java.time.Instant import cats.data.NonEmptyList import cats.effect.{Concurrent, ContextShift, IO, Sync, Timer} import cats.implicits._ import hydra.avro.registry.SchemaRegistry import hydra.core.marshallers.History import hydra.kafka.algebras.MetadataAlgebra.TopicMetadataContainer import hydra.kafka.model.ContactMethod.Slack import hydra.kafka.model.TopicMetadataV2Request.Subject import hydra.kafka.model.{Public, StreamTypeV2, TopicMetadataV2, TopicMetadataV2Key, TopicMetadataV2Request, TopicMetadataV2Value} import io.chrisdavenport.log4cats.SelfAwareStructuredLogger import io.chrisdavenport.log4cats.slf4j.Slf4jLogger import org.apache.avro.generic.GenericRecord import org.scalatest.Assertion import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpecLike import retry.RetryPolicies._ import retry.syntax.all._ import retry.{RetryPolicy, _} import scala.concurrent.ExecutionContext import scala.concurrent.duration._ class MetadataAlgebraSpec extends AnyWordSpecLike with Matchers { implicit private val contextShift: ContextShift[IO] = IO.contextShift(ExecutionContext.global) private implicit val concurrentEffect: Concurrent[IO] = IO.ioConcurrentEffect private implicit val policy: RetryPolicy[IO] = limitRetries[IO](5) |+| exponentialBackoff[IO](500.milliseconds) private implicit val timer: Timer[IO] = IO.timer(ExecutionContext.global) private implicit def noop[A]: (A, RetryDetails) => IO[Unit] = retry.noop[IO, A] implicit private def unsafeLogger[F[_]: Sync]: SelfAwareStructuredLogger[F] = Slf4jLogger.getLogger[F] private implicit class RetryAndAssert[A](boolIO: IO[A]) { def retryIfFalse(check: A => Boolean): IO[Assertion] = boolIO.map(check).retryingM(identity, policy, noop).map(assert(_)) } private val metadataTopicName = "_internal.metadataTopic" private val consumerGroup = "Consumer Group" (for { kafkaClient <- KafkaClientAlgebra.test[IO] schemaRegistry <- SchemaRegistry.test[IO] metadata <- MetadataAlgebra.make(metadataTopicName, consumerGroup, kafkaClient, schemaRegistry, consumeMetadataEnabled = true) } yield { runTests(metadata, kafkaClient) }).unsafeRunSync() private def runTests(metadataAlgebra: MetadataAlgebra[IO], kafkaClientAlgebra: KafkaClientAlgebra[IO]): Unit = { "MetadataAlgebraSpec" should { "retrieve none for non-existant topic" in { val subject = Subject.createValidated("Non-existantTopic").get metadataAlgebra.getMetadataFor(subject).unsafeRunSync() shouldBe None } "retrieve metadata" in { val subject = Subject.createValidated("subject1").get val (genericRecordsIO, key, value) = getMetadataGenericRecords(subject) (for { record <- genericRecordsIO _ <- kafkaClientAlgebra.publishMessage(record, metadataTopicName) _ <- metadataAlgebra.getMetadataFor(subject).retryIfFalse(_.isDefined) metadata <- metadataAlgebra.getMetadataFor(subject) } yield metadata shouldBe Some(TopicMetadataContainer(key, value, None, None))).unsafeRunSync() } "retrieve all metadata" in { val subject = Subject.createValidated("subject2").get val (genericRecordsIO, key, value) = getMetadataGenericRecords(subject) (for { record <- genericRecordsIO _ <- kafkaClientAlgebra.publishMessage(record, metadataTopicName) _ <- metadataAlgebra.getMetadataFor(subject).retryIfFalse(_.isDefined) allMetadata <- metadataAlgebra.getAllMetadata } yield allMetadata should have length 2).unsafeRunSync() } } } private def getMetadataGenericRecords(subject: Subject): (IO[(GenericRecord, Option[GenericRecord])], TopicMetadataV2Key, TopicMetadataV2Value) = { val key = TopicMetadataV2Key(subject) val value = TopicMetadataV2Value( StreamTypeV2.Entity, deprecated = false, Public, NonEmptyList.one(Slack.create("#channel").get), Instant.now, List(), None) (TopicMetadataV2.encode[IO](key, Some(value)), key, value) } }
Example 58
Source File: AvroKeyRecord.scala From hydra with Apache License 2.0 | 5 votes |
package hydra.kafka.producer import com.pluralsight.hydra.avro.JsonConverter import hydra.core.transport.AckStrategy import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord final case class AvroKeyRecord( destination: String, keySchema: Schema, valueSchema: Schema, key: GenericRecord, payload: GenericRecord, ackStrategy: AckStrategy ) extends KafkaRecord[GenericRecord, GenericRecord] object AvroKeyRecord { def apply( destination: String, keySchema: Schema, valueSchema: Schema, keyJson: String, valueJson: String, ackStrategy: AckStrategy ): AvroKeyRecord = { val (key, value): (GenericRecord, GenericRecord) = { val keyConverter: String => GenericRecord = new JsonConverter[GenericRecord](keySchema).convert val valueConverter: String => GenericRecord = new JsonConverter[GenericRecord](valueSchema).convert (keyConverter(keyJson), valueConverter(valueJson)) } AvroKeyRecord(destination, keySchema, valueSchema, key, value, ackStrategy) } def apply( destination: String, keySchema: Schema, valueSchema: Schema, key: GenericRecord, value: GenericRecord, ackStrategy: AckStrategy ): AvroKeyRecord = { new AvroKeyRecord( destination, keySchema, valueSchema, key, value, ackStrategy ) } }
Example 59
Source File: AvroRecordFactory.scala From hydra with Apache License 2.0 | 5 votes |
package hydra.kafka.producer import akka.actor.ActorRef import akka.pattern.ask import akka.util import com.pluralsight.hydra.avro.JsonConverter import hydra.avro.registry.ConfluentSchemaRegistry import hydra.avro.resource.SchemaResource import hydra.avro.util.AvroUtils import hydra.common.config.ConfigSupport import hydra.common.logging.LoggingAdapter import hydra.core.akka.SchemaRegistryActor.{FetchSchemaRequest, FetchSchemaResponse} import hydra.core.ingest.HydraRequest import hydra.core.transport.ValidationStrategy.Strict import org.apache.avro.generic.GenericRecord import scala.concurrent.duration._ import scala.concurrent.{ExecutionContext, Future} class AvroRecordFactory(schemaResourceLoader: ActorRef) extends KafkaRecordFactory[String, GenericRecord] with ConfigSupport with LoggingAdapter { private implicit val timeout = util.Timeout(3.seconds) override def build( request: HydraRequest )(implicit ec: ExecutionContext): Future[AvroRecord] = { for { (topic, subject) <- Future.fromTry(getTopicAndSchemaSubject(request)) schemaResource <- (schemaResourceLoader ? FetchSchemaRequest(subject)) .mapTo[FetchSchemaResponse] .map(_.schemaResource) record <- convert(schemaResource, request) } yield AvroRecord( topic, schemaResource.schema, getKey(request, record), record, request.ackStrategy ) } private def convert(schemaResource: SchemaResource, request: HydraRequest)( implicit ec: ExecutionContext ): Future[GenericRecord] = { val converter = new JsonConverter[GenericRecord]( schemaResource.schema, request.validationStrategy == Strict ) Future({ val converted = converter.convert(request.payload) converted }).recover { case ex => throw AvroUtils.improveException(ex, schemaResource, ConfluentSchemaRegistry.registryUrl(applicationConfig)) } } }
Example 60
Source File: SpecificTestUtil.scala From sbt-avrohugger with Apache License 2.0 | 5 votes |
package test import java.io.File import org.apache.avro.io.{DecoderFactory, EncoderFactory} import org.apache.avro.generic.{ GenericDatumReader, GenericRecord} import org.apache.avro.specific.{ SpecificDatumReader, SpecificDatumWriter, SpecificRecordBase } import org.apache.avro.Schema import org.apache.avro.file.{ DataFileReader, DataFileWriter } import org.specs2.mutable.Specification object SpecificTestUtil extends Specification { def write[T <: SpecificRecordBase](file: File, records: List[T]) = { val userDatumWriter = new SpecificDatumWriter[T] val dataFileWriter = new DataFileWriter[T](userDatumWriter) dataFileWriter.create(records.head.getSchema, file); records.foreach(record => dataFileWriter.append(record)) dataFileWriter.close(); } def read[T <: SpecificRecordBase](file: File, records: List[T]) = { val dummyRecord = new GenericDatumReader[GenericRecord] val schema = new DataFileReader(file, dummyRecord).getSchema val userDatumReader = new SpecificDatumReader[T](schema) val dataFileReader = new DataFileReader[T](file, userDatumReader) // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader. var record: T = null.asInstanceOf[T] var sameRecord: T = null.asInstanceOf[T] val recordIter = records.iterator while (dataFileReader.hasNext) { sameRecord = dataFileReader.next(sameRecord) record = recordIter.next } dataFileReader.close() sameRecord.equals(record) } def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = { val fileName = s"${records.head.getClass.getName}" val fileEnding = "avro" val file = File.createTempFile(fileName, fileEnding) file.deleteOnExit() write(file, records) read(file, records) } def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = { val schema = record.getSchema val writer = new SpecificDatumWriter[T](schema) val out = new java.io.ByteArrayOutputStream() val encoder = EncoderFactory.get().binaryEncoder(out, null) writer.write(record, encoder) encoder.flush val ba = out.toByteArray ba.size must ===(1) ba(0) must ===(0) out.close val reader = new SpecificDatumReader[T](schema) val decoder = DecoderFactory.get().binaryDecoder(ba, null) val decoded = reader.read(record, decoder) decoded must ===(record) } }
Example 61
Source File: KafkaRecordFactory.scala From hydra with Apache License 2.0 | 5 votes |
package hydra.kafka.producer import com.fasterxml.jackson.databind.JsonNode import hydra.avro.util.SchemaWrapper import hydra.core.ingest.RequestParams._ import hydra.core.ingest.{HydraRequest, RequestParams} import hydra.core.protocol.MissingMetadataException import hydra.core.transport.RecordFactory import hydra.kafka.producer.KafkaRecordFactory.RecordKeyExtractor import org.apache.avro.generic.GenericRecord import scala.util.{Failure, Success, Try} def getTopicAndSchemaSubject(request: HydraRequest): Try[(String, String)] = { val subject = request.metadataValue(RequestParams.HYDRA_SCHEMA_PARAM) request.metadataValue(HYDRA_KAFKA_TOPIC_PARAM) match { case Some(topic) => Success(topic -> subject.getOrElse(topic)) case None => Failure( MissingMetadataException( HYDRA_KAFKA_TOPIC_PARAM, "No kafka topic present in the request." ) ) } } } object KafkaRecordFactory { trait RecordKeyExtractor[K, V] { def extractKeyValue(request: HydraRequest, record: V): Option[K] } object RecordKeyExtractor { implicit object StringRecordKeyExtractor extends RecordKeyExtractor[String, String] { override def extractKeyValue( request: HydraRequest, record: String ): Option[String] = { request .metadataValue(HYDRA_RECORD_KEY_PARAM) .map(key => JsonPathKeys.getKey(key, record)) } } implicit object JsonRecordKeyExtractor extends RecordKeyExtractor[String, JsonNode] { override def extractKeyValue( request: HydraRequest, record: JsonNode ): Option[String] = { request .metadataValue(HYDRA_RECORD_KEY_PARAM) .map(key => JsonPathKeys.getKey(key, record.toString)) } } implicit object SchemaKeyExtractor extends RecordKeyExtractor[String, GenericRecord] { override def extractKeyValue( request: HydraRequest, payload: GenericRecord ): Option[String] = { request .metadataValue(HYDRA_RECORD_KEY_PARAM) .map { key => JsonPathKeys.getKey(key, request.payload) } .orElse { val schema = payload.getSchema val wrapper = SchemaWrapper.from(schema) wrapper .validate() .get //we're throwing the exception here so that the request ends with a 400 wrapper.primaryKeys.map(payload.get) match { case Nil => None case keys => Some(keys.mkString("|")) } } } } } }
Example 62
Source File: IngestionFlowV2.scala From hydra with Apache License 2.0 | 5 votes |
package hydra.ingest.services import java.io.IOException import cats.MonadError import cats.implicits._ import hydra.avro.registry.SchemaRegistry import hydra.avro.resource.SchemaResourceLoader.SchemaNotFoundException import hydra.avro.util.SchemaWrapper import hydra.core.transport.ValidationStrategy import hydra.kafka.algebras.KafkaClientAlgebra import hydra.kafka.algebras.KafkaClientAlgebra.PublishResponse import hydra.kafka.model.TopicMetadataV2Request.Subject import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import scalacache._ import scalacache.guava._ import scalacache.memoization._ import scala.concurrent.duration._ import scala.util.{Failure, Try} final class IngestionFlowV2[F[_]: MonadError[*[_], Throwable]: Mode]( schemaRegistry: SchemaRegistry[F], kafkaClient: KafkaClientAlgebra[F], schemaRegistryBaseUrl: String) { import IngestionFlowV2._ import hydra.avro.convert.StringToGenericRecord._ implicit val guavaCache: Cache[SchemaWrapper] = GuavaCache[SchemaWrapper] private def getSchema(subject: String): F[Schema] = { schemaRegistry.getLatestSchemaBySubject(subject) .flatMap { maybeSchema => val schemaNotFound = SchemaNotFoundException(subject) MonadError[F, Throwable].fromOption(maybeSchema, SchemaNotFoundAugmentedException(schemaNotFound, subject)) } } private def getSchemaWrapper(subject: Subject, isKey: Boolean): F[SchemaWrapper] = memoizeF[F, SchemaWrapper](Some(2.minutes)) { val suffix = if (isKey) "-key" else "-value" getSchema(subject.value + suffix).map { sch => SchemaWrapper.from(sch) } } private def recover[A](subject: Subject, isKey: Boolean): PartialFunction[Throwable, Try[A]] = { val suffix = if (isKey) "-key" else "-value" val location = s"$schemaRegistryBaseUrl/subjects/${subject.value}$suffix/versions/latest/schema" val pf: PartialFunction[Throwable, Try[A]] = { case e: ValidationExtraFieldsError => Failure(AvroConversionAugmentedException(s"${e.getClass.getName}: ${e.getMessage} [$location]")) case e: InvalidLogicalTypeError => Failure(AvroConversionAugmentedException(s"${e.getClass.getName}: ${e.getMessage} [$location]")) case e: IOException => Failure(AvroConversionAugmentedException(s"${e.getClass.getName}: ${e.getMessage} [$location]")) case e => Failure(e) } pf } private def getSchemas(request: V2IngestRequest, topic: Subject): F[(GenericRecord, Option[GenericRecord])] = { val useStrictValidation = request.validationStrategy.getOrElse(ValidationStrategy.Strict) == ValidationStrategy.Strict def getRecord(payload: String, schema: Schema): Try[GenericRecord] = payload.toGenericRecord(schema, useStrictValidation) for { kSchema <- getSchemaWrapper(topic, isKey = true) vSchema <- getSchemaWrapper(topic, isKey = false) k <- MonadError[F, Throwable].fromTry( getRecord(request.keyPayload, kSchema.schema).recoverWith(recover(topic, isKey = true))) v <- MonadError[F, Throwable].fromTry( request.valPayload.traverse(getRecord(_, vSchema.schema)).recoverWith(recover(topic, isKey = false))) } yield (k, v) } def ingest(request: V2IngestRequest, topic: Subject): F[PublishResponse] = { getSchemas(request, topic).flatMap { case (key, value) => kafkaClient.publishMessage((key, value), topic.value).rethrow } } } object IngestionFlowV2 { final case class V2IngestRequest(keyPayload: String, valPayload: Option[String], validationStrategy: Option[ValidationStrategy]) final case class AvroConversionAugmentedException(message: String) extends RuntimeException(message) final case class SchemaNotFoundAugmentedException(schemaNotFoundException: SchemaNotFoundException, topic: String) extends RuntimeException(s"Schema '$topic' cannot be loaded. Cause: ${schemaNotFoundException.getClass.getName}: Schema not found for $topic") }
Example 63
Source File: IngestionFlow.scala From hydra with Apache License 2.0 | 5 votes |
package hydra.ingest.services import java.io.IOException import cats.MonadError import cats.implicits._ import com.pluralsight.hydra.avro.JsonToAvroConversionException import hydra.avro.registry.SchemaRegistry import hydra.avro.resource.SchemaResourceLoader.SchemaNotFoundException import hydra.avro.util.SchemaWrapper import hydra.core.ingest.HydraRequest import hydra.core.ingest.RequestParams.{HYDRA_KAFKA_TOPIC_PARAM, HYDRA_RECORD_KEY_PARAM} import hydra.core.transport.{AckStrategy, ValidationStrategy} import hydra.kafka.algebras.KafkaClientAlgebra import hydra.kafka.producer.AvroRecord import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import scalacache._ import scalacache.guava._ import scalacache.memoization._ import scala.concurrent.duration._ import scala.util.{Failure, Success, Try} final class IngestionFlow[F[_]: MonadError[*[_], Throwable]: Mode]( schemaRegistry: SchemaRegistry[F], kafkaClient: KafkaClientAlgebra[F], schemaRegistryBaseUrl: String ) { import IngestionFlow._ implicit val guavaCache: Cache[SchemaWrapper] = GuavaCache[SchemaWrapper] private def getValueSchema(topicName: String): F[Schema] = { schemaRegistry.getLatestSchemaBySubject(topicName + "-value") .flatMap { maybeSchema => val schemaNotFound = SchemaNotFoundException(topicName) MonadError[F, Throwable].fromOption(maybeSchema, SchemaNotFoundAugmentedException(schemaNotFound, topicName)) } } private def getValueSchemaWrapper(topicName: String): F[SchemaWrapper] = memoizeF[F, SchemaWrapper](Some(2.minutes)) { getValueSchema(topicName).map { valueSchema => SchemaWrapper.from(valueSchema) } } def ingest(request: HydraRequest): F[Unit] = { request.metadataValue(HYDRA_KAFKA_TOPIC_PARAM) match { case Some(topic) => getValueSchemaWrapper(topic).flatMap { schemaWrapper => val useStrictValidation = request.validationStrategy == ValidationStrategy.Strict val payloadTryMaybe: Try[Option[GenericRecord]] = Option(request.payload) match { case Some(p) => convertToAvro(topic, schemaWrapper, useStrictValidation, p).map(avroRecord => Some(avroRecord.payload)) case None => Success(None) } val v1Key = getV1RecordKey(schemaWrapper, payloadTryMaybe, request) MonadError[F, Throwable].fromTry(payloadTryMaybe).flatMap { payloadMaybe => kafkaClient.publishStringKeyMessage((v1Key, payloadMaybe), topic).void } } case None => MonadError[F, Throwable].raiseError(MissingTopicNameException(request)) } } private def getV1RecordKey(schemaWrapper: SchemaWrapper, payloadTryMaybe: Try[Option[GenericRecord]], request: HydraRequest): Option[String] = { val headerV1Key = request.metadata.get(HYDRA_RECORD_KEY_PARAM) val optionString = schemaWrapper.primaryKeys.toList match { case Nil => None case l => l.flatMap(pkName => payloadTryMaybe match { case Success(payloadMaybe) => payloadMaybe.flatMap(p => Try(p.get(pkName)).toOption) case Failure(_) => None }).mkString("|").some } headerV1Key.orElse(optionString) } private def convertToAvro(topic: String, schemaWrapper: SchemaWrapper, useStrictValidation: Boolean, payloadString: String): Try[AvroRecord] = { Try(AvroRecord(topic, schemaWrapper.schema, None, payloadString, AckStrategy.Replicated, useStrictValidation)).recoverWith { case e: JsonToAvroConversionException => val location = s"$schemaRegistryBaseUrl/subjects/$topic-value/versions/latest/schema" Failure(new AvroConversionAugmentedException(s"${e.getClass.getName}: ${e.getMessage} [$location]")) case e: IOException => val location = s"$schemaRegistryBaseUrl/subjects/$topic-value/versions/latest/schema" Failure(new AvroConversionAugmentedException(s"${e.getMessage} [$location]")) case e => Failure(e) } } } object IngestionFlow { final case class MissingTopicNameException(request: HydraRequest) extends Exception(s"Missing the topic name in request with correlationId ${request.correlationId}") final case class AvroConversionAugmentedException(message: String) extends RuntimeException(message) final case class SchemaNotFoundAugmentedException(schemaNotFoundException: SchemaNotFoundException, topic: String) extends RuntimeException(s"Schema '$topic' cannot be loaded. Cause: ${schemaNotFoundException.getClass.getName}: Schema not found for $topic") }
Example 64
Source File: StringToGenericRecord.scala From hydra with Apache License 2.0 | 5 votes |
package hydra.avro.convert import java.util.UUID import org.apache.avro.{LogicalTypes, Schema} import org.apache.avro.generic.{GenericDatumReader, GenericRecord} import org.apache.avro.io.DecoderFactory import cats.implicits._ import org.apache.avro.util.Utf8 import scala.util.{Failure, Success, Try} object StringToGenericRecord { final case class ValidationExtraFieldsError(fields: Set[String]) extends RuntimeException( s"Extra fields ${fields.mkString(",")} found with Strict Validation Strategy" ) final case class InvalidLogicalTypeError(expected: String, received: AnyRef) extends RuntimeException( s"Invalid logical type. Expected $expected but received $received" ) implicit class ConvertToGenericRecord(s: String) { private def isUuidValid(s: String): Boolean = Try(UUID.fromString(s)).isSuccess private def checkLogicalTypes(record: GenericRecord): Try[Unit] = { import collection.JavaConverters._ def checkAll(avroField: AnyRef, fieldSchema: Option[Schema]): Try[Unit] = avroField match { case g: GenericRecord => g.getSchema.getFields.asScala.toList .traverse(f => checkAll(g.get(f.name), f.schema.some)).void case u: Utf8 if fieldSchema.exists(f => Option(f.getLogicalType).exists(_.getName == LogicalTypes.uuid.getName)) => if (isUuidValid(u.toString)) Success(()) else Failure(InvalidLogicalTypeError("UUID", u.toString)) case _ => Success(()) } val fields = record.getSchema.getFields.asScala.toList fields.traverse(f => checkAll(record.get(f.name), f.schema.some)).void } private def getAllPayloadFieldNames: Set[String] = { import spray.json._ def loop(cur: JsValue, extraName: Option[String]): Set[String] = cur match { case JsObject(f) => f.flatMap { case (k: String, v: JsValue) => loop(v, k.some) ++ Set(extraName.getOrElse("") + k) }.toSet case _ => Set.empty } loop(s.parseJson, None) } private def getAllSchemaFieldNames(schema: Schema): Set[String] = { import Schema.Type._ import collection.JavaConverters._ def loop(sch: Schema, extraName: Option[String]): Set[String] = sch.getType match { case RECORD => sch.getFields.asScala.toSet.flatMap { f: Schema.Field => loop(f.schema, f.name.some) ++ Set(extraName.getOrElse("") + f.name) } case _ => Set.empty } loop(schema, None) } def toGenericRecord(schema: Schema, useStrictValidation: Boolean): Try[GenericRecord] = Try { if (useStrictValidation) { val diff = getAllPayloadFieldNames diff getAllSchemaFieldNames(schema) if (diff.nonEmpty) throw ValidationExtraFieldsError(diff) } val decoderFactory = new DecoderFactory val decoder = decoderFactory.jsonDecoder(schema, s) val reader = new GenericDatumReader[GenericRecord](schema) reader.read(null, decoder) }.flatTap(checkLogicalTypes) } }
Example 65
Source File: AvroParquetReaderFnTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import java.util.UUID import io.eels.component.avro.AvroSchemaFns import io.eels.component.parquet.avro.AvroParquetReaderFn import io.eels.schema.{DoubleType, Field, LongType, StructType} import org.apache.avro.SchemaBuilder import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.avro.util.Utf8 import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.parquet.avro.AvroParquetWriter import org.scalatest.{BeforeAndAfterAll, Matchers, WordSpec} class AvroParquetReaderFnTest extends WordSpec with Matchers with BeforeAndAfterAll { private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(new Configuration()) private val path = new Path(UUID.randomUUID().toString()) override def afterAll(): Unit = { val fs = FileSystem.get(new Configuration()) fs.delete(path, false) } private val avroSchema = SchemaBuilder.record("com.chuckle").fields() .requiredString("str").requiredLong("looong").requiredDouble("dooble").endRecord() private val writer = AvroParquetWriter.builder[GenericRecord](path) .withSchema(avroSchema) .build() private val record = new GenericData.Record(avroSchema) record.put("str", "wibble") record.put("looong", 999L) record.put("dooble", 12.34) writer.write(record) writer.close() val schema = StructType(Field("str"), Field("looong", LongType(true), true), Field("dooble", DoubleType, true)) "AvroParquetReaderFn" should { "support projections on doubles" in { val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema.removeField("looong")))) val record = reader.read() reader.close() record.get("str").asInstanceOf[Utf8].toString shouldBe "wibble" record.get("dooble") shouldBe 12.34 } "support projections on longs" in { val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema.removeField("str")))) val record = reader.read() reader.close() record.get("looong") shouldBe 999L } "support full projections" in { val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema))) val record = reader.read() reader.close() record.get("str").asInstanceOf[Utf8].toString shouldBe "wibble" record.get("looong") shouldBe 999L record.get("dooble") shouldBe 12.34 } "support non projections" in { val reader = AvroParquetReaderFn(path, None, None) val group = reader.read() reader.close() group.get("str").asInstanceOf[Utf8].toString shouldBe "wibble" group.get("looong") shouldBe 999L group.get("dooble") shouldBe 12.34 } } }
Example 66
Source File: AvroParquetReaderFn.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet.avro import io.eels.Predicate import io.eels.component.parquet.{ParquetPredicateBuilder, ParquetReaderConfig} import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.parquet.avro.{AvroParquetReader, AvroReadSupport} import org.apache.parquet.filter2.compat.FilterCompat import org.apache.parquet.hadoop.ParquetReader def apply(path: Path, predicate: Option[Predicate], projectionSchema: Option[Schema])(implicit conf: Configuration): ParquetReader[GenericRecord] = { // The parquet reader can use a projection by setting a projected schema onto a conf object def configuration(): Configuration = { val newconf = new Configuration(conf) projectionSchema.foreach { it => AvroReadSupport.setAvroReadSchema(newconf, it) AvroReadSupport.setRequestedProjection(newconf, it) } //conf.set(ParquetInputFormat.DICTIONARY_FILTERING_ENABLED, "true") newconf.set(org.apache.parquet.hadoop.ParquetFileReader.PARQUET_READ_PARALLELISM, config.parallelism.toString) newconf } // a filter is set when we have a predicate for the read def filter(): FilterCompat.Filter = predicate.map(ParquetPredicateBuilder.build) .map(FilterCompat.get) .getOrElse(FilterCompat.NOOP) AvroParquetReader.builder[GenericRecord](path) .withCompatibility(false) .withConf(configuration()) .withFilter(filter()) .build() .asInstanceOf[ParquetReader[GenericRecord]] } }
Example 67
Source File: AvroParquetWriterFn.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet.avro import com.sksamuel.exts.Logging import io.eels.component.parquet.ParquetWriterConfig import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.hadoop.fs.Path import org.apache.parquet.avro.AvroParquetWriter import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter} object AvroParquetWriterFn extends Logging { def apply(path: Path, avroSchema: Schema): ParquetWriter[GenericRecord] = { val config = ParquetWriterConfig() AvroParquetWriter.builder[GenericRecord](path) .withSchema(avroSchema) .withCompressionCodec(config.compressionCodec) .withPageSize(config.pageSize) .withRowGroupSize(config.blockSize) .withDictionaryEncoding(config.enableDictionary) .withWriteMode(ParquetFileWriter.Mode.CREATE) .withValidation(config.validating) .build() } }
Example 68
Source File: AvroParquetRowWriter.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet.avro import com.sksamuel.exts.Logging import com.typesafe.config.{Config, ConfigFactory} import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.hadoop.fs.{FileSystem, Path} class AvroParquetRowWriter(path: Path, avroSchema: Schema)(implicit fs: FileSystem) extends Logging { private val config: Config = ConfigFactory.load() private val skipCrc = config.getBoolean("eel.parquet.skipCrc") logger.info(s"Parquet writer will skipCrc = $skipCrc") private val writer = AvroParquetWriterFn(path, avroSchema) def write(record: GenericRecord): Unit = { writer.write(record) } def close(): Unit = { writer.close() if (skipCrc) { val crc = new Path("." + path.toString() + ".crc") logger.debug("Deleting crc $crc") if (fs.exists(crc)) fs.delete(crc, false) } } }
Example 69
Source File: AvroWriter.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.avro import java.io.OutputStream import java.util.concurrent.atomic.AtomicInteger import io.eels.Row import io.eels.schema.StructType import org.apache.avro.file.DataFileWriter import org.apache.avro.generic import org.apache.avro.generic.GenericRecord class AvroWriter(structType: StructType, out: OutputStream) { private val schema = AvroSchemaFns.toAvroSchema(structType) private val datumWriter = new generic.GenericDatumWriter[GenericRecord](schema) private val dataFileWriter = new DataFileWriter[GenericRecord](datumWriter) private val serializer = new RowSerializer(schema) private val _records = new AtomicInteger(0) dataFileWriter.create(schema, out) def write(row: Row): Unit = { val record = serializer.serialize(row) dataFileWriter.append(record) _records.incrementAndGet() } def records: Int = _records.get() def close(): Unit = { dataFileWriter.flush() dataFileWriter.close() } }
Example 70
Source File: AvroDeserializer.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.avro import com.typesafe.config.ConfigFactory import io.eels.Row import io.eels.schema.StructType import org.apache.avro.Schema.Field import org.apache.avro.generic.GenericRecord import org.apache.avro.util.Utf8 import scala.collection.JavaConverters._ class AvroDeserializer(useJavaString: Boolean = ConfigFactory.load().getBoolean("eel.avro.java.string")) { val config = ConfigFactory.load() val deserializeAsNullable = config.getBoolean("eel.avro.deserializeAsNullable") var schema: StructType = null var fields: Array[Field] = null var range: Range = null def toScala(value: Any): Any = { value match { case record: GenericRecord => toValues(record) case utf8: Utf8 if useJavaString => value.asInstanceOf[Utf8].toString case col: java.util.Collection[Any] => col.asScala.toVector.map(toScala) case map: java.util.Map[_, _] => map.asScala.toMap.map { case (k, v) => toScala(k) -> toScala(v) } case other => other } } def toValues(record: GenericRecord): Vector[Any] = { val vector = Vector.newBuilder[Any] for (k <- 0 until record.getSchema.getFields.size) { val value = record.get(k) vector += toScala(value) } vector.result } def toRow(record: GenericRecord): Row = { // take the schema from the first record if (schema == null) { schema = AvroSchemaFns.fromAvroSchema(record.getSchema, deserializeAsNullable) } Row(schema, toValues(record)) } }
Example 71
Source File: IndexWithKeyFields.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.data import com.fasterxml.jackson.databind.JsonNode import com.typesafe.config.ConfigFactory import org.apache.avro.{LogicalTypes, Schema, SchemaBuilder} import org.apache.avro.generic.GenericRecord import org.apache.log4j.LogManager import org.joda.time.format.ISODateTimeFormat import scala.util.control.NonFatal case class IndexWithKeyFields(uuid: String, lastModified: java.sql.Timestamp, path: String) extends GenericRecord with CsvGenerator { override def put(key: String, v: scala.Any): Unit = ??? override def get(key: String): AnyRef = key match { case "uuid" => uuid case "lastModified" => java.lang.Long.valueOf(lastModified.getTime) case "path" => path } override def put(i: Int, v: scala.Any): Unit = ??? override def get(i: Int): AnyRef = i match { case 0 => uuid case 1 => java.lang.Long.valueOf(lastModified.getTime) case 2 => path case _ => throw new IllegalArgumentException } override def getSchema: Schema = IndexWithSystemFields.schema override def csv: String = (if (uuid == null) "" else uuid) + "," + (if (lastModified == null) "" else ISODateTimeFormat.dateTime.print(lastModified.getTime)) + "," + (if (path == null) "" else path) } object IndexWithKeyFields extends ObjectExtractor[IndexWithKeyFields] { private val logger = LogManager.getLogger(IndexWithSystemFields.getClass) // AVRO-2065 - doesn't allow union over logical type, so we can't make timestamp column nullable. val timestampMilliType: Schema = LogicalTypes.timestampMillis.addToSchema(Schema.create(Schema.Type.LONG)) val schema: Schema = SchemaBuilder .record("IndexWithSystemFields").namespace("cmwell.analytics") .fields .name("uuid").`type`.unionOf.stringType.and.nullType.endUnion.noDefault .name("lastModified").`type`(timestampMilliType).noDefault .name("path").`type`.unionOf.stringType.and.nullType.endUnion.noDefault .endRecord private val config = ConfigFactory.load val infotonSize: Int = config.getInt("extract-index-from-es.fetch-size-index-with-uuid-lastModified-path") def includeFields: String = { // Note that 'quad' is not included in this list val fields = "uuid,lastModified,path" .split(",") .map(name => s""""system.$name"""") .mkString(",") s""""_source": [$fields]""" } def extractFromJson(hit: JsonNode): IndexWithKeyFields = { val system = hit.findValue("_source").findValue("system") def extractString(name: String): String = system.findValue(name) match { case x: JsonNode => x.asText case _ => null } // Extracting date values as Long - as a java.sql.Date might be better def extractDate(name: String): java.sql.Timestamp = system.findValue(name) match { case x: JsonNode => try { new java.sql.Timestamp(ISODateTimeFormat.dateTime.parseDateTime(x.asText).getMillis) } catch { case NonFatal(ex) => logger.warn(s"Failed conversion of date value: $x", ex) throw ex } case _ => null } IndexWithKeyFields( uuid = extractString("uuid"), lastModified = extractDate("lastModified"), path = extractString("path")) } }
Example 72
Source File: AvroSEBasicTest.scala From akka-serialization-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.serializer.avro4s import com.github.dnvriend.TestSpec import com.github.dnvriend.domain.BookStore.{ ChangedBookV1, ChangedBookV2, ChangedBookV3, ChangedBookV4 } import com.github.dnvriend.serializer.avro.{ BookSerializerV1, BookSerializerV2, BookSerializerV3 } import com.sksamuel.avro4s.{ AvroSchema, RecordFormat } import org.apache.avro.Schema import org.apache.avro.file.SeekableByteArrayInput import org.apache.avro.generic.{ GenericDatumReader, GenericRecord } import org.apache.avro.io.DecoderFactory // SE stands for Schema Evolution class AvroSEBasicTest extends TestSpec { @Override def fromBytes(bytes: Array[Byte], schema: Schema): GenericRecord = { val serveReader = new GenericDatumReader[GenericRecord](schema) serveReader.read(null, DecoderFactory.get().binaryDecoder(bytes, null)) } val title = "Moby-Dick; or, The Whale" val year = 1851 val editor = "Scala Books" "AvroSEBasicTest" should "deserialize old class with renamed field" in { // in this case, two different serializers can be used val obj = ChangedBookV1(title, year) val serializerV1 = new BookSerializerV1 val bytes: Array[Byte] = serializerV1.toBinary(obj) val serializerV2 = new BookSerializerV2 serializerV2.fromBinary(bytes) should matchPattern { case ChangedBookV2(`title`, `year`) ⇒ } } it should "deserialize old class without new field" in { val obj = ChangedBookV2(title, year) val serializerV2 = new BookSerializerV2 val bytes: Array[Byte] = serializerV2.toBinary(obj) val in = new SeekableByteArrayInput(bytes) val schema2 = AvroSchema[ChangedBookV2] val schema3 = AvroSchema[ChangedBookV3] val gdr = new GenericDatumReader[GenericRecord](schema2, schema3) val binDecoder = DecoderFactory.get().binaryDecoder(in, null) val record: GenericRecord = gdr.read(null, binDecoder) val format = RecordFormat[ChangedBookV3] val r = format.from(record) r should matchPattern { case ChangedBookV3(`title`, `year`, "") ⇒ } } it should "deserialize old class with dropped field" in { val obj = ChangedBookV3(title, year, editor) val serializerV3 = new BookSerializerV3 val bytes: Array[Byte] = serializerV3.toBinary(obj) val in = new SeekableByteArrayInput(bytes) val schema3 = AvroSchema[ChangedBookV3] val schema4 = AvroSchema[ChangedBookV4] val gdr = new GenericDatumReader[GenericRecord](schema3, schema4) val binDecoder = DecoderFactory.get().binaryDecoder(in, null) val record: GenericRecord = gdr.read(null, binDecoder) val format = RecordFormat[ChangedBookV4] val r = format.from(record) r should matchPattern { case ChangedBookV4(`title`, `editor`) ⇒ } } }
Example 73
Source File: AvroIOTest.scala From ratatool with Apache License 2.0 | 5 votes |
package com.spotify.ratatool.io import java.io.{ByteArrayInputStream, ByteArrayOutputStream, File} import com.spotify.ratatool.Schemas import com.spotify.ratatool.avro.specific.TestRecord import org.apache.avro.generic.GenericRecord import com.spotify.ratatool.scalacheck._ import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers class AvroIOTest extends AnyFlatSpec with Matchers { private val genericSchema = Schemas.avroSchema private val genericGen = genericRecordOf(genericSchema) private val genericData = (1 to 100).flatMap(_ => genericGen.sample) private val specificSchema = TestRecord.getClassSchema private val specificGen = specificRecordOf[TestRecord] private val specificData = (1 to 100).flatMap(_ => specificGen.sample) "AvroIO" should "work with generic record and stream" in { val out = new ByteArrayOutputStream() AvroIO.writeToOutputStream(genericData, genericSchema, out) val in = new ByteArrayInputStream(out.toByteArray) val result = AvroIO.readFromInputStream[GenericRecord](in).toList result should equal (genericData) } it should "work with generic record and file" in { val file = File.createTempFile("ratatool-", ".avro") file.deleteOnExit() AvroIO.writeToFile(genericData, genericSchema, file) val result = AvroIO.readFromFile[GenericRecord](file).toList result should equal (genericData) } it should "work with specific record and stream" in { val out = new ByteArrayOutputStream() AvroIO.writeToOutputStream(specificData, specificSchema, out) val in = new ByteArrayInputStream(out.toByteArray) val result = AvroIO.readFromInputStream[TestRecord](in).toList result.map(FixRandomData(_)) should equal (specificData.map(FixRandomData(_))) } it should "work with specific record and file" in { val file = File.createTempFile("ratatool-", ".avro") file.deleteOnExit() AvroIO.writeToFile(specificData, specificSchema, file) val result = AvroIO.readFromFile[TestRecord](file).toList result.map(FixRandomData(_)) should equal (specificData.map(FixRandomData(_))) } }
Example 74
Source File: AvroIO.scala From ratatool with Apache License 2.0 | 5 votes |
package com.spotify.ratatool.io import java.io.{File, InputStream, OutputStream} import java.nio.ByteBuffer import java.nio.channels.SeekableByteChannel import com.google.common.io.ByteStreams import org.apache.avro.Schema import org.apache.avro.file.{DataFileReader, DataFileWriter, SeekableByteArrayInput, SeekableInput} import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord} import org.apache.avro.io.{DatumReader, DatumWriter} import org.apache.avro.reflect.{ReflectDatumReader, ReflectDatumWriter} import org.apache.avro.specific.{SpecificDatumReader, SpecificDatumWriter, SpecificRecord} import org.apache.beam.sdk.io.FileSystems import org.apache.beam.sdk.io.fs.MatchResult.Metadata import scala.jdk.CollectionConverters._ import scala.reflect.ClassTag def writeToOutputStream[T: ClassTag](data: Iterable[T], schema: Schema, os: OutputStream): Unit = { val fileWriter = new DataFileWriter(createDatumWriter[T]).create(schema, os) data.foreach(fileWriter.append) fileWriter.close() } def getAvroSchemaFromFile(path: String): Schema = { require(FileStorage(path).exists, s"File `$path` does not exist!") val files = FileStorage(path).listFiles.filter(_.resourceId.getFilename.endsWith(".avro")) require(files.nonEmpty, s"File `$path` does not contain avro files") val reader = new GenericDatumReader[GenericRecord]() val dfr = new DataFileReader[GenericRecord](AvroIO.getAvroSeekableInput(files.head), reader) dfr.getSchema } private def getAvroSeekableInput(meta: Metadata): SeekableInput = new SeekableInput { require(meta.isReadSeekEfficient) private val in = FileSystems.open(meta.resourceId()).asInstanceOf[SeekableByteChannel] override def read(b: Array[Byte], off: Int, len: Int): Int = in.read(ByteBuffer.wrap(b, off, len)) override def tell(): Long = in.position() override def length(): Long = in.size() override def seek(p: Long): Unit = in.position(p) override def close(): Unit = in.close() } }
Example 75
Source File: ParquetSampler.scala From ratatool with Apache License 2.0 | 5 votes |
package com.spotify.ratatool.samplers import com.spotify.ratatool.io.ParquetIO import org.apache.avro.generic.GenericRecord import org.slf4j.{Logger, LoggerFactory} import scala.collection.mutable.ListBuffer class ParquetSampler(path: String, protected val seed: Option[Long] = None) extends Sampler[GenericRecord] { private val logger: Logger = LoggerFactory.getLogger(classOf[ParquetSampler]) override def sample(n: Long, head: Boolean): Seq[GenericRecord] = { require(n > 0, "n must be > 0") require(head, "Parquet can only be used with --head") logger.info("Taking a sample of {} from Parquet {}", n, path) val result = ListBuffer.empty[GenericRecord] val iterator = ParquetIO.readFromFile(path) while (result.length < n && iterator.hasNext) { result.append(iterator.next()) } result.toList } }
Example 76
Source File: CSVAutoReadersTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.readers import com.salesforce.op.test.PassengerSparkFixtureTest import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner import scala.collection.JavaConverters._ @RunWith(classOf[JUnitRunner]) class CSVAutoReadersTest extends FlatSpec with PassengerSparkFixtureTest { private val expectedSchema = new Schema.Parser().parse(resourceFile(name = "PassengerAuto.avsc")) private val allFields = expectedSchema.getFields.asScala.map(_.name()) private val keyField: String = allFields.head Spec[CSVAutoReader[_]] should "read in data correctly and infer schema" in { val dataReader = DataReaders.Simple.csvAuto[GenericRecord]( path = Some(passengerCsvWithHeaderPath), key = _.get(keyField).toString ) val data = dataReader.readRDD().collect() data.foreach(_ shouldBe a[GenericRecord]) data.length shouldBe 8 val inferredSchema = data.head.getSchema inferredSchema shouldBe expectedSchema } it should "read in data correctly and infer schema based with headers provided" in { val dataReader = DataReaders.Simple.csvAuto[GenericRecord]( path = Some(passengerCsvPath), key = _.get(keyField).toString, headers = allFields ) val data = dataReader.readRDD().collect() data.foreach(_ shouldBe a[GenericRecord]) data.length shouldBe 8 val inferredSchema = data.head.getSchema inferredSchema shouldBe expectedSchema } }
Example 77
Source File: CSVReaders.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.readers import com.salesforce.op.OpParams import com.salesforce.op.utils.io.csv.{CSVInOut, CSVOptions, CSVToAvro} import org.apache.avro.generic.GenericRecord import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Dataset, SparkSession} import scala.reflect.ClassTag import scala.reflect.runtime.universe.WeakTypeTag class ConditionalCSVReader[T <: GenericRecord : ClassTag : WeakTypeTag] ( readPath: Option[String], key: T => String, schema: String, options: CSVOptions = CSVDefaults.CSVOptions, timeZone: String = CSVDefaults.TimeZone, val conditionalParams: ConditionalParams[T] ) extends CSVReader[T](readPath = readPath, key = key, schema = schema, options = options, timeZone = timeZone) with ConditionalDataReader[T]
Example 78
Source File: AvroInOutTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.utils.io.avro import java.io.{File, FileNotFoundException, FileWriter} import java.nio.file.Paths import com.salesforce.op.test.TestSparkContext import com.salesforce.op.utils.io.avro.AvroInOut._ import org.apache.avro.generic.GenericRecord import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.rdd.RDD import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class AvroInOutTest extends FlatSpec with TestSparkContext { val avroSchemaPath = s"$testDataDir/PassengerDataAll.avsc" val avroFilePath = s"$testDataDir/PassengerDataAll.avro" val avroFileRecordCount = 891 val hdfs: FileSystem = FileSystem.get(sc.hadoopConfiguration) lazy val avroTemp: String = tempDir + "/avro-inout-test" Spec(AvroInOut.getClass) should "creates RDD from an avro file" in { val res = readPathSeq(avroFilePath, withCount = true, deepCopy = true, persist = false) res shouldBe a[RDD[_]] res.count shouldBe avroFileRecordCount } it should "creates RDD from a sequence of avro files" in { val res = readPathSeq(s"$avroFilePath,$avroFilePath") res.count shouldBe avroFileRecordCount*2 } it should "create RDD from a mixed sequence of valid and invalid avro files" in { val res = readPathSeq(s"badfile/path1,$avroFilePath,badfile/path2,$avroFilePath,badfile/path3") res.count shouldBe avroFileRecordCount*2 } it should "throw an error if passed in avro files are invalid" in { val error = intercept[IllegalArgumentException](readPathSeq("badfile/path1,badfile/path2")) error.getMessage shouldBe "No valid directory found in path 'badfile/path1,badfile/path2'" } it should "creates Some(RDD) from an avro file" in { val res = read(avroFilePath) res.size shouldBe 1 res.get shouldBe an[RDD[_]] res.get.count shouldBe avroFileRecordCount } it should "create None from an invalid avro file" in { val res = read("badfile/path") res shouldBe None } Spec[AvroWriter[_]] should "writeAvro to filesystem" in { val avroData = readPathSeq(avroFilePath).asInstanceOf[RDD[GenericRecord]] val avroSchema = loadFile(avroSchemaPath) val error = intercept[FileNotFoundException](hdfs.listStatus(new Path(avroTemp))) error.getMessage shouldBe s"File $avroTemp does not exist" AvroWriter(avroData).writeAvro(avroTemp, avroSchema) val hdfsFiles = hdfs.listStatus(new Path(avroTemp)) filter (x => x.getPath.getName.contains("part")) val res = readPathSeq((for { x <- hdfsFiles } yield avroTemp + "/" + x.getPath.getName).mkString(",")) res.count shouldBe avroFileRecordCount } it should "checkPathsExist" in { val tmpDir = Paths.get(File.separator, "tmp").toFile val f1 = new File(tmpDir, "avroinouttest") f1.delete() val w = new FileWriter(f1) w.write("just checking") w.close() val f2 = new File(tmpDir, "thisfilecannotexist") f2.delete() val f3 = new File(tmpDir, "this file cannot exist") f3.delete() assume(f1.exists && !f2.exists && !f3.exists) // check for one dir being invalid in the path amongst two selectExistingPaths(s"$f1,$f2") shouldBe f1.toString // check if all dirs in the path are invalid then we get an exception intercept[IllegalArgumentException] { selectExistingPaths(f2.toString) } // also, check if all dirs in the path are invalid ( in a different way ) then we get an exception intercept[IllegalArgumentException] { selectExistingPaths(f3.toString) } // check for one dir being invalid ( in a different way ) in the path amongst the two dirs in it selectExistingPaths(s"$f1,$f3") shouldBe f1.toString // check for paths order insensitivity selectExistingPaths(s"$f3,$f1") shouldBe f1.toString // check for an exception if the path is an empty string intercept[IllegalArgumentException] { selectExistingPaths("") } } }
Example 79
Source File: RichGenericRecordTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.utils.avro import com.salesforce.op.test.{TestCommon, TestSparkContext} import com.salesforce.op.utils.io.avro.AvroInOut import org.apache.avro.generic.GenericRecord import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) class RichGenericRecordTest extends FlatSpec with Matchers with TestSparkContext with TestCommon { import com.salesforce.op.utils.avro.RichGenericRecord._ val dataPath = resourceFile(parent = "../test-data", name = s"PassengerData.avro").getPath val passengerData = AvroInOut.read[GenericRecord](dataPath).getOrElse(throw new Exception("Couldn't read data")) val firstRow = passengerData.sortBy(_.get("passengerId").toString.toInt).first Spec[RichGenericRecord] should "get value of Int" in { val id = firstRow.getValue[Int]("passengerId") id shouldBe Some(1) } it should "get value of Double" in { val survived = firstRow.getValue[Double]("survived") survived shouldBe Some(0.0) } it should "get value of Long" in { val height = firstRow.getValue[Long]("height") height shouldBe Some(168L) } it should "get value of String" in { val gender = firstRow.getValue[String]("gender") gender shouldBe Some("Female") } it should "get value of Char" in { val gender = firstRow.getValue[Char]("gender") gender shouldBe Some("Female") } it should "get value of Float" in { val age = firstRow.getValue[Float]("age") age shouldBe Some(32.0) } it should "get value of Short" in { val weight = firstRow.getValue[Short]("weight") weight shouldBe Some(67) } it should "throw error for invalid field" in { val error = intercept[IllegalArgumentException](firstRow.getValue[Short]("invalidField")) error.getMessage shouldBe "requirement failed: invalidField is not found in Avro schema!" } }
Example 80
Source File: RichGenericRecord.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.utils.avro import org.apache.avro.generic.GenericRecord object RichGenericRecord { private def javaConvert(in: Any): Any = { in match { case s: java.lang.String => s case s: org.apache.avro.util.Utf8 => s.toString case i: java.lang.Integer => i.toInt case d: java.lang.Double => d.toDouble case l: java.lang.Long => l.toLong case b: java.lang.Boolean => b case f: java.lang.Float => f.toFloat case s: java.lang.Short => s.toShort case c: java.lang.Character => c.toChar case x => throw new NotImplementedError(s"${x.getClass} is not an implemented type") } } }
Example 81
Source File: IndexWithCompleteDocument.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.data import com.fasterxml.jackson.databind.JsonNode import com.typesafe.config.ConfigFactory import org.apache.avro.generic.GenericRecord import org.apache.avro.{Schema, SchemaBuilder} case class IndexWithCompleteDocument(uuid: String, document: String) extends GenericRecord with CsvGenerator { override def put(key: String, v: scala.Any): Unit = ??? override def get(key: String): AnyRef = key match { case "uuid" => uuid case "document" => document case _ => throw new IllegalArgumentException } override def put(i: Int, v: scala.Any): Unit = ??? override def get(i: Int): AnyRef = i match { case 0 => uuid case 1 => document case _ => throw new IllegalArgumentException } override def getSchema: Schema = IndexWithCompleteDocument.schema // Specifically don't implement CsvGenerator.csv since it is guaranteed to be invalid CSV - force use of Parquet. } object IndexWithCompleteDocument extends ObjectExtractor[IndexWithCompleteDocument] { val schema: Schema = SchemaBuilder .record("IndexWithCompleteDocument").namespace("cmwell.analytics") .fields .name("uuid").`type`.unionOf.stringType.and.nullType.endUnion.noDefault .name("document").`type`.unionOf.stringType.and.nullType.endUnion.noDefault .endRecord private val config = ConfigFactory.load val infotonSize: Int = config.getInt("extract-index-from-es.fetch-size-index-with-complete-document") def includeFields: String = s""""_source": "*"""" def extractFromJson(hit: JsonNode): IndexWithCompleteDocument = IndexWithCompleteDocument( uuid = hit.findValue("_id").asText, document = hit.findValue("_source").toString) }
Example 82
Source File: DataWriterFactory.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.data import java.io.File import java.nio.file.Paths import akka.actor.ActorSystem import akka.stream.ActorMaterializer import cmwell.analytics.util.Shard import org.apache.avro.generic.GenericRecord import org.apache.commons.io.FileUtils import org.apache.parquet.hadoop.metadata.CompressionCodecName import scala.concurrent.ExecutionContextExecutor trait DataWriterFactory[T <: GenericRecord] { def apply(shard: Shard): DataWriter[T] } object DataWriterFactory { private val compressionCodec = CompressionCodecName.SNAPPY def file[T <: GenericRecord with CsvGenerator](format: String, objectExtractor: ObjectExtractor[T], outDirectory: String): Shard => DataWriter[T] = { val extension = s".$format" + (if (format == "parquet") s"${compressionCodec.getExtension}" else "") // Generate a meaningful file name for the target file name based on the source shard index name and shard number. (sourceShard: Shard) => { val outFile: File = Paths.get(outDirectory, s"part-r-${sourceShard.indexName}.${sourceShard.shard}$extension").toFile if (outFile.exists) FileUtils.forceDelete(outFile) new File(outFile.getParent).mkdirs() FileDataWriter[T](format, objectExtractor.schema, outFile.toString, compressionCodec) } } def index[T <: GenericRecord](indexMap: Map[String, String], // source-index -> target-index esEndpoint: String) (implicit system: ActorSystem, executionContext: ExecutionContextExecutor, actorMaterializer: ActorMaterializer ): Shard => DataWriter[T] = { (sourceShard: Shard) => { val targetIndex = indexMap(sourceShard.indexName) new IndexDataWriter[T](indexName = targetIndex, esEndpoint = esEndpoint) } } }
Example 83
Source File: AvroDataOutputStream.scala From avro4s with Apache License 2.0 | 5 votes |
package com.sksamuel.avro4s import java.io.OutputStream import org.apache.avro.Schema import org.apache.avro.file.{CodecFactory, DataFileWriter} import org.apache.avro.generic.{GenericDatumWriter, GenericRecord} case class AvroDataOutputStream[T](os: OutputStream, codec: CodecFactory) (implicit encoder: Encoder[T]) extends AvroOutputStream[T] { val resolved = encoder.resolveEncoder() val (writer, writeFn) = resolved.schema.getType match { case Schema.Type.DOUBLE | Schema.Type.LONG | Schema.Type.BOOLEAN | Schema.Type.STRING | Schema.Type.INT | Schema.Type.FLOAT => val datumWriter = new GenericDatumWriter[T](resolved.schema) val dataFileWriter = new DataFileWriter[T](datumWriter) dataFileWriter.setCodec(codec) dataFileWriter.create(resolved.schema, os) (dataFileWriter, (t: T) => dataFileWriter.append(t)) case _ => val datumWriter = new GenericDatumWriter[GenericRecord](resolved.schema) val dataFileWriter = new DataFileWriter[GenericRecord](datumWriter) dataFileWriter.setCodec(codec) dataFileWriter.create(resolved.schema, os) (dataFileWriter, (t: T) => { val record = resolved.encode(t).asInstanceOf[GenericRecord] dataFileWriter.append(record) }) } override def close(): Unit = { flush() writer.close() } override def write(t: T): Unit = { writeFn(t) } override def flush(): Unit = writer.flush() override def fSync(): Unit = writer.fSync() }
Example 84
Source File: Job.scala From spark-avro-compactor with Apache License 2.0 | 5 votes |
package ie.ianduffy.spark.avro.compactor import ie.ianduffy.spark.avro.compactor.Utils._ import io.confluent.kafka.schemaregistry.client.{SchemaMetadata, SchemaRegistryClient} import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.avro.mapred.AvroKey import org.apache.avro.mapreduce.AvroKeyOutputFormat import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.NullWritable import org.apache.spark.sql.SparkSession import org.slf4j.LoggerFactory object Job { private val log = LoggerFactory.getLogger(Job.getClass.getName.replace("$", "")) def run(spark: SparkSession, schemaRegistry: SchemaRegistryClient, jobConfig: JobConfig): Unit = { val schema: Schema = { val latestSchemaMetadata: SchemaMetadata = schemaRegistry.getLatestSchemaMetadata(jobConfig.schemaRegistrySubject) val id: Int = latestSchemaMetadata.getId schemaRegistry.getById(id) } implicit val sparkConfig: Configuration = spark.sparkContext.hadoopConfiguration sparkConfig.set("avro.schema.input.key", schema.toString()) sparkConfig.set("avro.schema.output.key", schema.toString()) val inputPath: Path = new Path(jobConfig.input) val outputPath: Path = new Path(jobConfig.output) val fs: FileSystem = inputPath.getFileSystem(sparkConfig) // avoid raising org.apache.hadoop.mapred.FileAlreadyExistsException if (jobConfig.overrideOutput) fs.delete(outputPath, true) // from fileSystem prefix with s3 the default is 64MB and can be overwitten by fs.s3.block.size // from fileSystem prefix with s3a the default is 32MB and can be overwitten by setting fs.s3a.block.size val outputBlocksize: Long = fs.getDefaultBlockSize(outputPath) // Where inputPath is of the form s3://some/path val inputPathSize: Long = fs.getContentSummary(inputPath).getSpaceConsumed val numPartitions: Int = Math.max(1, Math.floor((inputPathSize / CompressionRatio.AVRO_SNAPPY) / outputBlocksize).toInt) log.debug( s"""outputBlocksize: $outputBlocksize | inputPathSize: $inputPathSize | splitSize: $numPartitions """.stripMargin) val rdd = readHadoopFile(spark, inputPath.toString) rdd.coalesce(numPartitions) .saveAsNewAPIHadoopFile( outputPath.toString, classOf[AvroKey[GenericRecord]], classOf[NullWritable], classOf[AvroKeyOutputFormat[GenericRecord]], sparkConfig ) } }
Example 85
Source File: Utils.scala From spark-avro-compactor with Apache License 2.0 | 5 votes |
package ie.ianduffy.spark.avro.compactor import org.apache.avro.generic.GenericRecord import org.apache.avro.mapred.AvroKey import org.apache.avro.mapreduce.AvroKeyInputFormat import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.NullWritable import org.apache.spark.sql.SparkSession object Utils { def createSparkSession: SparkSession = SparkSession .builder .appName("avro-compactor") .getOrCreate def readHadoopFile(spark: SparkSession, path: String)(implicit sparkConfig: Configuration) = { spark.sparkContext.newAPIHadoopFile( path, classOf[AvroKeyInputFormat[GenericRecord]], classOf[AvroKey[GenericRecord]], classOf[NullWritable], sparkConfig ) } }
Example 86
Source File: AvroToParquetWriter.scala From etl-light with MIT License | 5 votes |
package yamrcraft.etlite.writers import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.hadoop.fs.Path import org.apache.parquet.avro.AvroParquetWriter import org.slf4j.LoggerFactory import yamrcraft.etlite.utils.FileUtils class AvroToParquetWriter(tempFile: String, outputFile: String) extends Writer[GenericRecord] { val logger = LoggerFactory.getLogger(this.getClass) // lazy initialization var writer: Option[AvroParquetWriter[GenericRecord]] = None val tempPath = new Path(tempFile + ".parquet") val outputPath = new Path(outputFile + ".parquet") logger.info(s"creating writer for working file: ${tempPath.toString}, outputFile: ${outputPath.toString}") override def write(event: GenericRecord): Unit = { logger.info(s"ParquetWriter.write, event type: ${event.getSchema.getName}") if (writer.isEmpty) { writer = Some(createWriter(tempPath.toString, event.getSchema)) } writer.get.write(event) } override def commit(): Unit = { writer.get.close() val fs = FileUtils.getFS(outputPath.toString) fs.mkdirs(outputPath.getParent) if (fs.exists(outputPath)) { fs.rename(outputPath, new Path(outputPath.getParent, s"__${outputPath.getName}.${System.currentTimeMillis()}.old.__")) } // copy temp file to output file (typically temp file would be on local file system). if (tempFile.startsWith("file")) { logger.info(s"copy file from: ${tempPath.toString} to $outputPath") fs.copyFromLocalFile(true, true, tempPath, outputPath) } else { logger.info(s"renaming file from: ${tempPath.toString} to $outputPath") fs.rename(tempPath, outputPath) } } private def createWriter(file: String, schema: Schema) = { val fs = FileUtils.getFS(file) val path = new Path(file) if (fs.exists(path)) { fs.delete(path, true) } fs.mkdirs(path.getParent) new AvroParquetWriter[GenericRecord](path, schema) } }
Example 87
Source File: JsonToParquetPipelineFactory.scala From etl-light with MIT License | 5 votes |
package yamrcraft.etlite.pipeline import org.apache.avro.generic.GenericRecord import yamrcraft.etlite.PipelineSettings import yamrcraft.etlite.transformers.{JsonToAvroTransformer, Message} import yamrcraft.etlite.writers.{AvroToParquetWriter, TimePartitioningWriter} class JsonToParquetPipelineFactory extends PipelineFactory[Message[GenericRecord]] { def createPipeline(settings: PipelineSettings, jobId: Long, partitionId: Int): Pipeline[Message[GenericRecord]] = new Pipeline( new JsonToAvroTransformer(settings.transformerConfig), new TimePartitioningWriter( settings.writerConfig, jobId, partitionId, (tempFile, outputFile) => new AvroToParquetWriter(tempFile, outputFile)) ) }
Example 88
Source File: JsonToAvroTransformer.scala From etl-light with MIT License | 5 votes |
package yamrcraft.etlite.transformers import com.typesafe.config.Config import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import play.api.libs.json.Json import yamrcraft.etlite.utils.ConfigConversions._ import yamrcraft.etlite.utils.{FileUtils, JsonAvroConverter, TimeUtils} import yamrcraft.etlite.{ErrorType, EtlException} class JsonToAvroTransformer(config: Config) extends Transformer[Message[GenericRecord]] { val converter = new JsonAvroConverter() // config settings val timestampField = config.getString("timestamp-field") val timestampFieldFormat = config.getString("timestamp-field-format") val defaultSchemaFileName = config.getString("default-schema-file") val (schemaSelectionField, schemas) = { config.hasPath("schema-selection") match { case true => (Some(config.getString("schema-selection.field")), Some(config.getConfig("schema-selection.schemas").asMap.map {case (k,v) => (k, createSchema(v))}) ) case false => (None, None) } } val defaultSchema: Schema = createSchema(defaultSchemaFileName) @throws(classOf[EtlException]) override def transform(inbound: InboundMessage): Message[GenericRecord] = { try { val schema = getSchema(inbound.msg) val record = converter.convertToGenericDataRecord(inbound.msg, schema) Message[GenericRecord]( record, schema.getName, extractTimestamp(record) ) } catch { case e: EtlException => throw e case e: Exception => throw new EtlException(ErrorType.TransformationError, e) } } private def createSchema(path: String): Schema = new Schema.Parser().parse(FileUtils.readContent(path)) private def getSchema(msg: Array[Byte]): Schema = { if (schemaSelectionField.isEmpty) { defaultSchema } else { val msgAsString = new String(msg, "UTF8") val msgJson = Json.parse(msgAsString) val selectionValue = (msgJson \ schemaSelectionField.get).asOpt[String] schemas.get.getOrElse(selectionValue.get, defaultSchema) } } @throws(classOf[EtlException]) private def extractTimestamp(event: GenericRecord): Long = { try { (event.get(timestampField): Any) match { case ts: Long => ts.asInstanceOf[Long] case ts: String => TimeUtils.stringTimeToLong(ts, timestampFieldFormat) case _ => throw new RuntimeException("timestamp field is not of either Long or String types.") } } catch { case e: Exception => throw new EtlException(ErrorType.PartitionTimestampError, e) } } }
Example 89
Source File: AvroDecoder.scala From cuesheet with Apache License 2.0 | 5 votes |
package com.kakao.cuesheet.convert import java.util.Arrays.copyOfRange import kafka.serializer.Decoder import kafka.utils.VerifiableProperties import org.apache.avro.Schema import org.apache.avro.generic.{GenericDatumReader, GenericRecord} sealed trait AvroDecoder[T] extends Decoder[T] { def props: VerifiableProperties protected val schema = new Schema.Parser().parse(props.getString(Avro.SCHEMA)) protected val skipBytes = props.getInt(Avro.SKIP_BYTES, 0) protected val reader = new GenericDatumReader[GenericRecord](schema) protected val decoder = Avro.recordDecoder(reader) private def skip(bytes: Array[Byte], size: Int): Array[Byte] = { val length = bytes.length length - size match { case remaining if remaining > 0 => copyOfRange(bytes, size, length) case _ => new Array[Byte](0) } } def parse(bytes: Array[Byte]): GenericRecord = { val data = if (skipBytes == 0) bytes else skip(bytes, skipBytes) decoder(data) } } class AvroRecordDecoder(val props: VerifiableProperties) extends AvroDecoder[GenericRecord] { override def fromBytes(bytes: Array[Byte]): GenericRecord = parse(bytes) } class AvroMapDecoder(val props: VerifiableProperties) extends AvroDecoder[Map[String, Any]] { override def fromBytes(bytes: Array[Byte]): Map[String, Any] = Avro.toMap(parse(bytes)) } class AvroJsonDecoder(val props: VerifiableProperties) extends AvroDecoder[String] { override def fromBytes(bytes: Array[Byte]): String = Avro.toJson(parse(bytes)) }
Example 90
Source File: AvroTypeSpec.scala From shapeless-datatype with Apache License 2.0 | 5 votes |
package shapeless.datatype.avro import java.io.{ByteArrayInputStream, ByteArrayOutputStream} import java.net.URI import java.nio.ByteBuffer import com.google.protobuf.ByteString import org.apache.avro.Schema import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord} import org.apache.avro.io.{DecoderFactory, EncoderFactory} import org.joda.time.Instant import org.scalacheck.Prop.forAll import org.scalacheck.ScalacheckShapeless._ import org.scalacheck._ import shapeless._ import shapeless.datatype.record._ import scala.reflect.runtime.universe._ object AvroTypeSpec extends Properties("AvroType") { import shapeless.datatype.test.Records._ import shapeless.datatype.test.SerializableUtils._ implicit def compareByteArrays(x: Array[Byte], y: Array[Byte]) = java.util.Arrays.equals(x, y) implicit def compareIntArrays(x: Array[Int], y: Array[Int]) = java.util.Arrays.equals(x, y) def roundTrip[A: TypeTag, L <: HList](m: A)(implicit gen: LabelledGeneric.Aux[A, L], fromL: FromAvroRecord[L], toL: ToAvroRecord[L], mr: MatchRecord[L] ): Boolean = { val t = ensureSerializable(AvroType[A]) val f1: SerializableFunction[A, GenericRecord] = new SerializableFunction[A, GenericRecord] { override def apply(m: A): GenericRecord = t.toGenericRecord(m) } val f2: SerializableFunction[GenericRecord, Option[A]] = new SerializableFunction[GenericRecord, Option[A]] { override def apply(m: GenericRecord): Option[A] = t.fromGenericRecord(m) } val toFn = ensureSerializable(f1) val fromFn = ensureSerializable(f2) val copy = fromFn(roundTripRecord(toFn(m))) val rm = RecordMatcher[A] copy.exists(rm(_, m)) } def roundTripRecord(r: GenericRecord): GenericRecord = { val writer = new GenericDatumWriter[GenericRecord](r.getSchema) val baos = new ByteArrayOutputStream() val encoder = EncoderFactory.get().binaryEncoder(baos, null) writer.write(r, encoder) encoder.flush() baos.close() val bytes = baos.toByteArray val reader = new GenericDatumReader[GenericRecord](r.getSchema) val bais = new ByteArrayInputStream(bytes) val decoder = DecoderFactory.get().binaryDecoder(bais, null) reader.read(null, decoder) } implicit val byteStringAvroType = AvroType.at[ByteString](Schema.Type.BYTES)( v => ByteString.copyFrom(v.asInstanceOf[ByteBuffer]), v => ByteBuffer.wrap(v.toByteArray) ) implicit val instantAvroType = AvroType.at[Instant](Schema.Type.LONG)(v => new Instant(v.asInstanceOf[Long]), _.getMillis) property("required") = forAll { m: Required => roundTrip(m) } property("optional") = forAll { m: Optional => roundTrip(m) } property("repeated") = forAll { m: Repeated => roundTrip(m) } property("mixed") = forAll { m: Mixed => roundTrip(m) } property("nested") = forAll { m: Nested => roundTrip(m) } property("seqs") = forAll { m: Seqs => roundTrip(m) } implicit val uriAvroType = AvroType.at[URI](Schema.Type.STRING)(v => URI.create(v.toString), _.toString) property("custom") = forAll { m: Custom => roundTrip(m) } }
Example 91
Source File: AvroType.scala From shapeless-datatype with Apache License 2.0 | 5 votes |
package shapeless.datatype.avro import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import shapeless._ import scala.reflect.runtime.universe._ class AvroType[A] extends Serializable { def fromGenericRecord[L <: HList]( m: GenericRecord )(implicit gen: LabelledGeneric.Aux[A, L], fromL: FromAvroRecord[L]): Option[A] = fromL(Right(m)).map(gen.from) def toGenericRecord[L <: HList]( a: A )(implicit gen: LabelledGeneric.Aux[A, L], toL: ToAvroRecord[L], tt: TypeTag[A]): GenericRecord = toL(gen.to(a)).left.get.build(AvroSchema[A]) } object AvroType { def apply[A: TypeTag]: AvroType[A] = new AvroType[A] def at[V: TypeTag]( schemaType: Schema.Type )(fromFn: Any => V, toFn: V => Any): BaseAvroMappableType[V] = { AvroSchema.register(implicitly[TypeTag[V]].tpe, schemaType) new BaseAvroMappableType[V] { override def from(value: Any): V = fromFn(value) override def to(value: V): Any = toFn(value) } } }
Example 92
Source File: SpecificTestUtil.scala From sbt-avrohugger with Apache License 2.0 | 5 votes |
package test import java.io.File import org.apache.avro.io.{DecoderFactory, EncoderFactory} import org.apache.avro.generic.{ GenericDatumReader, GenericRecord} import org.apache.avro.specific.{ SpecificDatumReader, SpecificDatumWriter, SpecificRecordBase } import org.apache.avro.Schema import org.apache.avro.file.{ DataFileReader, DataFileWriter } import org.specs2.mutable.Specification object SpecificTestUtil extends Specification { def write[T <: SpecificRecordBase](file: File, records: List[T]) = { val userDatumWriter = new SpecificDatumWriter[T] val dataFileWriter = new DataFileWriter[T](userDatumWriter) dataFileWriter.create(records.head.getSchema, file); records.foreach(record => dataFileWriter.append(record)) dataFileWriter.close(); } def read[T <: SpecificRecordBase](file: File, records: List[T]) = { val dummyRecord = new GenericDatumReader[GenericRecord] val schema = new DataFileReader(file, dummyRecord).getSchema val userDatumReader = new SpecificDatumReader[T](schema) val dataFileReader = new DataFileReader[T](file, userDatumReader) // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader. var record: T = null.asInstanceOf[T] var sameRecord: T = null.asInstanceOf[T] val recordIter = records.iterator while (dataFileReader.hasNext) { sameRecord = dataFileReader.next(sameRecord) record = recordIter.next } dataFileReader.close() sameRecord must ===(record) } def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = { val fileName = s"${records.head.getClass.getName}" val fileEnding = "avro" val file = File.createTempFile(fileName, fileEnding) file.deleteOnExit() write(file, records) read(file, records) } def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = { val schema = record.getSchema val writer = new SpecificDatumWriter[T](schema) val out = new java.io.ByteArrayOutputStream() val encoder = EncoderFactory.get().binaryEncoder(out, null) writer.write(record, encoder) encoder.flush val ba = out.toByteArray ba.size must ===(1) ba(0) must ===(0) out.close val reader = new SpecificDatumReader[T](schema) val decoder = DecoderFactory.get().binaryDecoder(ba, null) val decoded = reader.read(record, decoder) decoded must ===(record) } }
Example 93
Source File: SpecificDefautValuesSpec.scala From sbt-avrohugger with Apache License 2.0 | 5 votes |
import test._ import org.specs2.mutable.Specification import java.io.File import org.apache.avro.generic.{ GenericDatumReader, GenericRecord} import org.apache.avro.specific.{ SpecificDatumReader, SpecificDatumWriter, SpecificRecordBase } import org.apache.avro.file.DataFileReader import DefaultEnum._ class SpecificDefaultValuesSpec extends Specification { "A case class with default values" should { "deserialize correctly" in { val record = DefaultTest() val records = List(record) val fileName = s"${records.head.getClass.getName}" val fileEnding = "avro" val file = File.createTempFile(fileName, fileEnding) file.deleteOnExit() SpecificTestUtil.write(file, records) val dummyRecord = new GenericDatumReader[GenericRecord] val schema = new DataFileReader(file, dummyRecord).getSchema val userDatumReader = new SpecificDatumReader[DefaultTest](schema) val dataFileReader = new DataFileReader[DefaultTest](file, userDatumReader) val sameRecord = dataFileReader.next sameRecord.suit === SPADES sameRecord.number === 0 sameRecord.str === "str" sameRecord.optionString === None sameRecord.optionStringValue === Some("default") sameRecord.embedded === Embedded(1) sameRecord.defaultArray === Vector(1,3,4,5) sameRecord.optionalEnum === None sameRecord.defaultMap === Map("Hello" -> "world", "Merry" -> "Christmas") sameRecord.byt === "\u00FF".getBytes } } }