org.apache.avro.Schema Scala Example

Source File: AvroDataToCatalyst.scala From spark-schema-registry with Apache License 2.0

6 votes

package com.hortonworks.spark.registry.avro

import java.io.ByteArrayInputStream

import com.hortonworks.registries.schemaregistry.{SchemaVersionInfo, SchemaVersionKey}
import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient
import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotDeserializer
import org.apache.avro.Schema
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, UnaryExpression}
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.types.{BinaryType, DataType}

import scala.collection.JavaConverters._


case class AvroDataToCatalyst(child: Expression, schemaName: String, version: Option[Int], config: Map[String, Object])
  extends UnaryExpression with ExpectsInputTypes {

  override def inputTypes = Seq(BinaryType)

  @transient private lazy val srDeser: AvroSnapshotDeserializer = {
    val obj = new AvroSnapshotDeserializer()
    obj.init(config.asJava)
    obj
  }

  @transient private lazy val srSchema = fetchSchemaVersionInfo(schemaName, version)

  @transient private lazy val avroSchema = new Schema.Parser().parse(srSchema.getSchemaText)

  override lazy val dataType: DataType = SchemaConverters.toSqlType(avroSchema).dataType

  @transient private lazy val avroDeser= new AvroDeserializer(avroSchema, dataType)

  override def nullable: Boolean = true

  override def nullSafeEval(input: Any): Any = {
    val binary = input.asInstanceOf[Array[Byte]]
    val row = avroDeser.deserialize(srDeser.deserialize(new ByteArrayInputStream(binary), srSchema.getVersion))
    val result = row match {
      case r: InternalRow => r.copy()
      case _ => row
    }
    result
  }

  override def simpleString: String = {
    s"from_sr(${child.sql}, ${dataType.simpleString})"
  }

  override def sql: String = {
    s"from_sr(${child.sql}, ${dataType.catalogString})"
  }

  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val expr = ctx.addReferenceObj("this", this)
    defineCodeGen(ctx, ev, input =>
      s"(${ctx.boxedType(dataType)})$expr.nullSafeEval($input)")
  }

  private def fetchSchemaVersionInfo(schemaName: String, version: Option[Int]): SchemaVersionInfo = {
    val srClient = new SchemaRegistryClient(config.asJava)
    version.map(v => srClient.getSchemaVersionInfo(new SchemaVersionKey(schemaName, v)))
      .getOrElse(srClient.getLatestSchemaVersionInfo(schemaName))
  }

}

Source File: StopWordsRemover.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.spark.ml.feature

import com.ibm.aardpfark.pfa.document.{Cell, PFABuilder, PFADocument}
import com.ibm.aardpfark.pfa.expression.PFAExpression
import com.ibm.aardpfark.pfa.types.WithSchema
import com.ibm.aardpfark.spark.ml.PFAModel
import com.sksamuel.avro4s.{AvroNamespace, AvroSchema}
import org.apache.avro.{Schema, SchemaBuilder}
import org.apache.spark.ml.feature.StopWordsRemover

@AvroNamespace("com.ibm.aardpfark.exec.spark.spark.ml.feature")
case class StopWords(words: Seq[String]) extends WithSchema {
  def schema = AvroSchema[this.type ]
}

class PFAStopWordsRemover(override val sparkTransformer: StopWordsRemover) extends PFAModel[StopWords] {
  import com.ibm.aardpfark.pfa.dsl._

  private val inputCol = sparkTransformer.getInputCol
  private val outputCol = sparkTransformer.getOutputCol
  private val inputExpr = StringExpr(s"input.${inputCol}")

  private val stopWords = sparkTransformer.getStopWords
  private val caseSensitive = sparkTransformer.getCaseSensitive

  private def filterFn = FunctionDef[String, Boolean]("word") { w =>
    Seq(core.not(a.contains(wordsRef, if (caseSensitive) w else s.lower(w))))
  }

  override def inputSchema: Schema = {
    SchemaBuilder.record(withUid(inputBaseName)).fields()
      .name(inputCol).`type`().array().items().stringType().noDefault()
      .endRecord()
  }

  override def outputSchema: Schema =  {
    SchemaBuilder.record(withUid(outputBaseName)).fields()
      .name(outputCol).`type`().array().items().stringType().noDefault()
      .endRecord()
  }

  override protected def cell = {
    Cell(StopWords(stopWords))
  }

  private val wordsRef = modelCell.ref("words")

  override def action: PFAExpression = {
    NewRecord(outputSchema, Map(outputCol -> a.filter(inputExpr, filterFn)))
  }

  override def pfa: PFADocument =
    PFABuilder()
      .withName(sparkTransformer.uid)
      .withMetadata(getMetadata)
      .withInput(inputSchema)
      .withOutput(outputSchema)
      .withCell(modelCell)
      .withAction(action)
      .pfa
}

Source File: ControlStructuresSuite.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.pfa.expression

import com.ibm.aardpfark.pfa.DSLSuiteBase
import com.ibm.aardpfark.pfa.document.PFABuilder
import com.ibm.aardpfark.pfa.dsl._
import org.apache.avro.Schema

class ControlStructuresSuite extends DSLSuiteBase {

  test("DSL: If-then statements") {
    val action = If {core.gt(inputExpr, 0.0)} Then "Positive" Else "Negative"

    val pfaDoc = new PFABuilder()
      .withInput[Double]
      .withOutput[String]
      .withAction(action)
      .pfa

    val engine = getPFAEngine(pfaDoc.toJSON())

    assert("Positive" == engine.action(engine.jsonInput("1.0")))
    assert("Negative" == engine.action(engine.jsonInput("-1.0")))
  }

}

Source File: AttrSuite.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.pfa.expression

import com.ibm.aardpfark.pfa.dsl._
import com.ibm.aardpfark.pfa.DSLSuiteBase
import com.ibm.aardpfark.pfa.document.PFABuilder
import org.apache.avro.Schema

class AttrSuite extends DSLSuiteBase {

  test("DSL: Attr") {

    val action = Attr(Attr(inputExpr, "element"), 1)
    val pfaDoc = new PFABuilder()
      .withInput(Schema.createMap(Schema.createArray(Schema.create(Schema.Type.DOUBLE))))
      .withOutput[Double]
      .withAction(action)
      .pfa

    val engine = getPFAEngine(pfaDoc.toJSON())
    val result = engine.action(engine.jsonInput("""{"element": [0.0, 3.0]}"""))

    assert(result == 3.0)
  }
}

Source File: LoopsSuite.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.pfa.expression

import com.ibm.aardpfark.pfa.DSLSuiteBase
import com.ibm.aardpfark.pfa.document.PFABuilder
import com.ibm.aardpfark.pfa.dsl._
import org.apache.avro.Schema

class LoopsSuite extends DSLSuiteBase {

  test("DSL: For loop") {
    val sum = Let("sum", 0.0)

    val foreach = ForEach(StringExpr("element"), inputExpr) {
      e => Seq(Set(sum.ref, core.plus(sum.ref, e)))
    }
    val action = Action(sum, foreach, sum.ref)

    val pfaDoc = new PFABuilder()
      .withInput(Schema.createArray(Schema.create(Schema.Type.DOUBLE)))
      .withOutput[Double]
      .withAction(action)
      .pfa

    val engine = getPFAEngine(pfaDoc.toJSON())
    val result = engine.action(engine.jsonInput("[3.0, 4.0, 5.0]"))

    assert(result == 12.0)
  }

}

Source File: PFABuilder.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.pfa.document

import scala.collection.mutable.ArrayBuffer

import com.ibm.aardpfark.pfa.expression.PFAExpression
import com.ibm.aardpfark.pfa.types.WithSchema
import com.sksamuel.avro4s.SchemaFor
import org.apache.avro.Schema


class PFABuilder {
  import com.ibm.aardpfark.pfa.dsl._

  private var name: Option[String] = None
  private var meta: Map[String, String] = Map()
  private var input: Schema = null
  private var output: Schema = null
  private val cells = collection.mutable.HashMap[String, Cell[_]]()
  private val action = ArrayBuffer[PFAExpression]()
  private val functions = collection.mutable.HashMap[String, FunctionDef]()

  def withInput(schema: Schema): this.type = {
    input = schema
    this
  }

  def withInput[T](implicit ev: SchemaFor[T]): this.type = withInput(ev())

  def withOutput(schema: Schema): this.type = {
    output = schema
    this
  }

  def withName(name: String): this.type = {
    this.name = Some(name)
    this
  }

  def withMetadata(meta: Map[String, String]): this.type = {
    this.meta = meta
    this
  }

  def withOutput[T](implicit ev: SchemaFor[T]): this.type = withOutput(ev())

  def withCell[T <: WithSchema](name: String, cell: Cell[T]): this.type = {
    cells += name -> cell
    this
  }

  def withCell[T <: WithSchema](namedCell: NamedCell[T]): this.type = {
    cells += namedCell.name -> namedCell.cell
    this
  }

  def withFunction(name: String, fn: FunctionDef): this.type = {
    functions += name -> fn
    this
  }

  def withFunction(namedFn: NamedFunctionDef): this.type = {
    functions += namedFn.name -> namedFn.fn
    this
  }

  def withAction(expr: PFAExpression): this.type = {
    expr match {
      case ExprSeq(s) =>
        action ++= s
      case _ =>
        action += expr
    }
    this
  }

  def pfa: PFADocument = {
    PFADocument(name = name,
      metadata = meta,
      input = input,
      output = output,
      action = action,
      cells = cells.toMap,
      fcns = functions.toMap
    )
  }
}

object PFABuilder {
  def apply(): PFABuilder = new PFABuilder()
}

Source File: JSONSerializers.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.pfa.document

import scala.util.Try

import com.ibm.aardpfark.pfa.dsl._
import com.ibm.aardpfark.pfa.expression.PFAExpression
import com.ibm.aardpfark.spark.ml.tree.{TreeNode, Trees}
import org.apache.avro.Schema
import org.json4s.native.JsonMethods.parse
import org.json4s.{CustomSerializer, JValue}


object SchemaSerializer {

  def convert(s: Schema): JValue = {
    import Schema.Type._
    import org.json4s.JsonDSL._
    s.getType match {
      case DOUBLE | FLOAT | INT | LONG | STRING | BOOLEAN | BYTES | NULL  =>
        ("type" -> s.getType.getName)
      case _ =>
        parse(s.toString)
    }
  }
}

class SchemaSerializer extends CustomSerializer[Schema](format => (
  {
    case j: JValue =>
      new Schema.Parser().parse(j.toString)
  },
  {
    case s: Schema =>
      SchemaSerializer.convert(s)
  }
)
)

class PFAExpressionSerializer extends CustomSerializer[PFAExpression](format => (
  {
    case j: JValue =>
      throw new UnsupportedOperationException("cannot deserialize")
  },
  {
    case expr: PFAExpression =>
      expr.json
  }
)
)

class TreeSerializer extends CustomSerializer[TreeNode](format => (
  {
    case j: JValue =>
      throw new UnsupportedOperationException("cannot deserialize")
  },
  {
    case tree: TreeNode =>
      Trees.json(tree)
  }
)
)

class ParamSerializer extends CustomSerializer[Param](format => (
  {
    case j: JValue =>
      throw new UnsupportedOperationException("cannot deserialize")
  },
  {
    case p: Param =>
      import org.json4s.JsonDSL._
      if (p.simpleSchema) {
        (p.name -> p.`type`.getFullName)
      } else {
        val schemaSerializer = new SchemaSerializer().serialize(format)
        (p.name -> schemaSerializer(p.`type`))
      }

  }
)
)

Source File: PFADocument.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.pfa.document

import com.ibm.aardpfark.pfa.dsl._
import com.ibm.aardpfark.pfa.expression.PFAExpression
import com.ibm.aardpfark.pfa.utils.Utils
import org.apache.avro.Schema
import org.json4s.native.Serialization
import org.json4s.native.Serialization.{write, writePretty}
import org.json4s.{FieldSerializer, NoTypeHints}

trait ToPFA {
  def pfa: PFADocument
}

trait HasAction {
  protected def action: PFAExpression
}

trait HasModelCell {
  protected def modelCell: NamedCell[_]
}

case class PFADocument(
  name: Option[String] = None,
  version: Option[Long] = Some(1L),
  doc: Option[String] = Some(s"Auto-generated by Aardpfark at ${Utils.getCurrentDate}"),
  metadata: Map[String, String] = Map(),
  // options,
  input: Schema,
  output: Schema,
  // begin: Seq[String] = Seq(),
  // end: Seq[String] = Seq(),
  // method: String = "map",
  action: Seq[PFAExpression],
  cells: Map[String, Cell[_]] = Map(),
  // pools
  fcns: Map[String, FunctionDef] = Map()
  // randseed
  // zero
  // merge
  ) {

  implicit val formats = Serialization.formats(NoTypeHints) +
    new SchemaSerializer +
    new PFAExpressionSerializer +
    new ParamSerializer +
    new FieldSerializer[Cell[_]] +
    new TreeSerializer

  def toJSON(pretty: Boolean = false) = {
    if (pretty) writePretty(this) else write(this)
  }
}

Source File: New.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.pfa.expression

import com.ibm.aardpfark.pfa.document.SchemaSerializer
import com.sksamuel.avro4s.{AvroSchema, SchemaFor, ToSchema}
import org.apache.avro.Schema
import org.json4s.JValue
import org.json4s.JsonAST.JString
import org.json4s.native.JsonMethods.parse

trait New {

  object NewRecord {
    def apply(schema: Schema, init: Map[String, PFAExpression], fullSchema: Boolean = true) =
      NewRecordExpr(schema, init, fullSchema)
  }

  case class NewRecordExpr(schema: Schema, init: Map[String, PFAExpression], fullSchema: Boolean)
    extends PFAExpression {
    import org.json4s.JsonDSL._

    private val s = if (fullSchema) SchemaSerializer.convert(schema) else JString(schema.getFullName)
    override def json: JValue = {
      ("type" -> s) ~ ("new" -> init.mapValues(_.json))
    }
  }

  case class NewArrayExpr(schema: Schema, init: Seq[PFAExpression]) extends PFAExpression {
    import org.json4s.JsonDSL._

    override def json: JValue = {
      ("type" -> parse(schema.toString)) ~ ("new" -> init.map(_.json))
    }
  }

  object NewArray {
    def apply(schema: Schema, init: Seq[PFAExpression]) = NewArrayExpr(schema, init)
    def apply[T](init: Seq[PFAExpression])(implicit s: ToSchema[Seq[T]]) = {
      NewArrayExpr(s(), init)
    }
  }

  case class NewMap(schema: Schema, init: Map[String, PFAExpression]) extends PFAExpression {
    import org.json4s.JsonDSL._

    override def json: JValue = {
      ("type" -> parse(schema.toString)) ~ ("new" -> init.mapValues(_.json))
    }
  }

}

Source File: Casts.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.pfa.expression

import com.ibm.aardpfark.pfa.document.SchemaSerializer
import com.ibm.aardpfark.pfa.dsl.StringExpr
import com.sksamuel.avro4s.{SchemaFor, ToSchema}
import org.apache.avro.Schema
import org.json4s.JValue

trait Casts {

  case class As(schema: Schema, named: String, `do`: PFAExpression)

  object As {
    def apply(schema: Schema, named: String, `do`: (StringExpr) => PFAExpression): As = {
      As(schema, named, `do`(StringExpr(named)))
    }
    def apply[T](named: String, `do`: (StringExpr) => PFAExpression)(implicit s: ToSchema[T]): As = {
      As(s(), named, `do`(StringExpr(named)))
    }
  }
  object Cast {
    def apply(cast: PFAExpression, cases: Seq[As]) = new CastExpr(cast, cases)
    def apply(cast: PFAExpression, case1: As, cases: As*) = new CastExpr(cast, Seq(case1) ++ cases)
  }

  class CastExpr(cast: PFAExpression, cases: Seq[As]) extends PFAExpression {
    import org.json4s.JsonDSL._

    implicit val converter: Schema => JValue = SchemaSerializer.convert
    override def json: JValue = {
      ("cast" -> cast.json) ~
        ("cases" -> cases.map { as =>
          ("as" -> as.schema) ~ ("named" -> as.named) ~ ("do" -> as.`do`.json)
        })
    }
  }

}

Source File: MLPClassifier.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.spark.ml.classification

import scala.collection.mutable.ArrayBuffer

import com.ibm.aardpfark.pfa.document.{Cell, PFABuilder, PFADocument}
import com.ibm.aardpfark.pfa.dsl._
import com.ibm.aardpfark.pfa.expression._
import com.ibm.aardpfark.pfa.types.WithSchema
import com.ibm.aardpfark.spark.ml.PFAPredictionModel
import breeze.linalg.{DenseMatrix, DenseVector}
import com.sksamuel.avro4s.{AvroNamespace, AvroSchema}
import org.apache.avro.Schema

import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel

@AvroNamespace("com.ibm.aardpfark.exec.spark.ml.classification")
case class Layer(weights: Array[Array[Double]], bias: Array[Double])
@AvroNamespace("com.ibm.aardpfark.exec.spark.ml.classification")
case class Layers(layers: Seq[Layer]) extends WithSchema {
  override def schema: Schema = AvroSchema[this.type]
}

class PFAMultilayerPerceptronClassificationModel(
  override val sparkTransformer: MultilayerPerceptronClassificationModel)
  extends PFAPredictionModel[Layers] {

  private def getLayers = {
    val weights = sparkTransformer.weights.toArray
    val inputLayers = sparkTransformer.layers
    val layers = ArrayBuffer[Layer]()
    var offset = 0
    for (i <- 0 to inputLayers.size - 2) {
      val in = inputLayers(i)
      val out = inputLayers(i + 1)
      val wOffset = out * in
      val wData = weights.slice(offset, offset + wOffset)
      val bData = weights.slice(offset + wOffset, offset + wOffset + out)
      val w = Array.ofDim[Double](out, in)
      new DenseMatrix[Double](out, in, wData).foreachPair { case ((ii, jj), v) => w(ii)(jj) = v }
      val b = new DenseVector[Double](bData).toArray
      layers += Layer(w, b)
      offset += wOffset + out
    }
    layers.toArray
  }

  override protected def cell = Cell(Layers(getLayers))

  private val doubleSigmoid = NamedFunctionDef("doubleSigmoid", FunctionDef[Double, Double](
    "x", m.link.logit("x")
  ))

  override def action: PFAExpression = {
    val forward = model.neural.simpleLayers(inputExpr, modelCell.ref("layers"), doubleSigmoid.ref)
    val softmax = m.link.softmax(forward)
    NewRecord(outputSchema, Map(predictionCol -> a.argmax(softmax)))
  }

  override def pfa: PFADocument = {
    PFABuilder()
      .withName(sparkTransformer.uid)
      .withMetadata(getMetadata)
      .withInput(inputSchema)
      .withOutput(outputSchema)
      .withCell(modelCell)
      .withFunction(doubleSigmoid)
      .withAction(action)
      .pfa
  }

}

Source File: Merge.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.spark.ml

import com.ibm.aardpfark.pfa.document.Cell
import com.ibm.aardpfark.pfa.expression._
import org.apache.avro.{Schema, SchemaBuilder}

import org.apache.spark.ml.PipelineModel


    val first = docs.head
    val last = docs.last
    var name = "merged"
    var version = 0L
    val inputSchema = is
    val outputSchema = last.output
    var meta: Map[String, String] = Map()
    var cells: Map[String, Cell[_]] = Map()
    var action: PFAExpression = StringExpr("input")
    var fcns: Map[String, FunctionDef] = Map()
    var currentSchema = inputSchema

    docs.zipWithIndex.foreach { case (doc, idx) =>

      val inputParam = Param("input", currentSchema)

      val inputFields = currentSchema.getFields.toSeq
      val newFields = doc.output.getFields.toSeq
      val outputFields = inputFields ++ newFields

      val bldr = SchemaBuilder.record(s"Stage_${idx + 1}_output_schema").fields()
      outputFields.foreach { field =>
        bldr
          .name(field.name())
          .`type`(field.schema())
          .noDefault()
      }

      currentSchema = bldr.endRecord()

      val let = Let(s"Stage_${idx + 1}_action_output", Do(doc.action))
      val inputExprs = inputFields.map { field =>
        field.name -> StringExpr(s"input.${field.name}")
      }
      val newExprs = newFields.map { field =>
        field.name -> StringExpr(s"${let.x}.${field.name}")

      }
      val exprs = inputExprs ++ newExprs
      val stageOutput = NewRecord(currentSchema, exprs.toMap)

      val le = new LetExpr(Seq((let.x, let.`type`, let.expr)))

      val stageActionFn = NamedFunctionDef(s"Stage_${idx + 1}_action", FunctionDef(
        Seq(inputParam), currentSchema, Seq(le, stageOutput)
      ))

      fcns = fcns ++ doc.fcns + (stageActionFn.name -> stageActionFn.fn)
      cells = cells ++ doc.cells
      meta = meta ++ doc.metadata
      action = stageActionFn.call(action)
    }

    first.copy(
      name = Some(name),
      version = Some(version),
      metadata = meta,
      cells = cells,
      fcns = fcns,
      action = action,
      input = inputSchema,
      output = currentSchema
    )

  }
}

Source File: KMeans.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.spark.ml.clustering

import com.ibm.aardpfark.pfa.dsl.StringExpr
import com.ibm.aardpfark.pfa.document.{Cell, PFABuilder, PFADocument}
import com.ibm.aardpfark.pfa.dsl._
import com.ibm.aardpfark.pfa.expression.PFAExpression
import com.ibm.aardpfark.pfa.types.WithSchema
import com.ibm.aardpfark.spark.ml.PFAModel
import com.sksamuel.avro4s.{AvroNamespace, AvroSchema}
import org.apache.avro.{Schema, SchemaBuilder}

import org.apache.spark.ml.clustering.KMeansModel

@AvroNamespace("com.ibm.aardpfark.exec.spark.ml.clustering")
case class Cluster(id: Int, center: Seq[Double])

@AvroNamespace("com.ibm.aardpfark.exec.spark.ml.clustering")
case class KMeansModelData(clusters: Seq[Cluster]) extends WithSchema {
  override def schema: Schema = AvroSchema[this.type]
}

class PFAKMeansModel(override val sparkTransformer: KMeansModel) extends PFAModel[KMeansModelData] {

  private val inputCol = sparkTransformer.getFeaturesCol
  private val outputCol = sparkTransformer.getPredictionCol
  private val inputExpr = StringExpr(s"input.${inputCol}")

  override def inputSchema = {
    SchemaBuilder.record(withUid(inputBaseName)).fields()
      .name(inputCol).`type`().array().items().doubleType().noDefault()
      .endRecord()
  }

  override def outputSchema = SchemaBuilder.record(withUid(outputBaseName)).fields()
    .name(outputCol).`type`().intType().noDefault()
    .endRecord()

  override def cell =  {
    val clusters = sparkTransformer.clusterCenters.zipWithIndex.map { case (v, i) =>
      Cluster(i, v.toArray)
    }
    Cell(KMeansModelData(clusters))
  }

  override def action: PFAExpression = {
    val closest = model.cluster.closest(inputExpr, modelCell.ref("clusters"))
    NewRecord(outputSchema, Map(outputCol -> Attr(closest, "id")))
  }

  override def pfa: PFADocument = {
    PFABuilder()
      .withName(sparkTransformer.uid)
      .withMetadata(getMetadata)
      .withInput(inputSchema)
      .withOutput(outputSchema)
      .withCell(modelCell)
      .withAction(action)
      .pfa
  }

}

Source File: FunctionSuite.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.pfa.expression

import com.ibm.aardpfark.pfa.dsl._
import com.ibm.aardpfark.pfa.DSLSuiteBase
import com.ibm.aardpfark.pfa.document.PFABuilder
import org.apache.avro.Schema

class FunctionSuite extends DSLSuiteBase {

  test("DSL: NamedFunctionDef") {

    val squared = FunctionDef[Int, Int]("x") { x =>
      Seq(core.mult(x, x))
    }
    val namedSquared = NamedFunctionDef("squared", squared)

    val cubed = FunctionDef[Int, Int]("x") {x =>
      Seq(core.mult(x, namedSquared(x)))
    }
    val namedCubed = NamedFunctionDef("cubed", cubed)

    val action = Action(namedSquared(namedCubed(inputExpr)))

    val pfaDoc = new PFABuilder()
      .withInput[Int]
      .withOutput[Int]
      .withAction(action)
      .withFunction(namedSquared)
      .withFunction(namedCubed)
      .pfa

    val engine = getPFAEngine(pfaDoc.toJSON())

    assert(1 == engine.action(engine.jsonInput("1")))
    assert(64 == engine.action(engine.jsonInput("2")))
  }

  test("DSL: FunctionDef anonymous"){
    val squared = FunctionDef[Int, Int]("x") { x =>
      Seq(core.mult(x, x))
    }

    val arraySchema = Schema.createArray(Schema.create(Schema.Type.INT))

    val action = Action(
      a.map(inputExpr, squared)
    )

    val pfaDoc = new PFABuilder()
      .withInput(arraySchema)
      .withOutput(arraySchema)
      .withAction(action)
      .pfa

    val engine = getPFAEngine(pfaDoc.toJSON())

    assert("[1,4,9]" == engine.jsonOutput(engine.action(engine.jsonInput("[1,2,3]"))))
    assert("[1,4,9]" == engine.jsonOutput(engine.action(engine.jsonInput("[-1,-2,3]"))))
    assert("[9,64,256]" == engine.jsonOutput(engine.action(engine.jsonInput("[3,8,16]"))))
  }

  test("DSL: FunctionDef multiple args with same input type") {
    val fn = NamedFunctionDef("plusAll", FunctionDef[Double, Double]("x", "y", "z") {
      case Seq(x, y, z) =>
        core.plus(core.plus(x, y), z)
    })

    val action = Action(fn.call(inputExpr, core.mult(inputExpr, 2.0), 6.0))

    val pfaDoc = new PFABuilder()
      .withInput[Double]
      .withOutput[Double]
      .withAction(action)
      .withFunction(fn)
      .pfa

    val engine = getPFAEngine(pfaDoc.toJSON())

    assert(12.0 == engine.action(engine.jsonInput("2.0")))
  }

}

Source File: IDF.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.spark.ml.feature

import com.ibm.aardpfark.pfa.document.{Cell, PFABuilder, PFADocument}
import com.ibm.aardpfark.pfa.expression._
import com.ibm.aardpfark.pfa.types.WithSchema
import com.ibm.aardpfark.spark.ml.PFAModel
import com.sksamuel.avro4s.{AvroNamespace, AvroSchema}
import org.apache.avro.{Schema, SchemaBuilder}

import org.apache.spark.ml.feature.IDFModel

@AvroNamespace("com.ibm.aardpfark.exec.spark.ml.feature")
case class IDFData(idf: Seq[Double]) extends WithSchema {
  override def schema: Schema = AvroSchema[this.type]
}




  override def action: PFAExpression = {
    NewRecord(outputSchema, Map(outputCol -> a.zipmap(inputExpr, idfRef, multFn.ref)))
  }

  override def pfa: PFADocument = {
    PFABuilder()
      .withName(sparkTransformer.uid)
      .withMetadata(getMetadata)
      .withInput(inputSchema)
      .withOutput(outputSchema)
      .withCell(modelCell)
      .withFunction(multFn)
      .withAction(action)
      .pfa
  }

}

Source File: VectorAssembler.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.spark.ml.feature

import com.ibm.aardpfark.pfa.document.{PFABuilder, PFADocument}
import com.ibm.aardpfark.pfa.expression.PFAExpression
import com.ibm.aardpfark.spark.ml.PFATransformer
import org.apache.avro.{Schema, SchemaBuilder}
import org.apache.spark.ml.feature.VectorAssembler
import org.json4s.DefaultFormats

class PFAVectorAssembler(override val sparkTransformer: VectorAssembler) extends PFATransformer {

  import com.ibm.aardpfark.pfa.dsl._
  implicit val formats = DefaultFormats

  private val inputCols = sparkTransformer.getInputCols
  private val outputCol = sparkTransformer.getOutputCol

  type DorSeqD = Either[Double, Seq[Double]]

  override protected def inputSchema: Schema = {
    val builder = SchemaBuilder.record(withUid(inputBaseName)).fields()
    for (inputCol <- inputCols) {
      builder.name(inputCol).`type`()
        .unionOf()
        .doubleType().and()
        .array().items().doubleType()
        .endUnion().noDefault()
    }
    builder.endRecord()
  }

  override protected def outputSchema: Schema = {
    SchemaBuilder.record(withUid(outputBaseName)).fields()
      .name(outputCol).`type`().array().items().doubleType().noDefault()
      .endRecord()
  }

  private val asDouble = As[Double]("x", x => NewArray[Double](x))
  private val asArray = As[Array[Double]]("x", x => x)

  private val castFn = NamedFunctionDef("castToArray",
    FunctionDef[DorSeqD, Seq[Double]]("x") { x =>
      Cast(x, asDouble, asArray)
    }
  )

  override protected def action: PFAExpression = {
    val cols = Let("cols", NewArray[DorSeqD](inputCols.map(c => StringExpr(s"input.$c"))))
    Action(
      cols,
      NewRecord(outputSchema, Map(outputCol -> a.flatten(a.map(cols.ref, castFn.ref))))
    )
  }

  override def pfa: PFADocument = {
    PFABuilder()
      .withName(sparkTransformer.uid)
      .withMetadata(getMetadata)
      .withInput(inputSchema)
      .withOutput(outputSchema)
      .withAction(action)
      .withFunction(castFn)
      .pfa
  }
}

Source File: PCAModel.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.spark.ml.feature

import com.ibm.aardpfark.pfa.document.{Cell, PFABuilder, PFADocument}
import com.ibm.aardpfark.pfa.expression.PFAExpression
import com.ibm.aardpfark.pfa.types.WithSchema
import com.ibm.aardpfark.spark.ml.PFAModel
import com.sksamuel.avro4s.{AvroNamespace, AvroSchema}
import org.apache.avro.{Schema, SchemaBuilder}

import org.apache.spark.ml.feature.PCAModel

@AvroNamespace("com.ibm.aardpfark.exec.spark.ml.feature")
case class PCAData(pc: Array[Array[Double]]) extends WithSchema {
  override def schema: Schema = AvroSchema[this.type]
}

class PFAPCAModel(override val sparkTransformer: PCAModel) extends PFAModel[PCAData] {
  import com.ibm.aardpfark.pfa.dsl._

  private val inputCol = sparkTransformer.getInputCol
  private val outputCol = sparkTransformer.getOutputCol
  private val inputExpr = StringExpr(s"input.${inputCol}")

  override def inputSchema = {
    SchemaBuilder.record(withUid(inputBaseName)).fields()
      .name(inputCol).`type`().array().items().doubleType().noDefault()
      .endRecord()
  }

  override def outputSchema = {
    SchemaBuilder.record(withUid(outputBaseName)).fields()
      .name(outputCol).`type`().array().items().doubleType().noDefault()
      .endRecord()
  }

  override protected def cell = {
    val pc = sparkTransformer.pc.transpose.rowIter.map(v => v.toArray).toArray
    Cell(PCAData(pc))
  }

  override def action: PFAExpression = {
    val dot = la.dot(modelCell.ref("pc"), inputExpr)
    NewRecord(outputSchema, Map(outputCol -> dot))
  }

  override def pfa: PFADocument = {
    PFABuilder()
      .withName(sparkTransformer.uid)
      .withMetadata(getMetadata)
      .withInput(inputSchema)
      .withOutput(outputSchema)
      .withCell(modelCell)
      .withAction(action)
      .pfa
  }

}

Source File: Main.scala From geotrellis-osm-elevation with Apache License 2.0

5 votes

package geotrellis.osme.server

import geotrellis.raster._
import geotrellis.raster.io.geotiff._
import geotrellis.raster.render._
import geotrellis.raster.resample._

import geotrellis.spark._
import geotrellis.spark.io._
import geotrellis.spark.io.file._
import geotrellis.spark.io.avro._
import geotrellis.spark.io.avro.codecs._

import geotrellis.spark.io.index._

import org.apache.spark._
import org.apache.avro.Schema

import com.github.nscala_time.time.Imports._
import akka.actor._
import akka.io.IO
import spray.can.Http
import spray.routing.{HttpService, RequestContext}
import spray.routing.directives.CachingDirectives
import spray.http.MediaTypes
import spray.json._
import spray.json.DefaultJsonProtocol._

import com.typesafe.config.ConfigFactory

import scala.concurrent._
import scala.collection.JavaConverters._
import scala.reflect.ClassTag

object Main {
  def main(args: Array[String]): Unit = {
    val conf =
      new SparkConf()
        .setIfMissing("spark.master", "local[*]")
        .setAppName("Osme Server")
        .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        .set("spark.kryo.registrator", "geotrellis.spark.io.hadoop.KryoRegistrator")

    implicit val sc = new SparkContext(conf)

    implicit val system = akka.actor.ActorSystem("demo-system")

    // create and start our service actor
    val service =
      system.actorOf(Props(classOf[OsmeServiceActor], sc), "osme")

    // start a new HTTP server on port 8088 with our service actor as the handler
    IO(Http) ! Http.Bind(service, "0.0.0.0", 8088)
  }
}

Source File: BasicTest.scala From kafka-testing with Apache License 2.0

5 votes

package com.landoop.kafka.testing

import org.apache.avro.Schema
import org.apache.avro.generic.GenericData
import org.apache.kafka.clients.producer.ProducerRecord

class BasicTest extends ClusterTestingCapabilities {

  private val createAvroRecord = {
    val userSchema = "{\"namespace\": \"example.avro\", \"type\": \"record\", " + "\"name\": \"User\"," + "\"fields\": [{\"name\": \"name\", \"type\": \"string\"}]}"
    val parser = new Schema.Parser
    val schema = parser.parse(userSchema)
    val avroRecord = new GenericData.Record(schema)
    avroRecord.put("name", "testUser")
    avroRecord
  }

  "KCluster" should {
    "start up and be able to handle avro records being sent " in {
      val topic = "testAvro" + System.currentTimeMillis()
      val avroRecord = createAvroRecord
      val objects = Array[AnyRef](avroRecord)
      val producerProps = stringAvroProducerProps
      val producer = createProducer[String, Any](producerProps)

      for (o <- objects) {
        val message = new ProducerRecord[String, Any](topic, o)
        producer.send(message)
      }
      val consumerProps = stringAvroConsumerProps()
      val consumer = createStringAvroConsumer(consumerProps)
      val records = consumeStringAvro(consumer, topic, objects.length)
      objects.toSeq shouldBe records
    }

    "write and read avro records" in {
      val topic = "testAvro" + System.currentTimeMillis()
      val avroRecord = createAvroRecord
      val objects = Array[Any](avroRecord, true, 130, 345L, 1.23f, 2.34d, "abc", "def".getBytes)
      val producerProps = stringAvroProducerProps
      val producer = createProducer[String, Any](producerProps)
      for (o <- objects) {
        producer.send(new ProducerRecord[String, Any](topic, o))
      }
      val consumerProps = stringAvroConsumerProps("group" + System.currentTimeMillis())
      val consumer = createStringAvroConsumer(consumerProps)
      val records = consumeStringAvro(consumer, topic, objects.length)
      objects.deep shouldBe records.toArray.deep
    }
  }

}

Source File: AvroSerializer.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.bloomberg.avro

import java.io.ByteArrayOutputStream

import com.datamountaineer.streamreactor.connect.bloomberg.BloombergData
import com.datamountaineer.streamreactor.connect.bloomberg.avro.AvroSchemaGenerator._
import org.apache.avro.Schema
import org.apache.avro.generic.GenericData.Record
import org.apache.avro.generic.{GenericData, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.EncoderFactory

import scala.collection.JavaConverters._

object AvroSerializer {

  
    private def recursive(record: GenericData.Record, schema: Schema, fieldName: String, value: Any): Unit = {
      value match {
        case _: Boolean => record.put(fieldName, value)
        case _: Int => record.put(fieldName, value)
        case _: Long => record.put(fieldName, value)
        case _: Double => record.put(fieldName, value)
        case _: Char => record.put(fieldName, value)
        case _: Float => record.put(fieldName, value)
        case _: String =>
          record.put(fieldName, value)
        case list: java.util.List[_] =>
          val tmpSchema = schema.getField(fieldName).schema()
          val itemSchema = if (tmpSchema.getType == Schema.Type.UNION) tmpSchema.getTypes.get(1) else tmpSchema
          require(itemSchema.getType == Schema.Type.ARRAY)
          //we might have a record not a primitive
          if (itemSchema.getElementType.getType == Schema.Type.RECORD) {
            val items = new GenericData.Array[GenericData.Record](list.size(), itemSchema)
            list.asScala.foreach { i =>
              //only map is allowed
              val m = i.asInstanceOf[java.util.Map[String, Any]]
              items.add(m.toAvroRecord(itemSchema.getElementType))
            }
            record.put(fieldName, items)
          } else {
            val items = new GenericData.Array[Any](list.size(), itemSchema)
            items.addAll(list)
            record.put(fieldName, items)
          }

        case map: java.util.LinkedHashMap[String @unchecked, _] =>
          //record schema
          val fieldSchema = schema.getField(fieldName).schema()
          val nestedSchema = if (fieldSchema.getType == Schema.Type.UNION) fieldSchema.getTypes.get(1) else fieldSchema
          val nestedRecord = new Record(nestedSchema)
          map.entrySet().asScala.foreach(e =>
            recursive(nestedRecord, nestedSchema, e.getKey, e.getValue))
          record.put(fieldName, nestedRecord)
      }
    }
  }
}

Source File: AvroRecordRowKeyBuilderTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.hbase

import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._
import com.datamountaineer.streamreactor.connect.hbase.avro.AvroRecordFieldExtractorMapFn
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.hbase.util.Bytes
import org.apache.kafka.connect.sink.SinkRecord
import org.mockito.MockitoSugar
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class AvroRecordRowKeyBuilderTest extends AnyWordSpec with Matchers with MockitoSugar {
  val schema: Schema = new Schema.Parser().parse(PersonAvroSchema.schema)

  "AvroRecordRowKeyBuilder" should {
    "extract the values from the avro record and create the key" in {
      val keys = Seq("firstName", "lastName", "age")
      val rowKeyBuilder = new AvroRecordRowKeyBuilderBytes(AvroRecordFieldExtractorMapFn(schema, keys), keys)

      val sinkRecord = mock[SinkRecord]
      val firstName = "Jack"
      val lastName = "Smith"
      val age = 29

      val record = new GenericRecord {

        val values: Map[String, AnyRef] = Map("firstName" -> firstName, "lastName" -> lastName, "age" -> Int.box(age))

        override def get(key: String): AnyRef = values(key)

        override def put(key: String, v: scala.Any): Unit = sys.error("not supported")

        override def get(i: Int): AnyRef = sys.error("not supported")


        override def put(i: Int, v: scala.Any): Unit = sys.error("not supported")


        override def getSchema: Schema = sys.error("not supported")
      }

      val expectedValue = Bytes.add(
        Array(
          firstName.fromString(),
          rowKeyBuilder.delimBytes,
          lastName.fromString(),
          rowKeyBuilder.delimBytes,
          age.fromInt()))
      rowKeyBuilder.build(sinkRecord, record) shouldBe expectedValue
    }
  }
}

Source File: AvroSchemaFieldsExistFnTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.hbase.avro

import com.datamountaineer.streamreactor.connect.hbase.PersonAvroSchema
import org.apache.avro.Schema
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec


class AvroSchemaFieldsExistFnTest extends AnyWordSpec with Matchers {
  val schema: Schema = new Schema.Parser().parse(PersonAvroSchema.schema)

  "AvroSchemaFieldsExistFn" should {
    "raise an exception if the field is not present" in {
      intercept[IllegalArgumentException] {
        AvroSchemaFieldsExistFn(schema, Seq("notpresent"))
      }

      intercept[IllegalArgumentException] {
        AvroSchemaFieldsExistFn(schema, Seq(" lastName"))
      }
    }

    "not raise an exception if the fields are present" in {
      AvroSchemaFieldsExistFn(schema, Seq("lastName", "age", "address"))
    }
  }
}

Source File: AvroRecordFieldExtractorMapFnTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.hbase.avro

import java.nio.file.Paths

import org.apache.avro.Schema
import org.apache.hadoop.hbase.util.Bytes
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class AvroRecordFieldExtractorMapFnTest extends AnyWordSpec with Matchers {

  val schema: Schema = new Schema.Parser().parse(Paths.get(getClass.getResource("/person.avsc").toURI).toFile)

  "AvroRecordFieldExtractorMapFn" should {
    "raise an exception if the given field does not exist in the schema" in {
      intercept[IllegalArgumentException] {
        AvroRecordFieldExtractorMapFn(schema, Seq("wrongField"))
      }
    }

    "raise an exception if the given field is not a primitive" in {
      intercept[IllegalArgumentException] {
        AvroRecordFieldExtractorMapFn(schema, Seq("address"))
      }
    }

    "create the mappings for all the given fields" in {
      val mappings = AvroRecordFieldExtractorMapFn(schema, Seq("firstName", "age"))

      val fnFirstName = mappings("firstName")
      val firstName = "Beaky"
      fnFirstName(firstName) shouldBe Bytes.toBytes(firstName)

      val fnAge = mappings("age")
      val age = 31
      fnAge(age) shouldBe Bytes.toBytes(age)
      intercept[ClassCastException] {
        fnAge(12.4)
      }
    }
  }
}

Source File: AvroSchemaFieldsExistFn.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.hbase.avro

import org.apache.avro.{AvroRuntimeException, Schema}


object AvroSchemaFieldsExistFn {
  def apply(schema: Schema, fields: Seq[String]) : Unit = {
    fields.foreach { field =>
      try {
        if (Option(schema.getField(field)).isEmpty) {
          throw new IllegalArgumentException(s"[$field] is not found in the schema fields")
        }
      }
      catch {
        case avroException: AvroRuntimeException => throw new IllegalArgumentException(s"$field is not found in the schema", avroException)
      }
    }
  }
}

Source File: ToTableRow.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.extra.bigquery

import com.spotify.scio.extra.bigquery.AvroConverters.AvroConversionException

import java.math.{BigDecimal => JBigDecimal}
import java.nio.ByteBuffer
import java.util

import com.spotify.scio.bigquery.TableRow
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericFixed, IndexedRecord}
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.io.BaseEncoding
import org.joda.time.format.DateTimeFormat
import org.joda.time.{DateTime, LocalDate, LocalTime}

import scala.jdk.CollectionConverters._


private[bigquery] trait ToTableRow {
  private lazy val encodingPropName: String = "bigquery.bytes.encoder"
  private lazy val base64Encoding: BaseEncoding = BaseEncoding.base64()
  private lazy val hexEncoding: BaseEncoding = BaseEncoding.base16()

  // YYYY-[M]M-[D]D
  private[this] val localDateFormatter =
    DateTimeFormat.forPattern("yyyy-MM-dd").withZoneUTC()

  // YYYY-[M]M-[D]D[( |T)[H]H:[M]M:[S]S[.DDDDDD]]
  private[this] val localTimeFormatter =
    DateTimeFormat.forPattern("HH:mm:ss.SSSSSS")

  // YYYY-[M]M-[D]D[( |T)[H]H:[M]M:[S]S[.DDDDDD]][time zone]
  private[this] val timestampFormatter =
    DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSSSSS")

  private[bigquery] def toTableRowField(fieldValue: Any, field: Schema.Field): Any =
    fieldValue match {
      case x: CharSequence          => x.toString
      case x: Enum[_]               => x.name()
      case x: JBigDecimal           => x.toString
      case x: Number                => x
      case x: Boolean               => x
      case x: GenericFixed          => encodeByteArray(x.bytes(), field.schema())
      case x: ByteBuffer            => encodeByteArray(toByteArray(x), field.schema())
      case x: util.Map[_, _]        => toTableRowFromMap(x.asScala, field)
      case x: java.lang.Iterable[_] => toTableRowFromIterable(x.asScala, field)
      case x: IndexedRecord         => AvroConverters.toTableRow(x)
      case x: LocalDate             => localDateFormatter.print(x)
      case x: LocalTime             => localTimeFormatter.print(x)
      case x: DateTime              => timestampFormatter.print(x)
      case _ =>
        throw AvroConversionException(
          s"ToTableRow conversion failed:" +
            s"could not match ${fieldValue.getClass}"
        )
    }

  private def toTableRowFromIterable(iterable: Iterable[Any], field: Schema.Field): util.List[_] =
    iterable
      .map { item =>
        if (item.isInstanceOf[Iterable[_]] || item.isInstanceOf[Map[_, _]]) {
          throw AvroConversionException(
            s"ToTableRow conversion failed for item $item: " +
              s"iterable and map types not supported"
          )
        }
        toTableRowField(item, field)
      }
      .toList
      .asJava

  private def toTableRowFromMap(map: Iterable[Any], field: Schema.Field): util.List[_] =
    map
      .map {
        case (k, v) =>
          new TableRow()
            .set("key", toTableRowField(k, field))
            .set("value", toTableRowField(v, field))
      }
      .toList
      .asJava

  private def encodeByteArray(bytes: Array[Byte], fieldSchema: Schema): String =
    Option(fieldSchema.getProp(encodingPropName)) match {
      case Some("BASE64") => base64Encoding.encode(bytes)
      case Some("HEX")    => hexEncoding.encode(bytes)
      case Some(encoding) =>
        throw AvroConversionException(s"Unsupported encoding $encoding")
      case None => base64Encoding.encode(bytes)
    }

  private def toByteArray(buffer: ByteBuffer) = {
    val copy = buffer.asReadOnlyBuffer
    val bytes = new Array[Byte](copy.limit)
    copy.rewind
    copy.get(bytes)
    bytes
  }
}

Source File: JsonConverterWithSchemaEvolutionTest.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.converters.source

import java.util.Collections

import com.datamountaineer.streamreactor.connect.converters.MsgKey
import com.sksamuel.avro4s.{RecordFormat, SchemaFor}
import io.confluent.connect.avro.AvroData
import org.apache.avro.Schema
import org.apache.kafka.connect.data.Struct
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class JsonConverterWithSchemaEvolutionTest extends AnyWordSpec with Matchers {
  val topic = "the_real_topic"
  val sourceTopic = "source_topic"
  val avroData = new AvroData(4)

  "JsonConverter" should {
    "throw IllegalArgumentException if payload is null" in {
      intercept[IllegalArgumentException] {
        val converter = new JsonConverterWithSchemaEvolution
        val record = converter.convert("topic", "somesource", "1000", null)
      }
    }

    "handle a simple json" in {
      val json = JacksonJson.toJson(Car("LaFerrari", "Ferrari", 2015, 963, 0.0001))
      val converter = new JsonConverterWithSchemaEvolution
      val record = converter.convert(topic, sourceTopic, "100", json.getBytes)
      record.keySchema() shouldBe MsgKey.schema
      record.key().asInstanceOf[Struct].getString("topic") shouldBe sourceTopic
      record.key().asInstanceOf[Struct].getString("id") shouldBe "100"

      val schema =
        new Schema.Parser().parse(
          SchemaFor[CarOptional]().toString
            .replace("\"name\":\"CarOptional\"", s"""\"name\":\"$sourceTopic\"""")
            .replace(s""",\"namespace\":\"${getClass.getCanonicalName.dropRight(getClass.getSimpleName.length+1)}\"""", "")
        )
      val format = RecordFormat[CarOptional]
      val carOptional = format.to(CarOptional(Option("LaFerrari"), Option("Ferrari"), Option(2015), Option(963), Option(0.0001)))

      record.valueSchema() shouldBe avroData.toConnectSchema(schema)

      record.value() shouldBe avroData.toConnectData(schema, carOptional).value()
      record.sourcePartition() shouldBe null
      record.sourceOffset() shouldBe Collections.singletonMap(JsonConverterWithSchemaEvolution.ConfigKey, avroData.fromConnectSchema(avroData.toConnectSchema(schema)).toString())
    }
  }
}


case class Car(name: String,
               manufacturer: String,
               model: Long,
               bhp: Long,
               price: Double)


case class CarOptional(name: Option[String],
                       manufacturer: Option[String],
                       model: Option[Long],
                       bhp: Option[Long],
                       price: Option[Double])

Source File: JsonSimpleConverterTest.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.converters.source

import java.util.Collections

import com.datamountaineer.streamreactor.connect.converters.MsgKey
import com.sksamuel.avro4s.{RecordFormat, SchemaFor}
import io.confluent.connect.avro.AvroData
import org.apache.avro.Schema
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class JsonSimpleConverterTest extends AnyWordSpec with Matchers {
  val topic = "the_real_topic"
  val sourceTopic = "source_topic"
  val avroData = new AvroData(4)

  "JsonSimpleConverter" should {
    "convert from json to the struct" in {
      val car = Car("LaFerrari", "Ferrari", 2015, 963, 0.0001)
      val json = JacksonJson.toJson(car)
      val converter = new JsonSimpleConverter
      val record = converter.convert(topic, sourceTopic, "100", json.getBytes)
      record.keySchema() shouldBe MsgKey.schema
      record.key() shouldBe MsgKey.getStruct(sourceTopic, "100")

      val schema = new Schema.Parser().parse(
        SchemaFor[Car]().toString
          .replace("\"name\":\"Car\"", s"""\"name\":\"$sourceTopic\"""")
          .replace(s"""\"namespace\":\"${getClass.getCanonicalName.dropRight(getClass.getSimpleName.length+1)}\",""", "")
      )
      val format = RecordFormat[Car]
      val avro = format.to(car)

      record.valueSchema() shouldBe avroData.toConnectSchema(schema)

      record.value() shouldBe avroData.toConnectData(schema, avro).value()
      record.sourcePartition() shouldBe Collections.singletonMap(Converter.TopicKey, sourceTopic)
      record.sourceOffset() shouldBe null
    }
  }
}

Source File: AvroSerializer.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.serialization

import java.io.{ByteArrayOutputStream, InputStream, OutputStream}

import com.sksamuel.avro4s.{RecordFormat, SchemaFor}
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.{DecoderFactory, EncoderFactory}

object AvroSerializer {
  def write[T <: Product](t: T)(implicit os: OutputStream, formatter: RecordFormat[T], schemaFor: SchemaFor[T]): Unit = write(apply(t), schemaFor())

  def write(record: GenericRecord, schema: Schema)(implicit os: OutputStream) = {
    val writer = new GenericDatumWriter[GenericRecord](schema)
    val encoder = EncoderFactory.get().binaryEncoder(os, null)

    writer.write(record, encoder)
    encoder.flush()
    os.flush()
  }

  def getBytes[T <: Product](t: T)(implicit recordFormat: RecordFormat[T], schemaFor: SchemaFor[T]): Array[Byte] = getBytes(recordFormat.to(t), schemaFor())

  def getBytes(record: GenericRecord, schema: Schema): Array[Byte] = {
    implicit val output = new ByteArrayOutputStream()
    write(record, schema)
    output.toByteArray
  }

  def read(is: InputStream, schema: Schema): GenericRecord = {
    val reader = new GenericDatumReader[GenericRecord](schema)
    val decoder = DecoderFactory.get().binaryDecoder(is, null)
    reader.read(null, decoder)
  }

  def read[T <: Product](is: InputStream)(implicit schemaFor: SchemaFor[T], recordFormat: RecordFormat[T]): T = recordFormat.from(read(is, schemaFor()))

  def apply[T <: Product](t: T)(implicit formatter: RecordFormat[T]): GenericRecord = formatter.to(t)
}

Source File: AvroSerde.scala From event-sourcing-kafka-streams with MIT License

5 votes

package org.amitayh.invoices.common.serde

import java.io.ByteArrayOutputStream
import java.nio.ByteBuffer
import java.time.Instant
import java.util
import java.util.UUID

import com.sksamuel.avro4s._
import org.amitayh.invoices.common.domain._
import org.amitayh.invoices.common.serde.UuidConverters.{fromByteBuffer, toByteBuffer}
import org.apache.avro.Schema
import org.apache.avro.Schema.Field
import org.apache.kafka.common.serialization.{Deserializer, Serde, Serializer}

object AvroSerde {
  implicit val instantToSchema: ToSchema[Instant] = new ToSchema[Instant] {
    override val schema: Schema = Schema.create(Schema.Type.STRING)
  }

  implicit val instantToValue: ToValue[Instant] = new ToValue[Instant] {
    override def apply(value: Instant): String = value.toString
  }

  implicit val instantFromValue: FromValue[Instant] = new FromValue[Instant] {
    override def apply(value: Any, field: Field): Instant =
      Instant.parse(value.toString)
  }

  implicit val uuidToSchema: ToSchema[UUID] = new ToSchema[UUID] {
    override val schema: Schema = Schema.create(Schema.Type.BYTES)
  }

  implicit val uuidToValue: ToValue[UUID] = new ToValue[UUID] {
    override def apply(value: UUID): ByteBuffer = toByteBuffer(value)
  }

  implicit val uuidFromValue: FromValue[UUID] = new FromValue[UUID] {
    override def apply(value: Any, field: Field): UUID =
      fromByteBuffer(value.asInstanceOf[ByteBuffer])
  }

  val CommandSerde: Serde[Command] = serdeFor[Command]

  val CommandResultSerde: Serde[CommandResult] = serdeFor[CommandResult]

  val SnapshotSerde: Serde[InvoiceSnapshot] = serdeFor[InvoiceSnapshot]

  val EventSerde: Serde[Event] = serdeFor[Event]

  def toBytes[T: SchemaFor: ToRecord](data: T): Array[Byte] = {
    val baos = new ByteArrayOutputStream
    val output = AvroOutputStream.binary[T](baos)
    output.write(data)
    output.close()
    baos.toByteArray
  }

  def fromBytes[T: SchemaFor: FromRecord](data: Array[Byte]): T = {
    val input = AvroInputStream.binary[T](data)
    input.iterator.next()
  }

  private def serdeFor[T: SchemaFor: ToRecord: FromRecord]: Serde[T] = new Serde[T] {
    override val serializer: Serializer[T] = new Serializer[T] {
      override def serialize(topic: String, data: T): Array[Byte] = toBytes(data)
      override def configure(configs: util.Map[String, _], isKey: Boolean): Unit = ()
      override def close(): Unit = ()
    }
    override val deserializer: Deserializer[T] = new Deserializer[T] {
      override def deserialize(topic: String, data: Array[Byte]): T = fromBytes(data)
      override def configure(configs: util.Map[String, _], isKey: Boolean): Unit = ()
      override def close(): Unit = ()
    }
    override def configure(configs: util.Map[String, _], isKey: Boolean): Unit = ()
    override def close(): Unit = ()
  }
}

Source File: AvroCoders.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.coders.instances

import java.io.{InputStream, OutputStream}

import com.spotify.scio.coders.{AvroCoderMacros, Coder}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.avro.specific.{SpecificData, SpecificFixed}
import org.apache.beam.sdk.coders.Coder.NonDeterministicException
import org.apache.beam.sdk.coders.{AtomicCoder, AvroCoder, StringUtf8Coder}
import org.apache.beam.sdk.util.common.ElementByteSizeObserver

import scala.reflect.{classTag, ClassTag}

final private class SlowGenericRecordCoder extends AtomicCoder[GenericRecord] {
  // TODO: can we find something more efficient than String ?
  private[this] val sc = StringUtf8Coder.of()

  override def encode(value: GenericRecord, os: OutputStream): Unit = {
    val schema = value.getSchema
    val coder = AvroCoder.of(schema)
    sc.encode(schema.toString, os)
    coder.encode(value, os)
  }

  override def decode(is: InputStream): GenericRecord = {
    val schemaStr = sc.decode(is)
    val schema = new Schema.Parser().parse(schemaStr)
    val coder = AvroCoder.of(schema)
    coder.decode(is)
  }

  // delegate methods for determinism and equality checks
  override def verifyDeterministic(): Unit =
    throw new NonDeterministicException(
      this,
      "Coder[GenericRecord] without schema is non-deterministic"
    )
  override def consistentWithEquals(): Boolean = false
  override def structuralValue(value: GenericRecord): AnyRef =
    AvroCoder.of(value.getSchema).structuralValue(value)

  // delegate methods for byte size estimation
  override def isRegisterByteSizeObserverCheap(value: GenericRecord): Boolean =
    AvroCoder.of(value.getSchema).isRegisterByteSizeObserverCheap(value)
  override def registerByteSizeObserver(
    value: GenericRecord,
    observer: ElementByteSizeObserver
  ): Unit =
    AvroCoder.of(value.getSchema).registerByteSizeObserver(value, observer)
}


  // TODO: Use a coder that does not serialize the schema
  def avroGenericRecordCoder(schema: Schema): Coder[GenericRecord] =
    Coder.beam(AvroCoder.of(schema))

  // XXX: similar to GenericAvroSerializer
  def avroGenericRecordCoder: Coder[GenericRecord] =
    Coder.beam(new SlowGenericRecordCoder)

  import org.apache.avro.specific.SpecificRecordBase
  implicit def genAvro[T <: SpecificRecordBase]: Coder[T] =
    macro AvroCoderMacros.staticInvokeCoder[T]

  implicit def avroSpecificFixedCoder[T <: SpecificFixed: ClassTag]: Coder[T] =
    SpecificFixedCoder[T]
}

Source File: AvroSerializer.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.coders.instances.kryo

import com.esotericsoftware.kryo.Kryo
import com.esotericsoftware.kryo.io.{Input, Output}
import com.twitter.chill.KSerializer
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.avro.specific.SpecificRecordBase
import org.apache.beam.sdk.coders.AvroCoder

import scala.collection.mutable.{Map => MMap}
import scala.util.Try

private[coders] class GenericAvroSerializer extends KSerializer[GenericRecord] {
  private lazy val cache: MMap[String, AvroCoder[GenericRecord]] = MMap()

  private def getCoder(schemaStr: String): AvroCoder[GenericRecord] =
    cache.getOrElseUpdate(schemaStr, AvroCoder.of(new Schema.Parser().parse(schemaStr)))
  private def getCoder(schemaStr: String, schema: Schema): AvroCoder[GenericRecord] =
    cache.getOrElseUpdate(schemaStr, AvroCoder.of(schema))

  override def write(kryo: Kryo, out: Output, obj: GenericRecord): Unit = {
    val schemaStr = obj.getSchema.toString
    val coder = this.getCoder(schemaStr, obj.getSchema)
    // write schema before every record in case it's not in reader serializer's cache
    out.writeString(schemaStr)
    coder.encode(obj, out)
  }

  override def read(kryo: Kryo, in: Input, cls: Class[GenericRecord]): GenericRecord = {
    val coder = this.getCoder(in.readString())
    coder.decode(in)
  }
}

private[coders] class SpecificAvroSerializer[T <: SpecificRecordBase] extends KSerializer[T] {
  private lazy val cache: MMap[Class[T], AvroCoder[T]] = MMap()

  private def getCoder(cls: Class[T]): AvroCoder[T] =
    cache.getOrElseUpdate(
      cls,
      Try(cls.getConstructor().newInstance().getSchema)
        .map(AvroCoder.of(cls, _))
        .getOrElse(AvroCoder.of(cls))
    )

  override def write(kser: Kryo, out: Output, obj: T): Unit =
    this.getCoder(obj.getClass.asInstanceOf[Class[T]]).encode(obj, out)

  override def read(kser: Kryo, in: Input, cls: Class[T]): T =
    this.getCoder(cls).decode(in)
}

Source File: StorageUtil.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.bigquery

import com.google.api.services.bigquery.model.{TableFieldSchema, TableSchema}
import com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions
import org.apache.avro.Schema
import org.apache.avro.Schema.Type

import scala.jdk.CollectionConverters._


object StorageUtil {
  def tableReadOptions(
    selectedFields: List[String] = Nil,
    rowRestriction: Option[String] = None
  ): TableReadOptions =
    TableReadOptions
      .newBuilder()
      .addAllSelectedFields(selectedFields.asJava)
      .setRowRestriction(rowRestriction.getOrElse(""))
      .build()

  // https://cloud.google.com/bigquery/docs/reference/storage/
  def toTableSchema(avroSchema: Schema): TableSchema = {
    val fields = getFieldSchemas(avroSchema)

    new TableSchema().setFields(fields.asJava)
  }

  private def getFieldSchemas(avroSchema: Schema): List[TableFieldSchema] =
    avroSchema.getFields.asScala.map(toTableFieldSchema).toList

  private def toTableFieldSchema(field: Schema.Field): TableFieldSchema = {
    val schema = field.schema
    val (mode, tpe) = schema.getType match {
      case Type.UNION =>
        val types = schema.getTypes
        assert(types.size == 2 && types.get(0).getType == Type.NULL)
        ("NULLABLE", types.get(1))
      case Type.ARRAY =>
        ("REPEATED", schema.getElementType)
      case _ =>
        ("REQUIRED", schema)
    }
    val tableField = new TableFieldSchema().setName(field.name).setMode(mode)
    setRawType(tableField, tpe)
    tableField
  }

  private def setRawType(tableField: TableFieldSchema, schema: Schema): Unit = {
    val tpe = schema.getType match {
      case Type.BOOLEAN => "BOOLEAN"
      case Type.LONG =>
        schema.getLogicalType match {
          case null                                 => "INT64"
          case t if t.getName == "timestamp-micros" => "TIMESTAMP"
          case t if t.getName == "time-micros"      => "TIME"
          case t =>
            throw new IllegalStateException(s"Unsupported logical type: $t")
        }
      case Type.DOUBLE => "FLOAT64"
      case Type.BYTES =>
        schema.getLogicalType match {
          case null => "BYTES"
          case t if t.getName == "decimal" =>
            assert(schema.getObjectProp("precision").asInstanceOf[Int] == 38)
            assert(schema.getObjectProp("scale").asInstanceOf[Int] == 9)
            "NUMERIC"
          case t => s"Unsupported logical type: $t"
        }
      case Type.INT =>
        schema.getLogicalType match {
          case t if t.getName == "date" => "DATE"
          case t                        => s"Unsupported logical type: $t"
        }
      case Type.STRING =>
        // FIXME: schema.getLogicalType == null in this case, BigQuery service side bug?
        if (schema.getProp("logicalType") == "datetime") {
          "DATETIME"
        } else {
          schema.getLogicalType match {
            case null                          => "STRING"
            case t if t.getName == "datetime"  => "DATETIME"
            case t if t.getName == "geography" => "GEOGRAPHY"
            case t =>
              throw new IllegalStateException(s"Unsupported logical type: $t")
          }
        }
      case Type.RECORD =>
        tableField.setFields(getFieldSchemas(schema).asJava)
        "RECORD"
      case t =>
        throw new IllegalStateException(s"Unsupported type: $t")
    }
    tableField.setType(tpe)
    ()
  }
}

Source File: Cache.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.bigquery.client

import java.io.File

import com.google.api.services.bigquery.model.{TableReference, TableSchema}
import com.spotify.scio.bigquery.BigQueryUtil
import org.apache.beam.sdk.io.gcp.{bigquery => bq}
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Charsets
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.hash.Hashing
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.io.Files

import scala.util.Try
import org.apache.avro.Schema

private[client] object Cache {
  sealed trait Show[T] {
    def show(t: T): String
  }

  object Show {
    @inline final def apply[T](implicit t: Show[T]): Show[T] = t

    implicit val showTableSchema: Show[TableSchema] = new Show[TableSchema] {
      override def show(t: TableSchema): String = t.toPrettyString()
    }

    implicit val showTableRef: Show[TableReference] = new Show[TableReference] {
      override def show(table: TableReference): String =
        bq.BigQueryHelpers.toTableSpec(table)
    }

    implicit val showAvroSchema: Show[Schema] = new Show[Schema] {
      override def show(t: Schema): String = t.toString()
    }
  }

  sealed trait Read[T] {
    def read(s: String): Option[T]
  }

  object Read {
    @inline final def apply[T](implicit t: Read[T]): Read[T] = t

    implicit val readTableSchema: Read[TableSchema] = new Read[TableSchema] {
      override def read(s: String): Option[TableSchema] =
        Try(BigQueryUtil.parseSchema(s)).toOption
    }

    implicit val readTableRef: Read[TableReference] = new Read[TableReference] {
      override def read(table: String): Option[TableReference] =
        Try(bq.BigQueryHelpers.parseTableSpec(table)).toOption
    }

    implicit val readAvroSchema: Read[Schema] = new Read[Schema] {
      override def read(s: String): Option[Schema] =
        Try {
          new Schema.Parser().parse(s)
        }.toOption
    }
  }

  private[this] def isCacheEnabled: Boolean = BigQueryConfig.isCacheEnabled

  def getOrElse[T: Read: Show](key: String, f: String => File)(method: => T): T =
    if (isCacheEnabled) {
      get(key, f) match {
        case Some(schema) => schema
        case None =>
          val schema = method
          set(key, schema, f)
          schema
      }
    } else {
      method
    }

  def set[T: Show](key: String, t: T, f: String => File): Unit =
    Files
      .asCharSink(f(key), Charsets.UTF_8)
      .write(Show[T].show(t))

  def get[T: Read](key: String, f: String => File): Option[T] =
    Try(scala.io.Source.fromFile(f(key)).mkString).toOption.flatMap(Read[T].read)

  val SchemaCache: String => File = key => cacheFile(key, ".schema.json")

  val TableCache: String => File = key => cacheFile(key, ".table.txt")

  private[this] def cacheFile(key: String, suffix: String): File = {
    val cacheDir = BigQueryConfig.cacheDirectory
    val filename = Hashing.murmur3_128().hashString(key, Charsets.UTF_8).toString + suffix
    val cacheFile = cacheDir.resolve(filename).toFile()
    Files.createParentDirs(cacheFile)
    cacheFile
  }
}

Source File: GroupByBenchmark.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.jmh

import com.spotify.scio.{ScioContext, ScioExecutionContext}
import com.spotify.scio.avro._
import com.spotify.scio.coders._
import org.apache.beam.sdk.coders.{KvCoder, Coder => BCoder}
import org.apache.beam.sdk.values.KV
import org.apache.beam.sdk.transforms.GroupByKey
import org.apache.beam.sdk.options.{PipelineOptions, PipelineOptionsFactory}
import java.util.concurrent.TimeUnit

import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.openjdk.jmh.annotations._

import scala.jdk.CollectionConverters._
@BenchmarkMode(Array(Mode.AverageTime))
@OutputTimeUnit(TimeUnit.SECONDS)
@State(Scope.Thread)
class GroupByBenchmark {
  val schema =
    """
      {
        "type": "record",
        "name": "Event",
        "namespace": "smbjoin",
        "fields": [
          {
            "name": "id",
            "type": "string"
          },
          {
            "name": "value",
            "type": "double"
          }
        ]
      }
    """

  val avroSchema =
    new Schema.Parser().parse(schema)

  private def runWithContext[T](fn: ScioContext => T): ScioExecutionContext = {
    val opts = PipelineOptionsFactory.as(classOf[PipelineOptions])
    val sc = ScioContext(opts)
    fn(sc)
    sc.run()
  }

  val source = "src/test/resources/events-10000-0.avro"
  implicit val coderGenericRecord: Coder[GenericRecord] =
    Coder.avroGenericRecordCoder(avroSchema)

  val charCoder = CoderMaterializer.beamWithDefault(Coder[Char])
  val doubleCoder = CoderMaterializer.beamWithDefault(Coder[Double])
  val kvCoder: BCoder[KV[Char, Double]] = KvCoder.of(charCoder, doubleCoder)

  @Benchmark
  def testScioGroupByKey: ScioExecutionContext =
    runWithContext { sc =>
      sc.avroFile(source, schema = avroSchema)
        .map(rec => (rec.get("id").toString.head, rec.get("value").asInstanceOf[Double]))
        .groupByKey
    }

  @Benchmark
  def testBeamGroupByKey: ScioExecutionContext =
    runWithContext { sc =>
      sc.wrap {
        sc.avroFile(source, schema = avroSchema)
          .map { rec =>
            KV.of(rec.get("id").toString.head, rec.get("value").asInstanceOf[Double])
          }
          .internal
          .setCoder(kvCoder)
          .apply(GroupByKey.create[Char, Double])
      }.map(kv => (kv.getKey, kv.getValue.asScala))
    }
}

Source File: AvroConverters.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.extra.bigquery

import com.google.api.services.bigquery.model.TableSchema
import com.spotify.scio.annotations.experimental
import com.spotify.scio.bigquery.TableRow
import org.apache.avro.Schema
import org.apache.avro.generic.IndexedRecord

import scala.jdk.CollectionConverters._

object AvroConverters extends ToTableRow with ToTableSchema {

  @experimental
  def toTableRow[T <: IndexedRecord](record: T): TableRow = {
    val row = new TableRow

    record.getSchema.getFields.asScala.foreach { field =>
      Option(record.get(field.pos)).foreach { fieldValue =>
        row.set(field.name, toTableRowField(fieldValue, field))
      }
    }

    row
  }

  
  @experimental
  def toTableSchema(avroSchema: Schema): TableSchema = {
    val fields = getFieldSchemas(avroSchema)

    new TableSchema().setFields(fields.asJava)
  }

  final case class AvroConversionException(
    private val message: String,
    private val cause: Throwable = null
  ) extends Exception(message, cause)
}

Source File: SCollectionSyntax.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.extra.bigquery.syntax

import com.google.api.services.bigquery.model.TableReference
import com.spotify.scio.annotations.experimental
import com.spotify.scio.bigquery.BigQueryTable.WriteParam
import com.spotify.scio.bigquery.{BigQueryTable, Table, TableRow}
import com.spotify.scio.io.ClosedTap
import com.spotify.scio.util.ScioUtil
import com.spotify.scio.values.SCollection
import org.apache.avro.Schema
import org.apache.avro.generic.IndexedRecord
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.{CreateDisposition, WriteDisposition}

import scala.reflect.ClassTag

trait SCollectionSyntax {
  implicit def toAvroToBigQuerySCollection[T <: IndexedRecord: ClassTag](
    data: SCollection[T]
  ): AvroToBigQuerySCollectionOps[T] = new AvroToBigQuerySCollectionOps[T](data)
}

final class AvroToBigQuerySCollectionOps[T <: IndexedRecord: ClassTag](
  private val self: SCollection[T]
) extends Serializable {
  import com.spotify.scio.extra.bigquery.AvroConverters._

  
  @experimental
  def saveAvroAsBigQuery(
    table: TableReference,
    avroSchema: Schema = null,
    writeDisposition: WriteDisposition = null,
    createDisposition: CreateDisposition = null,
    tableDescription: String = null
  ): ClosedTap[TableRow] = {
    val schema: Schema = Option(avroSchema)
      .getOrElse {
        val cls = ScioUtil.classOf[T]
        if (classOf[IndexedRecord] isAssignableFrom cls) {
          cls.getMethod("getClassSchema").invoke(null).asInstanceOf[Schema]
        } else {
          throw AvroConversionException("Could not invoke $SCHEMA on provided Avro type")
        }
      }

    val params =
      WriteParam(toTableSchema(schema), writeDisposition, createDisposition, tableDescription)
    self
      .map(toTableRow(_))
      .write(BigQueryTable(Table.Ref(table)))(params)
  }
}

Source File: Registry.scala From tamer with MIT License

5 votes

package tamer
package registry

import io.confluent.kafka.schemaregistry.client.SchemaRegistryClient
import log.effect.LogWriter
import log.effect.zio.ZioLogWriter.log4sFromName
import org.apache.avro.{Schema, SchemaValidatorBuilder}
import zio.{RIO, Task}

import scala.jdk.CollectionConverters._

trait Registry extends Serializable {
  val registry: Registry.Service[Any]
}

object Registry {
  trait Service[R] {
    def getOrRegisterId(subject: String, schema: Schema): RIO[R, Int]
    def verifySchema(id: Int, schema: Schema): RIO[R, Unit]
  }

  object > extends Service[Registry] {
    override final def getOrRegisterId(subject: String, schema: Schema): RIO[Registry, Int] = RIO.accessM(_.registry.getOrRegisterId(subject, schema))
    override final def verifySchema(id: Int, schema: Schema): RIO[Registry, Unit]           = RIO.accessM(_.registry.verifySchema(id, schema))
  }

  trait Live extends Registry {
    val client: SchemaRegistryClient
    override final val registry: Service[Any] = new Service[Any] {
      private[this] final val logTask: Task[LogWriter[Task]] = log4sFromName.provide("tamer.Registry.Live")
      private[this] final val strategy                       = new SchemaValidatorBuilder().canReadStrategy().validateLatest()
      private[this] final def validate(toValidate: Schema, writerSchema: Schema): Task[Unit] =
        Task(strategy.validate(toValidate, List(writerSchema).asJava))

      override final def getOrRegisterId(subject: String, schema: Schema): Task[Int] =
        for {
          log <- logTask
          id <-
            Task(client.getId(subject, schema)).tap(id => log.debug(s"retrieved existing writer schema id: $id")) <>
              Task(client.register(subject, schema)).tap(id => log.info(s"registered with id $id new subject $subject writer schema $schema"))
        } yield id
      override final def verifySchema(id: Int, schema: Schema): Task[Unit] =
        for {
          log          <- logTask
          writerSchema <- Task(client.getById(id)).tap(_ => log.debug(s"retrieved writer schema id: $id"))
          _            <- validate(schema, writerSchema).tapError(t => log.error(s"schema supplied cannot read payload: ${t.getLocalizedMessage}"))
        } yield ()
    }
  }
}

Source File: AvroUtils.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.avro

import org.apache.avro.Schema
import org.apache.avro.generic.{GenericData, GenericRecord}

import scala.jdk.CollectionConverters._

object AvroUtils {
  private def f(name: String, tpe: Schema.Type) =
    new Schema.Field(
      name,
      Schema.createUnion(List(Schema.create(Schema.Type.NULL), Schema.create(tpe)).asJava),
      null: String,
      null: AnyRef
    )

  private def fArr(name: String, tpe: Schema.Type) =
    new Schema.Field(name, Schema.createArray(Schema.create(tpe)), null: String, null: AnyRef)

  val schema = Schema.createRecord("GenericTestRecord", null, null, false)
  schema.setFields(
    List(
      f("int_field", Schema.Type.INT),
      f("long_field", Schema.Type.LONG),
      f("float_field", Schema.Type.FLOAT),
      f("double_field", Schema.Type.DOUBLE),
      f("boolean_field", Schema.Type.BOOLEAN),
      f("string_field", Schema.Type.STRING),
      fArr("array_field", Schema.Type.STRING)
    ).asJava
  )

  def newGenericRecord(i: Int): GenericRecord = {
    val r = new GenericData.Record(schema)
    r.put("int_field", 1 * i)
    r.put("long_field", 1L * i)
    r.put("float_field", 1f * i)
    r.put("double_field", 1.0 * i)
    r.put("boolean_field", true)
    r.put("string_field", "hello")
    r.put("array_field", List[CharSequence]("a", "b", "c").asJava)
    r
  }

  def newSpecificRecord(i: Int): TestRecord =
    new TestRecord(
      i,
      i.toLong,
      i.toFloat,
      i.toDouble,
      true,
      "hello",
      List[CharSequence]("a", "b", "c").asJava
    )
}

Source File: Sedes.scala From shc with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.hbase

import java.io.ByteArrayInputStream

import org.apache.avro.Schema
import org.apache.avro.Schema.Type._
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io._
import org.apache.commons.io.output.ByteArrayOutputStream
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.types._

trait Sedes {
  def serialize(value: Any): Array[Byte]
  def deserialize(bytes: Array[Byte], start: Int, end: Int): Any
}

class DoubleSedes extends Sedes {
  override def serialize(value: Any): Array[Byte] = Bytes.toBytes(value.asInstanceOf[Double])
  override def deserialize(bytes: Array[Byte], start: Int, end: Int): Any = {
    Bytes.toLong(bytes, start)
  }
}

Source File: BasicTest.scala From ksql-streams with Apache License 2.0

5 votes

package com.landoop.kstreams.sql.cluster

import org.apache.avro.Schema
import org.apache.avro.generic.GenericData
import org.apache.kafka.clients.producer.ProducerRecord

class BasicTest extends ClusterTestingCapabilities {

  private def createAvroRecord = {
    val userSchema = "{\"namespace\": \"example.avro\", \"type\": \"record\", " + "\"name\": \"User\"," + "\"fields\": [{\"name\": \"name\", \"type\": \"string\"}]}"
    val parser = new Schema.Parser
    val schema = parser.parse(userSchema)
    val avroRecord = new GenericData.Record(schema)
    avroRecord.put("name", "testUser")
    avroRecord
  }

  "KCluster" should {
    "start up and be able to handle avro records being sent " in {
      val topic = "testAvro"
      val avroRecord = createAvroRecord
      val objects = Array[AnyRef](avroRecord)
      val producerProps = stringAvroProducerProps
      val producer = createProducer(producerProps)

      for (o <- objects) {
        val message = new ProducerRecord[String, Any](topic, o)
        producer.send(message)
      }
      val consumerProps = stringAvroConsumerProps()
      val consumer = createStringAvroConsumer(consumerProps)
      val records = consumeStringAvro(consumer, topic, objects.length)
      objects.toSeq shouldBe records
    }

    "handle the avro new producer" in {
      val topic = "testAvro"
      val avroRecord = createAvroRecord
      val objects = Array[Any](avroRecord, true, 130, 345L, 1.23f, 2.34d, "abc", "def".getBytes)
      val producerProps = stringAvroProducerProps
      val producer = createProducer(producerProps)
      for (o <- objects) {
        producer.send(new ProducerRecord[String, Any](topic, o))
      }
      val consumerProps = stringAvroConsumerProps()
      val consumer = createStringAvroConsumer(consumerProps)
      val records = consumeStringAvro(consumer, topic, objects.length)
      objects.deep shouldBe records.toArray.deep
    }
  }

}

Source File: JsonToAvroConverter.scala From ksql-streams with Apache License 2.0

5 votes

package com.landoop.kstreams.sql.transform

import java.util

import com.fasterxml.jackson.databind.node._
import com.sksamuel.avro4s.ScaleAndPrecision
import io.confluent.kafka.serializers
import io.confluent.kafka.serializers.NonRecordContainer
import org.apache.avro.generic.GenericContainer
import org.apache.avro.generic.GenericData.Record
import org.apache.avro.{LogicalTypes, Schema}

class JsonToAvroConverter(namespace: String, avroStringTypeIsString: Boolean = false) {

  import org.json4s._
  import org.json4s.native.JsonMethods._

  def convert(name: String, str: String)
             (implicit schema: Option[Schema], sp: ScaleAndPrecision): GenericContainer = convert(name, parse(str))

  def convert(name: String, value: JValue)
             (implicit aggregatedSchema: Option[Schema], sp: ScaleAndPrecision): GenericContainer = {
    value match {
      case JArray(arr) =>
        val values = new java.util.ArrayList[AnyRef]()
        val prevSchema = aggregatedSchema.map(_.getField(name)).map(_.schema)
        val result = convert(name, arr.head)(prevSchema, sp)
        result match {
          case n: NonRecordContainer => values.add(n.getValue)
          case _ => values.add(result)
        }
        arr.tail.foreach { v =>
          convert(name, v)(prevSchema, sp) match {
            case n: NonRecordContainer => values.add(n.getValue)
            case other => values.add(other)
          }
        }

        new NonRecordContainer(Schema.createArray(result.getSchema), values)
      case JBool(b) =>
        new NonRecordContainer(Schema.create(Schema.Type.BOOLEAN), b)
      case JDecimal(d) =>
        val schema = Schema.create(Schema.Type.BYTES)
        val decimal = LogicalTypes.decimal(sp.precision, sp.scale)
        decimal.addToSchema(schema)

        new serializers.NonRecordContainer(schema, d.bigDecimal.unscaledValue().toByteArray)
      case JDouble(d) =>
        new serializers.NonRecordContainer(Schema.create(Schema.Type.DOUBLE), d)
      case JInt(i) =>
        new serializers.NonRecordContainer(Schema.create(Schema.Type.LONG), i.toLong)
      case JLong(l) =>
        new serializers.NonRecordContainer(Schema.create(Schema.Type.LONG), l)
      case JNothing =>
        new NonRecordContainer(Schema.create(Schema.Type.NULL), null)
      case JNull =>
        val schema = Schema.createUnion(java.util.Arrays.asList(Schema.create(Schema.Type.NULL), createStringSchema))
        new serializers.NonRecordContainer(schema, null)
      case JString(s) =>
        val schema = createStringSchema
        new serializers.NonRecordContainer(schema, s)
      case JObject(values) =>
        val schema = Schema.createRecord(name, "", namespace, false)
        val fields = new util.ArrayList[Schema.Field]()
        val default: AnyRef = null
        val fieldsMap = values.map { case (n, v) =>
          val prevSchema = aggregatedSchema.map(_.getField(n)).map(_.schema())
          val result = convert(n, v)(prevSchema, sp)

          //schema.setFields(java.util.Arrays.asList()))
          fields.add(new Schema.Field(n, result.getSchema, "", default))
          n -> result
        }.toMap

        import scala.collection.JavaConversions._
        aggregatedSchema
          .foreach { schema =>
            schema.getFields
              .withFilter(f => !fieldsMap.contains(f.name()))
              .foreach { f =>
                fields.add(new Schema.Field(f.name(), f.schema(), "", default))
              }
          }

        schema.setFields(fields)
        val record = new Record(schema)
        fieldsMap.foreach {
          case (field, v: NonRecordContainer) => record.put(field, v.getValue)
          case (field, v: GenericContainer) => record.put(field, v)
        }

        record
    }
  }

  private def createStringSchema = {
    val schema = Schema.create(Schema.Type.STRING)
    if (avroStringTypeIsString) schema.addProp("avro.java.string", new TextNode("String"))
    schema
  }
}

Source File: StdAvroModelFactoryTest.scala From aloha with MIT License

5 votes

package com.eharmony.aloha.factory.avro

import com.eharmony.aloha.audit.impl.avro.Score
import com.eharmony.aloha.factory.ModelFactory
import com.eharmony.aloha.io.vfs.Vfs1
import com.eharmony.aloha.models.Model
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.commons.io.IOUtils
import org.junit.Assert.assertEquals
import org.junit.Test
import org.junit.runner.RunWith
import org.junit.runners.BlockJUnit4ClassRunner

import scala.util.Try


  private[this] def record = {
    val r = new GenericData.Record(TheSchema)
    r.put("req_str_1", "smart handsome stubborn")
    r
  }
}

object StdAvroModelFactoryTest {
  private lazy val TheSchema = {
    val is = getClass.getClassLoader.getResourceAsStream(SchemaUrlResource)
    try new Schema.Parser().parse(is) finally IOUtils.closeQuietly(is)
  }

  private val ExpectedResult = 7d

  private val SchemaUrlResource = "avro/class7.avpr"

  private val SchemaUrl = s"res:$SchemaUrlResource"

  private val SchemaFile = new java.io.File(getClass.getClassLoader.getResource(SchemaUrlResource).getFile)

  private val SchemaVfs1FileObject = org.apache.commons.vfs.VFS.getManager.resolveFile(SchemaUrl)

  private val SchemaVfs2FileObject = org.apache.commons.vfs2.VFS.getManager.resolveFile(SchemaUrl)

  private val Imports = Seq("com.eharmony.aloha.feature.BasicFunctions._", "scala.math._")

  private val ReturnType = "Double"

  private val ModelJson =
    """
      |{
      |  "modelType": "Regression",
      |  "modelId": { "id": 0, "name": "" },
      |  "features" : {
      |    "my_attributes": "${req_str_1}.split(\"\\\\W+\").map(v => (s\"=$v\", 1.0))"
      |  },
      |  "weights": {
      |    "my_attributes=handsome": 1,
      |    "my_attributes=smart": 2,
      |    "my_attributes=stubborn": 4
      |  }
      |}
    """.stripMargin
}

Source File: TestAvroClass.scala From embedded-kafka-schema-registry with MIT License

5 votes

package net.manub.embeddedkafka.schemaregistry

import org.apache.avro.specific.SpecificRecordBase
import org.apache.avro.{AvroRuntimeException, Schema}

case class TestAvroClass(var name: String) extends SpecificRecordBase {
  def this() = this("")

  override def get(i: Int): AnyRef =
    i match {
      case 0 => name
      case _ => throw new AvroRuntimeException("Bad index")
    }

  override def put(i: Int, v: scala.Any): Unit =
    i match {
      case 0 =>
        name = v match {
          case (utf8: org.apache.avro.util.Utf8) => utf8.toString
          case _                                 => v.asInstanceOf[String]
        }
      case _ => throw new AvroRuntimeException("Bad index")
    }

  override def getSchema: Schema = TestAvroClass.avroSchema
}

object TestAvroClass {
  val avroSchema =
    (new Schema.Parser)
      .parse("""
               |{"namespace": "net.manub.embeddedkafka.schemaregistry",
               | "type": "record",
               | "name": "TestAvroClass",
               | "fields": [
               |   {"name": "name", "type": "string"}
               | ]
               |}
              """.stripMargin)
}

Source File: TestAvroClass.scala From embedded-kafka-schema-registry with MIT License

5 votes

package net.manub.embeddedkafka.schemaregistry

import org.apache.avro.specific.SpecificRecordBase
import org.apache.avro.{AvroRuntimeException, Schema}

case class TestAvroClass(var name: String) extends SpecificRecordBase {
  def this() = this("")

  override def get(i: Int): AnyRef =
    i match {
      case 0 => name
      case _ => throw new AvroRuntimeException("Bad index")
    }

  override def put(i: Int, v: scala.Any): Unit =
    i match {
      case 0 =>
        name = v match {
          case (utf8: org.apache.avro.util.Utf8) => utf8.toString
          case _                                 => v.asInstanceOf[String]
        }
      case _ => throw new AvroRuntimeException("Bad index")
    }

  override def getSchema: Schema = TestAvroClass.avroSchema
}

object TestAvroClass {
  val avroSchema =
    (new Schema.Parser)
      .parse("""
                |{"namespace": "net.manub.embeddedkafka.schemaregistry",
                | "type": "record",
                | "name": "TestAvroClass",
                | "fields": [
                |   {"name": "name", "type": "string"}
                | ]
                |}
              """.stripMargin)
}

Source File: AvroCodecsSpecification.scala From kafka-scala-api with Apache License 2.0

5 votes

package com.example.avro

import org.scalatest._
import com.twitter.bijection.Injection
import com.twitter.bijection.avro.GenericAvroCodecs
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericData, GenericRecord}

class GenericAvroCodecsSpecification extends WordSpec with Matchers {
  val testSchema = new Schema.Parser().parse("""{
                                                   "type":"record",
                                                   "name":"FiscalRecord",
                                                   "namespace":"avro",
                                                   "fields":[
                                                      {
                                                         "name":"calendarDate",
                                                         "type":"string"
                                                      },
                                                      {
                                                         "name":"fiscalWeek",
                                                         "type":[
                                                            "int",
                                                            "null"
                                                         ]
                                                      },
                                                      {
                                                         "name":"fiscalYear",
                                                         "type":[
                                                            "int",
                                                            "null"
                                                         ]
                                                      }
                                                   ]
                                                }""")

  "Generic Avro codec" should {

    "Round trip generic record using Generic Injection" in {
      implicit val genericInjection = GenericAvroCodecs[GenericRecord](testSchema)
      val testRecord = buildGenericAvroRecord(("2012-01-01", 1, 12))
      val bytes = Injection[GenericRecord, Array[Byte]](testRecord)
      val attempt = Injection.invert[GenericRecord, Array[Byte]](bytes)
      assert(attempt.get == testRecord)
    }

    "Round trip generic record using Binary Injection" in {
      implicit val genericBinaryInjection = GenericAvroCodecs.toBinary[GenericRecord](testSchema)
      val testRecord = buildGenericAvroRecord(("2012-01-01", 1, 12))
      val bytes = Injection[GenericRecord, Array[Byte]](testRecord)
      val attempt = Injection.invert[GenericRecord, Array[Byte]](bytes)
      assert(attempt.get == testRecord)
    }

    "Round trip generic record using Json Injection" in {
      implicit val genericJsonInjection = GenericAvroCodecs.toJson[GenericRecord](testSchema)
      val testRecord = buildGenericAvroRecord(("2012-01-01", 1, 12))
      val jsonString = Injection[GenericRecord, String](testRecord)
      val attempt = Injection.invert[GenericRecord, String](jsonString)
      assert(attempt.get == testRecord)
    }
  }

  def buildGenericAvroRecord(i: (String, Int, Int)): GenericRecord = {

    val fiscalRecord = new GenericData.Record(testSchema)
    fiscalRecord.put("calendarDate", i._1)
    fiscalRecord.put("fiscalWeek", i._2)
    fiscalRecord.put("fiscalYear", i._3)
    fiscalRecord
  }
}

Source File: AvroRecord.scala From shc with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.hbase.examples

import org.apache.avro.Schema
import org.apache.avro.generic.GenericData
import org.apache.spark.sql.execution.datasources.hbase.types.{AvroSerde, SchemaConverters}

object AvroRecord {
  def main(args: Array[String]) {
    //Test avro to schema converterBasic setup
    val schemaString =
      """{"namespace": "example.avro",
        |   "type": "record", "name": "User",
        |    "fields": [ {"name": "name", "type": "string"},
        |      {"name": "favorite_number",  "type": ["int", "null"]},
        |        {"name": "favorite_color", "type": ["string", "null"]} ] }""".stripMargin

    val avroSchema: Schema = {
      val p = new Schema.Parser
      p.parse(schemaString)
    }
    val user1 = new GenericData.Record(avroSchema)
    user1.put("name", "Alyssa")
    user1.put("favorite_number", 256)

    val user2 = new GenericData.Record(avroSchema)
    user2.put("name", "Ben")
    user2.put("favorite_number", 7)
    user2.put("favorite_color", "red")

    val sqlUser1 = SchemaConverters.createConverterToSQL(avroSchema)(user1)
    println(sqlUser1)
    val schema = SchemaConverters.toSqlType(avroSchema)
    println(s"\nSqlschema: $schema")
    val avroUser1 = SchemaConverters.createConverterToAvro(schema.dataType, "avro", "example.avro")(sqlUser1)
    val avroByte = AvroSerde.serialize(avroUser1, avroSchema)
    val avroUser11 = AvroSerde.deserialize(avroByte, avroSchema)
    println(s"$avroUser1")
  }
}

Source File: LinearAlgebraLibrarySuite.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.pfa.functions

import com.ibm.aardpfark.pfa.dsl._
import com.ibm.aardpfark.pfa.document.PFABuilder
import org.apache.avro.Schema

class LinearAlgebraLibrarySuite extends FunctionLibrarySuite {

  test("Linear algebra add") {
    val action = la.add(inputExpr, NewArray[Double](Seq(-1.0, 1.0, 4.0)))

    val pfaDoc = new PFABuilder()
      .withInput(doubleArraySchema)
      .withOutput(doubleArraySchema)
      .withAction(action)
      .pfa

    val engine = getPFAEngine(pfaDoc.toJSON())
    val result = engine.action(engine.jsonInput("[1.0, 10.0, -3.0]"))
    assert(engine.jsonOutput(result) == "[0.0,11.0,1.0]")
  }

  test("Linear algebra dot - matrix / matrix") {
    val action = la.dot(inputExpr, inputExpr)

    val pfaDoc = new PFABuilder()
      .withInput(Schema.createArray(doubleArraySchema))
      .withOutput(Schema.createArray(doubleArraySchema))
      .withAction(action)
      .pfa

    val engine = getPFAEngine(pfaDoc.toJSON())
    val result = engine.action(engine.jsonInput("[[0.0, 1.0], [2.0, 1.0]]"))
    assert(engine.jsonOutput(result) == "[[2.0,1.0],[2.0,3.0]]")
  }

  test("Linear algebra dot - matrix / vector") {
    val action = la.dot(inputExpr, NewArray[Double](Seq(-1.0, 1.0)))

    val pfaDoc = new PFABuilder()
      .withInput(Schema.createArray(doubleArraySchema))
      .withOutput(doubleArraySchema)
      .withAction(action)
      .pfa

    val engine = getPFAEngine(pfaDoc.toJSON())
    val result = engine.action(engine.jsonInput("[[0.0, 1.0], [2.0, 1.0]]"))
    assert(engine.jsonOutput(result) == "[1.0,-1.0]")
  }

  test("Linear algebra scale") {
    val action = la.scale(inputExpr, 0.5)

    val pfaDoc = new PFABuilder()
      .withInput(Schema.createArray(doubleArraySchema))
      .withOutput(Schema.createArray(doubleArraySchema))
      .withAction(action)
      .pfa

    val engine = getPFAEngine(pfaDoc.toJSON())
    val result = engine.action(engine.jsonInput("[[0.0, 1.0], [2.0, 1.0]]"))
    assert(engine.jsonOutput(result) == "[[0.0,0.5],[1.0,0.5]]")
  }

  test("Linear algebra sub") {
    val action = la.sub(inputExpr, NewArray[Double](Seq(-1.0, 1.0, 4.0)))

    val pfaDoc = new PFABuilder()
      .withInput(doubleArraySchema)
      .withOutput(doubleArraySchema)
      .withAction(action)
      .pfa

    val engine = getPFAEngine(pfaDoc.toJSON())
    val result = engine.action(engine.jsonInput("[1.0, 10.0, -3.0]"))
    assert(engine.jsonOutput(result) == "[2.0,9.0,-7.0]")
  }

}

Source File: GenericAvroSerializerSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.serializer

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
import java.nio.ByteBuffer

import com.esotericsoftware.kryo.io.{Input, Output}
import org.apache.avro.{Schema, SchemaBuilder}
import org.apache.avro.generic.GenericData.Record

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext {
  conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

  val schema : Schema = SchemaBuilder
    .record("testRecord").fields()
    .requiredString("data")
    .endRecord()
  val record = new Record(schema)
  record.put("data", "test data")

  test("schema compression and decompression") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    assert(schema === genericSer.decompress(ByteBuffer.wrap(genericSer.compress(schema))))
  }

  test("record serialization and deserialization") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)

    val outputStream = new ByteArrayOutputStream()
    val output = new Output(outputStream)
    genericSer.serializeDatum(record, output)
    output.flush()
    output.close()

    val input = new Input(new ByteArrayInputStream(outputStream.toByteArray))
    assert(genericSer.deserializeDatum(input) === record)
  }

  test("uses schema fingerprint to decrease message size") {
    val genericSerFull = new GenericAvroSerializer(conf.getAvroSchema)

    val output = new Output(new ByteArrayOutputStream())

    val beginningNormalPosition = output.total()
    genericSerFull.serializeDatum(record, output)
    output.flush()
    val normalLength = output.total - beginningNormalPosition

    conf.registerAvroSchemas(schema)
    val genericSerFinger = new GenericAvroSerializer(conf.getAvroSchema)
    val beginningFingerprintPosition = output.total()
    genericSerFinger.serializeDatum(record, output)
    val fingerprintLength = output.total - beginningFingerprintPosition

    assert(fingerprintLength < normalLength)
  }

  test("caches previously seen schemas") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    val compressedSchema = genericSer.compress(schema)
    val decompressedSchema = genericSer.decompress(ByteBuffer.wrap(compressedSchema))

    assert(compressedSchema.eq(genericSer.compress(schema)))
    assert(decompressedSchema.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema))))
  }
}

Source File: Schemas.scala From ratatool with Apache License 2.0

5 votes

package com.spotify.ratatool

import com.google.api.client.json.JsonObjectParser
import com.google.api.client.json.jackson2.JacksonFactory
import com.google.api.services.bigquery.model.TableSchema
import com.google.common.base.Charsets
import org.apache.avro.Schema

object Schemas {

  val avroSchema: Schema =
    new Schema.Parser().parse(this.getClass.getResourceAsStream("/schema.avsc"))
  val simpleAvroSchema: Schema =
    new Schema.Parser().parse(this.getClass.getResourceAsStream("/SimpleRecord.avsc"))
  val evolvedSimpleAvroSchema: Schema =
    new Schema.Parser().parse(this.getClass.getResourceAsStream("/EvolvedSimpleRecord.avsc"))

  val simpleAvroByteFieldSchema: Schema =
    new Schema.Parser().parse(this.getClass.getResourceAsStream("/SimpleByteFieldRecord.avsc"))

  val tableSchema: TableSchema = new JsonObjectParser(new JacksonFactory)
    .parseAndClose(
      this.getClass.getResourceAsStream("/schema.json"),
      Charsets.UTF_8,
      classOf[TableSchema])

}

Source File: AvroIO.scala From ratatool with Apache License 2.0

5 votes

package com.spotify.ratatool.io

import java.io.{File, InputStream, OutputStream}
import java.nio.ByteBuffer
import java.nio.channels.SeekableByteChannel

import com.google.common.io.ByteStreams
import org.apache.avro.Schema
import org.apache.avro.file.{DataFileReader, DataFileWriter, SeekableByteArrayInput, SeekableInput}
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.{DatumReader, DatumWriter}
import org.apache.avro.reflect.{ReflectDatumReader, ReflectDatumWriter}
import org.apache.avro.specific.{SpecificDatumReader, SpecificDatumWriter, SpecificRecord}
import org.apache.beam.sdk.io.FileSystems
import org.apache.beam.sdk.io.fs.MatchResult.Metadata

import scala.jdk.CollectionConverters._
import scala.reflect.ClassTag


  def writeToOutputStream[T: ClassTag](data: Iterable[T],
                                       schema: Schema,
                                       os: OutputStream): Unit = {
    val fileWriter = new DataFileWriter(createDatumWriter[T]).create(schema, os)
    data.foreach(fileWriter.append)
    fileWriter.close()
  }

  def getAvroSchemaFromFile(path: String): Schema = {
    require(FileStorage(path).exists, s"File `$path` does not exist!")
    val files = FileStorage(path).listFiles.filter(_.resourceId.getFilename.endsWith(".avro"))
    require(files.nonEmpty, s"File `$path` does not contain avro files")
    val reader = new GenericDatumReader[GenericRecord]()
    val dfr = new DataFileReader[GenericRecord](AvroIO.getAvroSeekableInput(files.head), reader)
    dfr.getSchema
  }

  private def getAvroSeekableInput(meta: Metadata): SeekableInput = new SeekableInput {
    require(meta.isReadSeekEfficient)
    private val in = FileSystems.open(meta.resourceId()).asInstanceOf[SeekableByteChannel]
    override def read(b: Array[Byte], off: Int, len: Int): Int =
      in.read(ByteBuffer.wrap(b, off, len))
    override def tell(): Long = in.position()
    override def length(): Long = in.size()
    override def seek(p: Long): Unit = in.position(p)
    override def close(): Unit = in.close()
  }

}

Source File: GenericAvroSerializerSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.serializer

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
import java.nio.ByteBuffer

import com.esotericsoftware.kryo.io.{Output, Input}
import org.apache.avro.{SchemaBuilder, Schema}
import org.apache.avro.generic.GenericData.Record

import org.apache.spark.{SparkFunSuite, SharedSparkContext}

class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext {
  conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

  val schema : Schema = SchemaBuilder
    .record("testRecord").fields()
    .requiredString("data")
    .endRecord()
  val record = new Record(schema)
  record.put("data", "test data")

  test("schema compression and decompression") {//模式压缩与解压缩
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    assert(schema === genericSer.decompress(ByteBuffer.wrap(genericSer.compress(schema))))
  }

  test("record serialization and deserialization") {//记录序列化和反序列化
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)

    val outputStream = new ByteArrayOutputStream()
    val output = new Output(outputStream)
    genericSer.serializeDatum(record, output)
    output.flush()
    output.close()

    val input = new Input(new ByteArrayInputStream(outputStream.toByteArray))
    assert(genericSer.deserializeDatum(input) === record)
  }
  //使用模式指纹以减少信息大小
  test("uses schema fingerprint to decrease message size") {
    val genericSerFull = new GenericAvroSerializer(conf.getAvroSchema)

    val output = new Output(new ByteArrayOutputStream())

    val beginningNormalPosition = output.total()
    genericSerFull.serializeDatum(record, output)
    output.flush()
    val normalLength = output.total - beginningNormalPosition

    conf.registerAvroSchemas(schema)
    val genericSerFinger = new GenericAvroSerializer(conf.getAvroSchema)
    val beginningFingerprintPosition = output.total()
    genericSerFinger.serializeDatum(record, output)
    val fingerprintLength = output.total - beginningFingerprintPosition

    assert(fingerprintLength < normalLength)
  }

  test("caches previously seen schemas") {//缓存之前模式
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    val compressedSchema = genericSer.compress(schema)
    val decompressedScheam = genericSer.decompress(ByteBuffer.wrap(compressedSchema))

    assert(compressedSchema.eq(genericSer.compress(schema)))
    assert(decompressedScheam.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema))))
  }
}

Source File: AvroFieldTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.cli.gen

import com.salesforce.op.cli.gen.AvroField._
import com.salesforce.op.test.TestCommon
import org.apache.avro.Schema
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Assertions, FlatSpec}

import scala.collection.JavaConverters._
import scala.language.postfixOps


@RunWith(classOf[JUnitRunner])
class AvroFieldTest extends FlatSpec with TestCommon with Assertions {

  Spec[AvroField] should "do from" in {
    val types = List(
      Schema.Type.STRING,
      //  Schema.Type.BYTES, // somehow this avro type is not covered (yet)
      Schema.Type.INT,
      Schema.Type.LONG,
      Schema.Type.FLOAT,
      Schema.Type.DOUBLE,
      Schema.Type.BOOLEAN
    )
    val simpleSchemas = types map Schema.create

    val unions = List(
      Schema.createUnion((Schema.Type.NULL::Schema.Type.INT::Nil) map Schema.create asJava),
      Schema.createUnion((Schema.Type.INT::Schema.Type.NULL::Nil) map Schema.create asJava)
    )

    val enum = Schema.createEnum("Aliens", "undocumented", "outer",
      List("Edgar_the_Bug", "Boris_the_Animal", "Laura_Vasquez") asJava)

    val allSchemas = (enum::unions)++simpleSchemas // NULL does not work

    val fields = allSchemas.zipWithIndex map {
      case (s, i) => new Schema.Field("x" + i, s, "Who", null: Object)
    }

    val expected = List(
      AEnum(fields(0), isNullable = false),
      AInt(fields(1), isNullable = true),
      AInt(fields(2), isNullable = true),
      AString(fields(3), isNullable = false),
      AInt(fields(4), isNullable = false),
      ALong(fields(5), isNullable = false),
      AFloat(fields(6), isNullable = false),
      ADouble(fields(7), isNullable = false),
      ABoolean(fields(8), isNullable = false)
    )

    an[IllegalArgumentException] should be thrownBy {
      val nullSchema = Schema.create(Schema.Type.NULL)
      val nullField = new Schema.Field("xxx", null, "Nobody", null: Object)
      AvroField from nullField
    }

    fields.size shouldBe expected.size

    for {
      (field, expected) <- fields zip expected
    } {
      val actual = AvroField from field
      actual shouldBe expected
    }
  }

}

Source File: CSVAutoReadersTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.readers

import com.salesforce.op.test.PassengerSparkFixtureTest
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.junit.runner.RunWith
import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner

import scala.collection.JavaConverters._


@RunWith(classOf[JUnitRunner])
class CSVAutoReadersTest extends FlatSpec with PassengerSparkFixtureTest {

  private val expectedSchema = new Schema.Parser().parse(resourceFile(name = "PassengerAuto.avsc"))
  private val allFields = expectedSchema.getFields.asScala.map(_.name())
  private val keyField: String = allFields.head

  Spec[CSVAutoReader[_]] should "read in data correctly and infer schema" in {
    val dataReader = DataReaders.Simple.csvAuto[GenericRecord](
      path = Some(passengerCsvWithHeaderPath),
      key = _.get(keyField).toString
    )
    val data = dataReader.readRDD().collect()
    data.foreach(_ shouldBe a[GenericRecord])
    data.length shouldBe 8

    val inferredSchema = data.head.getSchema
    inferredSchema shouldBe expectedSchema
  }

  it should "read in data correctly and infer schema based with headers provided" in {
    val dataReader = DataReaders.Simple.csvAuto[GenericRecord](
      path = Some(passengerCsvPath),
      key = _.get(keyField).toString,
      headers = allFields
    )
    val data = dataReader.readRDD().collect()
    data.foreach(_ shouldBe a[GenericRecord])
    data.length shouldBe 8

    val inferredSchema = data.head.getSchema
    inferredSchema shouldBe expectedSchema

  }

}

Source File: HttpSchemaRegistrySpec.scala From affinity with Apache License 2.0

5 votes

package io.amient.affinity.kafka

import io.amient.affinity.avro.HttpSchemaRegistry
import io.amient.affinity.avro.HttpSchemaRegistry.HttpAvroConf
import io.amient.affinity.avro.record.AvroRecord
import io.amient.affinity.avro.record.AvroSerde.AvroConf
import org.apache.avro.Schema
import org.scalatest.{FlatSpec, Matchers}

import scala.collection.JavaConverters._

object SimpleEnum extends Enumeration {
  type SimpleEnum = Value
  val A, B, C = Value
}

case class SimpleKey(val id: Int) extends AvroRecord {
  override def hashCode(): Int = id.hashCode()
}

case class SimpleRecord(val id: SimpleKey = SimpleKey(0), val side: SimpleEnum.Value = SimpleEnum.A, val seq: Seq[SimpleKey] = Seq()) extends AvroRecord{
  override def hashCode(): Int = id.hashCode()
}

case class CompositeRecord(
                   val items: Seq[SimpleRecord] = Seq(),
                   val index: Map[String, SimpleRecord] = Map(),
                   val setOfPrimitives: Set[Long] = Set() ) extends AvroRecord


class HttpSchemaRegistrySpec extends FlatSpec with Matchers with EmbeddedConfluentRegistry {

  override def numPartitions = 1

  behavior of "HttpSchemaRegistry"

  val serde = new HttpSchemaRegistry(HttpAvroConf(Map(
    HttpAvroConf(AvroConf).HttpSchemaRegistryUrl.path -> registryUrl
  ).asJava))

  serde.register[SimpleKey]
  serde.register[SimpleRecord]
  val v1schema = new Schema.Parser().parse("{\"type\":\"record\",\"name\":\"Record\",\"namespace\":\"io.amient.affinity.kafka\",\"fields\":[{\"name\":\"items\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"record\",\"name\":\"SimpleRecord\",\"fields\":[{\"name\":\"id\",\"type\":{\"type\":\"record\",\"name\":\"SimpleKey\",\"fields\":[{\"name\":\"id\",\"type\":\"int\"}]},\"default\":{\"id\":0}},{\"name\":\"side\",\"type\":{\"type\":\"enum\",\"name\":\"SimpleEnum\",\"symbols\":[\"A\",\"B\",\"C\"]},\"default\":\"A\"},{\"name\":\"seq\",\"type\":{\"type\":\"array\",\"items\":\"SimpleKey\"},\"default\":[]}]}},\"default\":[]},{\"name\":\"removed\",\"type\":\"int\",\"default\":0}]}")
  serde.register[CompositeRecord](v1schema)

  it should "allow compatible version of previously registered schema" in {
    serde.register[CompositeRecord] should be(4)
  }

  it should "reject incompatible schema registration" in {

    val thrown = intercept[RuntimeException]{
      val v3schema = new Schema.Parser().parse("{\"type\":\"record\",\"name\":\"Record\",\"namespace\":\"io.amient.affinity.kafka\",\"fields\":[{\"name\":\"data\",\"type\":\"string\"}]}")
      serde.register[CompositeRecord](v3schema)
    }
    thrown.getMessage should include("incompatible")

  }

  it should "register topic subject when fqn subject is already registered" in {
    val data = SimpleRecord()
    //fqn should be already registered
    serde.getRuntimeSchema(classOf[SimpleRecord].getName) should be((2, data.getSchema))
    //now simulate what KafkaAvroSerde would do
    val (schemaId, objSchema) = serde.from(data, "topic-simple")
    schemaId should be(2)
    objSchema should be(data.getSchema)
    //and check the additional subject was registered with the same schema
    serde.register("topic-simple", data.getSchema) should be(2)
  }
}

Source File: AvroSchemaSpec.scala From affinity with Apache License 2.0

5 votes

package io.amient.affinity.avro

import io.amient.affinity.avro.record.AvroRecord
import org.apache.avro.Schema
import org.scalatest.{FlatSpec, Matchers}

object Status extends Enumeration {
  type Status = Value
  val OK, FAILED = Value
}

case class Referenced(
       A: Status.Value, B: Status.Value, C: Map[String, Status.Value], D: List[Status.Value], E: Option[Status.Value]) extends AvroRecord

class AvroSchemaSpec extends FlatSpec with Matchers {

  "AvroRecord" should "not fail when referencing the same type in a single schema" in {
    val schemaJson = AvroRecord.inferSchema[Referenced].toString(false)
    println(schemaJson)
    new Schema.Parser().parse(schemaJson)
  }

  "AvroRecord" should "1" in {
    new Schema.Parser().parse(
      """{"type":"record","namespace":"com.trustelevate.vpc.domain","name":"Parent","fields":[{"name":"pid","type":"long"},{"name":"registered","default":false,"type":"boolean"},{"name":"consents","default":{},"type":{"type":"map","values":{"type":"record","name":"Consent","fields":[{"name":"username","type":"string"},{"name":"contact","type":{"type":"record","name":"CredentialKey","fields":[{"name":"kind","type":{"type":"enum","name":"CredentialType","symbols":["FIRST_NAME","LAST_NAME","EMAIL","DOB","ADDRESS","PARENT","PHONE"]}},{"name":"value","type":"string"}]}},{"name":"service","type":"string"},{"name":"consentAge","default":0,"type":"int"},{"name":"status","default":"PENDING","type":{"type":"enum","name":"ConsentStatus","symbols":["NOT_REQUIRED","PENDING","APPROVED","REJECTED"]}},{"name":"requestedUTC","type":"long"},{"name":"updatedUTC","default":-1,"type":"long"},{"name":"child","default":0,"type":"long"},{"name":"verification","default":"UNKNOWN","type":{"type":"enum","name":"VerificationStatus","symbols":["UNKNOWN","CONFIRMED","VERIFIED","FAILED"]}},{"name":"requestToken","default":"","type":"string"}]}}},{"name":"children","default":{},"type":{"type":"map","values":{"type":"record","name":"Child","fields":[{"name":"pii","type":{"type":"array","items":"CredentialKey"}},{"name":"verification","type":"VerificationStatus"},{"name":"verificationTimestamp","default":-1,"type":"long"}]}}},{"name":"password","default":null,"type":["null","string"]},{"name":"action","default":"NONE","type":{"type":"enum","name":"UserAction","symbols":["CREATE_PASSWORD","RESET_PASSWORD","NONE"]}}]}"""
    )
  }

}

Source File: ZookeeperSchemaRegistrySpec.scala From affinity with Apache License 2.0

5 votes

package io.amient.affinity.avro

import io.amient.affinity.avro.ZookeeperSchemaRegistry.ZkAvroConf
import io.amient.affinity.avro.record.AvroSerde
import io.amient.affinity.avro.record.AvroSerde.AvroConf
import io.amient.affinity.kafka.EmbeddedZooKeeper
import org.apache.avro.{Schema, SchemaValidationException}
import org.scalatest.{FlatSpec, Matchers}

import scala.collection.JavaConverters._

class ZookeeperSchemaRegistrySpec extends FlatSpec with Matchers with EmbeddedZooKeeper {

  behavior of "ZkAvroRegistry"

  val v1schema = new Schema.Parser().parse("{\"type\":\"record\",\"name\":\"Record_Current\",\"namespace\":\"io.amient.affinity.avro\",\"fields\":[{\"name\":\"items\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"record\",\"name\":\"SimpleRecord\",\"fields\":[{\"name\":\"id\",\"type\":{\"type\":\"record\",\"name\":\"SimpleKey\",\"fields\":[{\"name\":\"id\",\"type\":\"int\"}]},\"default\":{\"id\":0}},{\"name\":\"side\",\"type\":{\"type\":\"enum\",\"name\":\"SimpleEnum\",\"symbols\":[\"A\",\"B\",\"C\"]},\"default\":\"A\"},{\"name\":\"seq\",\"type\":{\"type\":\"array\",\"items\":\"SimpleKey\"},\"default\":[]}]}},\"default\":[]},{\"name\":\"removed\",\"type\":\"int\",\"default\":0}]}")
  val v3schema = new Schema.Parser().parse("{\"type\":\"record\",\"name\":\"Record_Current\",\"namespace\":\"io.amient.affinity.avro\",\"fields\":[{\"name\":\"items\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"record\",\"name\":\"SimpleRecord\",\"fields\":[{\"name\":\"id\",\"type\":{\"type\":\"record\",\"name\":\"SimpleKey\",\"fields\":[{\"name\":\"id\",\"type\":\"int\"}]},\"default\":{\"id\":0}},{\"name\":\"side\",\"type\":{\"type\":\"enum\",\"name\":\"SimpleEnum\",\"symbols\":[\"A\",\"B\",\"C\"]},\"default\":\"A\"},{\"name\":\"seq\",\"type\":{\"type\":\"array\",\"items\":\"SimpleKey\"},\"default\":[]}]}},\"default\":[]},{\"name\":\"index\",\"type\":{\"type\":\"map\",\"values\":\"SimpleRecord\"},\"default\":{}}]}")

  val conf = AvroConf(Map(
    AvroConf.Class.path -> classOf[ZookeeperSchemaRegistry].getName,
    ZkAvroConf(AvroConf).ZooKeeper.Connect.path -> zkConnect
  ).asJava)

  val serde = AvroSerde.create(conf)
  serde.register[SimpleKey]
  serde.register[SimpleRecord]
  val backwardSchemaId = serde.register[Record_Current](v1schema)
  val currentSchemaId = serde.register[Record_Current]
  val forwardSchemaId = serde.register[Record_Current](v3schema)

  it should "work in a backward-compatibility scenario" in {
    val oldValue = Record_V1(Seq(SimpleRecord(SimpleKey(1), SimpleEnum.C)), 10)
    val oldBytes = serde.write(oldValue, v1schema, backwardSchemaId)
    oldBytes.mkString(",") should be("0,0,0,0,2,2,2,4,0,0,20")
    val upgraded = serde.fromBytes(oldBytes)
    upgraded should be(Record_Current(Seq(SimpleRecord(SimpleKey(1), SimpleEnum.C)), Map()))
  }

  it should "work in a forward-compatibility scenario" in {
    val forwardValue = Record_V3(Seq(SimpleRecord(SimpleKey(1), SimpleEnum.A)), Map("X" -> SimpleRecord(SimpleKey(1), SimpleEnum.A)))
    val forwardBytes = serde.write(forwardValue, v3schema, forwardSchemaId)
    val downgraded = serde.fromBytes(forwardBytes)
    downgraded should be(Record_Current(Seq(SimpleRecord(SimpleKey(1), SimpleEnum.A)), Map("X" -> SimpleRecord(SimpleKey(1), SimpleEnum.A)), Set()))
  }

  it should "reject incompatible schema registration" in {
    val v4schema = new Schema.Parser().parse("{\"type\":\"record\",\"name\":\"Record\",\"namespace\":\"io.amient.affinity.avro\",\"fields\":[{\"name\":\"data\",\"type\":\"string\"}]}")
    an[SchemaValidationException] should be thrownBy {
      serde.register[Record_Current](v4schema)
    }
  }

}

Source File: LocalSchemaRegistry.scala From affinity with Apache License 2.0

5 votes

package io.amient.affinity.avro

import java.nio.file.{Files, Path}

import com.typesafe.config.Config
import io.amient.affinity.avro.LocalSchemaRegistry.LocalAvroConf
import io.amient.affinity.avro.record.AvroSerde
import io.amient.affinity.avro.record.AvroSerde.AvroConf
import io.amient.affinity.core.config.CfgStruct
import org.apache.avro.Schema

import scala.collection.JavaConverters._
import scala.io.Source


object LocalSchemaRegistry {

  object LocalAvroConf extends LocalAvroConf {
    override def apply(config: Config) = new LocalAvroConf().apply(config)
  }

  class LocalAvroConf extends CfgStruct[LocalAvroConf](classOf[AvroConf]) {
    val DataPath = filepath("schema.registry.path", true).doc("local file path under which schemas will be stored")
  }

}


class LocalSchemaRegistry(dataPath: Path) extends AvroSerde with AvroSchemaRegistry {

  def this(_conf: AvroConf) = this(LocalAvroConf(_conf).DataPath())

  def checkDataPath(): Unit = {
    require(dataPath != null, s"${LocalAvroConf.DataPath.path} is not defined")
    if (!Files.exists(dataPath)) Files.createDirectories(dataPath)
  }

  override def close() = ()

  
  override protected def registerSchema(subject: String, schema: Schema): Int = hypersynchronized {
    checkDataPath()
    val s = dataPath.resolve(s"$subject.dat")
    val versions: Map[Schema, Int] = if (Files.exists(s)) {
      Source.fromFile(s.toFile).mkString.split(",").toList.map(_.toInt).map {
        case id => getSchema(id) -> id
      }.toMap
    } else {
      Map.empty
    }
    versions.get(schema).getOrElse {
      validator.validate(schema, versions.map(_._1).asJava)
      val id = (0 until Int.MaxValue).find(i => !Files.exists(dataPath.resolve(s"$i.avsc"))).max
      val schemaPath = dataPath.resolve(s"$id.avsc")
      Files.createFile(schemaPath)
      Files.write(schemaPath, schema.toString(true).getBytes("UTF-8"))
      val updatedVersions = versions + (schema -> id)
      Files.write(s, updatedVersions.values.mkString(",").getBytes("UTF-8"))
      id
    }
  }

  private def hypersynchronized[X](func: => X) = synchronized {

    checkDataPath()
    val file = dataPath.resolve(".lock").toFile

    def getLock(countDown: Int = 30): Unit = {
      if (!file.createNewFile()) if (countDown > 0) {
        Thread.sleep(1000)
        getLock(countDown - 1)
      } else throw new java.nio.file.FileAlreadyExistsException("atomic createNewFile failed")
    }

    getLock()
    try {
      func
    } finally {
      file.delete()
    }
  }

}

Source File: MemorySchemaRegistry.scala From affinity with Apache License 2.0

5 votes

package io.amient.affinity.avro

import java.util.concurrent.ConcurrentHashMap

import com.typesafe.config.Config
import io.amient.affinity.avro.MemorySchemaRegistry.MemAvroConf
import io.amient.affinity.avro.record.AvroSerde
import io.amient.affinity.avro.record.AvroSerde.AvroConf
import io.amient.affinity.core.config.CfgStruct
import org.apache.avro.{Schema, SchemaValidator}

import scala.collection.JavaConverters._

object MemorySchemaRegistry {

  object MemAvroConf extends MemAvroConf {
    override def apply(config: Config) = new MemAvroConf().apply(config)
  }
  class MemAvroConf extends CfgStruct[MemAvroConf](classOf[AvroConf]) {
    val ID = integer("schema.registry.id", false)
      .doc("multiple instances with the same id will share the schemas registered by any of them")
  }

  val multiverse = new ConcurrentHashMap[Int, Universe]()

  def createUniverse(reuse: Option[Int] = None): Universe = synchronized {
    reuse match {
      case Some(id) if multiverse.containsKey(id) => multiverse.get(id)
      case Some(id) =>
        val universe = new Universe(id)
        multiverse.asScala += id -> universe
        universe
      case None =>
        val id = (if (multiverse.isEmpty) 1 else multiverse.asScala.keys.max + 1)
        val universe = new Universe(id)
        multiverse.asScala += id -> universe
        universe
    }
  }

  class Universe(val id: Int) {
    val schemas = new ConcurrentHashMap[Int, Schema]()
    val subjects = new ConcurrentHashMap[String, List[Int]]()

    def getOrRegister(schema: Schema): Int = synchronized {
      schemas.asScala.find(_._2 == schema) match {
        case None =>
          val newId = schemas.size
          schemas.put(newId, schema)
          newId
        case Some((id, _)) => id
      }
    }

    def updateSubject(subject: String, schemaId: Int, validator: SchemaValidator): Unit = synchronized {
      val existing = Option(subjects.get(subject)).getOrElse(List())
      validator.validate(schemas.get(schemaId), existing.map(id => schemas.get(id)).asJava)
      if (!existing.contains(schemaId)) {
        subjects.put(subject, (existing :+ schemaId))
      }
    }
  }

}

class MemorySchemaRegistry(universe: MemorySchemaRegistry.Universe) extends AvroSerde with AvroSchemaRegistry {

  def this(conf: MemAvroConf) = this(MemorySchemaRegistry.createUniverse(if (conf.ID.isDefined) Some(conf.ID()) else None))

  def this(_conf: AvroConf) = this(MemAvroConf.apply(_conf))

  def this() = this(new MemAvroConf())

  //this is for stable tests
  register[Null]("null")
  register[Boolean]("boolean")
  register[Int]("int")
  register[Long]("long")
  register[Float]("float")
  register[Double]("double")
  register[String]("string")
  register[Array[Byte]]("bytes")

  
  override protected def registerSchema(subject: String, schema: Schema): Int = {
    val id = universe.getOrRegister(schema)
    universe.updateSubject(subject, id, validator)
    id
  }

  override def close() = ()
}

Source File: ZookeeperSchemaRegistry.scala From affinity with Apache License 2.0

5 votes

package io.amient.affinity.avro

import com.typesafe.config.Config
import io.amient.affinity.avro.ZookeeperSchemaRegistry.ZkAvroConf
import io.amient.affinity.avro.record.AvroSerde
import io.amient.affinity.avro.record.AvroSerde.AvroConf
import io.amient.affinity.core.config.CfgStruct
import io.amient.affinity.core.util.{ZkClients, ZkConf}
import org.I0Itec.zkclient.ZkClient
import org.I0Itec.zkclient.exception.ZkNodeExistsException
import org.apache.avro.Schema
import org.apache.zookeeper.CreateMode

import scala.collection.JavaConverters._

object ZookeeperSchemaRegistry {

  object ZkAvroConf extends ZkAvroConf {
    override def apply(config: Config) = new ZkAvroConf().apply(config)
  }

  class ZkAvroConf extends CfgStruct[ZkAvroConf](classOf[AvroConf]) {
    val ZooKeeper = struct("schema.registry.zookeeper", new ZkConf, true)
    val ZkRoot = string("schema.registry.zookeeper.root", "/affinity-schema-registry")
      .doc("znode under which schemas will be stored")
  }
  
}


class ZookeeperSchemaRegistry(zkRoot: String, zk: ZkClient) extends AvroSerde with AvroSchemaRegistry {

  def this(conf: ZkAvroConf) = this(conf.ZkRoot(), {
    val zk = ZkClients.get(conf.ZooKeeper)
    val zkRoot = conf.ZkRoot()
    if (!zk.exists(zkRoot)) zk.createPersistent(zkRoot)
    val zkSchemas = s"$zkRoot/schemas"
    if (!zk.exists(zkSchemas)) zk.createPersistent(zkSchemas)
    val zkSubjects = s"$zkRoot/subjects"
    if (!zk.exists(zkSubjects)) zk.createPersistent(zkSubjects)
    zk
  })

  def this(_conf: AvroConf) = this {
    new ZkAvroConf().apply(_conf)
  }

  override def close(): Unit = ZkClients.close(zk)

  
  override protected def registerSchema(subject: String, schema: Schema): Int = hypersynchronized {
    val zkSubject = s"$zkRoot/subjects/$subject"
    val zkSchemas = s"$zkRoot/schemas"
    val versions: Map[Schema, Int] =
      if (!zk.exists(zkSubject)) Map.empty else {
        zk.readData[String](zkSubject) match {
          case some => some.split(",").toList.map(_.toInt).map {
            case id => getSchema(id) -> id
          }.toMap
        }
      }
    versions.get(schema).getOrElse {
      validator.validate(schema, versions.map(_._1).asJava)
      val schemaPath = zk.create(s"$zkSchemas/", schema.toString(true), CreateMode.PERSISTENT_SEQUENTIAL)
      val id = schemaPath .substring(zkSchemas.length + 1).toInt
      val updatedVersions = versions.map(_._2).toList :+ id
      if (zk.exists(zkSubject)) {
        zk.writeData(zkSubject, updatedVersions.mkString(","))
      } else {
        zk.create(zkSubject, updatedVersions.mkString(","), CreateMode.PERSISTENT)
      }
      id
    }
  }


  private def hypersynchronized[X](f: => X): X = synchronized {
    val lockPath = zkRoot + "/lock"
    var acquired = 0
    do {
      try {
        zk.createEphemeral(lockPath)
        acquired = 1
      } catch {
        case _: ZkNodeExistsException =>
          acquired -= 1
          if (acquired < -100) {
            throw new IllegalStateException("Could not acquire zk registry lock")
          } else {
            Thread.sleep(500)
          }
      }
    } while (acquired != 1)
    try f finally zk.delete(lockPath)
  }

}

Source File: JsonToAvroConverter.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s.json

import java.util

import com.sksamuel.avro4s._
import org.apache.avro.Schema


class JsonToAvroConverter(namespace: String,
                          avroStringTypeIsString: Boolean = false,
                          jsonFieldMapper: FieldMapper = DefaultFieldMapper) {

  import org.json4s._
  import org.json4s.native.JsonMethods._

  import scala.collection.JavaConverters._

  def convert(name: String, str: String): Schema = {
    convert(name, parse(str).transformField {
      case JField(n, v) =>
        val newName = toCamelCase(n, jsonFieldMapper)
        (newName, v)
    })
  }

  def convert(name: String, value: JValue): Schema = value match {
    case JArray(list) if list.isEmpty => Schema.create(Schema.Type.NULL)
    case JArray(list) => Schema.createArray(convert(name, list.head))
    case JBool(_) => Schema.create(Schema.Type.BOOLEAN)
    case JDecimal(_) => Schema.create(Schema.Type.DOUBLE)
    case JDouble(_) => Schema.create(Schema.Type.DOUBLE)
    case JInt(_) => Schema.create(Schema.Type.LONG)
    case JLong(_) => Schema.create(Schema.Type.LONG)
    case JNothing => Schema.create(Schema.Type.NULL)
    case JNull => Schema.createUnion(util.Arrays.asList(Schema.create(Schema.Type.NULL), createStringSchema))
    case JString(_) => createStringSchema
    case JSet(value) => Schema.createArray(convert(name, value.head))
    case JObject(values) =>
      val record = Schema.createRecord(name, null, namespace, false)
      val doc: String = null
      val default: AnyRef = null
      val fields = values.map { case (k, v) => new Schema.Field(k, convert(k, v), doc, default) }
      record.setFields(fields.asJava)
      record
  }

  private def createStringSchema = {
    val schema = Schema.create(Schema.Type.STRING)
    if (avroStringTypeIsString) schema.addProp("avro.java.string", "String")
    schema
  }

  private def toCamelCase(s: String, from: FieldMapper): String = {
    def fromDelimited(sep: String, s: String): String = {
      val head :: tail = s.split(sep).toList
      head ++ tail.foldLeft("")((acc, word) => acc ++ word.capitalize)

    }

    def decapitalize(s: String): String = {
      if (s.nonEmpty) s.head.toLower.toString + s.tail else s
    }

    from match {
      case DefaultFieldMapper => s
      case PascalCase => decapitalize(s)
      case SnakeCase => fromDelimited("_", s)
      case LispCase => fromDelimited("-", s)
    }
  }

}

Source File: IndexWithCompleteDocument.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.data

import com.fasterxml.jackson.databind.JsonNode
import com.typesafe.config.ConfigFactory
import org.apache.avro.generic.GenericRecord
import org.apache.avro.{Schema, SchemaBuilder}

case class IndexWithCompleteDocument(uuid: String, document: String) extends GenericRecord with CsvGenerator {

  override def put(key: String, v: scala.Any): Unit = ???

  override def get(key: String): AnyRef = key match {
    case "uuid" => uuid
    case "document" => document
    case _ => throw new IllegalArgumentException
  }

  override def put(i: Int, v: scala.Any): Unit = ???

  override def get(i: Int): AnyRef = i match {
    case 0 => uuid
    case 1 => document
    case _ => throw new IllegalArgumentException
  }

  override def getSchema: Schema = IndexWithCompleteDocument.schema

  // Specifically don't implement CsvGenerator.csv since it is guaranteed to be invalid CSV - force use of Parquet.
}

object IndexWithCompleteDocument extends ObjectExtractor[IndexWithCompleteDocument] {

  val schema: Schema = SchemaBuilder
    .record("IndexWithCompleteDocument").namespace("cmwell.analytics")
    .fields
    .name("uuid").`type`.unionOf.stringType.and.nullType.endUnion.noDefault
    .name("document").`type`.unionOf.stringType.and.nullType.endUnion.noDefault
    .endRecord

  private val config = ConfigFactory.load
  val infotonSize: Int = config.getInt("extract-index-from-es.fetch-size-index-with-complete-document")

  def includeFields: String = s""""_source": "*""""

  def extractFromJson(hit: JsonNode): IndexWithCompleteDocument =
    IndexWithCompleteDocument(
      uuid = hit.findValue("_id").asText,
      document = hit.findValue("_source").toString)
}

Source File: IndexWithKeyFields.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.data

import com.fasterxml.jackson.databind.JsonNode
import com.typesafe.config.ConfigFactory
import org.apache.avro.{LogicalTypes, Schema, SchemaBuilder}
import org.apache.avro.generic.GenericRecord
import org.apache.log4j.LogManager
import org.joda.time.format.ISODateTimeFormat

import scala.util.control.NonFatal


case class IndexWithKeyFields(uuid: String,
                              lastModified: java.sql.Timestamp,
                              path: String) extends GenericRecord with CsvGenerator {

  override def put(key: String, v: scala.Any): Unit = ???

  override def get(key: String): AnyRef = key match {
    case "uuid" => uuid
    case "lastModified" => java.lang.Long.valueOf(lastModified.getTime)
    case "path" => path
  }

  override def put(i: Int, v: scala.Any): Unit = ???

  override def get(i: Int): AnyRef = i match {
    case 0 => uuid
    case 1 => java.lang.Long.valueOf(lastModified.getTime)
    case 2 => path
    case _ => throw new IllegalArgumentException
  }

  override def getSchema: Schema = IndexWithSystemFields.schema

  override def csv: String =
    (if (uuid == null) "" else uuid) + "," +
      (if (lastModified == null) "" else ISODateTimeFormat.dateTime.print(lastModified.getTime)) + "," +
      (if (path == null) "" else path)
}

object IndexWithKeyFields extends ObjectExtractor[IndexWithKeyFields] {

  private val logger = LogManager.getLogger(IndexWithSystemFields.getClass)

  // AVRO-2065 - doesn't allow union over logical type, so we can't make timestamp column nullable.
  val timestampMilliType: Schema = LogicalTypes.timestampMillis.addToSchema(Schema.create(Schema.Type.LONG))

  val schema: Schema = SchemaBuilder
    .record("IndexWithSystemFields").namespace("cmwell.analytics")
    .fields
    .name("uuid").`type`.unionOf.stringType.and.nullType.endUnion.noDefault
    .name("lastModified").`type`(timestampMilliType).noDefault
    .name("path").`type`.unionOf.stringType.and.nullType.endUnion.noDefault
    .endRecord

  private val config = ConfigFactory.load
  val infotonSize: Int = config.getInt("extract-index-from-es.fetch-size-index-with-uuid-lastModified-path")

  def includeFields: String = {
    // Note that 'quad' is not included in this list
    val fields = "uuid,lastModified,path"
      .split(",")
      .map(name => s""""system.$name"""")
      .mkString(",")

    s""""_source": [$fields]"""
  }

  def extractFromJson(hit: JsonNode): IndexWithKeyFields = {

    val system = hit.findValue("_source").findValue("system")

    def extractString(name: String): String = system.findValue(name) match {
      case x: JsonNode => x.asText
      case _ => null
    }

    // Extracting date values as Long - as a java.sql.Date might be better
    def extractDate(name: String): java.sql.Timestamp = system.findValue(name) match {
      case x: JsonNode =>
        try {
          new java.sql.Timestamp(ISODateTimeFormat.dateTime.parseDateTime(x.asText).getMillis)
        }
        catch {
          case NonFatal(ex) =>
            logger.warn(s"Failed conversion of date value: $x", ex)
            throw ex
        }
      case _ => null
    }

    IndexWithKeyFields(
      uuid = extractString("uuid"),
      lastModified = extractDate("lastModified"),
      path = extractString("path"))
  }
}

Source File: Job.scala From spark-avro-compactor with Apache License 2.0

5 votes

package ie.ianduffy.spark.avro.compactor

import ie.ianduffy.spark.avro.compactor.Utils._
import io.confluent.kafka.schemaregistry.client.{SchemaMetadata, SchemaRegistryClient}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.avro.mapred.AvroKey
import org.apache.avro.mapreduce.AvroKeyOutputFormat
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.NullWritable
import org.apache.spark.sql.SparkSession
import org.slf4j.LoggerFactory

object Job {

  private val log = LoggerFactory.getLogger(Job.getClass.getName.replace("$", ""))

  def run(spark: SparkSession, schemaRegistry: SchemaRegistryClient, jobConfig: JobConfig): Unit = {
    val schema: Schema = {
      val latestSchemaMetadata: SchemaMetadata = schemaRegistry.getLatestSchemaMetadata(jobConfig.schemaRegistrySubject)
      val id: Int = latestSchemaMetadata.getId
      schemaRegistry.getById(id)
    }

    implicit val sparkConfig: Configuration = spark.sparkContext.hadoopConfiguration
    sparkConfig.set("avro.schema.input.key", schema.toString())
    sparkConfig.set("avro.schema.output.key", schema.toString())

    val inputPath: Path = new Path(jobConfig.input)
    val outputPath: Path = new Path(jobConfig.output)

    val fs: FileSystem = inputPath.getFileSystem(sparkConfig)

    // avoid raising org.apache.hadoop.mapred.FileAlreadyExistsException
    if (jobConfig.overrideOutput) fs.delete(outputPath, true)

    // from fileSystem prefix with s3 the default is 64MB and can be overwitten by fs.s3.block.size
    // from fileSystem prefix with s3a the default is 32MB and can be overwitten by setting fs.s3a.block.size
    val outputBlocksize: Long = fs.getDefaultBlockSize(outputPath)

    // Where inputPath is of the form s3://some/path
    val inputPathSize: Long = fs.getContentSummary(inputPath).getSpaceConsumed

    val numPartitions: Int = Math.max(1, Math.floor((inputPathSize / CompressionRatio.AVRO_SNAPPY) / outputBlocksize).toInt)

    log.debug(
      s"""outputBlocksize: $outputBlocksize
         | inputPathSize: $inputPathSize
         | splitSize: $numPartitions
       """.stripMargin)

    val rdd = readHadoopFile(spark, inputPath.toString)

    rdd.coalesce(numPartitions)
      .saveAsNewAPIHadoopFile(
        outputPath.toString,
        classOf[AvroKey[GenericRecord]],
        classOf[NullWritable],
        classOf[AvroKeyOutputFormat[GenericRecord]],
        sparkConfig
      )
  }
}

Source File: AvroToParquetWriter.scala From etl-light with MIT License

5 votes

package yamrcraft.etlite.writers

import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.fs.Path
import org.apache.parquet.avro.AvroParquetWriter
import org.slf4j.LoggerFactory
import yamrcraft.etlite.utils.FileUtils

class AvroToParquetWriter(tempFile: String, outputFile: String) extends Writer[GenericRecord] {

  val logger = LoggerFactory.getLogger(this.getClass)

  // lazy initialization
  var writer: Option[AvroParquetWriter[GenericRecord]] = None

  val tempPath = new Path(tempFile + ".parquet")
  val outputPath = new Path(outputFile + ".parquet")
  logger.info(s"creating writer for working file: ${tempPath.toString}, outputFile: ${outputPath.toString}")

  override def write(event: GenericRecord): Unit = {
    logger.info(s"ParquetWriter.write, event type: ${event.getSchema.getName}")
    if (writer.isEmpty) {
      writer = Some(createWriter(tempPath.toString, event.getSchema))
    }

    writer.get.write(event)
  }

  override def commit(): Unit = {
    writer.get.close()

    val fs = FileUtils.getFS(outputPath.toString)
    fs.mkdirs(outputPath.getParent)
    if (fs.exists(outputPath)) {
      fs.rename(outputPath, new Path(outputPath.getParent, s"__${outputPath.getName}.${System.currentTimeMillis()}.old.__"))
    }
    // copy temp file to output file (typically temp file would be on local file system).
    if (tempFile.startsWith("file")) {
      logger.info(s"copy file from: ${tempPath.toString} to $outputPath")
      fs.copyFromLocalFile(true, true, tempPath, outputPath)
    } else {
      logger.info(s"renaming file from: ${tempPath.toString} to $outputPath")
      fs.rename(tempPath, outputPath)
    }
  }

  private def createWriter(file: String, schema: Schema) = {
    val fs = FileUtils.getFS(file)
    val path = new Path(file)
    if (fs.exists(path)) {
      fs.delete(path, true)
    }
    fs.mkdirs(path.getParent)
    new AvroParquetWriter[GenericRecord](path, schema)
  }

}

Source File: JsonToAvroTransformer.scala From etl-light with MIT License

5 votes

package yamrcraft.etlite.transformers

import com.typesafe.config.Config
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import play.api.libs.json.Json
import yamrcraft.etlite.utils.ConfigConversions._
import yamrcraft.etlite.utils.{FileUtils, JsonAvroConverter, TimeUtils}
import yamrcraft.etlite.{ErrorType, EtlException}

class JsonToAvroTransformer(config: Config) extends Transformer[Message[GenericRecord]] {

  val converter = new JsonAvroConverter()

  // config settings
  val timestampField = config.getString("timestamp-field")
  val timestampFieldFormat = config.getString("timestamp-field-format")
  val defaultSchemaFileName = config.getString("default-schema-file")
  val (schemaSelectionField, schemas) = {
    config.hasPath("schema-selection") match {
      case true =>
        (Some(config.getString("schema-selection.field")),
          Some(config.getConfig("schema-selection.schemas").asMap.map {case (k,v) => (k, createSchema(v))}) )
      case false => (None, None)
    }
  }

  val defaultSchema: Schema = createSchema(defaultSchemaFileName)

  @throws(classOf[EtlException])
  override def transform(inbound: InboundMessage): Message[GenericRecord] = {

    try {
      val schema = getSchema(inbound.msg)
      val record = converter.convertToGenericDataRecord(inbound.msg, schema)

      Message[GenericRecord](
        record,
        schema.getName,
        extractTimestamp(record)
      )

    } catch {
      case e: EtlException => throw e
      case e: Exception => throw new EtlException(ErrorType.TransformationError, e)
    }
  }

  private def createSchema(path: String): Schema = new Schema.Parser().parse(FileUtils.readContent(path))

  private def getSchema(msg: Array[Byte]): Schema = {
    if (schemaSelectionField.isEmpty) {
      defaultSchema
    } else {
      val msgAsString = new String(msg, "UTF8")
      val msgJson = Json.parse(msgAsString)
      val selectionValue = (msgJson \ schemaSelectionField.get).asOpt[String]
      schemas.get.getOrElse(selectionValue.get, defaultSchema)
    }
  }

  @throws(classOf[EtlException])
  private def extractTimestamp(event: GenericRecord): Long = {
    try {
      (event.get(timestampField): Any) match {
        case ts: Long => ts.asInstanceOf[Long]
        case ts: String => TimeUtils.stringTimeToLong(ts, timestampFieldFormat)
        case _ => throw new RuntimeException("timestamp field is not of either Long or String types.")
      }
    } catch {
      case e: Exception => throw new EtlException(ErrorType.PartitionTimestampError, e)
    }
  }
}

Source File: AvroSchemaHelper.scala From memsql-spark-connector with Apache License 2.0

5 votes

package com.memsql.spark

import org.apache.avro.Schema
import org.apache.avro.Schema.Type
import org.apache.avro.Schema.Type._

import scala.collection.JavaConverters._

object AvroSchemaHelper {

  def resolveNullableType(avroType: Schema, nullable: Boolean): Schema = {
    if (nullable && avroType.getType != NULL) {
      // avro uses union to represent nullable type.
      val fields = avroType.getTypes.asScala
      assert(fields.length == 2)
      val actualType = fields.filter(_.getType != Type.NULL)
      assert(actualType.length == 1)
      actualType.head
    } else {
      avroType
    }
  }
}

Source File: AvroDecoder.scala From cuesheet with Apache License 2.0

5 votes

package com.kakao.cuesheet.convert

import java.util.Arrays.copyOfRange

import kafka.serializer.Decoder
import kafka.utils.VerifiableProperties
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}


sealed trait AvroDecoder[T] extends Decoder[T] {

  def props: VerifiableProperties

  protected val schema = new Schema.Parser().parse(props.getString(Avro.SCHEMA))
  protected val skipBytes = props.getInt(Avro.SKIP_BYTES, 0)

  protected val reader = new GenericDatumReader[GenericRecord](schema)
  protected val decoder = Avro.recordDecoder(reader)

  private def skip(bytes: Array[Byte], size: Int): Array[Byte] = {
    val length = bytes.length
    length - size match {
      case remaining if remaining > 0 => copyOfRange(bytes, size, length)
      case _ => new Array[Byte](0)
    }
  }

  def parse(bytes: Array[Byte]): GenericRecord = {
    val data = if (skipBytes == 0) bytes else skip(bytes, skipBytes)
    decoder(data)
  }
}

class AvroRecordDecoder(val props: VerifiableProperties) extends AvroDecoder[GenericRecord] {
  override def fromBytes(bytes: Array[Byte]): GenericRecord = parse(bytes)
}

class AvroMapDecoder(val props: VerifiableProperties) extends AvroDecoder[Map[String, Any]] {
  override def fromBytes(bytes: Array[Byte]): Map[String, Any] = Avro.toMap(parse(bytes))
}

class AvroJsonDecoder(val props: VerifiableProperties) extends AvroDecoder[String] {
  override def fromBytes(bytes: Array[Byte]): String = Avro.toJson(parse(bytes))
}

Source File: AvroTypeSpec.scala From shapeless-datatype with Apache License 2.0

5 votes

package shapeless.datatype.avro

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
import java.net.URI
import java.nio.ByteBuffer

import com.google.protobuf.ByteString
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.joda.time.Instant
import org.scalacheck.Prop.forAll
import org.scalacheck.ScalacheckShapeless._
import org.scalacheck._
import shapeless._
import shapeless.datatype.record._

import scala.reflect.runtime.universe._

object AvroTypeSpec extends Properties("AvroType") {
  import shapeless.datatype.test.Records._
  import shapeless.datatype.test.SerializableUtils._

  implicit def compareByteArrays(x: Array[Byte], y: Array[Byte]) = java.util.Arrays.equals(x, y)
  implicit def compareIntArrays(x: Array[Int], y: Array[Int]) = java.util.Arrays.equals(x, y)

  def roundTrip[A: TypeTag, L <: HList](m: A)(implicit
    gen: LabelledGeneric.Aux[A, L],
    fromL: FromAvroRecord[L],
    toL: ToAvroRecord[L],
    mr: MatchRecord[L]
  ): Boolean = {
    val t = ensureSerializable(AvroType[A])
    val f1: SerializableFunction[A, GenericRecord] =
      new SerializableFunction[A, GenericRecord] {
        override def apply(m: A): GenericRecord = t.toGenericRecord(m)
      }
    val f2: SerializableFunction[GenericRecord, Option[A]] =
      new SerializableFunction[GenericRecord, Option[A]] {
        override def apply(m: GenericRecord): Option[A] = t.fromGenericRecord(m)
      }
    val toFn = ensureSerializable(f1)
    val fromFn = ensureSerializable(f2)
    val copy = fromFn(roundTripRecord(toFn(m)))
    val rm = RecordMatcher[A]
    copy.exists(rm(_, m))
  }

  def roundTripRecord(r: GenericRecord): GenericRecord = {
    val writer = new GenericDatumWriter[GenericRecord](r.getSchema)
    val baos = new ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(baos, null)
    writer.write(r, encoder)
    encoder.flush()
    baos.close()
    val bytes = baos.toByteArray

    val reader = new GenericDatumReader[GenericRecord](r.getSchema)
    val bais = new ByteArrayInputStream(bytes)
    val decoder = DecoderFactory.get().binaryDecoder(bais, null)
    reader.read(null, decoder)
  }

  implicit val byteStringAvroType = AvroType.at[ByteString](Schema.Type.BYTES)(
    v => ByteString.copyFrom(v.asInstanceOf[ByteBuffer]),
    v => ByteBuffer.wrap(v.toByteArray)
  )
  implicit val instantAvroType =
    AvroType.at[Instant](Schema.Type.LONG)(v => new Instant(v.asInstanceOf[Long]), _.getMillis)
  property("required") = forAll { m: Required => roundTrip(m) }
  property("optional") = forAll { m: Optional => roundTrip(m) }
  property("repeated") = forAll { m: Repeated => roundTrip(m) }
  property("mixed") = forAll { m: Mixed => roundTrip(m) }
  property("nested") = forAll { m: Nested => roundTrip(m) }
  property("seqs") = forAll { m: Seqs => roundTrip(m) }

  implicit val uriAvroType =
    AvroType.at[URI](Schema.Type.STRING)(v => URI.create(v.toString), _.toString)
  property("custom") = forAll { m: Custom => roundTrip(m) }
}

Source File: AvroType.scala From shapeless-datatype with Apache License 2.0

5 votes

package shapeless.datatype.avro

import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import shapeless._

import scala.reflect.runtime.universe._

class AvroType[A] extends Serializable {
  def fromGenericRecord[L <: HList](
    m: GenericRecord
  )(implicit gen: LabelledGeneric.Aux[A, L], fromL: FromAvroRecord[L]): Option[A] =
    fromL(Right(m)).map(gen.from)
  def toGenericRecord[L <: HList](
    a: A
  )(implicit gen: LabelledGeneric.Aux[A, L], toL: ToAvroRecord[L], tt: TypeTag[A]): GenericRecord =
    toL(gen.to(a)).left.get.build(AvroSchema[A])
}

object AvroType {
  def apply[A: TypeTag]: AvroType[A] = new AvroType[A]

  def at[V: TypeTag](
    schemaType: Schema.Type
  )(fromFn: Any => V, toFn: V => Any): BaseAvroMappableType[V] = {
    AvroSchema.register(implicitly[TypeTag[V]].tpe, schemaType)
    new BaseAvroMappableType[V] {
      override def from(value: Any): V = fromFn(value)
      override def to(value: V): Any = toFn(value)
    }
  }
}

Source File: AvroSchema.scala From shapeless-datatype with Apache License 2.0

5 votes

package shapeless.datatype.avro

import org.apache.avro.Schema.Field
import org.apache.avro.{JsonProperties, Schema}

import scala.collection.JavaConverters._
import scala.reflect.runtime.universe._

object AvroSchema {
  private def isField(s: Symbol): Boolean =
    s.isPublic && s.isMethod && !s.isSynthetic && !s.isConstructor

  private def isCaseClass(tpe: Type): Boolean =
    !tpe.toString.startsWith("scala.") &&
      List(typeOf[Product], typeOf[Serializable], typeOf[Equals])
        .forall(b => tpe.baseClasses.contains(b.typeSymbol))

  private def toSchema(tpe: Type): (Schema, Any) = tpe match {
    case t if t =:= typeOf[Boolean]     => (Schema.create(Schema.Type.BOOLEAN), null)
    case t if t =:= typeOf[Int]         => (Schema.create(Schema.Type.INT), null)
    case t if t =:= typeOf[Long]        => (Schema.create(Schema.Type.LONG), null)
    case t if t =:= typeOf[Float]       => (Schema.create(Schema.Type.FLOAT), null)
    case t if t =:= typeOf[Double]      => (Schema.create(Schema.Type.DOUBLE), null)
    case t if t =:= typeOf[String]      => (Schema.create(Schema.Type.STRING), null)
    case t if t =:= typeOf[Array[Byte]] => (Schema.create(Schema.Type.BYTES), null)

    case t if t.erasure =:= typeOf[Option[_]].erasure =>
      val s = toSchema(t.typeArgs.head)._1
      (Schema.createUnion(Schema.create(Schema.Type.NULL), s), JsonProperties.NULL_VALUE)
    case t if t.erasure <:< typeOf[Traversable[_]].erasure || t.erasure <:< typeOf[Array[_]] =>
      val s = toSchema(t.typeArgs.head)._1
      (Schema.createArray(s), java.util.Collections.emptyList())

    case t if isCaseClass(t) =>
      val fields: List[Field] = t.decls.filter(isField).map(toField).toList
      val name = t.typeSymbol.name.toString
      val pkg = t.typeSymbol.owner.fullName
      (Schema.createRecord(name, null, pkg, false, fields.asJava), null)

    case t if customTypes.contains(t.toString) => (Schema.create(customTypes(t.toString)), null)
  }

  private def toField(s: Symbol): Field = {
    val name = s.name.toString
    val tpe = s.asMethod.returnType
    val (schema, default) = toSchema(tpe)
    new Field(name, schema, null, default)
  }

  private val customTypes = scala.collection.mutable.Map[String, Schema.Type]()
  private val cachedSchemas = scala.collection.concurrent.TrieMap.empty[TypeTag[_], Schema]

  private[avro] def register(tpe: Type, schemaType: Schema.Type): Unit =
    customTypes += tpe.toString -> schemaType

  def apply[T: TypeTag]: Schema = {
    val tt = implicitly[TypeTag[T]]
    cachedSchemas.getOrElseUpdate(tt, toSchema(tt.tpe)._1)
  }
}

Source File: EnumSchemaCompatibilityTest.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s.schema

import com.sksamuel.avro4s.{AvroName, AvroSchema, ScalaEnumSchemaFor, SchemaFor}
import org.apache.avro.{Schema, SchemaCompatibility}
import org.apache.avro.SchemaCompatibility.SchemaCompatibilityType
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec


class EnumSchemaCompatibilityTest extends AnyWordSpec with Matchers {

  @AvroName("Colours")
  object Colours1 extends Enumeration {
    val Red, Amber, Green = Value
  }

  @AvroName("Colours")
  object Colours2 extends Enumeration {
    val Red, Amber, Green, Orange = Value
  }

  "An enum schema that does not contain a default enum value" should {

    val schemaVersion1: Schema = AvroSchema[Colours1.Value]
    val schemaVersion2: Schema = AvroSchema[Colours2.Value]

    "not be backwards compatible when a new enum value is added" in {

      val compatibilityType = SchemaCompatibility.checkReaderWriterCompatibility(
        schemaVersion1,
        schemaVersion2
      ).getType

      compatibilityType shouldEqual SchemaCompatibilityType.INCOMPATIBLE
    }

    "be forwards compatible even when a new enum value is added" in {

      val compatibilityType = SchemaCompatibility.checkReaderWriterCompatibility(
        schemaVersion2,
        schemaVersion1
      ).getType

      compatibilityType shouldEqual SchemaCompatibilityType.COMPATIBLE
    }
  }

  "an enum schema that contains a default enum value" should {

    // define the enum schemas with a default value
    implicit val schemaForColour1: SchemaFor[Colours1.Value] = ScalaEnumSchemaFor[Colours1.Value](Colours1.Amber)
    implicit val schemaForColour2: SchemaFor[Colours2.Value] = ScalaEnumSchemaFor[Colours2.Value](Colours2.Amber)

    val schemaVersion1: Schema = AvroSchema[Colours1.Value]
    val schemaVersion2: Schema = AvroSchema[Colours2.Value]

    "be backwards compatible when a new enum value is added" in {

      val compatibilityType = SchemaCompatibility.checkReaderWriterCompatibility(
        schemaVersion1,
        schemaVersion2
      ).getType

      compatibilityType shouldEqual SchemaCompatibilityType.COMPATIBLE
    }

    "be forwards compatible when a new enum value is added" in {

      val compatibilityType = SchemaCompatibility.checkReaderWriterCompatibility(
        schemaVersion2,
        schemaVersion1
      ).getType

      compatibilityType shouldEqual SchemaCompatibilityType.COMPATIBLE
    }
  }
}

Source File: twitter_schema.scala From avrohugger with Apache License 2.0

5 votes

package com.miguno.avro.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroMetadata, AvroReader, AvroSerializeable}

import com.miguno.avro.{twitter_schema => Jtwitter_schema}


final case class twitter_schema(username: String, tweet: String, timestamp: Long) extends AvroSerializeable {
  type J = Jtwitter_schema
  override def toAvro: Jtwitter_schema = {
    new Jtwitter_schema(username, tweet, timestamp)
  }
}

object twitter_schema {
  implicit def reader = new AvroReader[twitter_schema] {
    override type J = Jtwitter_schema
  }
  implicit val metadata: AvroMetadata[twitter_schema, Jtwitter_schema] = new AvroMetadata[twitter_schema, Jtwitter_schema] {
    override val avroClass: Class[Jtwitter_schema] = classOf[Jtwitter_schema]
    override val schema: Schema = Jtwitter_schema.getClassSchema()
    override val fromAvro: (Jtwitter_schema) => twitter_schema = {
      (j: Jtwitter_schema) => twitter_schema(j.getUsername.toString, j.getTweet.toString, j.getTimestamp.toLong)
    }
  }
}

Source File: Serdes.scala From tamer with MIT License

5 votes

package tamer

import java.io.ByteArrayOutputStream
import java.nio.ByteBuffer

import com.sksamuel.avro4s._
import org.apache.avro.Schema
import tamer.registry._
import zio.{RIO, Task}
import zio.kafka.client.serde.{Deserializer, Serializer}

sealed trait Serde[A] extends Any {
  def isKey: Boolean
  def schema: Schema
  def deserializer: Deserializer[Registry with Topic, A]
  def serializer: Serializer[Registry with Topic, A]
  final def serde: ZSerde[Registry with Topic, A] = ZSerde(deserializer)(serializer)
}

object Serde {
  private[this] final val Magic: Byte = 0x0
  private[this] final val intByteSize = 4

  final def apply[A <: Product: Decoder: Encoder: SchemaFor](isKey: Boolean = false) =
    new RecordSerde[A](isKey, SchemaFor[A].schema(DefaultFieldMapper))

  final class RecordSerde[A: Decoder: Encoder](override final val isKey: Boolean, override final val schema: Schema) extends Serde[A] {
    private[this] def subject(topic: String): String = s"$topic-${if (isKey) "key" else "value"}"
    override final val deserializer: Deserializer[Registry with Topic, A] = Deserializer.byteArray.mapM { ba =>
      val buffer = ByteBuffer.wrap(ba)
      if (buffer.get() != Magic) RIO.fail(SerializationError("Unknown magic byte!"))
      else {
        val id = buffer.getInt()
        for {
          env <- RIO.environment[Registry]
          _   <- env.registry.verifySchema(id, schema)
          res <- RIO.fromTry {
            val length  = buffer.limit() - 1 - intByteSize
            val payload = new Array[Byte](length)
            buffer.get(payload, 0, length)
            AvroInputStream.binary[A].from(payload).build(schema).tryIterator.next
          }
        } yield res
      }
    }
    override final val serializer: Serializer[Registry with Topic, A] = Serializer.byteArray.contramapM { a =>
      for {
        env <- RIO.environment[Registry with Topic]
        id  <- env.registry.getOrRegisterId(subject(env.topic), schema)
        arr <- Task {
          val baos = new ByteArrayOutputStream
          baos.write(Magic.toInt)
          baos.write(ByteBuffer.allocate(intByteSize).putInt(id).array())
          val ser = AvroOutputStream.binary[A].to(baos).build(schema)
          ser.write(a)
          ser.close()
          baos.toByteArray
        }
      } yield arr
    }
  }
}

Source File: AvroUtil.scala From cloudflow with Apache License 2.0

5 votes

package cloudflow.streamlets.avro

import scala.util.{ Failure, Success, Try }

import scala.reflect.ClassTag
import scala.reflect._
import org.apache.avro.specific.SpecificRecordBase
import org.apache.avro.Schema
import cloudflow.streamlets._

object AvroUtil {
  val Format = "avro"

  def makeSchema[T <: SpecificRecordBase: ClassTag]: Schema =
    Try(classTag[T].runtimeClass.getDeclaredMethod("SCHEMA$")) match {
      case Success(schema) ⇒ schema.invoke(null).asInstanceOf[Schema]
      case Failure(_) ⇒ {
        Try(classTag[T].runtimeClass.getDeclaredField("SCHEMA$")) match {
          case Success(schema) ⇒ schema.get(null).asInstanceOf[Schema]
          case Failure(ex)     ⇒ throw new RuntimeException(s"Error fetching avro schema for class ${classTag[T].runtimeClass}", ex)
        }
      }
    }
  def fingerprintSha256(schema: Schema): String = {
    import java.util.Base64

    import org.apache.avro.SchemaNormalization._

    Base64
      .getEncoder()
      .encodeToString(parsingFingerprint("SHA-256", schema))
  }

  def createSchemaDefinition(schema: Schema) = SchemaDefinition(
    name = schema.getFullName,
    schema = schema.toString(false),
    fingerprint = fingerprintSha256(schema),
    format = Format
  )
}

Source File: AvroCodec.scala From cloudflow with Apache License 2.0

5 votes

package cloudflow.streamlets.avro

import scala.util.{ Failure, Try }

import com.twitter.bijection.Injection
import org.apache.avro.Schema
import org.apache.avro.specific.SpecificRecordBase
import com.twitter.bijection.avro.SpecificAvroCodecs

import cloudflow.streamlets._

class AvroCodec[T <: SpecificRecordBase](avroSchema: Schema) extends Codec[T] {

  val recordInjection: Injection[T, Array[Byte]] = SpecificAvroCodecs.toBinary(avroSchema)
  val avroSerde                                  = new AvroSerde(recordInjection)

  def encode(value: T): Array[Byte] = avroSerde.encode(value)
  def decode(bytes: Array[Byte]): T = avroSerde.decode(bytes)
  def schema: Schema                = avroSchema
}

private[avro] class AvroSerde[T <: SpecificRecordBase](injection: Injection[T, Array[Byte]]) extends Serializable {
  val inverted: Array[Byte] ⇒ Try[T] = injection.invert _

  def encode(value: T): Array[Byte] = injection(value)

  // TODO fix up the exception, maybe pas through input
  def decode(bytes: Array[Byte]): T =
    Try(inverted(bytes).get).recoverWith {
      case t ⇒
        Failure(DecodeException("Could not decode.", t))
    }.get
}

Source File: SparkAvroDecoder.scala From cloudflow with Apache License 2.0

5 votes

package cloudflow.spark.avro

import org.apache.log4j.Logger

import java.io.ByteArrayOutputStream

import scala.reflect.runtime.universe._

import org.apache.avro.generic.{ GenericDatumReader, GenericDatumWriter, GenericRecord }
import org.apache.avro.io.{ DecoderFactory, EncoderFactory }
import org.apache.spark.sql.{ Dataset, Encoder, Row }
import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder, RowEncoder }
import org.apache.spark.sql.catalyst.expressions.GenericRow
import org.apache.spark.sql.types.StructType
import org.apache.avro.Schema

import cloudflow.spark.sql.SQLImplicits._

case class EncodedKV(key: String, value: Array[Byte])

case class SparkAvroDecoder[T: Encoder: TypeTag](avroSchema: String) {

  val encoder: Encoder[T]                           = implicitly[Encoder[T]]
  val sqlSchema: StructType                         = encoder.schema
  val encoderForDataColumns: ExpressionEncoder[Row] = RowEncoder(sqlSchema)
  @transient lazy val _avroSchema                   = new Schema.Parser().parse(avroSchema)
  @transient lazy val rowConverter                  = SchemaConverters.createConverterToSQL(_avroSchema, sqlSchema)
  @transient lazy val datumReader                   = new GenericDatumReader[GenericRecord](_avroSchema)
  @transient lazy val decoder                       = DecoderFactory.get
  def decode(bytes: Array[Byte]): Row = {
    val binaryDecoder = decoder.binaryDecoder(bytes, null)
    val record        = datumReader.read(null, binaryDecoder)
    rowConverter(record).asInstanceOf[GenericRow]
  }

}


case class SparkAvroEncoder[T: Encoder: TypeTag](avroSchema: String) {

  @transient lazy val log = Logger.getLogger(getClass.getName)

  val BufferSize = 5 * 1024 // 5 Kb

  val encoder                     = implicitly[Encoder[T]]
  val sqlSchema                   = encoder.schema
  @transient lazy val _avroSchema = new Schema.Parser().parse(avroSchema)

  val recordName                = "topLevelRecord" // ???
  val recordNamespace           = "recordNamespace" // ???
  @transient lazy val converter = AvroConverter.createConverterToAvro(sqlSchema, recordName, recordNamespace)

  // Risk: This process is memory intensive. Might require thread-level buffers to optimize memory usage
  def rowToBytes(row: Row): Array[Byte] = {
    val genRecord = converter(row).asInstanceOf[GenericRecord]
    if (log.isDebugEnabled) log.debug(s"genRecord = $genRecord")
    val datumWriter   = new GenericDatumWriter[GenericRecord](_avroSchema)
    val avroEncoder   = EncoderFactory.get
    val byteArrOS     = new ByteArrayOutputStream(BufferSize)
    val binaryEncoder = avroEncoder.binaryEncoder(byteArrOS, null)
    datumWriter.write(genRecord, binaryEncoder)
    binaryEncoder.flush()
    byteArrOS.toByteArray
  }

  def encode(dataset: Dataset[T]): Dataset[Array[Byte]] =
    dataset.toDF().mapPartitions(rows ⇒ rows.map(rowToBytes)).as[Array[Byte]]

  // Note to self: I'm not sure how heavy this chain of transformations is
  def encodeWithKey(dataset: Dataset[T], keyFun: T ⇒ String): Dataset[EncodedKV] = {
    val encoder             = encoderFor[T]
    implicit val rowEncoder = RowEncoder(encoder.schema).resolveAndBind()
    dataset.map { value ⇒
      val key         = keyFun(value)
      val internalRow = encoder.toRow(value)
      val row         = rowEncoder.fromRow(internalRow)
      val bytes       = rowToBytes(row)
      EncodedKV(key, bytes)
    }
  }

}

Source File: GenericAvroSerializerSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.serializer

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
import java.nio.ByteBuffer

import com.esotericsoftware.kryo.io.{Output, Input}
import org.apache.avro.{SchemaBuilder, Schema}
import org.apache.avro.generic.GenericData.Record

import org.apache.spark.{SparkFunSuite, SharedSparkContext}

class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext {
  conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

  val schema : Schema = SchemaBuilder
    .record("testRecord").fields()
    .requiredString("data")
    .endRecord()
  val record = new Record(schema)
  record.put("data", "test data")

  test("schema compression and decompression") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    assert(schema === genericSer.decompress(ByteBuffer.wrap(genericSer.compress(schema))))
  }

  test("record serialization and deserialization") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)

    val outputStream = new ByteArrayOutputStream()
    val output = new Output(outputStream)
    genericSer.serializeDatum(record, output)
    output.flush()
    output.close()

    val input = new Input(new ByteArrayInputStream(outputStream.toByteArray))
    assert(genericSer.deserializeDatum(input) === record)
  }

  test("uses schema fingerprint to decrease message size") {
    val genericSerFull = new GenericAvroSerializer(conf.getAvroSchema)

    val output = new Output(new ByteArrayOutputStream())

    val beginningNormalPosition = output.total()
    genericSerFull.serializeDatum(record, output)
    output.flush()
    val normalLength = output.total - beginningNormalPosition

    conf.registerAvroSchemas(schema)
    val genericSerFinger = new GenericAvroSerializer(conf.getAvroSchema)
    val beginningFingerprintPosition = output.total()
    genericSerFinger.serializeDatum(record, output)
    val fingerprintLength = output.total - beginningFingerprintPosition

    assert(fingerprintLength < normalLength)
  }

  test("caches previously seen schemas") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    val compressedSchema = genericSer.compress(schema)
    val decompressedSchema = genericSer.decompress(ByteBuffer.wrap(compressedSchema))

    assert(compressedSchema.eq(genericSer.compress(schema)))
    assert(decompressedSchema.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema))))
  }
}

Source File: SchemaRegistryImpl.scala From kafka4s with Apache License 2.0

5 votes

package com.banno.kafka.schemaregistry

import scala.collection.compat._
import org.apache.avro.Schema
import io.confluent.kafka.schemaregistry.client.{SchemaMetadata, SchemaRegistryClient}
import cats.effect.Sync
import scala.jdk.CollectionConverters._

case class SchemaRegistryImpl[F[_]](c: SchemaRegistryClient)(implicit F: Sync[F])
    extends SchemaRegistryApi[F] {
  import SchemaRegistryApi._

  def getAllSubjects: F[Iterable[String]] = F.delay(c.getAllSubjects().asScala)
  def getById(id: Int): F[Schema] = F.delay(c.getById(id))
  def getBySubjectAndId(subject: String, id: Int): F[Schema] =
    F.delay(c.getBySubjectAndId(subject, id))
  def getCompatibility(subject: String): F[SchemaRegistryApi.CompatibilityLevel] =
    F.delay(CompatibilityLevel.unsafeFromString(c.getCompatibility(subject)))
  def getLatestSchemaMetadata(subject: String): F[SchemaMetadata] =
    F.delay(c.getLatestSchemaMetadata(subject))
  def getSchemaMetadata(subject: String, version: Int): F[SchemaMetadata] =
    F.delay(c.getSchemaMetadata(subject, version))
  def getVersion(subject: String, schema: Schema): F[Int] = F.delay(c.getVersion(subject, schema))
  def register(subject: String, schema: Schema): F[Int] = F.delay(c.register(subject, schema))
  def testCompatibility(subject: String, schema: Schema): F[Boolean] =
    F.delay(c.testCompatibility(subject, schema))
  def updateCompatibility(subject: String, compatibility: CompatibilityLevel): F[String] =
    F.delay(c.updateCompatibility(subject, compatibility.asString))
}

Source File: SchemaRegistryOps.scala From kafka4s with Apache License 2.0

5 votes

package com.banno.kafka.schemaregistry

import scala.collection.compat._
import org.apache.avro.Schema
import com.sksamuel.avro4s.{DefaultFieldMapper, SchemaFor}
import cats.FlatMap
import cats.implicits._

case class SchemaRegistryOps[F[_]](registry: SchemaRegistryApi[F]) {

  def keySubject(topic: String): String = topic + "-key"
  def valueSubject(topic: String): String = topic + "-value"

  def register[A](subject: String)(implicit SF: SchemaFor[A]): F[Int] =
    registry.register(subject, SF.schema(DefaultFieldMapper))

  def registerKey[K: SchemaFor](topic: String): F[Int] =
    register[K](keySubject(topic))

  def registerValue[V: SchemaFor](topic: String): F[Int] =
    register[V](valueSubject(topic))

  def register[K: SchemaFor, V: SchemaFor](topic: String)(implicit F: FlatMap[F]): F[(Int, Int)] =
    for {
      k <- registerKey[K](topic)
      v <- registerValue[V](topic)
    } yield (k, v)

  def isCompatible(subject: String, schema: Schema): F[Boolean] =
    registry.testCompatibility(subject, schema)

  def isCompatible[A](subject: String)(implicit SF: SchemaFor[A]): F[Boolean] =
    isCompatible(subject, SF.schema(DefaultFieldMapper))

  def isKeyCompatible[K: SchemaFor](topic: String): F[Boolean] =
    isCompatible[K](keySubject(topic))

  def isValueCompatible[V: SchemaFor](topic: String): F[Boolean] =
    isCompatible[V](valueSubject(topic))

  def isCompatible[K: SchemaFor, V: SchemaFor](
      topic: String
  )(implicit F: FlatMap[F]): F[(Boolean, Boolean)] =
    for {
      k <- isKeyCompatible[K](topic)
      v <- isValueCompatible[V](topic)
    } yield (k, v)

}

Source File: GenericAvroSerializerSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.serializer

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
import java.nio.ByteBuffer

import com.esotericsoftware.kryo.io.{Input, Output}
import org.apache.avro.{Schema, SchemaBuilder}
import org.apache.avro.generic.GenericData.Record

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext {
  conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

  val schema : Schema = SchemaBuilder
    .record("testRecord").fields()
    .requiredString("data")
    .endRecord()
  val record = new Record(schema)
  record.put("data", "test data")

  test("schema compression and decompression") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    assert(schema === genericSer.decompress(ByteBuffer.wrap(genericSer.compress(schema))))
  }

  test("record serialization and deserialization") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)

    val outputStream = new ByteArrayOutputStream()
    val output = new Output(outputStream)
    genericSer.serializeDatum(record, output)
    output.flush()
    output.close()

    val input = new Input(new ByteArrayInputStream(outputStream.toByteArray))
    assert(genericSer.deserializeDatum(input) === record)
  }

  test("uses schema fingerprint to decrease message size") {
    val genericSerFull = new GenericAvroSerializer(conf.getAvroSchema)

    val output = new Output(new ByteArrayOutputStream())

    val beginningNormalPosition = output.total()
    genericSerFull.serializeDatum(record, output)
    output.flush()
    val normalLength = output.total - beginningNormalPosition

    conf.registerAvroSchemas(schema)
    val genericSerFinger = new GenericAvroSerializer(conf.getAvroSchema)
    val beginningFingerprintPosition = output.total()
    genericSerFinger.serializeDatum(record, output)
    val fingerprintLength = output.total - beginningFingerprintPosition

    assert(fingerprintLength < normalLength)
  }

  test("caches previously seen schemas") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    val compressedSchema = genericSer.compress(schema)
    val decompressedSchema = genericSer.decompress(ByteBuffer.wrap(compressedSchema))

    assert(compressedSchema.eq(genericSer.compress(schema)))
    assert(decompressedSchema.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema))))
  }
}

Source File: package.scala From avro4s with Apache License 2.0

5 votes

package benchmarks

import benchmarks.record.AttributeValue
import benchmarks.record.AttributeValue.{Empty, Invalid, Valid}
import com.sksamuel.avro4s._
import org.apache.avro.Schema
import org.apache.avro.generic.GenericData

import scala.collection.JavaConverters._
import scala.reflect.runtime.universe.{TypeTag, typeOf}

package object handrolled_codecs {

  final class AttributeValueCodec[T: Encoder: Decoder](val schemaForValid: SchemaFor[Valid[T]])
      extends Codec[AttributeValue[T]] { codec =>

    def schemaFor: SchemaFor[AttributeValue[T]] = {
      implicit val sfv: SchemaFor[Valid[T]] = schemaForValid
      SchemaFor[AttributeValue[T]]
    }

    val validEncoder = Encoder[Valid[T]].withSchema(schemaForValid)
    val emptyEncoder = Encoder[Empty]
    val invalidEncoder = Encoder[Invalid]

    def encode(t: AttributeValue[T]): AnyRef = t match {
      case v: Valid[T] => validEncoder.encode(v)
      case e: Empty    => emptyEncoder.encode(e)
      case i: Invalid  => invalidEncoder.encode(i)
    }

    val validDecoder = Decoder[Valid[T]].withSchema(schemaForValid)
    val emptyDecoder = Decoder[Empty]
    val invalidDecoder = Decoder[Invalid]

    val validSn: String = validDecoder.schema.getFullName
    val emptySn: String = emptyDecoder.schema.getFullName
    val invalidSn: String = invalidDecoder.schema.getFullName

    def decode(value: Any): AttributeValue[T] = {
      val schema = value match {
        case r: GenericData.Record => r.getSchema
        case i: ImmutableRecord    => i.schema
      }
      schema.getFullName match {
        case `validSn`   => validDecoder.decode(value)
        case `emptySn`   => emptyDecoder.decode(value)
        case `invalidSn` => invalidDecoder.decode(value)
      }
    }
  }

  def buildSchemaForValid[T: SchemaFor: TypeTag]: SchemaFor[Valid[T]] = {
    val sf = SchemaFor[Valid[T]]
    val name: String = typeOf[T].typeSymbol.name.toString
    val s = sf.schema
    val fields = s.getFields.asScala.map(f => new Schema.Field(f.name, f.schema, f.doc, f.defaultVal)).asJava
    SchemaFor(Schema.createRecord(s"Valid$name", s.getDoc, s.getNamespace, s.isError, fields), sf.fieldMapper)
  }

  object AttributeValueCodec {
    def apply[T: Encoder: Decoder: SchemaFor: TypeTag]: AttributeValueCodec[T] = {
      implicit val schemaForValid: SchemaFor[Valid[T]] = buildSchemaForValid
      new AttributeValueCodec[T](schemaForValid)
    }
  }
}

Source File: RecursiveSchemaTest.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s.schema

import com.sksamuel.avro4s.Recursive.{Branch, MutRec1}
import com.sksamuel.avro4s._
import org.apache.avro.Schema
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class RecursiveSchemaTest extends AnyWordSpec with Matchers {

  "SchemaFor" should {
    "support recursive types with sealed traits" in {
      AvroSchema[Recursive.Tree[Int]] shouldBe expectedSchema("/recursive_tree.json")
    }

    "support mutually recursive types" in {
      AvroSchema[MutRec1] shouldBe expectedSchema("/mutually_recursive.json")
    }

    "support recursive types with lists" in {
      AvroSchema[Recursive.ListTree[Int]] shouldBe expectedSchema("/recursive_list.json")
    }

    "support recursive types with maps" in {
      AvroSchema[Recursive.MapTree[Int]] shouldBe expectedSchema("/recursive_map.json")
    }

    "support recursive types with option" in {
      AvroSchema[Recursive.OptionTree[Int]] shouldBe expectedSchema("/recursive_option.json")
    }

    "support recursive types with either" in {
      AvroSchema[Recursive.EitherTree[Int]] shouldBe expectedSchema("/recursive_either.json")
    }

    "support recursive types with shapeless coproduct" in {
      AvroSchema[Recursive.CoproductTree[Int]] shouldBe expectedSchema("/recursive_coproduct.json")
    }

    "support recursive types with tuples and value types" in {
      AvroSchema[Recursive.TVTree[Int]] shouldBe expectedSchema("/recursive_tuple_value_type.json")
    }

    "support custom definitions" in {
      import scala.collection.JavaConverters._
      implicit def sf: SchemaFor[Recursive.Branch[Int]] =
        new ResolvableSchemaFor[Recursive.Branch[Int]] {
          val tree = SchemaFor[Recursive.Tree[Int]]
          def schemaFor(env: DefinitionEnvironment[SchemaFor], update: SchemaUpdate): SchemaFor[Branch[Int]] =
            env.get[Recursive.Branch[Int]].getOrElse {

              val record: SchemaFor[Recursive.Branch[Int]] =
                SchemaFor(Schema.createRecord("CustomBranch", "custom schema", "custom", false))
              val nextEnv = env.updated(record)
              val treeSchema = tree.resolveSchemaFor(nextEnv, update).schema
              val fields = Seq(new Schema.Field("left", treeSchema), new Schema.Field("right", treeSchema))
              record.schema.setFields(fields.asJava)
              record
            }
        }

      val schema = sf.resolveSchemaFor().schema

      schema shouldBe expectedSchema("/recursive_custom.json")
    }
  }

  def expectedSchema(name: String) =
    new org.apache.avro.Schema.Parser().parse(getClass.getResourceAsStream(name))

}

Source File: AvroRecordFieldExtractorMapFn.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.hbase.avro

import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._
import org.apache.avro.Schema
import org.apache.avro.Schema.Type

import scala.collection.JavaConverters._


  def apply(schema: Schema, fields: Seq[String]): Map[String, (Any) => Array[Byte]] = {
    fields.map { fn =>
      val f = schema.getField(fn)
      if (f == null) {
        throw new IllegalArgumentException(s"$fn does not exist in the given schema.")
      }
      fn -> getFunc(f.schema())
    }.toMap
  }

  private def getFunc(schema: Schema): (Any) => Array[Byte] = {
    val `type` = schema.getType.getName

    `type`.toUpperCase() match {
      case "BOOLEAN" => (v: Any) => if (v == null) null else v.fromBoolean()
      case "BYTES" => (v: Any) => if (v == null) null else v.asInstanceOf[Array[Byte]]
      case "DOUBLE" => (v: Any) => if (v == null) null else v.fromDouble()
      case "FLOAT" => (v: Any) => if (v == null) null else v.fromFloat()
      case "INT" => (v: Any) => if (v == null) null else v.fromInt()
      case "LONG" => (v: Any) => if (v == null) null else v.fromLong()
      case "STRING" => (v: Any) => if (v == null) null else v.fromString()
      case "UNION" =>
        schema.getTypes.asScala.collectFirst {
          case s if s.getType != Type.NULL => getFunc(s)
        }.getOrElse(throw new IllegalArgumentException(s"$schema is not supported."))
      case _ =>
        throw new IllegalArgumentException(s"${schema.getType.name()} is not supported")
    }
  }
}

Source File: BasicEncoderTest.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s.record.encoder

import com.sksamuel.avro4s.examples.UppercasePkg.ClassInUppercasePackage
import com.sksamuel.avro4s._
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericFixed, GenericRecord}
import org.apache.avro.util.Utf8
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class BasicEncoderTest extends AnyWordSpec with Matchers {

  "Encoder" should {
    "encode strings as UTF8" in {
      case class Foo(s: String)
      val schema = AvroSchema[Foo]
      val record = Encoder[Foo].encode(Foo("hello"))
      record shouldBe ImmutableRecord(schema, Vector(new Utf8("hello")))
    }
    "encode strings as GenericFixed and pad bytes when schema is fixed" in {
      case class Foo(s: String)

      val fixedSchema = SchemaFor[String](Schema.createFixed("FixedString", null, null, 7))
      implicit val fixedStringEncoder: Encoder[String] = Encoder.StringEncoder.withSchema(fixedSchema)

      val record = Encoder[Foo].encode(Foo("hello")).asInstanceOf[GenericRecord]
      record.get("s").asInstanceOf[GenericFixed].bytes().toList shouldBe Seq(104, 101, 108, 108, 111, 0, 0)
      // the fixed should have the right size
      record.get("s").asInstanceOf[GenericFixed].bytes().length shouldBe 7
    }
    "encode longs" in {
      case class Foo(l: Long)
      val schema = AvroSchema[Foo]
      Encoder[Foo].encode(Foo(123456L)) shouldBe ImmutableRecord(schema, Vector(java.lang.Long.valueOf(123456L)))
    }
    "encode doubles" in {
      case class Foo(d: Double)
      val schema = AvroSchema[Foo]
      Encoder[Foo].encode(Foo(123.435)) shouldBe ImmutableRecord(schema, Vector(java.lang.Double.valueOf(123.435D)))
    }
    "encode booleans" in {
      case class Foo(d: Boolean)
      val schema = AvroSchema[Foo]
      Encoder[Foo].encode(Foo(true)) shouldBe ImmutableRecord(schema, Vector(java.lang.Boolean.valueOf(true)))
    }
    "encode floats" in {
      case class Foo(d: Float)
      val schema = AvroSchema[Foo]
      Encoder[Foo].encode(Foo(123.435F)) shouldBe ImmutableRecord(schema, Vector(java.lang.Float.valueOf(123.435F)))
    }
    "encode ints" in {
      case class Foo(i: Int)
      val schema = AvroSchema[Foo]
      Encoder[Foo].encode(Foo(123)) shouldBe ImmutableRecord(schema, Vector(java.lang.Integer.valueOf(123)))
    }
    "support uppercase packages" in {
      val schema = AvroSchema[ClassInUppercasePackage]
      val t = com.sksamuel.avro4s.examples.UppercasePkg.ClassInUppercasePackage("hello")
      schema.getFullName shouldBe "com.sksamuel.avro4s.examples.UppercasePkg.ClassInUppercasePackage"
      Encoder[ClassInUppercasePackage].encode(t) shouldBe ImmutableRecord(schema, Vector(new Utf8("hello")))
    }
  }
}

Source File: Github284.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s.github

import com.sksamuel.avro4s.{Record, RecordFormat}
import org.apache.avro.specific.SpecificRecordBase
import org.apache.avro.{AvroRuntimeException, Schema}
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

case class Street(var name: String) extends SpecificRecordBase {
  def this() = this("")

  override def get(i: Int): AnyRef = i match {
    case 0 => name
    case _ => throw new AvroRuntimeException("Bad index")
  }

  override def put(index: Int, value: scala.Any): Unit =
    index match {
      case 0 =>
        value.asInstanceOf[String]
      case _ =>
        throw new AvroRuntimeException("Bad index")
    }

  override def getSchema: Schema = Street.SCHEMA$
}

object Street {
  val SCHEMA$ =
    (new Schema.Parser).parse("""
                                |{
                                | "type": "record",
                                | "namespace": "com.sksamuel.avro4s.github",
                                | "name": "Street",
                                | "fields": [
                                |     {"name": "name", "type": "string"}
                                | ]
                                |}
                              """.stripMargin)
}

final class Github284 extends AnyWordSpec with Matchers {
  "SchemaFor" should {
    "convert case class to a Record and convert it back to original case class" in {

      val street: Street = Street(name = "street name")

      val streetAsRecord: Record = RecordFormat[Street].to(street)

      val decodedStreet: Street = RecordFormat[Street].from(streetAsRecord)

      streetAsRecord shouldBe a [Record]

      decodedStreet shouldBe a [Street]

      decodedStreet shouldBe street
    }
  }
}

Source File: CustomDefaults.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s

import magnolia.{SealedTrait, Subtype}
import org.json4s.native.JsonMethods.parse
import org.json4s.native.Serialization.write
import org.apache.avro.Schema
import org.apache.avro.Schema.Type
import org.json4s.DefaultFormats

import scala.collection.JavaConverters._

sealed trait CustomDefault
case class CustomUnionDefault(className: String, values: java.util.Map[String, Any]) extends CustomDefault
case class CustomUnionWithEnumDefault(parentName: String, default: String, value: String) extends CustomDefault
case class CustomEnumDefault(value: String) extends CustomDefault

object CustomDefaults {

  implicit val formats = DefaultFormats

  def customScalaEnumDefault(value: Any) = CustomEnumDefault(value.toString)

  def customDefault(p: Product, schema: Schema): CustomDefault =
    if(isEnum(p, schema.getType))
      CustomEnumDefault(trimmedClassName(p))
    else {
      if(isUnionOfEnum(schema)) {
        val enumType = schema.getTypes.asScala.filter(_.getType == Schema.Type.ENUM).head
        CustomUnionWithEnumDefault(enumType.getName, trimmedClassName(p), p.toString)
      } else
        CustomUnionDefault(trimmedClassName(p), parse(write(p)).extract[Map[String, Any]].map {
          case (name, b: BigInt) if b.isValidInt => name -> b.intValue
          case (name, b: BigInt) if b.isValidLong => name -> b.longValue
          case (name, z) if schema.getType == Type.UNION => name ->
            schema.getTypes.asScala.find(_.getName == trimmedClassName(p)).map(_.getField(name).schema())
              .map(DefaultResolver(z, _)).getOrElse(z)
          case (name, z) => name -> DefaultResolver(z, schema.getField(name).schema())

        }.asJava)
    }

  def isUnionOfEnum(schema: Schema) = schema.getType == Schema.Type.UNION && schema.getTypes.asScala.map(_.getType).contains(Schema.Type.ENUM)

  def sealedTraitEnumDefaultValue[T](ctx: SealedTrait[SchemaFor, T]) = {
    val defaultExtractor = new AnnotationExtractors(ctx.annotations)
    defaultExtractor.enumDefault.flatMap { default =>
      ctx.subtypes.flatMap { st: Subtype[SchemaFor, T] =>
        if(st.typeName.short == default.toString)
          Option(st.typeName.short)
        else
          None
      }.headOption
    }
  }

  def isScalaEnumeration(value: Any) = value.getClass.getCanonicalName == "scala.Enumeration.Val"

  private def isEnum(product: Product, schemaType: Schema.Type) =
    product.productArity == 0 && schemaType == Schema.Type.ENUM

  private def trimmedClassName(p: Product) = trimDollar(p.getClass.getSimpleName)

  private def trimDollar(s: String) = if(s.endsWith("$")) s.dropRight(1) else s
}

Source File: AvroSchemaMerge.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s

import org.apache.avro.{JsonProperties, Schema}
import org.apache.avro.Schema.Field

object AvroSchemaMerge {

  import scala.collection.JavaConverters._

  def apply(name: String, namespace: String, schemas: List[Schema]): Schema = {
    require(schemas.forall(_.getType == Schema.Type.RECORD), "Can only merge records")

    val doc = schemas.flatMap(x => Option(x.getDoc)).mkString("; ")

    val fields = schemas.flatMap(_.getFields.asScala).groupBy(_.name).map { case (name, fields) =>

      val doc = fields.flatMap(x => Option(x.doc)).mkString("; ")
      val default = fields.find(_.defaultVal != null).map(_.defaultVal).orNull

      // if we have two schemas with the same type, then just keep the first one
      val union = {
        val schemas = fields
          .map(_.schema)
          .flatMap(schema => schema.getType match {
            case Schema.Type.UNION => schema.getTypes.asScala
            case _ => Seq(schema)
          })
          .filter(_.getType != Schema.Type.NULL)
          .groupBy(_.getType)
          .map(_._2.head)
          .toList
          .sortBy(_.getName)

        // if default value was not specified or equal to JsonProperties.NULL_VALUE then null schema should be the first in union
        Schema.createUnion({
          if (default == null || default == JsonProperties.NULL_VALUE) {
            (Schema.create(Schema.Type.NULL) :: schemas).asJava
          } else {
            (schemas :+ Schema.create(Schema.Type.NULL)).asJava
          }
        })
      }

      new Field(name, union, if (doc.isEmpty) null else doc, default)
    }

    val schema = Schema.createRecord(name, if (doc.isEmpty) null else doc, namespace, false)
    schema.setFields(fields.toList.asJava)
    schema
  }
}

Source File: DefaultResolver.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s

import java.nio.ByteBuffer
import java.util.UUID

import org.apache.avro.LogicalTypes.Decimal
import org.apache.avro.generic.{GenericEnumSymbol, GenericFixed}
import org.apache.avro.util.Utf8
import org.apache.avro.{Conversions, Schema}
import CustomDefaults._
import scala.collection.JavaConverters._


object DefaultResolver {

  def apply(value: Any, schema: Schema): AnyRef = value match {
    case Some(x) => apply(x, schema)
    case u: Utf8 => u.toString
    case uuid: UUID => uuid.toString
    case enum: GenericEnumSymbol[_] => enum.toString
    case fixed: GenericFixed => fixed.bytes()
    case bd: BigDecimal => bd.toString()
    case byteBuffer: ByteBuffer if schema.getLogicalType.isInstanceOf[Decimal] =>
      val decimalConversion = new Conversions.DecimalConversion
      val bd = decimalConversion.fromBytes(byteBuffer, schema, schema.getLogicalType)
      java.lang.Double.valueOf(bd.doubleValue)
    case byteBuffer: ByteBuffer => byteBuffer.array()
    case x: scala.Long => java.lang.Long.valueOf(x)
    case x: scala.Boolean => java.lang.Boolean.valueOf(x)
    case x: scala.Int => java.lang.Integer.valueOf(x)
    case x: scala.Double => java.lang.Double.valueOf(x)
    case x: scala.Float => java.lang.Float.valueOf(x)
    case x: Map[_,_] => x.asJava
    case x: Seq[_] => x.asJava
    case shapeless.Inl(x) => apply(x, schema)
    case p: Product => customDefault(p, schema)
    case v if isScalaEnumeration(v) => customScalaEnumDefault(value)
    case _ =>
      value.asInstanceOf[AnyRef]
  }

}

Source File: AvroDataOutputStream.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s

import java.io.OutputStream

import org.apache.avro.Schema
import org.apache.avro.file.{CodecFactory, DataFileWriter}
import org.apache.avro.generic.{GenericDatumWriter, GenericRecord}


case class AvroDataOutputStream[T](os: OutputStream,
                                   codec: CodecFactory)
                                  (implicit encoder: Encoder[T]) extends AvroOutputStream[T] {

  val resolved = encoder.resolveEncoder()

  val (writer, writeFn) = resolved.schema.getType match {
    case Schema.Type.DOUBLE | Schema.Type.LONG | Schema.Type.BOOLEAN | Schema.Type.STRING | Schema.Type.INT | Schema.Type.FLOAT =>
      val datumWriter = new GenericDatumWriter[T](resolved.schema)
      val dataFileWriter = new DataFileWriter[T](datumWriter)
      dataFileWriter.setCodec(codec)
      dataFileWriter.create(resolved.schema, os)
      (dataFileWriter, (t: T) => dataFileWriter.append(t))
    case _ =>
      val datumWriter = new GenericDatumWriter[GenericRecord](resolved.schema)
      val dataFileWriter = new DataFileWriter[GenericRecord](datumWriter)
      dataFileWriter.setCodec(codec)
      dataFileWriter.create(resolved.schema, os)
      (dataFileWriter, (t: T) => {
        val record = resolved.encode(t).asInstanceOf[GenericRecord]
        dataFileWriter.append(record)
      })
  }

  override def close(): Unit = {
    flush()
    writer.close()
  }

  override def write(t: T): Unit = {
    writeFn(t)
  }

  override def flush(): Unit = writer.flush()
  override def fSync(): Unit = writer.fSync()
}

Source File: DefaultAwareDatumReader.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s

import org.apache.avro.generic.GenericDatumReader
import org.apache.avro.io.ResolvingDecoder
import org.apache.avro.{AvroTypeException, Schema}

class DefaultAwareDatumReader[T](writer: Schema, reader: Schema)
  extends GenericDatumReader[T](writer, reader, new DefaultAwareGenericData) {
  override def readField(r: scala.Any,
                         f: Schema.Field,
                         oldDatum: scala.Any,
                         in: ResolvingDecoder,
                         state: scala.Any): Unit = {
    try {
      super.readField(r, f, oldDatum, in, state)
    } catch {
      case t: AvroTypeException =>
        if (f.defaultVal == null) throw t else getData.setField(r, f.name, f.pos, f.defaultVal)
    }
  }
}

object DefaultAwareDatumReader {
  def apply[T](writerSchema: Schema): DefaultAwareDatumReader[T] = new DefaultAwareDatumReader[T](writerSchema, writerSchema)
}

Source File: AvroDataInputStream.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s

import java.io.InputStream

import org.apache.avro.Schema
import org.apache.avro.file.DataFileStream
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.avro.io.DatumReader

import scala.util.Try

class AvroDataInputStream[T](in: InputStream,
                             writerSchema: Option[Schema])
                            (implicit decoder: Decoder[T]) extends AvroInputStream[T] {

  val resolved = decoder.resolveDecoder()

  // if no reader or writer schema is specified, then we create a reader that uses what's present in the files
  private val datumReader = writerSchema match {
    case Some(writer) => GenericData.get.createDatumReader(writer, resolved.schema)
    case None => GenericData.get.createDatumReader(null, resolved.schema)
  }

  private val dataFileReader = new DataFileStream[GenericRecord](in, datumReader.asInstanceOf[DatumReader[GenericRecord]])

  override def iterator: Iterator[T] = new Iterator[T] {
    override def hasNext: Boolean = dataFileReader.hasNext
    override def next(): T = {
      val record = dataFileReader.next
      resolved.decode(record)
    }
  }

  override def tryIterator: Iterator[Try[T]] = new Iterator[Try[T]] {
    override def hasNext: Boolean = dataFileReader.hasNext
    override def next(): Try[T] = Try {
      val record = dataFileReader.next
      resolved.decode(record)
    }
  }

  override def close(): Unit = in.close()
}

Source File: GenericSerde.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s.kafka

import java.io.ByteArrayOutputStream

import com.sksamuel.avro4s.{AvroFormat, AvroInputStream, AvroOutputStream, AvroSchema, BinaryFormat, DataFormat, Decoder, Encoder, JsonFormat, SchemaFor}
import org.apache.avro.Schema
import org.apache.kafka.common.serialization.{Deserializer, Serde, Serializer}


class GenericSerde[T >: Null : SchemaFor : Encoder : Decoder](avroFormat: AvroFormat = BinaryFormat) extends Serde[T]
  with Deserializer[T]
  with Serializer[T]
  with Serializable {

  val schema: Schema = AvroSchema[T]

  override def serializer(): Serializer[T] = this

  override def deserializer(): Deserializer[T] = this

  override def deserialize(topic: String, data: Array[Byte]): T = {
    if (data == null) null else {

      val avroInputStream = avroFormat match {
        case BinaryFormat => AvroInputStream.binary[T]
        case JsonFormat => AvroInputStream.json[T]
        case DataFormat => AvroInputStream.data[T]
      }

      val input = avroInputStream.from(data).build(schema)
      val result = input.iterator.next()
      input.close()
      result
    }
  }

  override def close(): Unit = ()

  override def configure(configs: java.util.Map[String, _], isKey: Boolean): Unit = ()

  override def serialize(topic: String, data: T): Array[Byte] = {
    val baos = new ByteArrayOutputStream()

    val avroOutputStream = avroFormat match {
      case BinaryFormat => AvroOutputStream.binary[T]
      case JsonFormat => AvroOutputStream.json[T]
      case DataFormat => AvroOutputStream.data[T]
    }

    val output = avroOutputStream.to(baos).build()
    output.write(data)
    output.close()
    baos.toByteArray
  }
}

Source File: RefinedTest.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s.refined

import com.sksamuel.avro4s._
import eu.timepit.refined.api.Refined
import eu.timepit.refined.auto._
import eu.timepit.refined.collection.NonEmpty
import org.apache.avro.Schema
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

case class Foo(nonEmptyStr: String Refined NonEmpty)

class RefinedTest extends AnyWordSpec with Matchers {

  "refinedSchemaFor" should {
    "use the schema for the underlying type" in {
      AvroSchema[Foo] shouldBe new Schema.Parser().parse(
        """
          |{
          |	"type": "record",
          |	"name": "Foo",
          |	"namespace": "com.sksamuel.avro4s.refined",
          |	"fields": [{
          |		"name": "nonEmptyStr",
          |		"type": "string"
          |	}]
          |}
        """.stripMargin)
    }
  }

  "refinedEncoder" should {
    "use the encoder for the underlying type" in {
      val expected: String Refined NonEmpty = "foo"
      val record = ToRecord[Foo].to(Foo(expected))
      record.get("nonEmptyStr").toString shouldBe expected.value
    }
  }

  "refinedDecoder" should {
    "use the decoder for the underlying type" in {
      val expected: String Refined NonEmpty = "foo"
      val record = ImmutableRecord(AvroSchema[Foo], Vector(expected.value))
      FromRecord[Foo].from(record) shouldBe Foo(expected)
    }

    "throw when the value does not conform to the refined predicate" in {
      val record = ImmutableRecord(AvroSchema[Foo], Vector(""))
      assertThrows[IllegalArgumentException](FromRecord[Foo].from(record))
    }
  }
}

Source File: GenericAvroSerializerSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.serializer

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
import java.nio.ByteBuffer

import com.esotericsoftware.kryo.io.{Input, Output}
import org.apache.avro.{Schema, SchemaBuilder}
import org.apache.avro.generic.GenericData.Record

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext {
  conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

  val schema : Schema = SchemaBuilder
    .record("testRecord").fields()
    .requiredString("data")
    .endRecord()
  val record = new Record(schema)
  record.put("data", "test data")

  test("schema compression and decompression") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    assert(schema === genericSer.decompress(ByteBuffer.wrap(genericSer.compress(schema))))
  }

  test("record serialization and deserialization") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)

    val outputStream = new ByteArrayOutputStream()
    val output = new Output(outputStream)
    genericSer.serializeDatum(record, output)
    output.flush()
    output.close()

    val input = new Input(new ByteArrayInputStream(outputStream.toByteArray))
    assert(genericSer.deserializeDatum(input) === record)
  }

  test("uses schema fingerprint to decrease message size") {
    val genericSerFull = new GenericAvroSerializer(conf.getAvroSchema)

    val output = new Output(new ByteArrayOutputStream())

    val beginningNormalPosition = output.total()
    genericSerFull.serializeDatum(record, output)
    output.flush()
    val normalLength = output.total - beginningNormalPosition

    conf.registerAvroSchemas(schema)
    val genericSerFinger = new GenericAvroSerializer(conf.getAvroSchema)
    val beginningFingerprintPosition = output.total()
    genericSerFinger.serializeDatum(record, output)
    val fingerprintLength = output.total - beginningFingerprintPosition

    assert(fingerprintLength < normalLength)
  }

  test("caches previously seen schemas") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    val compressedSchema = genericSer.compress(schema)
    val decompressedSchema = genericSer.decompress(ByteBuffer.wrap(compressedSchema))

    assert(compressedSchema.eq(genericSer.compress(schema)))
    assert(decompressedSchema.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema))))
  }
}

Source File: AvroSEBasicTest.scala From akka-serialization-test with Apache License 2.0

5 votes

package com.github.dnvriend.serializer.avro4s

import com.github.dnvriend.TestSpec
import com.github.dnvriend.domain.BookStore.{ ChangedBookV1, ChangedBookV2, ChangedBookV3, ChangedBookV4 }
import com.github.dnvriend.serializer.avro.{ BookSerializerV1, BookSerializerV2, BookSerializerV3 }
import com.sksamuel.avro4s.{ AvroSchema, RecordFormat }
import org.apache.avro.Schema
import org.apache.avro.file.SeekableByteArrayInput
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord }
import org.apache.avro.io.DecoderFactory

// SE stands for Schema Evolution
class AvroSEBasicTest extends TestSpec {

  @Override
  def fromBytes(bytes: Array[Byte], schema: Schema): GenericRecord = {
    val serveReader = new GenericDatumReader[GenericRecord](schema)
    serveReader.read(null, DecoderFactory.get().binaryDecoder(bytes, null))
  }

  val title = "Moby-Dick; or, The Whale"
  val year = 1851
  val editor = "Scala Books"

  "AvroSEBasicTest" should "deserialize old class with renamed field" in {
    // in this case, two different serializers can be used

    val obj = ChangedBookV1(title, year)
    val serializerV1 = new BookSerializerV1
    val bytes: Array[Byte] = serializerV1.toBinary(obj)
    val serializerV2 = new BookSerializerV2

    serializerV2.fromBinary(bytes) should matchPattern {
      case ChangedBookV2(`title`, `year`) ⇒
    }
  }

  it should "deserialize old class without new field" in {

    val obj = ChangedBookV2(title, year)
    val serializerV2 = new BookSerializerV2
    val bytes: Array[Byte] = serializerV2.toBinary(obj)

    val in = new SeekableByteArrayInput(bytes)

    val schema2 = AvroSchema[ChangedBookV2]
    val schema3 = AvroSchema[ChangedBookV3]

    val gdr = new GenericDatumReader[GenericRecord](schema2, schema3)
    val binDecoder = DecoderFactory.get().binaryDecoder(in, null)
    val record: GenericRecord = gdr.read(null, binDecoder)
    val format = RecordFormat[ChangedBookV3]
    val r = format.from(record)

    r should matchPattern {
      case ChangedBookV3(`title`, `year`, "") ⇒
    }

  }

  it should "deserialize old class with dropped field" in {

    val obj = ChangedBookV3(title, year, editor)
    val serializerV3 = new BookSerializerV3
    val bytes: Array[Byte] = serializerV3.toBinary(obj)

    val in = new SeekableByteArrayInput(bytes)

    val schema3 = AvroSchema[ChangedBookV3]
    val schema4 = AvroSchema[ChangedBookV4]

    val gdr = new GenericDatumReader[GenericRecord](schema3, schema4)
    val binDecoder = DecoderFactory.get().binaryDecoder(in, null)
    val record: GenericRecord = gdr.read(null, binDecoder)
    val format = RecordFormat[ChangedBookV4]
    val r = format.from(record)

    r should matchPattern {
      case ChangedBookV4(`title`, `editor`) ⇒
    }

  }

}

Source File: avroMarshallers.scala From scalatest-embedded-kafka with MIT License

5 votes

package net.manub.embeddedkafka.avro

import java.io.ByteArrayOutputStream

import kafka.utils.VerifiableProperties
import org.apache.avro.Schema
import org.apache.avro.io._
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecord
}
import org.apache.kafka.common.serialization.{Deserializer, Serializer}

class KafkaAvroDeserializer[T <: SpecificRecord](schema: Schema)
    extends Deserializer[T]
    with NoOpConfiguration
    with NoOpClose {

  private val reader = new SpecificDatumReader[T](schema)

  override def deserialize(topic: String, data: Array[Byte]): T = {
    val decoder = DecoderFactory.get().binaryDecoder(data, null)
    reader.read(null.asInstanceOf[T], decoder)
  }
}

class KafkaAvroSerializer[T <: SpecificRecord]()
    extends Serializer[T]
    with NoOpConfiguration
    with NoOpClose {

  private def toBytes(nullableData: T): Array[Byte] =
    Option(nullableData).fold[Array[Byte]](null) { data =>
      val writer: DatumWriter[T] = new SpecificDatumWriter[T](data.getSchema)
      val out = new ByteArrayOutputStream()
      val encoder = EncoderFactory.get.binaryEncoder(out, null)

      writer.write(data, encoder)
      encoder.flush()
      out.close()

      out.toByteArray
    }

  override def serialize(topic: String, data: T): Array[Byte] =
    toBytes(data)
}

sealed trait NoOpConfiguration {
  def configure(configs: java.util.Map[String, _], isKey: Boolean): Unit = ()
}

sealed trait NoOpClose {
  def close(): Unit = ()
}

Source File: TestAvroClass.scala From scalatest-embedded-kafka with MIT License

5 votes

package net.manub.embeddedkafka

import org.apache.avro.specific.SpecificRecordBase
import org.apache.avro.{AvroRuntimeException, Schema}

case class TestAvroClass(var name: String) extends SpecificRecordBase {
  def this() = this("")

  override def get(i: Int): AnyRef = i match {
    case 0 => name
    case _ => throw new AvroRuntimeException("Bad index")
  }

  override def put(i: Int, v: scala.Any): Unit = i match {
    case 0 =>
      name = v match {
        case (utf8: org.apache.avro.util.Utf8) => utf8.toString
        case _                                 => v.asInstanceOf[String]
      }
    case _ => throw new AvroRuntimeException("Bad index")
  }

  override def getSchema: Schema = TestAvroClass.SCHEMA$
}

object TestAvroClass {
  val SCHEMA$ =
    (new Schema.Parser).parse("""
      |{"namespace": "example",
      | "type": "record",
      | "namespace": "net.manub.embeddedkafka",
      | "name": "TestAvroClass",
      | "fields": [
      |     {"name": "name", "type": "string"}
      | ]
      |}
    """.stripMargin)
}

Source File: TestAvroClass.scala From scalatest-embedded-kafka with MIT License

5 votes

package net.manub.embeddedkafka

import org.apache.avro.specific.SpecificRecordBase
import org.apache.avro.{AvroRuntimeException, Schema}

case class TestAvroClass(var name: String) extends SpecificRecordBase {
  def this() = this("")

  override def get(i: Int): AnyRef = i match {
    case 0 => name
    case _ => throw new AvroRuntimeException("Bad index")
  }

  override def put(i: Int, v: scala.Any): Unit = i match {
    case 0 =>
      name = v match {
        case (utf8: org.apache.avro.util.Utf8) => utf8.toString
        case _                                 => v.asInstanceOf[String]
      }
    case _ => throw new AvroRuntimeException("Bad index")
  }

  override def getSchema: Schema = TestAvroClass.SCHEMA$
}

object TestAvroClass {
  val SCHEMA$ =
    (new Schema.Parser).parse("""
                                |{"namespace": "example",
                                | "type": "record",
                                | "namespace": "net.manub.embeddedkafka",
                                | "name": "TestAvroClass",
                                | "fields": [
                                |     {"name": "name", "type": "string"}
                                | ]
                                |}
                              """.stripMargin)
}

Source File: AvroSchemaMerge.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.avro

import com.sksamuel.exts.StringOption
import org.apache.avro.Schema

import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer

object AvroSchemaMerge {

  def apply(name: String, namespace: String, schemas: List[Schema]): Schema = {
    require(schemas.forall(_.getType == Schema.Type.RECORD), "Can only merge records")

    // documentations can just be a concat
    val doc = schemas.map(_.getDoc).filter(_ != null).mkString("; ")

    // simple impl to start: take all the fields from the first schema, and then add in the missing ones
    // from second 2 and so on
    val fields = new ArrayBuffer[Schema.Field]()
    schemas.foreach { schema =>
      schema.getFields.asScala.filterNot { field => fields.exists(_.name() == field.name) }.foreach { field =>
        // avro is funny about sharing fields, so need to copy it
        val copy = new Schema.Field(field.name(), field.schema(), StringOption(field.doc).orNull, field.defaultVal)
        fields.append(copy)
      }
    }

    val schema = Schema.createRecord(name, if (doc.isEmpty()) null else doc, namespace, false)
    schema.setFields(fields.result().asJava)
    schema
  }
}

Source File: AvroParquetRowWriter.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet.avro

import com.sksamuel.exts.Logging
import com.typesafe.config.{Config, ConfigFactory}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.fs.{FileSystem, Path}


class AvroParquetRowWriter(path: Path,
                           avroSchema: Schema)(implicit fs: FileSystem) extends Logging {

  private val config: Config = ConfigFactory.load()
  private val skipCrc = config.getBoolean("eel.parquet.skipCrc")
  logger.info(s"Parquet writer will skipCrc = $skipCrc")

  private val writer = AvroParquetWriterFn(path, avroSchema)

  def write(record: GenericRecord): Unit = {
    writer.write(record)
  }

  def close(): Unit = {
    writer.close()
    if (skipCrc) {
      val crc = new Path("." + path.toString() + ".crc")
      logger.debug("Deleting crc $crc")
      if (fs.exists(crc))
        fs.delete(crc, false)
    }
  }
}

Source File: AvroParquetWriterFn.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet.avro

import com.sksamuel.exts.Logging
import io.eels.component.parquet.ParquetWriterConfig
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.fs.Path
import org.apache.parquet.avro.AvroParquetWriter
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter}


object AvroParquetWriterFn extends Logging {
  def apply(path: Path, avroSchema: Schema): ParquetWriter[GenericRecord] = {
    val config = ParquetWriterConfig()
    AvroParquetWriter.builder[GenericRecord](path)
      .withSchema(avroSchema)
      .withCompressionCodec(config.compressionCodec)
      .withPageSize(config.pageSize)
      .withRowGroupSize(config.blockSize)
      .withDictionaryEncoding(config.enableDictionary)
      .withWriteMode(ParquetFileWriter.Mode.CREATE)
      .withValidation(config.validating)
      .build()
  }
}

Source File: AvroParquetReaderFn.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet.avro

import io.eels.Predicate
import io.eels.component.parquet.{ParquetPredicateBuilder, ParquetReaderConfig}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.parquet.avro.{AvroParquetReader, AvroReadSupport}
import org.apache.parquet.filter2.compat.FilterCompat
import org.apache.parquet.hadoop.ParquetReader


  def apply(path: Path,
            predicate: Option[Predicate],
            projectionSchema: Option[Schema])(implicit conf: Configuration): ParquetReader[GenericRecord] = {

    // The parquet reader can use a projection by setting a projected schema onto a conf object
    def configuration(): Configuration = {
      val newconf = new Configuration(conf)
      projectionSchema.foreach { it =>
        AvroReadSupport.setAvroReadSchema(newconf, it)
        AvroReadSupport.setRequestedProjection(newconf, it)
      }
      //conf.set(ParquetInputFormat.DICTIONARY_FILTERING_ENABLED, "true")
      newconf.set(org.apache.parquet.hadoop.ParquetFileReader.PARQUET_READ_PARALLELISM, config.parallelism.toString)
      newconf
    }

    // a filter is set when we have a predicate for the read
    def filter(): FilterCompat.Filter = predicate.map(ParquetPredicateBuilder.build)
      .map(FilterCompat.get)
      .getOrElse(FilterCompat.NOOP)

    AvroParquetReader.builder[GenericRecord](path)
      .withCompatibility(false)
      .withConf(configuration())
      .withFilter(filter())
      .build()
      .asInstanceOf[ParquetReader[GenericRecord]]
  }
}

Source File: ISODateConverter.scala From hydra with Apache License 2.0

5 votes

package hydra.avro.convert

import java.text.SimpleDateFormat
import java.time._

import hydra.common.logging.LoggingAdapter
import org.apache.avro.{Conversion, LogicalType, Schema}

import scala.util.Try


class ISODateConverter extends Conversion[ZonedDateTime] with LoggingAdapter {

  private val utc = ZoneOffset.UTC

  override def getLogicalTypeName: String = IsoDate.IsoDateLogicalTypeName

  override def getConvertedType: Class[ZonedDateTime] = classOf[ZonedDateTime]

  private val simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssX")

  override def fromCharSequence(
      value: CharSequence,
      schema: Schema,
      `type`: LogicalType
  ): ZonedDateTime = {
    Try(OffsetDateTime.parse(value).toInstant)
      .orElse {
        Try(LocalDateTime.parse(value).toInstant(ZoneOffset.UTC))
      }
      .orElse {
        Try(simpleDateFormat.parse(value.toString).toInstant)
      }
      .recover {
        case e: Throwable =>
          log.error(e.getMessage, e)
          Instant.EPOCH
      }
      .map(_.atZone(utc))
      .get
  }
}

object IsoDate extends LogicalType("iso-datetime") {
  val IsoDateLogicalTypeName = "iso-datetime"

  override def validate(schema: Schema): Unit = {
    if (schema.getType() != Schema.Type.STRING) {
      throw new IllegalArgumentException(
        "Iso-datetime can only be used with an underlying string type"
      )
    }
  }
}

Source File: AvroUuid.scala From hydra with Apache License 2.0

5 votes

package hydra.avro.convert

import org.apache.avro.{LogicalType, Schema}

object AvroUuid extends LogicalType("uuid") {
  val AvroUuidLogicalTypeName = "uuid"

  override def validate(schema: Schema): Unit = {
    if (schema.getType() != Schema.Type.STRING) {
      throw new IllegalArgumentException(
        "uui can only be used with an underlying string type"
      )
    }
  }
}

Source File: StringToGenericRecord.scala From hydra with Apache License 2.0

5 votes

package hydra.avro.convert

import java.util.UUID

import org.apache.avro.{LogicalTypes, Schema}
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
import org.apache.avro.io.DecoderFactory
import cats.implicits._
import org.apache.avro.util.Utf8

import scala.util.{Failure, Success, Try}

object StringToGenericRecord {

  final case class ValidationExtraFieldsError(fields: Set[String]) extends RuntimeException(
    s"Extra fields ${fields.mkString(",")} found with Strict Validation Strategy"
  )

  final case class InvalidLogicalTypeError(expected: String, received: AnyRef) extends RuntimeException(
    s"Invalid logical type. Expected $expected but received $received"
  )

  implicit class ConvertToGenericRecord(s: String) {

    private def isUuidValid(s: String): Boolean =
      Try(UUID.fromString(s)).isSuccess

    private def checkLogicalTypes(record: GenericRecord): Try[Unit] = {
      import collection.JavaConverters._
      def checkAll(avroField: AnyRef, fieldSchema: Option[Schema]): Try[Unit] = avroField match {
        case g: GenericRecord => g.getSchema.getFields.asScala.toList
          .traverse(f => checkAll(g.get(f.name), f.schema.some)).void
        case u: Utf8 if fieldSchema.exists(f => Option(f.getLogicalType).exists(_.getName == LogicalTypes.uuid.getName)) =>
          if (isUuidValid(u.toString)) Success(()) else Failure(InvalidLogicalTypeError("UUID", u.toString))
        case _ => Success(())
      }
      val fields = record.getSchema.getFields.asScala.toList
      fields.traverse(f => checkAll(record.get(f.name), f.schema.some)).void
    }

    private def getAllPayloadFieldNames: Set[String] = {
      import spray.json._
      def loop(cur: JsValue, extraName: Option[String]): Set[String] = cur match {
        case JsObject(f) => f.flatMap { case (k: String, v: JsValue) =>
          loop(v, k.some) ++ Set(extraName.getOrElse("") + k)
        }.toSet
        case _ => Set.empty
      }
      loop(s.parseJson, None)
    }

    private def getAllSchemaFieldNames(schema: Schema): Set[String] = {
      import Schema.Type._
      import collection.JavaConverters._
      def loop(sch: Schema, extraName: Option[String]): Set[String] = sch.getType match {
        case RECORD => sch.getFields.asScala.toSet.flatMap { f: Schema.Field =>
          loop(f.schema, f.name.some) ++ Set(extraName.getOrElse("") + f.name)
        }
        case _ => Set.empty
      }
      loop(schema, None)
    }

    def toGenericRecord(schema: Schema, useStrictValidation: Boolean): Try[GenericRecord] = Try {
      if (useStrictValidation) {
        val diff = getAllPayloadFieldNames diff getAllSchemaFieldNames(schema)
        if (diff.nonEmpty) throw ValidationExtraFieldsError(diff)
      }
      val decoderFactory = new DecoderFactory
      val decoder = decoderFactory.jsonDecoder(schema, s)
      val reader = new GenericDatumReader[GenericRecord](schema)
      reader.read(null, decoder)
    }.flatTap(checkLogicalTypes)
  }

}

Source File: SchemaWrapper.scala From hydra with Apache License 2.0

5 votes

package hydra.avro.util

import org.apache.avro.Schema
import org.apache.avro.Schema.{Field, Type}

import scala.collection.mutable
import scala.util.Try


  def from(schema: Schema, primaryKeys: Seq[String]): SchemaWrapper = {
    SchemaWrapper(schema, primaryKeys)
  }

  private def schemaPKs(schema: Schema): Seq[String] = {
    Option(schema.getProp("hydra.key"))
      .map(_.replaceAll("\\s", "").split(",")) match {
      case Some(ids) => ids
      case None      => Seq.empty
    }
  }
}

Source File: AvroUtils.scala From hydra with Apache License 2.0

5 votes

package hydra.avro.util

import com.pluralsight.hydra.avro.JsonToAvroConversionException
import hydra.avro.registry.JsonToAvroConversionExceptionWithMetadata
import hydra.avro.resource.SchemaResource
import org.apache.avro.Schema
import org.apache.avro.Schema.Field

import scala.collection.mutable


  def areEqual(one: Schema, other: Schema): Boolean = {
    val seen = SEEN_EQUALS.get
    val here = SeenPair(one.hashCode(), other.hashCode())
    val equals = {
      if (seen.contains(here)) return true
      if (one eq other) return true
      if (one.getFullName != other.getFullName) return false
      one.getFields.asScala.map(_.name()).toSet == other.getFields.asScala
        .map(_.name())
        .toSet
    }

    if (equals) seen.add(here)

    equals
  }

  def improveException(ex: Throwable, schema: SchemaResource, registryUrl:String) = {
    ex match {
      case e: JsonToAvroConversionException =>
        JsonToAvroConversionExceptionWithMetadata(e, schema, registryUrl)
      case e: Exception => e
    }
  }

  private[avro] case class SeenPair private (s1: Int, s2: Int) {

    override def equals(o: Any): Boolean =
      (this.s1 == o.asInstanceOf[SeenPair].s1) && (this.s2 == o
        .asInstanceOf[SeenPair]
        .s2)

    override def hashCode: Int = s1 + s2
  }

}

Source File: IngestionFlow.scala From hydra with Apache License 2.0

5 votes

package hydra.ingest.services

import java.io.IOException

import cats.MonadError
import cats.implicits._
import com.pluralsight.hydra.avro.JsonToAvroConversionException
import hydra.avro.registry.SchemaRegistry
import hydra.avro.resource.SchemaResourceLoader.SchemaNotFoundException
import hydra.avro.util.SchemaWrapper
import hydra.core.ingest.HydraRequest
import hydra.core.ingest.RequestParams.{HYDRA_KAFKA_TOPIC_PARAM, HYDRA_RECORD_KEY_PARAM}
import hydra.core.transport.{AckStrategy, ValidationStrategy}
import hydra.kafka.algebras.KafkaClientAlgebra
import hydra.kafka.producer.AvroRecord
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import scalacache._
import scalacache.guava._
import scalacache.memoization._

import scala.concurrent.duration._
import scala.util.{Failure, Success, Try}

final class IngestionFlow[F[_]: MonadError[*[_], Throwable]: Mode](
                                                                    schemaRegistry: SchemaRegistry[F],
                                                                    kafkaClient: KafkaClientAlgebra[F],
                                                                    schemaRegistryBaseUrl: String
                                                                  ) {

  import IngestionFlow._

  implicit val guavaCache: Cache[SchemaWrapper] = GuavaCache[SchemaWrapper]

  private def getValueSchema(topicName: String): F[Schema] = {
    schemaRegistry.getLatestSchemaBySubject(topicName + "-value")
      .flatMap { maybeSchema =>
        val schemaNotFound = SchemaNotFoundException(topicName)
        MonadError[F, Throwable].fromOption(maybeSchema, SchemaNotFoundAugmentedException(schemaNotFound, topicName))
      }
  }

  private def getValueSchemaWrapper(topicName: String): F[SchemaWrapper] = memoizeF[F, SchemaWrapper](Some(2.minutes)) {
    getValueSchema(topicName).map { valueSchema =>
      SchemaWrapper.from(valueSchema)
    }
  }

  def ingest(request: HydraRequest): F[Unit] = {
    request.metadataValue(HYDRA_KAFKA_TOPIC_PARAM) match {
      case Some(topic) => getValueSchemaWrapper(topic).flatMap { schemaWrapper =>
        val useStrictValidation = request.validationStrategy == ValidationStrategy.Strict
        val payloadTryMaybe: Try[Option[GenericRecord]] = Option(request.payload) match {
          case Some(p) => convertToAvro(topic, schemaWrapper, useStrictValidation, p).map(avroRecord => Some(avroRecord.payload))
          case None => Success(None)
        }
        val v1Key = getV1RecordKey(schemaWrapper, payloadTryMaybe, request)
        MonadError[F, Throwable].fromTry(payloadTryMaybe).flatMap { payloadMaybe =>
          kafkaClient.publishStringKeyMessage((v1Key, payloadMaybe), topic).void
        }
      }
      case None => MonadError[F, Throwable].raiseError(MissingTopicNameException(request))
    }
  }

  private def getV1RecordKey(schemaWrapper: SchemaWrapper, payloadTryMaybe: Try[Option[GenericRecord]], request: HydraRequest): Option[String] = {
    val headerV1Key = request.metadata.get(HYDRA_RECORD_KEY_PARAM)
    val optionString = schemaWrapper.primaryKeys.toList match {
      case Nil => None
      case l => l.flatMap(pkName => payloadTryMaybe match {
        case Success(payloadMaybe) =>
          payloadMaybe.flatMap(p => Try(p.get(pkName)).toOption)
        case Failure(_) => None
      }).mkString("|").some
    }
    headerV1Key.orElse(optionString)
  }

  private def convertToAvro(topic: String, schemaWrapper: SchemaWrapper, useStrictValidation: Boolean, payloadString: String): Try[AvroRecord] = {
    Try(AvroRecord(topic, schemaWrapper.schema, None, payloadString, AckStrategy.Replicated, useStrictValidation)).recoverWith {
      case e: JsonToAvroConversionException =>
        val location = s"$schemaRegistryBaseUrl/subjects/$topic-value/versions/latest/schema"
        Failure(new AvroConversionAugmentedException(s"${e.getClass.getName}: ${e.getMessage} [$location]"))
      case e: IOException =>
        val location = s"$schemaRegistryBaseUrl/subjects/$topic-value/versions/latest/schema"
        Failure(new AvroConversionAugmentedException(s"${e.getMessage} [$location]"))
      case e => Failure(e)
    }
  }
}

object IngestionFlow {
  final case class MissingTopicNameException(request: HydraRequest)
    extends Exception(s"Missing the topic name in request with correlationId ${request.correlationId}")
  final case class AvroConversionAugmentedException(message: String) extends RuntimeException(message)
  final case class SchemaNotFoundAugmentedException(schemaNotFoundException: SchemaNotFoundException, topic: String)
    extends RuntimeException(s"Schema '$topic' cannot be loaded. Cause: ${schemaNotFoundException.getClass.getName}: Schema not found for $topic")
}

Source File: IngestionFlowV2.scala From hydra with Apache License 2.0

5 votes

package hydra.ingest.services

import java.io.IOException

import cats.MonadError
import cats.implicits._
import hydra.avro.registry.SchemaRegistry
import hydra.avro.resource.SchemaResourceLoader.SchemaNotFoundException
import hydra.avro.util.SchemaWrapper
import hydra.core.transport.ValidationStrategy
import hydra.kafka.algebras.KafkaClientAlgebra
import hydra.kafka.algebras.KafkaClientAlgebra.PublishResponse
import hydra.kafka.model.TopicMetadataV2Request.Subject
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import scalacache._
import scalacache.guava._
import scalacache.memoization._

import scala.concurrent.duration._
import scala.util.{Failure, Try}

final class IngestionFlowV2[F[_]: MonadError[*[_], Throwable]: Mode](
                                                                    schemaRegistry: SchemaRegistry[F],
                                                                    kafkaClient: KafkaClientAlgebra[F],
                                                                    schemaRegistryBaseUrl: String) {

  import IngestionFlowV2._
  import hydra.avro.convert.StringToGenericRecord._

  implicit val guavaCache: Cache[SchemaWrapper] = GuavaCache[SchemaWrapper]

  private def getSchema(subject: String): F[Schema] = {
    schemaRegistry.getLatestSchemaBySubject(subject)
      .flatMap { maybeSchema =>
        val schemaNotFound = SchemaNotFoundException(subject)
        MonadError[F, Throwable].fromOption(maybeSchema, SchemaNotFoundAugmentedException(schemaNotFound, subject))
      }
  }

  private def getSchemaWrapper(subject: Subject, isKey: Boolean): F[SchemaWrapper] = memoizeF[F, SchemaWrapper](Some(2.minutes)) {
    val suffix = if (isKey) "-key" else "-value"
    getSchema(subject.value + suffix).map { sch =>
      SchemaWrapper.from(sch)
    }
  }

  private def recover[A](subject: Subject, isKey: Boolean): PartialFunction[Throwable, Try[A]] = {
    val suffix = if (isKey) "-key" else "-value"
    val location = s"$schemaRegistryBaseUrl/subjects/${subject.value}$suffix/versions/latest/schema"
    val pf: PartialFunction[Throwable, Try[A]] = {
      case e: ValidationExtraFieldsError =>
        Failure(AvroConversionAugmentedException(s"${e.getClass.getName}: ${e.getMessage} [$location]"))
      case e: InvalidLogicalTypeError =>
        Failure(AvroConversionAugmentedException(s"${e.getClass.getName}: ${e.getMessage} [$location]"))
      case e: IOException =>
        Failure(AvroConversionAugmentedException(s"${e.getClass.getName}: ${e.getMessage} [$location]"))
      case e => Failure(e)
    }
    pf
  }

  private def getSchemas(request: V2IngestRequest, topic: Subject): F[(GenericRecord, Option[GenericRecord])] = {
    val useStrictValidation = request.validationStrategy.getOrElse(ValidationStrategy.Strict) == ValidationStrategy.Strict
    def getRecord(payload: String, schema: Schema): Try[GenericRecord] =
      payload.toGenericRecord(schema, useStrictValidation)
    for {
      kSchema <- getSchemaWrapper(topic, isKey = true)
      vSchema <- getSchemaWrapper(topic, isKey = false)
      k <- MonadError[F, Throwable].fromTry(
        getRecord(request.keyPayload, kSchema.schema).recoverWith(recover(topic, isKey = true)))
      v <- MonadError[F, Throwable].fromTry(
        request.valPayload.traverse(getRecord(_, vSchema.schema)).recoverWith(recover(topic, isKey = false)))
    } yield (k, v)
  }

  def ingest(request: V2IngestRequest, topic: Subject): F[PublishResponse] = {
    getSchemas(request, topic).flatMap { case (key, value) =>
      kafkaClient.publishMessage((key, value), topic.value).rethrow
    }
  }
}

object IngestionFlowV2 {
  final case class V2IngestRequest(keyPayload: String, valPayload: Option[String], validationStrategy: Option[ValidationStrategy])

  final case class AvroConversionAugmentedException(message: String) extends RuntimeException(message)
  final case class SchemaNotFoundAugmentedException(schemaNotFoundException: SchemaNotFoundException, topic: String)
    extends RuntimeException(s"Schema '$topic' cannot be loaded. Cause: ${schemaNotFoundException.getClass.getName}: Schema not found for $topic")
}

Source File: IngestionFlowSpec.scala From hydra with Apache License 2.0

5 votes

package hydra.ingest.services

import cats.effect.{Concurrent, ContextShift, IO}
import hydra.avro.registry.SchemaRegistry
import hydra.core.ingest.HydraRequest
import hydra.core.ingest.RequestParams.{HYDRA_KAFKA_TOPIC_PARAM,HYDRA_RECORD_KEY_PARAM}
import hydra.ingest.services.IngestionFlow.MissingTopicNameException
import hydra.kafka.algebras.KafkaClientAlgebra
import org.apache.avro.{Schema, SchemaBuilder}
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

import scala.concurrent.ExecutionContext

class IngestionFlowSpec extends AnyFlatSpec with Matchers {

  private implicit val contextShift: ContextShift[IO] = IO.contextShift(ExecutionContext.global)
  private implicit val concurrentEffect: Concurrent[IO] = IO.ioConcurrentEffect
  private implicit val mode: scalacache.Mode[IO] = scalacache.CatsEffect.modes.async

  private val testSubject: String = "test_subject"

  private val testSubjectNoKey: String = "test_subject_no_key"

  private val testKey: String = "test"

  private val testPayload: String =
    s"""{"id": "$testKey", "testField": true}"""

  private val testSchema: Schema = SchemaBuilder.record("TestRecord")
    .prop("hydra.key", "id")
    .fields().requiredString("id").requiredBoolean("testField").endRecord()

  private val testSchemaNoKey: Schema = SchemaBuilder.record("TestRecordNoKey")
    .fields().requiredString("id").requiredBoolean("testField").endRecord()

  private def ingest(request: HydraRequest): IO[KafkaClientAlgebra[IO]] = for {
    schemaRegistry <- SchemaRegistry.test[IO]
    _ <- schemaRegistry.registerSchema(testSubject + "-value", testSchema)
    _ <- schemaRegistry.registerSchema(testSubjectNoKey + "-value", testSchemaNoKey)
    kafkaClient <- KafkaClientAlgebra.test[IO]
    ingestFlow <- IO(new IngestionFlow[IO](schemaRegistry, kafkaClient, "https://schemaRegistry.notreal"))
    _ <- ingestFlow.ingest(request)
  } yield kafkaClient

  it should "ingest a message" in {
    val testRequest = HydraRequest("correlationId", testPayload, metadata = Map(HYDRA_KAFKA_TOPIC_PARAM -> testSubject))
    ingest(testRequest).flatMap { kafkaClient =>
      kafkaClient.consumeStringKeyMessages(testSubject, "test-consumer").take(1).compile.toList.map { publishedMessages =>
        val firstMessage = publishedMessages.head
        (firstMessage._1, firstMessage._2.get.toString) shouldBe (Some(testKey), testPayload)
      }
    }.unsafeRunSync()
  }

  it should "ingest a message with a null key" in {
    val testRequest = HydraRequest("correlationId", testPayload, metadata = Map(HYDRA_KAFKA_TOPIC_PARAM -> testSubjectNoKey))
    ingest(testRequest).flatMap { kafkaClient =>
      kafkaClient.consumeStringKeyMessages(testSubjectNoKey, "test-consumer").take(1).compile.toList.map { publishedMessages =>
        val firstMessage = publishedMessages.head
        (firstMessage._1, firstMessage._2.get.toString) shouldBe (None, testPayload)
      }
    }.unsafeRunSync()
  }

  it should "return an error when no topic name is provided" in {
    val testRequest = HydraRequest("correlationId", testPayload)
    ingest(testRequest).attempt.unsafeRunSync() shouldBe Left(MissingTopicNameException(testRequest))
  }

  it should "take the key from the header if present" in {
    val headerKey = "someDifferentKey"
    val testRequest = HydraRequest("correlationId", testPayload, metadata = Map(HYDRA_RECORD_KEY_PARAM -> headerKey, HYDRA_KAFKA_TOPIC_PARAM -> testSubject))
    ingest(testRequest).flatMap { kafkaClient =>
      kafkaClient.consumeStringKeyMessages(testSubject, "test-consumer").take(1).compile.toList.map { publishedMessages =>
        val firstMessage = publishedMessages.head
        (firstMessage._1, firstMessage._2.get.toString) shouldBe (Some(headerKey), testPayload)
      }
    }.unsafeRunSync()

  }

}

Source File: H2Dialect.scala From hydra with Apache License 2.0

5 votes

package hydra.sql

import java.sql.JDBCType

import hydra.avro.util.SchemaWrapper
import org.apache.avro.Schema
import org.apache.avro.Schema.Field
import org.apache.avro.Schema.Type._


private object H2Dialect extends JdbcDialect {

  override def canHandle(url: String): Boolean = url.startsWith("jdbc:h2")

  override def getJDBCType(dt: Schema): Option[JdbcType] = dt.getType match {
    case STRING  => Option(JdbcType("CLOB", JDBCType.CLOB))
    case BOOLEAN => Option(JdbcType("CHAR(1)", JDBCType.CHAR))
    case ARRAY   => Option(JdbcType("ARRAY", JDBCType.ARRAY))
    case _       => None
  }

  override def getArrayType(schema: Schema) =
    Some(JdbcType("ARRAY", java.sql.JDBCType.ARRAY))

  override def buildUpsert(
      table: String,
      schema: SchemaWrapper,
      dbs: DbSyntax
  ): String = {

    val idFields = schema.primaryKeys
    val fields = schema.getFields
    val columns =
      fields.map(c => quoteIdentifier(dbs.format(c.name))).mkString(",")
    val placeholders = fields.map(_ => "?").mkString(",")
    val pk = idFields.map(i => quoteIdentifier(dbs.format(i))).mkString(",")
    val sql =
      s"""merge into ${table} ($columns) key($pk) values ($placeholders);""".stripMargin
    sql
  }

  override def upsertFields(schema: SchemaWrapper): Seq[Field] =
    schema.getFields

  override def alterTableQueries(
      table: String,
      missingFields: Seq[Schema.Field],
      dbs: DbSyntax
  ): Seq[String] = {
    missingFields.map { f =>
      val dbDef = JdbcUtils.getJdbcType(f.schema, this).databaseTypeDefinition
      val colName = quoteIdentifier(dbs.format(f.name))
      s"alter table $table add column $colName $dbDef"
    }
  }

  override def dropNotNullConstraintQueries(
      table: String,
      schema: SchemaWrapper,
      dbs: DbSyntax
  ): Seq[String] = {
    schema.getFields.filterNot(f => schema.primaryKeys.contains(f.name)).map {
      f =>
        val colName = quoteIdentifier(dbs.format(f.name))
        s"alter table $table alter column $colName drop not null"
    }
  }
}

Source File: AggregatedDialect.scala From hydra with Apache License 2.0

5 votes

package hydra.sql

import hydra.avro.util.SchemaWrapper
import org.apache.avro.Schema

import scala.util.Try

private class AggregatedDialect(dialects: List[JdbcDialect])
    extends JdbcDialect {

  require(dialects.nonEmpty)

  override def canHandle(url: String): Boolean =
    dialects.map(_.canHandle(url)).reduce(_ && _)

  override def getJDBCType(dt: Schema): Option[JdbcType] = {
    dialects.flatMap(_.getJDBCType(dt)).headOption
  }

  override def buildUpsert(
      table: String,
      schema: SchemaWrapper,
      dbs: DbSyntax
  ): String = {
    dialects.map(d => Try(d.buildUpsert(table, schema, dbs))).head.get
  }
}

Source File: Interface.scala From hydra with Apache License 2.0

5 votes

package hydra.sql

import java.sql.JDBCType

import hydra.avro.util.SchemaWrapper
import org.apache.avro.Schema


case class Database(
    name: String,
    locationUri: String,
    description: Option[String]
)

case class Table(
    name: String,
    schema: SchemaWrapper,
    dbSchema: Option[String] = None,
    description: Option[String] = None
)

case class Column(
    name: String,
    schema: Schema,
    dataType: JdbcType,
    nullable: Boolean,
    description: Option[String]
)

case class DbTable(
    name: String,
    columns: Seq[DbColumn],
    description: Option[String] = None
)

case class DbColumn(
    name: String,
    jdbcType: JDBCType,
    nullable: Boolean,
    description: Option[String]
)

Source File: NoOpDialectSpec.scala From hydra with Apache License 2.0

5 votes

package hydra.sql

import hydra.avro.util.SchemaWrapper
import org.apache.avro.Schema
import org.scalatest.matchers.should.Matchers
import org.scalatest.funspec.AnyFunSpecLike


class NoOpDialectSpec extends Matchers with AnyFunSpecLike {

  describe("The NoOp dialect") {
    it("handles everything") {
      NoopDialect.canHandle("url") shouldBe true
    }

    it("does not upsert") {
      intercept[UnsupportedOperationException] {
        NoopDialect.buildUpsert(
          "table",
          SchemaWrapper.from(Schema.create(Schema.Type.NULL)),
          UnderscoreSyntax
        )
      }
    }

    it("returns the correct json placeholder") {
      NoopDialect.jsonPlaceholder shouldBe "?"
    }

    it("does not support dropping constraints by default") {
      intercept[UnsupportedOperationException] {
        NoopDialect.dropNotNullConstraintQueries(
          "table",
          null,
          UnderscoreSyntax
        )
      }
    }
  }
}

Source File: AvroRecord.scala From hydra with Apache License 2.0

5 votes

package hydra.kafka.producer

import com.pluralsight.hydra.avro.JsonConverter
import hydra.core.transport.AckStrategy
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.commons.lang3.StringUtils


case class AvroRecord(
    destination: String,
    schema: Schema,
    key: String,
    payload: GenericRecord,
    ackStrategy: AckStrategy
) extends KafkaRecord[String, GenericRecord]

object AvroRecord {

  def apply(
      destination: String,
      schema: Schema,
      key: Option[String],
      json: String,
      ackStrategy: AckStrategy,
      useStrictValidation: Boolean = false
  ): AvroRecord = {

    val payload: GenericRecord = {
      val converter: JsonConverter[GenericRecord] =
        new JsonConverter[GenericRecord](schema, useStrictValidation)
      converter.convert(json)
    }

    AvroRecord(destination, schema, key.orNull, payload, ackStrategy)
  }

  def apply(
      destination: String,
      schema: Schema,
      key: Option[String],
      record: GenericRecord,
      ackStrategy: AckStrategy
  ): AvroRecord = {
    AvroRecord(destination, schema, key.orNull, record, ackStrategy)
  }
}

Source File: AvroKeyRecord.scala From hydra with Apache License 2.0

5 votes

package hydra.kafka.producer

import com.pluralsight.hydra.avro.JsonConverter
import hydra.core.transport.AckStrategy
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord

final case class AvroKeyRecord(
    destination: String,
    keySchema: Schema,
    valueSchema: Schema,
    key: GenericRecord,
    payload: GenericRecord,
    ackStrategy: AckStrategy
) extends KafkaRecord[GenericRecord, GenericRecord]

object AvroKeyRecord {

  def apply(
      destination: String,
      keySchema: Schema,
      valueSchema: Schema,
      keyJson: String,
      valueJson: String,
      ackStrategy: AckStrategy
  ): AvroKeyRecord = {

    val (key, value): (GenericRecord, GenericRecord) = {
      val keyConverter: String => GenericRecord =
        new JsonConverter[GenericRecord](keySchema).convert
      val valueConverter: String => GenericRecord =
        new JsonConverter[GenericRecord](valueSchema).convert
      (keyConverter(keyJson), valueConverter(valueJson))
    }

    AvroKeyRecord(destination, keySchema, valueSchema, key, value, ackStrategy)
  }

  def apply(
      destination: String,
      keySchema: Schema,
      valueSchema: Schema,
      key: GenericRecord,
      value: GenericRecord,
      ackStrategy: AckStrategy
  ): AvroKeyRecord = {
    new AvroKeyRecord(
      destination,
      keySchema,
      valueSchema,
      key,
      value,
      ackStrategy
    )
  }
}

Source File: IngestionErrorHandler.scala From hydra with Apache License 2.0

5 votes

package hydra.kafka.ingestors

import akka.actor.Actor
import com.pluralsight.hydra.avro.JsonToAvroConversionException
import hydra.common.config.ConfigSupport._
import hydra.avro.registry.JsonToAvroConversionExceptionWithMetadata
import hydra.common.config.ConfigSupport
import hydra.core.ingest.RequestParams.HYDRA_KAFKA_TOPIC_PARAM
import hydra.core.protocol.GenericIngestionError
import hydra.core.transport.Transport.Deliver
import hydra.kafka.producer.AvroRecord
import org.apache.avro.Schema
import spray.json.DefaultJsonProtocol

import scala.io.Source


class IngestionErrorHandler
    extends Actor
    with ConfigSupport
    with DefaultJsonProtocol {

  import spray.json._

  private implicit val ec = context.dispatcher

  private implicit val hydraIngestionErrorInfoFormat = jsonFormat6(
    HydraIngestionErrorInfo
  )

  private val errorTopic = applicationConfig
    .getStringOpt("ingest.error-topic")
    .getOrElse("_hydra_ingest_errors")

  private lazy val kafkaTransport = context
    .actorSelection(
      applicationConfig
        .getStringOpt(s"transports.kafka.path")
        .getOrElse(s"/user/service/kafka_transport")
    )

  private val errorSchema = new Schema.Parser()
    .parse(Source.fromResource("schemas/HydraIngestError.avsc").mkString)

  override def receive: Receive = {
    case error: GenericIngestionError =>
      kafkaTransport ! Deliver(buildPayload(error))
  }

  private[ingestors] def buildPayload(
      err: GenericIngestionError
  ): AvroRecord = {
    val schema: Option[String] = err.cause match {
      case e: JsonToAvroConversionException             => Some(e.getSchema.toString)
      case e: JsonToAvroConversionExceptionWithMetadata => Some(e.location)
      case e: Exception                                 => None
    }

    val topic = err.request.metadataValue(HYDRA_KAFKA_TOPIC_PARAM)

    val errorInfo = HydraIngestionErrorInfo(
      err.ingestor,
      topic,
      err.cause.getMessage,
      err.request.metadata,
      schema,
      err.request.payload
    ).toJson.compactPrint

    AvroRecord(
      errorTopic,
      errorSchema,
      topic,
      errorInfo,
      err.request.ackStrategy
    )
  }
}

case class HydraIngestionErrorInfo(
    ingestor: String,
    destination: Option[String],
    errorMessage: String,
    metadata: Map[String, String],
    schema: Option[String],
    payload: String
)

Source File: MockConnectorSpec.scala From darwin with Apache License 2.0

5 votes

package it.agilelab.darwin.connector.mock

import com.typesafe.config.ConfigFactory
import org.apache.avro.Schema
import org.apache.avro.Schema.Type
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

class MockConnectorSpec extends AnyFlatSpec with Matchers {

  it should "load the test schemas" in {
    new MockConnectorCreator().create(ConfigFactory.empty()).fullLoad() should have size (2)
  }

  it should "load the test schemas and custom ones" in {
    val connector = new MockConnectorCreator().create(ConfigFactory.empty())
    connector.insert((3L, Schema.create(Type.BYTES)) :: Nil)
    connector.fullLoad() should have size (3)
  }

}

Source File: CachedEagerApplicationSuite.scala From darwin with Apache License 2.0

5 votes

package it.agilelab.darwin.app.mock

import java.lang.reflect.Modifier
import java.nio.ByteOrder

import com.typesafe.config.{Config, ConfigFactory}
import it.agilelab.darwin.annotations.AvroSerde
import it.agilelab.darwin.app.mock.classes.{MyClass, MyNestedClass, NewClass, OneField}
import it.agilelab.darwin.common.{Connector, ConnectorFactory}
import it.agilelab.darwin.manager.{AvroSchemaManager, CachedEagerAvroSchemaManager}
import org.apache.avro.{Schema, SchemaNormalization}
import org.apache.avro.reflect.ReflectData
import org.reflections.Reflections

import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers
import it.agilelab.darwin.common.compat._

class BigEndianCachedEagerApplicationSuite extends CachedEagerApplicationSuite(ByteOrder.BIG_ENDIAN)

class LittleEndianCachedEagerApplicationSuite extends CachedEagerApplicationSuite(ByteOrder.LITTLE_ENDIAN)

abstract class CachedEagerApplicationSuite(val endianness: ByteOrder) extends AnyFlatSpec with Matchers {

  val config: Config = ConfigFactory.load()
  val connector: Connector = ConnectorFactory.connector(config)
  val manager: AvroSchemaManager = new CachedEagerAvroSchemaManager(connector, endianness)

  "CachedEagerAvroSchemaManager" should "not fail after the initialization" in {
    val schemas: Seq[Schema] = Seq(SchemaReader.readFromResources("MyNestedClass.avsc"))
    assert(manager.registerAll(schemas).size == 1)
  }

  it should "load all existing schemas and register a new one" in {
    val schemas: Seq[Schema] = Seq(SchemaReader.readFromResources("MyNestedClass.avsc"))
    manager.getSchema(0L)

    manager.registerAll(schemas)

    val id = manager.getId(schemas.head)
    assert(manager.getSchema(id).isDefined)
    assert(schemas.head == manager.getSchema(id).get)
  }

  it should "get all previously registered schemas" in {
    val schema: Schema = SchemaReader.readFromResources("MyNestedClass.avsc")
    val schema0 = manager.getSchema(0L)
    val schema1 = manager.getSchema(1L)
    assert(schema0.isDefined)
    assert(schema1.isDefined)
    assert(schema0.get != schema1.get)
    assert(schema != schema0.get)
    assert(schema != schema1.get)
  }

  it should "generate all schemas for all the annotated classes with @AvroSerde" in {
    val reflections = new Reflections("it.agilelab.darwin.app.mock.classes")

    val oneFieldSchema = ReflectData.get().getSchema(classOf[OneField]).toString
    val myNestedSchema = ReflectData.get().getSchema(classOf[MyNestedClass]).toString
    val myClassSchema = ReflectData.get().getSchema(classOf[MyClass]).toString

    val annotationClass: Class[AvroSerde] = classOf[AvroSerde]
    val classes = reflections.getTypesAnnotatedWith(annotationClass).toScala.toSeq
      .filter(c => !c.isInterface && !Modifier.isAbstract(c.getModifiers))
    val schemas = classes.map(c => ReflectData.get().getSchema(Class.forName(c.getName)).toString)
    Seq(oneFieldSchema, myClassSchema, myNestedSchema) should contain theSameElementsAs schemas
  }

  it should "reload all schemas from the connector" in {
    val newSchema = ReflectData.get().getSchema(classOf[NewClass])
    val newId = SchemaNormalization.parsingFingerprint64(newSchema)
    assert(manager.getSchema(newId).isEmpty)

    connector.insert(Seq(newId -> newSchema))
    assert(manager.getSchema(newId).isEmpty)

    manager.reload()
    assert(manager.getSchema(newId).isDefined)
    assert(manager.getSchema(newId).get == newSchema)
  }

}

Source File: SchemaReader.scala From darwin with Apache License 2.0

5 votes

package it.agilelab.darwin.app.mock

import java.io.{File, InputStream}

import org.apache.avro.Schema

object SchemaReader {

  def readFromResources(p: String): Schema = {
    read(getClass.getClassLoader.getResourceAsStream(p))
  }

  def read(f: File): Schema = {
    val parser = new Schema.Parser()
    parser.parse(f)
  }

  def read(s: String): Schema = {
    val parser = new Schema.Parser()
    parser.parse(s)
  }

  def read(is: InputStream): Schema = {
    val parser = new Schema.Parser()
    parser.parse(is)
  }
}

Source File: CachedLazyApplicationSuite.scala From darwin with Apache License 2.0

5 votes

package it.agilelab.darwin.app.mock

import java.lang.reflect.Modifier
import java.nio.ByteOrder

import com.typesafe.config.{Config, ConfigFactory}
import it.agilelab.darwin.annotations.AvroSerde
import it.agilelab.darwin.app.mock.classes.{MyClass, MyNestedClass, NewClass, OneField}
import it.agilelab.darwin.common.{Connector, ConnectorFactory}
import it.agilelab.darwin.manager.{AvroSchemaManager, CachedLazyAvroSchemaManager}
import org.apache.avro.{Schema, SchemaNormalization}
import org.apache.avro.reflect.ReflectData
import org.reflections.Reflections

import it.agilelab.darwin.common.compat._
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

class BigEndianCachedLazyApplicationSuite extends CachedLazyApplicationSuite(ByteOrder.BIG_ENDIAN)

class LittleEndianCachedLazyApplicationSuite extends CachedLazyApplicationSuite(ByteOrder.LITTLE_ENDIAN)

abstract class CachedLazyApplicationSuite(val endianness: ByteOrder) extends AnyFlatSpec with Matchers {

  val config: Config = ConfigFactory.load()
  val connector: Connector = ConnectorFactory.connector(config)
  val manager: AvroSchemaManager = new CachedLazyAvroSchemaManager(connector, endianness)

  "CachedLazyAvroSchemaManager" should "not fail after the initialization" in {
    val schemas: Seq[Schema] = Seq(SchemaReader.readFromResources("MyNestedClass.avsc"))
    assert(manager.registerAll(schemas).size == 1)
  }

  it should "load all existing schemas and register a new one" in {
    val schemas: Seq[Schema] = Seq(SchemaReader.readFromResources("MyNestedClass.avsc"))
    manager.getSchema(0L)

    manager.registerAll(schemas)

    val id = manager.getId(schemas.head)
    assert(manager.getSchema(id).isDefined)
    assert(schemas.head == manager.getSchema(id).get)
  }

  it should "get all previously registered schemas" in {
    val schema: Schema = SchemaReader.readFromResources("MyNestedClass.avsc")
    val schema0 = manager.getSchema(0L)
    val schema1 = manager.getSchema(1L)
    assert(schema0.isDefined)
    assert(schema1.isDefined)
    assert(schema0.get != schema1.get)
    assert(schema != schema0.get)
    assert(schema != schema1.get)
  }

  it should "generate all schemas for all the annotated classes with @AvroSerde" in {
    val reflections = new Reflections("it.agilelab.darwin.app.mock.classes")

    val oneFieldSchema = ReflectData.get().getSchema(classOf[OneField]).toString
    val myNestedSchema = ReflectData.get().getSchema(classOf[MyNestedClass]).toString
    val myClassSchema = ReflectData.get().getSchema(classOf[MyClass]).toString

    val annotationClass: Class[AvroSerde] = classOf[AvroSerde]
    val classes = reflections.getTypesAnnotatedWith(annotationClass).toScala.toSeq
      .filter(c => !c.isInterface && !Modifier.isAbstract(c.getModifiers))
    val schemas = classes.map(c => ReflectData.get().getSchema(Class.forName(c.getName)).toString)
    Seq(oneFieldSchema, myClassSchema, myNestedSchema) should contain theSameElementsAs schemas
  }

  it should "reload all schemas from the connector" in {
    val newSchema = ReflectData.get().getSchema(classOf[NewClass])
    val newId = SchemaNormalization.parsingFingerprint64(newSchema)
    assert(manager.getSchema(newId).isEmpty)

    connector.insert(Seq(newId -> newSchema))
    assert(manager.getSchema(newId).isDefined)
    assert(manager.getSchema(newId).get == newSchema)
  }
}

Source File: LazyApplicationSuite.scala From darwin with Apache License 2.0

5 votes

package it.agilelab.darwin.app.mock

import java.lang.reflect.Modifier
import java.nio.ByteOrder

import com.typesafe.config.{Config, ConfigFactory}
import it.agilelab.darwin.annotations.AvroSerde
import it.agilelab.darwin.app.mock.classes.{MyClass, MyNestedClass, NewClass, OneField}
import it.agilelab.darwin.common.{Connector, ConnectorFactory}
import it.agilelab.darwin.manager.{AvroSchemaManager, LazyAvroSchemaManager}
import org.apache.avro.{Schema, SchemaNormalization}
import org.apache.avro.reflect.ReflectData
import org.reflections.Reflections

import it.agilelab.darwin.common.compat._
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

class BigEndianLazyApplicationSuite extends LazyApplicationSuite(ByteOrder.BIG_ENDIAN)

class LittleEndianLazyApplicationSuite extends LazyApplicationSuite(ByteOrder.LITTLE_ENDIAN)

abstract class LazyApplicationSuite(endianness: ByteOrder) extends AnyFlatSpec with Matchers {

  val config: Config = ConfigFactory.load()
  val connector: Connector = ConnectorFactory.connector(config)
  val manager: AvroSchemaManager = new LazyAvroSchemaManager(connector, endianness)

  "LazyAvroSchemaManager" should "not fail after the initialization" in {
    val schemas: Seq[Schema] = Seq(SchemaReader.readFromResources("MyNestedClass.avsc"))
    assert(manager.registerAll(schemas).size == 1)
  }

  it should "load all existing schemas and register a new one" in {
    val schemas: Seq[Schema] = Seq(SchemaReader.readFromResources("MyNestedClass.avsc"))
    manager.getSchema(0L)

    manager.registerAll(schemas)

    val id = manager.getId(schemas.head)
    assert(manager.getSchema(id).isDefined)
    assert(schemas.head == manager.getSchema(id).get)
  }

  it should "get all previously registered schemas" in {
    val schema: Schema = SchemaReader.readFromResources("MyNestedClass.avsc")
    val schema0 = manager.getSchema(0L)
    val schema1 = manager.getSchema(1L)
    assert(schema0.isDefined)
    assert(schema1.isDefined)
    assert(schema0.get != schema1.get)
    assert(schema != schema0.get)
    assert(schema != schema1.get)
  }

  it should "generate all schemas for all the annotated classes with @AvroSerde" in {
    val reflections = new Reflections("it.agilelab.darwin.app.mock.classes")

    val oneFieldSchema = ReflectData.get().getSchema(classOf[OneField]).toString
    val myNestedSchema = ReflectData.get().getSchema(classOf[MyNestedClass]).toString
    val myClassSchema = ReflectData.get().getSchema(classOf[MyClass]).toString

    val annotationClass: Class[AvroSerde] = classOf[AvroSerde]
    val classes = reflections.getTypesAnnotatedWith(annotationClass).toScala.toSeq
      .filter(c => !c.isInterface && !Modifier.isAbstract(c.getModifiers))
    val schemas = classes.map(c => ReflectData.get().getSchema(Class.forName(c.getName)).toString)
    Seq(oneFieldSchema, myClassSchema, myNestedSchema) should contain theSameElementsAs schemas
  }

  it should "reload all schemas from the connector" in {
    val newSchema = ReflectData.get().getSchema(classOf[NewClass])
    val newId = SchemaNormalization.parsingFingerprint64(newSchema)
    assert(manager.getSchema(newId).isEmpty)

    connector.insert(Seq(newId -> newSchema))
    assert(manager.getSchema(newId).isDefined)
    assert(manager.getSchema(newId).get == newSchema)
  }
}

Source File: DarwinService.scala From darwin with Apache License 2.0

5 votes

package it.agilelab.darwin.server.rest

import akka.actor.ActorSystem
import akka.http.scaladsl.model.{HttpResponse, StatusCodes}
import akka.http.scaladsl.server.directives.DebuggingDirectives
import akka.http.scaladsl.server.{Directives, Route}
import akka.stream.ActorMaterializer
import akka.stream.Attributes.LogLevels
import it.agilelab.darwin.manager.AvroSchemaManager
import org.apache.avro.Schema


trait DarwinService extends Service with Directives with DebuggingDirectives with JsonSupport {

  val manager: AvroSchemaManager

  override def route: Route = logRequestResult(("darwin", LogLevels.Debug)) {
    get {
      path("schemas" / LongNumber.?) {
        case Some(id) => manager.getSchema(id) match {
          case Some(schema) => complete(schema)
          case None => complete {
            HttpResponse(StatusCodes.NotFound)
          }
        }
        case None => complete(manager.getAll)
      }
    } ~ post {
      path("schemas" / PathEnd) {
        entity(as[Seq[Schema]]) { schemas =>
          complete {
            manager.registerAll(schemas).map(_._1)
          }
        }
      }
    }
  }
}


object DarwinService {
  def apply(asm: AvroSchemaManager)(implicit s: ActorSystem, m: ActorMaterializer): DarwinService = new DarwinService {
    override implicit val materializer: ActorMaterializer = m
    override implicit val system: ActorSystem = s
    override val manager: AvroSchemaManager = asm
  }
}

Source File: JsonSupport.scala From darwin with Apache License 2.0

5 votes

package it.agilelab.darwin.server.rest

import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport
import org.apache.avro.Schema
import spray.json.{DefaultJsonProtocol, JsObject, JsString, JsValue, JsonParser, PrettyPrinter, RootJsonFormat}

trait JsonSupport extends SprayJsonSupport with DefaultJsonProtocol {
  implicit val printer: PrettyPrinter.type = PrettyPrinter

  implicit val schemaFormat: RootJsonFormat[Schema] = new RootJsonFormat[Schema] {

    override def write(obj: Schema): JsValue = JsonParser(obj.toString(true))

    override def read(json: JsValue): Schema = new Schema.Parser().parse(json.prettyPrint)
  }

  implicit val schemaWithIdFormat: RootJsonFormat[(Long, Schema)] = new RootJsonFormat[(Long, Schema)] {

    override def write(obj: (Long, Schema)): JsValue = JsObject(Map(
      "id" -> JsString(obj._1.toString),
      "schema" -> schemaFormat.write(obj._2)
    ))

    override def read(json: JsValue): (Long, Schema) = json match {
      case JsObject(fields) =>
        val id = fields.get("id") match {
          case Some(JsString(number)) => number
          case _ => throw new Exception("Id field should be a long")
        }

        val schema = fields.get("schema") match {
          case Some(x@JsObject(_)) => x
          case _ => throw new Exception("schema should be an object")
        }

        (id.toLong, schemaFormat.read(schema))
      case _ => throw new Exception("should be an object")
    }
  }
}

Source File: HBaseConnectorSuite.scala From darwin with Apache License 2.0

5 votes

package it.agilelab.darwin.connector.hbase

import java.nio.file.Files

import com.typesafe.config.{ConfigFactory, ConfigValueFactory}
import it.agilelab.darwin.common.Connector
import org.apache.avro.reflect.ReflectData
import org.apache.avro.{Schema, SchemaNormalization}
import org.apache.hadoop.hbase.HBaseTestingUtility
import org.scalatest.BeforeAndAfterAll
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

class HBaseConnectorSuite extends AnyFlatSpec with Matchers with BeforeAndAfterAll {

  var connector: Connector = _

  "HBaseConnector" should "load all existing schemas" in {
    connector.fullLoad()
  }

  it should "insert and retrieve" in {
    val schemas = Seq(ReflectData.get().getSchema(classOf[HBaseMock]), ReflectData.get().getSchema(classOf[HBase2Mock]))
      .map(s => SchemaNormalization.parsingFingerprint64(s) -> s)
    connector.insert(schemas)
    val loaded: Seq[(Long, Schema)] = connector.fullLoad()
    assert(loaded.size == schemas.size)
    assert(loaded.forall(schemas.contains))
    val schema = connector.findSchema(loaded.head._1)
    assert(schema.isDefined)
    assert(schema.get == loaded.head._2)
    val noSchema = connector.findSchema(-1L)
    assert(noSchema.isEmpty)
  }

  "connector.tableCreationHint" should "print the correct hint for table creation" in {
    connector.tableCreationHint() should be(
      """To create namespace and table from an HBase shell issue:
        |  create_namespace 'AVRO'
        |  create 'AVRO:SCHEMA_REPOSITORY', '0'""".stripMargin)
  }

  "connector.tableExists" should "return true with existent table" in {
    connector.tableExists() should be(true)
  }

  override def beforeAll(): Unit = {

    connector = new HBaseConnectorCreator().create(HBaseConnectorSuite.config)

    connector.createTable()
  }


}

object HBaseConnectorSuite {
  private lazy val config = {
    val util = new HBaseTestingUtility()
    val minicluster = util.startMiniCluster()

    //Hbase connector can only load configurations from a file path so we need to render the hadoop conf
    val confFile = Files.createTempFile("prefix", "suffix")
    val stream = Files.newOutputStream(confFile)
    minicluster.getConfiguration.writeXml(stream)
    stream.flush()
    stream.close()
    val hbaseConfigPath = ConfigValueFactory.fromAnyRef(confFile.toAbsolutePath.toString)

    //HbaseConnector will only load conf if hbase-site and core-site are given,
    //we give the same file to each.
    sys.addShutdownHook(minicluster.shutdown())
    ConfigFactory.load()
      .withValue(ConfigurationKeys.HBASE_SITE, hbaseConfigPath)
      .withValue(ConfigurationKeys.CORE_SITE, hbaseConfigPath)
  }

}

Source File: PostgresConnectorSuite.scala From darwin with Apache License 2.0

5 votes

package it.agilelab.darwin.connector.postgres

import com.typesafe.config.{Config, ConfigFactory}
import it.agilelab.darwin.common.Connector
import org.apache.avro.{Schema, SchemaNormalization}
import org.scalatest.BeforeAndAfterAll
import ru.yandex.qatools.embed.postgresql.EmbeddedPostgres
import ru.yandex.qatools.embed.postgresql.distribution.Version
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

class PostgresConnectorSuite extends AnyFlatSpec with Matchers with BeforeAndAfterAll {
  val config: Config = ConfigFactory.load("postgres.properties")
  val connector: Connector = new PostgresConnectorCreator().create(config)
  val embeddedPostgres: EmbeddedPostgres = new EmbeddedPostgres(Version.V9_6_11)

  override protected def beforeAll(): Unit = {
    super.beforeAll()

    val port = 5432
    val host = "localhost"
    val dbname = "postgres"
    val username = "postgres"
    val password = "mysecretpassword"

    embeddedPostgres.start(host, port, dbname, username, password)

    connector.createTable()
  }

  override protected def afterAll(): Unit = {
    super.afterAll()

    embeddedPostgres.stop()
  }



  "PostgresConnector" should "load all existing schemas" in {
    connector.fullLoad()
  }

  ignore should "insert and retrieve" in {
    val outerSchema = new Schema.Parser().parse(getClass.getClassLoader.getResourceAsStream("postgresmock.avsc"))
    val innerSchema = outerSchema.getField("four").schema()

    val schemas = Seq(innerSchema, outerSchema)
      .map(s => SchemaNormalization.parsingFingerprint64(s) -> s)
    connector.insert(schemas)
    val loaded: Seq[(Long, Schema)] = connector.fullLoad()
    assert(loaded.size == schemas.size)
    assert(loaded.forall(schemas.contains))
  }

}

Source File: JsonProtocol.scala From darwin with Apache License 2.0

5 votes

package it.agilelab.darwin.connector.rest

import java.io.InputStream

import org.apache.avro.Schema
import org.codehaus.jackson.map.ObjectMapper
import org.codehaus.jackson.node.JsonNodeFactory
import it.agilelab.darwin.common.compat._

trait JsonProtocol {
  val objectMapper = new ObjectMapper()

  def toJson(schemas : Seq[(Long,Schema)]): String = {

    val data = schemas.map {
      case (_, schema) =>
        objectMapper.readTree(schema.toString)
    }.foldLeft(JsonNodeFactory.instance.arrayNode()) {
      case (array, node) =>
        array.add(node)
        array
    }

    objectMapper.writeValueAsString(data)
  }

  def toSeqOfIdSchema(in: InputStream): Seq[(Long, Schema)] = {
    val node = objectMapper.readTree(in)

    node.getElements.toScala.map { node =>
      val id = node.get("id").asText().toLong
      val schemaNode = node.get("schema")

      val schemaToString = objectMapper.writeValueAsString(schemaNode)

      val parser = new Schema.Parser()

      val schema = parser.parse(schemaToString)

      (id, schema)
    }.toVector
  }


  def toSchema(in: InputStream): Schema = {
    val parser = new Schema.Parser()
    parser.parse(in)
  }
}

Source File: CachedLazyAvroSchemaManager.scala From darwin with Apache License 2.0

5 votes

package it.agilelab.darwin.manager

import java.nio.ByteOrder

import it.agilelab.darwin.common.Connector
import org.apache.avro.Schema


class CachedLazyAvroSchemaManager(connector: Connector, endianness: ByteOrder)
  extends CachedAvroSchemaManager(connector, endianness) {

  override def getSchema(id: Long): Option[Schema] = {
    cache.getSchema(id).orElse {
      val schema: Option[Schema] = connector.findSchema(id)
      schema.foreach(s => _cache.set(Some(cache.insert(Seq(getId(s) -> s)))))
      schema
    }
  }

  override def getAll: Seq[(Long, Schema)] = {
    _cache.set(Some(cache.insert(connector.fullLoad())))
    cache.getAll
  }
}

Source File: ObjectToBytesWithSchema.scala From trucking-iot with Apache License 2.0

5 votes

package com.orendainx.trucking.storm.bolts

import java.util

import com.hortonworks.registries.schemaregistry.avro.AvroSchemaProvider
import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient
import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotSerializer
import com.hortonworks.registries.schemaregistry.{SchemaMetadata, SchemaVersionInfo}
import com.orendainx.trucking.commons.models.{EnrichedTruckAndTrafficData, WindowedDriverStats}
import com.typesafe.scalalogging.Logger
import org.apache.avro.Schema
import org.apache.avro.generic.GenericData
import org.apache.storm.task.{OutputCollector, TopologyContext}
import org.apache.storm.topology.OutputFieldsDeclarer
import org.apache.storm.topology.base.BaseRichBolt
import org.apache.storm.tuple.{Fields, Tuple, Values}

import scala.collection.JavaConverters._


class ObjectToBytesWithSchema extends BaseRichBolt {

  private lazy val log = Logger(this.getClass)
  private var outputCollector: OutputCollector = _

  // Declare schema-related fields to be initialized when this component's prepare() method is called
  private var schemaRegistryClient: SchemaRegistryClient = _
  private var serializer: AvroSnapshotSerializer = _

  private var joinedSchemaMetadata: SchemaMetadata = _
  private var joinedSchemaInfo: SchemaVersionInfo = _
  private var driverStatsSchemaMetadata: SchemaMetadata = _
  private var driverStatsJoinedSchemaInfo: SchemaVersionInfo = _

  override def prepare(stormConf: util.Map[_, _], context: TopologyContext, collector: OutputCollector): Unit = {
    outputCollector = collector

    val schemaRegistryUrl = stormConf.get(SchemaRegistryClient.Configuration.SCHEMA_REGISTRY_URL.name()).toString
    val clientConfig = Map(SchemaRegistryClient.Configuration.SCHEMA_REGISTRY_URL.name() -> schemaRegistryUrl).asJava

    schemaRegistryClient = new SchemaRegistryClient(clientConfig)

    joinedSchemaMetadata = schemaRegistryClient.getSchemaMetadataInfo("EnrichedTruckAndTrafficData").getSchemaMetadata
    joinedSchemaInfo = schemaRegistryClient.getLatestSchemaVersionInfo("EnrichedTruckAndTrafficData")

    driverStatsSchemaMetadata = schemaRegistryClient.getSchemaMetadataInfo("WindowedDriverStats").getSchemaMetadata
    driverStatsJoinedSchemaInfo = schemaRegistryClient.getLatestSchemaVersionInfo("WindowedDriverStats")

    serializer = schemaRegistryClient.getDefaultSerializer(AvroSchemaProvider.TYPE).asInstanceOf[AvroSnapshotSerializer]
    serializer.init(clientConfig)
  }

  override def execute(tuple: Tuple): Unit = {

    val serializedBytes = tuple.getStringByField("dataType") match {
      case "EnrichedTruckAndTrafficData" =>
        val record = enrichedTruckAndTrafficToGenericRecord(tuple.getValueByField("data").asInstanceOf[EnrichedTruckAndTrafficData])
        serializer.serialize(record, joinedSchemaMetadata)
      case "WindowedDriverStats" =>
        val record = enrichedTruckAndTrafficToGenericRecord(tuple.getValueByField("data").asInstanceOf[WindowedDriverStats])
        serializer.serialize(record, driverStatsSchemaMetadata)
    }

    outputCollector.emit(new Values(serializedBytes))
    outputCollector.ack(tuple)
  }

  override def declareOutputFields(declarer: OutputFieldsDeclarer): Unit = declarer.declare(new Fields("data"))

  private def enrichedTruckAndTrafficToGenericRecord(data: EnrichedTruckAndTrafficData) = {
    val record = new GenericData.Record(new Schema.Parser().parse(joinedSchemaInfo.getSchemaText))
    record.put("eventTime", data.eventTime)
    record.put("truckId", data.truckId)
    record.put("driverId", data.driverId)
    record.put("driverName", data.driverName)
    record.put("routeId", data.routeId)
    record.put("routeName", data.routeName)
    record.put("latitude", data.latitude)
    record.put("longitude", data.longitude)
    record.put("speed", data.speed)
    record.put("eventType", data.eventType)
    record.put("foggy", data.foggy)
    record.put("rainy", data.rainy)
    record.put("windy", data.windy)
    record.put("congestionLevel", data.congestionLevel)
    record
  }

  private def enrichedTruckAndTrafficToGenericRecord(data: WindowedDriverStats) = {
    val record = new GenericData.Record(new Schema.Parser().parse(driverStatsJoinedSchemaInfo.getSchemaText))
    record.put("driverId", data.driverId)
    record.put("averageSpeed", data.averageSpeed)
    record.put("totalFog", data.totalFog)
    record.put("totalRain", data.totalRain)
    record.put("totalWind", data.totalWind)
    record.put("totalViolations", data.totalViolations)
    record
  }
}

Source File: AvroToSchema.scala From streamliner-examples with Apache License 2.0

5 votes

package com.memsql.spark.examples.avro

import collection.JavaConversions._
import org.apache.spark.sql.types._
import org.apache.avro.Schema

// Converts an Avro schema to a Spark DataFrame schema.
//
// This assumes that the Avro schema is "flat", i.e. a Record that includes primitive types
// or unions of primitive types. Unions, and Avro types that don't directly map to Scala types,
// are converted to Strings and put in a Spark SQL StringType column.
private object AvroToSchema {
  def getSchema(schema: Schema): StructType = {
    StructType(schema.getFields.map(field => {
      val fieldName = field.name
      val fieldSchema = field.schema
      val fieldType = fieldSchema.getType match {
        case Schema.Type.BOOLEAN => BooleanType
        case Schema.Type.DOUBLE => DoubleType
        case Schema.Type.FLOAT => FloatType
        case Schema.Type.INT => IntegerType
        case Schema.Type.LONG => LongType
        case Schema.Type.NULL => NullType
        case Schema.Type.STRING => StringType
        case _ => StringType
      }
      StructField(fieldName, fieldType.asInstanceOf[DataType], true)
    }))
  }
}

Source File: AvroTransformer.scala From streamliner-examples with Apache License 2.0

5 votes

package com.memsql.spark.examples.avro

import com.memsql.spark.etl.api.{UserTransformConfig, Transformer, PhaseConfig}
import com.memsql.spark.etl.utils.PhaseLogger
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{SQLContext, DataFrame, Row}
import org.apache.spark.sql.types.StructType

import org.apache.avro.Schema
import org.apache.avro.generic.GenericData
import org.apache.avro.io.DecoderFactory
import org.apache.avro.specific.SpecificDatumReader

// Takes DataFrames of byte arrays, where each row is a serialized Avro record.
// Returns DataFrames of deserialized data, where each field has its own column.
class AvroTransformer extends Transformer {
  var avroSchemaStr: String = null
  var sparkSqlSchema: StructType = null

  def AvroRDDToDataFrame(sqlContext: SQLContext, rdd: RDD[Row]): DataFrame = {

    val rowRDD: RDD[Row] = rdd.mapPartitions({ partition => {
      // Create per-partition copies of non-serializable objects
      val parser: Schema.Parser = new Schema.Parser()
      val avroSchema = parser.parse(avroSchemaStr)
      val reader = new SpecificDatumReader[GenericData.Record](avroSchema)

      partition.map({ rowOfBytes =>
        val bytes = rowOfBytes(0).asInstanceOf[Array[Byte]]
        val decoder = DecoderFactory.get().binaryDecoder(bytes, null)
        val record = reader.read(null, decoder)
        val avroToRow = new AvroToRow()

        avroToRow.getRow(record)
      })
    }})
    sqlContext.createDataFrame(rowRDD, sparkSqlSchema)
  }

  override def initialize(sqlContext: SQLContext, config: PhaseConfig, logger: PhaseLogger): Unit = {
    val userConfig = config.asInstanceOf[UserTransformConfig]

    val avroSchemaJson = userConfig.getConfigJsValue("avroSchema") match {
      case Some(s) => s
      case None => throw new IllegalArgumentException("avroSchema must be set in the config")
    }
    avroSchemaStr = avroSchemaJson.toString

    val parser = new Schema.Parser()
    val avroSchema = parser.parse(avroSchemaJson.toString)
    sparkSqlSchema = AvroToSchema.getSchema(avroSchema)
  }

  override def transform(sqlContext: SQLContext, df: DataFrame, config: PhaseConfig, logger: PhaseLogger): DataFrame = {
    AvroRDDToDataFrame(sqlContext, df.rdd)
  }
}

Source File: AvroRandomGenerator.scala From streamliner-examples with Apache License 2.0

5 votes

package com.memsql.spark.examples.avro

import org.apache.avro.Schema
import org.apache.avro.generic.GenericData

import scala.collection.JavaConversions._
import scala.util.Random

class AvroRandomGenerator(inSchema: Schema) {
  // Avoid nested Records, since our destination is a DataFrame.
  val MAX_RECURSION_LEVEL: Int = 1

  val topSchema: Schema = inSchema
  val random = new Random

  def next(schema: Schema = this.topSchema, level: Int = 0): Any = {
    if (level <= MAX_RECURSION_LEVEL) {

      schema.getType match {
        case Schema.Type.RECORD => {
          val datum = new GenericData.Record(schema)
          schema.getFields.foreach {
            x => datum.put(x.pos, next(x.schema, level + 1))
          }
          datum
        }

        case Schema.Type.UNION => {
          val types = schema.getTypes
          // Generate a value using the first type in the union.
          // "Random type" is also a valid option.
          next(types(0), level)
        }

        case _ => generateValue(schema.getType)
      }

    } else {
      null
    }
  }

  def generateValue(avroType: Schema.Type): Any = avroType match {
    case Schema.Type.BOOLEAN => random.nextBoolean
    case Schema.Type.DOUBLE => random.nextDouble
    case Schema.Type.FLOAT => random.nextFloat
    case Schema.Type.INT => random.nextInt
    case Schema.Type.LONG => random.nextLong
    case Schema.Type.NULL => null
    case Schema.Type.STRING => getRandomString
    case _ => null
  }

  def getRandomString(): String = {
    val length: Int = 5 + random.nextInt(5)
    (1 to length).map(x => ('a'.toInt + random.nextInt(26)).toChar).mkString
  }

}

Source File: AvroRandomExtractor.scala From streamliner-examples with Apache License 2.0

5 votes

package com.memsql.spark.examples.avro

import com.memsql.spark.etl.api._
import com.memsql.spark.etl.utils.PhaseLogger
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.sql.{SQLContext, DataFrame, Row}
import org.apache.spark.sql.types._
import org.apache.avro.Schema
import org.apache.avro.generic.GenericData
import org.apache.avro.io.{DatumWriter, EncoderFactory}
import org.apache.avro.specific.SpecificDatumWriter

import java.io.ByteArrayOutputStream

// Generates an RDD of byte arrays, where each is a serialized Avro record.
class AvroRandomExtractor extends Extractor {
  var count: Int = 1
  var generator: AvroRandomGenerator = null
  var writer: DatumWriter[GenericData.Record] = null
  var avroSchema: Schema = null
  
  def schema: StructType = StructType(StructField("bytes", BinaryType, false) :: Nil)

  val parser: Schema.Parser = new Schema.Parser()

  override def initialize(ssc: StreamingContext, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Unit = {
    val userConfig = config.asInstanceOf[UserExtractConfig]
    val avroSchemaJson = userConfig.getConfigJsValue("avroSchema") match {
      case Some(s) => s
      case None => throw new IllegalArgumentException("avroSchema must be set in the config")
    }
    count = userConfig.getConfigInt("count").getOrElse(1)
    avroSchema = parser.parse(avroSchemaJson.toString)

    writer = new SpecificDatumWriter(avroSchema)
    generator = new AvroRandomGenerator(avroSchema)
  }

  override def next(ssc: StreamingContext, time: Long, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Option[DataFrame] = {
    val rdd = sqlContext.sparkContext.parallelize((1 to count).map(_ => Row({
      val out = new ByteArrayOutputStream
      val encoder = EncoderFactory.get().binaryEncoder(out, null)
      val avroRecord: GenericData.Record = generator.next().asInstanceOf[GenericData.Record]

      writer.write(avroRecord, encoder)
      encoder.flush
      out.close
      out.toByteArray
    })))

    Some(sqlContext.createDataFrame(rdd, schema))
  }
}

Source File: AvroToRow.scala From streamliner-examples with Apache License 2.0

5 votes

package com.memsql.spark.examples.avro

import collection.JavaConversions._
import org.apache.spark.sql.Row
import org.apache.avro.Schema
import org.apache.avro.generic.GenericData

// Converts an Avro record to a Spark DataFrame row.
//
// This assumes that the Avro schema is "flat", i.e. a Record that includes primitive types
// or unions of primitive types. Unions, and Avro types that don't directly map to Scala types,
// are converted to Strings and put in a Spark SQL StringType column.
private class AvroToRow {
  def getRow(record: GenericData.Record): Row = {
    Row.fromSeq(record.getSchema.getFields().map(f => {
      val schema = f.schema()
      val obj = record.get(f.pos)

      schema.getType match {
        case Schema.Type.BOOLEAN => obj.asInstanceOf[Boolean]
        case Schema.Type.DOUBLE => obj.asInstanceOf[Double]
        case Schema.Type.FLOAT => obj.asInstanceOf[Float]
        case Schema.Type.INT => obj.asInstanceOf[Int]
        case Schema.Type.LONG => obj.asInstanceOf[Long]
        case Schema.Type.NULL => null

        case _ => obj.toString
      }
    }))
  }
}

Source File: AvroToRowSpec.scala From streamliner-examples with Apache License 2.0

5 votes

package com.memsql.spark.examples.avro

import com.memsql.spark.connector.dataframe.JsonValue

import org.apache.avro.Schema
import org.apache.avro.generic.GenericData
import org.apache.spark.sql.Row
import test.util.Fixtures

import collection.JavaConversions._
import java.nio.ByteBuffer
import org.scalatest._

class AvroToRowSpec extends FlatSpec {
  "AvroToRow" should "create Spark SQL Rows from Avro objects" in {
    val parser: Schema.Parser = new Schema.Parser()
    val avroTestSchema: Schema = parser.parse(Fixtures.avroSchema)

    val record: GenericData.Record = new GenericData.Record(avroTestSchema)

    record.put("testBool", true)
    record.put("testDouble", 19.88)
    record.put("testFloat", 3.19f)
    record.put("testInt", 1123)
    record.put("testLong", 2147483648L)
    record.put("testNull", null)
    record.put("testString", "Conor")
    record.put("testUnion", 17)

    val row: Row = new AvroToRow().getRow(record)

    assert(row.getAs[Boolean](0))
    assert(row.getAs[Double](1) == 19.88)
    assert(row.getAs[Float](2) == 3.19f)
    assert(row.getAs[Int](3) == 1123)
    assert(row.getAs[Long](4) == 2147483648L)
    assert(row.getAs[Null](5) == null)
    assert(row.getAs[String](6) == "Conor")
    assert(row.getAs[String](7) == "17")
  }
}

Source File: AvroRandomGeneratorSpec.scala From streamliner-examples with Apache License 2.0

5 votes

package com.memsql.spark.examples.avro

import org.scalatest._
import org.apache.avro.Schema
import org.apache.avro.generic.GenericData
import test.util.Fixtures

class AvroRandomGeneratorSpec extends FlatSpec {
  "AvroRandomGenerator" should "create Avro objects with random values" in {
    val schema = new Schema.Parser().parse(Fixtures.avroSchema)
    val avroRecord:GenericData.Record = new AvroRandomGenerator(schema).next().asInstanceOf[GenericData.Record]

    assert(avroRecord.get("testBool").isInstanceOf[Boolean])
    assert(avroRecord.get("testDouble").isInstanceOf[Double])
    assert(avroRecord.get("testFloat").isInstanceOf[Float])
    assert(avroRecord.get("testInt").isInstanceOf[Int])
    assert(avroRecord.get("testLong").isInstanceOf[Long])
    assert(avroRecord.get("testNull") == null)
    assert(avroRecord.get("testString").isInstanceOf[String])
    assert(avroRecord.get("testUnion").isInstanceOf[Int])
  }
}

Source File: AvroToSchemaSpec.scala From streamliner-examples with Apache License 2.0

5 votes

package com.memsql.spark.examples.avro

import com.memsql.spark.connector.dataframe.JsonType
import org.apache.spark.sql.types._
import org.apache.avro.Schema
import org.scalatest._
import test.util.Fixtures

class AvroToSchemaSpec extends FlatSpec {
  "AvroToSchema" should "create a Spark SQL schema from an Avro schema" in {
    val parser = new Schema.Parser()
    val avroSchema = parser.parse(Fixtures.avroSchema)
    val sparkSchema = AvroToSchema.getSchema(avroSchema)
    val fields = sparkSchema.fields

    assert(fields.forall(field => field.nullable))
    assert(fields(0).name == "testBool")
    assert(fields(0).dataType == BooleanType)

    assert(fields(1).name == "testDouble")
    assert(fields(1).dataType == DoubleType)

    assert(fields(2).name == "testFloat")
    assert(fields(2).dataType == FloatType)

    assert(fields(3).name == "testInt")
    assert(fields(3).dataType == IntegerType)

    assert(fields(4).name == "testLong")
    assert(fields(4).dataType == LongType)

    assert(fields(5).name == "testNull")
    assert(fields(5).dataType == NullType)

    assert(fields(6).name == "testString")
    assert(fields(6).dataType == StringType)

    assert(fields(7).name == "testUnion")
    assert(fields(7).dataType == StringType)
  }
}

Source File: SpecificTestUtil.scala From sbt-avrohugger with Apache License 2.0

5 votes

package test

import java.io.File

import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.Schema
import org.apache.avro.file.{ DataFileReader, DataFileWriter }

import org.specs2.mutable.Specification

object SpecificTestUtil extends Specification {

  def write[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val userDatumWriter = new SpecificDatumWriter[T]
    val dataFileWriter = new DataFileWriter[T](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
  }

  def read[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new SpecificDatumReader[T](schema)
    val dataFileReader = new DataFileReader[T](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader.
    var record: T = null.asInstanceOf[T]
    var sameRecord: T = null.asInstanceOf[T]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  }

  def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

  def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = {
    val schema = record.getSchema
    val writer = new SpecificDatumWriter[T](schema)
    val out = new java.io.ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(out, null)
    writer.write(record, encoder)
    encoder.flush
    val ba = out.toByteArray
    ba.size must ===(1)
    ba(0) must ===(0)
    out.close
    val reader = new SpecificDatumReader[T](schema)
    val decoder = DecoderFactory.get().binaryDecoder(ba, null)
    val decoded = reader.read(record, decoder)
    decoded must ===(record)
  }

}

Source File: SpecificTestUtil.scala From sbt-avrohugger with Apache License 2.0

5 votes

package test

import java.io.File

import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.Schema
import org.apache.avro.file.{ DataFileReader, DataFileWriter }

import org.specs2.mutable.Specification

object SpecificTestUtil extends Specification {

  def write[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val userDatumWriter = new SpecificDatumWriter[T]()
    val dataFileWriter = new DataFileWriter[T](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file)
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close()
  }

  def read[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new SpecificDatumReader[T](schema)
    val dataFileReader = new DataFileReader[T](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader.
    var record: T = null.asInstanceOf[T]
    var sameRecord: T = null.asInstanceOf[T]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  }

  def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

  def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = {
    val schema = record.getSchema
    val writer = new SpecificDatumWriter[T](schema)
    val out = new java.io.ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(out, null)
    writer.write(record, encoder)
    encoder.flush
    val ba = out.toByteArray
    ba.size must ===(1)
    ba(0) must ===(0)
    out.close
    val reader = new SpecificDatumReader[T](schema)
    val decoder = DecoderFactory.get().binaryDecoder(ba, null)
    val decoded = reader.read(record, decoder)
    decoded must ===(record)
  }

}

Source File: SpecificTestUtil.scala From sbt-avrohugger with Apache License 2.0

5 votes

package test

import java.io.File

import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.Schema
import org.apache.avro.file.{ DataFileReader, DataFileWriter }

import org.specs2.mutable.Specification

object SpecificTestUtil extends Specification {

  def write[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val userDatumWriter = new SpecificDatumWriter[T]
    val dataFileWriter = new DataFileWriter[T](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
  }

  def read[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new SpecificDatumReader[T](schema)
    val dataFileReader = new DataFileReader[T](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader.
    var record: T = null.asInstanceOf[T]
    var sameRecord: T = null.asInstanceOf[T]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord.equals(record)
  }

  def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

  def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = {
    val schema = record.getSchema
    val writer = new SpecificDatumWriter[T](schema)
    val out = new java.io.ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(out, null)
    writer.write(record, encoder)
    encoder.flush
    val ba = out.toByteArray
    ba.size must ===(1)
    ba(0) must ===(0)
    out.close
    val reader = new SpecificDatumReader[T](schema)
    val decoder = DecoderFactory.get().binaryDecoder(ba, null)
    val decoded = reader.read(record, decoder)
    decoded must ===(record)
  }

}

Source File: SpecificTestUtil.scala From sbt-avrohugger with Apache License 2.0

5 votes

package test

import java.io.File

import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.Schema
import org.apache.avro.file.{ DataFileReader, DataFileWriter }

import org.specs2.mutable.Specification

object SpecificTestUtil extends Specification {

  def write[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val userDatumWriter = new SpecificDatumWriter[T]
    val dataFileWriter = new DataFileWriter[T](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
  }

  def read[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new SpecificDatumReader[T](schema)
    val dataFileReader = new DataFileReader[T](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader.
    var record: T = null.asInstanceOf[T]
    var sameRecord: T = null.asInstanceOf[T]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  }

  def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

  def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = {
    val schema = record.getSchema
    val writer = new SpecificDatumWriter[T](schema)
    val out = new java.io.ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(out, null)
    writer.write(record, encoder)
    encoder.flush
    val ba = out.toByteArray
    ba.size must ===(1)
    ba(0) must ===(0)
    out.close
    val reader = new SpecificDatumReader[T](schema)
    val decoder = DecoderFactory.get().binaryDecoder(ba, null)
    val decoded = reader.read(record, decoder)
    decoded must ===(record)
  }

}

Source File: DefaultRowReader.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.avro

import java.nio.charset.Charset

import org.apache.avro.Schema
import org.apache.avro.generic.{GenericData, GenericDatumReader}
import org.apache.avro.io.{BinaryDecoder, DecoderFactory}
import SchemaConverter._
import ml.combust.mleap.runtime.serialization.{BuiltinFormats, RowReader}
import ml.combust.mleap.core.types.StructType
import ml.combust.mleap.runtime.frame.{ArrayRow, Row}

import scala.util.Try


class DefaultRowReader(override val schema: StructType) extends RowReader {
  val valueConverter = ValueConverter()
  lazy val readers = schema.fields.map(_.dataType).map(valueConverter.avroToMleap)
  val avroSchema = schema: Schema
  val datumReader = new GenericDatumReader[GenericData.Record](avroSchema)
  var decoder: BinaryDecoder = null
  var record = new GenericData.Record(avroSchema)

  override def fromBytes(bytes: Array[Byte], charset: Charset = BuiltinFormats.charset): Try[Row] = Try {
    decoder = DecoderFactory.get().binaryDecoder(bytes, decoder)
    record = datumReader.read(record, decoder)
    val row = ArrayRow(new Array[Any](schema.fields.length))
    for(i <- schema.fields.indices) { row.set(i, readers(i)(record.get(i))) }
    row
  }
}

Source File: DefaultFrameWriter.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.avro

import java.io.ByteArrayOutputStream
import java.nio.charset.Charset

import org.apache.avro.Schema
import org.apache.avro.file.DataFileWriter
import org.apache.avro.generic.{GenericData, GenericDatumWriter}
import SchemaConverter._
import ml.combust.mleap.runtime.frame.LeapFrame
import ml.combust.mleap.runtime.serialization.{BuiltinFormats, FrameWriter}
import resource._

import scala.util.{Failure, Try}


class DefaultFrameWriter[LF <: LeapFrame[LF]](frame: LF) extends FrameWriter {
  val valueConverter = ValueConverter()

  override def toBytes(charset: Charset = BuiltinFormats.charset): Try[Array[Byte]] = {
    (for(out <- managed(new ByteArrayOutputStream())) yield {
      val writers = frame.schema.fields.map(_.dataType).map(valueConverter.mleapToAvro)
      val avroSchema = frame.schema: Schema
      val record = new GenericData.Record(avroSchema)
      val datumWriter = new GenericDatumWriter[GenericData.Record](avroSchema)
      val writer = new DataFileWriter[GenericData.Record](datumWriter)
      writer.create(avroSchema, out)

      for(row <- frame.collect()) {
        var i = 0
        for(writer <- writers) {
          record.put(i, writer(row.getRaw(i)))
          i = i + 1
        }

        Try(writer.append(record)) match {
          case Failure(error) => error.printStackTrace()
          case _ =>
        }
      }

      writer.close()

      out.toByteArray
    }).tried
  }
}

Source File: DefaultRowWriter.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.avro

import java.io.ByteArrayOutputStream
import java.nio.charset.Charset

import org.apache.avro.Schema
import org.apache.avro.generic.{GenericData, GenericDatumWriter}
import org.apache.avro.io.{BinaryEncoder, EncoderFactory}
import SchemaConverter._
import ml.combust.mleap.runtime.serialization.{BuiltinFormats, RowWriter}
import ml.combust.mleap.core.types.StructType
import ml.combust.mleap.runtime.frame.Row
import resource._

import scala.util.Try


class DefaultRowWriter(override val schema: StructType) extends RowWriter {
  val valueConverter = ValueConverter()
  lazy val writers = schema.fields.map(_.dataType).map(valueConverter.mleapToAvro)
  val avroSchema = schema: Schema
  val datumWriter = new GenericDatumWriter[GenericData.Record](avroSchema)
  var encoder: BinaryEncoder = null
  var record = new GenericData.Record(avroSchema)

  override def toBytes(row: Row, charset: Charset = BuiltinFormats.charset): Try[Array[Byte]] = {
    (for(out <- managed(new ByteArrayOutputStream(1024))) yield {
      encoder = EncoderFactory.get().binaryEncoder(out, encoder)

      var i = 0
      for(writer <- writers) {
        record.put(i, writer(row.getRaw(i)))
        i = i + 1
      }
      datumWriter.write(record, encoder)
      encoder.flush()

      out.toByteArray
    }).tried
  }
}

Source File: ParquetWriterTask.scala From gearpump-examples with Apache License 2.0

5 votes

package io.gearpump.examples.kafka_hdfs_pipeline

import org.apache.avro.Schema
import io.gearpump.Message
import io.gearpump.cluster.UserConfig
import io.gearpump.examples.kafka_hdfs_pipeline.ParquetWriterTask._
import io.gearpump.streaming.task.{StartTime, Task, TaskContext}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.apache.parquet.avro.AvroParquetWriter

import scala.util.{Failure, Success, Try}

class ParquetWriterTask(taskContext : TaskContext, config: UserConfig) extends Task(taskContext, config) {
  val outputFileName = taskContext.appName + ".parquet"
  val absolutePath = Option(getHdfs + config.getString(PARQUET_OUTPUT_DIRECTORY).getOrElse("/parquet") + "/" + outputFileName).map(deleteFile(_)).get
  val outputPath = new Path(absolutePath)
  var parquetWriter = new AvroParquetWriter[SpaceShuttleRecord](outputPath, SpaceShuttleRecord.SCHEMA$)

  def getYarnConf = new YarnConfiguration
  def getFs = FileSystem.get(getYarnConf)
  def getHdfs = new Path(getFs.getHomeDirectory, "gearpump")

  private def deleteFile(fileName: String): String = {
    val file = new Path(fileName)
    getFs.exists(file) match {
      case true =>
        getFs.delete(file,false)
      case false =>
    }
    fileName
  }

  override def onStart(startTime: StartTime): Unit = {
    LOG.info(s"ParquetWriter.onStart $absolutePath")
  }

  override def onNext(msg: Message): Unit = {
    Try({
      parquetWriter.write(msg.msg.asInstanceOf[SpaceShuttleRecord])
    }) match {
      case Success(ok) =>
      case Failure(throwable) =>
        LOG.error(s"failed ${throwable.getMessage}")
    }
  }

  override def onStop(): Unit = {
    LOG.info("ParquetWriter.onStop")
    parquetWriter.close()
  }
}

object ParquetWriterTask {
  val PARQUET_OUTPUT_DIRECTORY = "parquet.output.directory"
  val PARQUET_WRITER = "parquet.writer"
}

Source File: ParquetWriterTaskSpec.scala From gearpump-examples with Apache License 2.0

5 votes

package io.gearpump.examples.kafka_hdfs_pipeline

import akka.actor.ActorSystem
import org.apache.avro.Schema
import io.gearpump.Message
import io.gearpump.cluster.UserConfig
import io.gearpump.streaming.MockUtil
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.apache.parquet.avro.{AvroParquetReader, AvroParquetWriter}
import org.apache.parquet.hadoop.ParquetReader
import org.apache.parquet.hadoop.api.ReadSupport
import org.mockito.Mockito
import org.mockito.Mockito._
import org.scalatest.prop.PropertyChecks
import org.scalatest.{BeforeAndAfterAll, Matchers, PropSpec}


class ParquetWriterTaskSpec extends PropSpec with PropertyChecks with Matchers with BeforeAndAfterAll {

  implicit var system: ActorSystem = ActorSystem("PipeLineSpec")
  val context = MockUtil.mockTaskContext
  val appName = "KafkaHdfsPipeLine"
  when(context.appName).thenReturn(appName)
  val fs = FileSystem.get(new YarnConfiguration)
  val homeDir = fs.getHomeDirectory.toUri.getPath
  val parquetDir = new Path(homeDir, "gearpump")  + "/parquet/"
  val parquetPath = parquetDir + appName + ".parquet"
  val parquetCrc = parquetDir + "." + appName + ".parquet.crc"
  val parquetWriter = Mockito.mock(classOf[AvroParquetWriter[SpaceShuttleRecord]])

  val anomaly = 0.252
  val now = System.currentTimeMillis

  val userConfig = UserConfig.empty.withString(ParquetWriterTask.PARQUET_OUTPUT_DIRECTORY, "/parquet")

  override def afterAll(): Unit = {
    List(parquetPath, parquetCrc, parquetDir).foreach(new java.io.File(_).delete)
    system.shutdown()
  }

  property("ParquetWriterTask should initialize with local parquet file opened for writing") {
    val parquetWriterTask = new ParquetWriterTask(context, userConfig)
    val path = parquetWriterTask.absolutePath.stripPrefix("file:")
    assert(parquetPath.equals(path))
    parquetWriterTask.onStop
  }

  property("ParquetWriterTask should write records to a parquet file") {
    val message = Message(SpaceShuttleRecord(now, anomaly), now)
    val parquetWriterTask = new ParquetWriterTask(context, userConfig)
    parquetWriterTask.parquetWriter = parquetWriter
    parquetWriterTask.onNext(message)
    verify(parquetWriterTask.parquetWriter).write(message.msg.asInstanceOf[SpaceShuttleRecord])
    parquetWriterTask.onStop
  }

  property("ParquetWriterTask should have verifiable written record") {
    val message = Message(SpaceShuttleRecord(now, anomaly), now)
    val parquetWriterTask = new ParquetWriterTask(context, userConfig)
    parquetWriterTask.onNext(message)
    parquetWriterTask.onStop
    val reader = new AvroParquetReader[SpaceShuttleRecord](new Path(parquetPath))
    val record = reader.read()
    assert(message.msg.asInstanceOf[SpaceShuttleRecord].anomaly == record.anomaly)
    assert(message.msg.asInstanceOf[SpaceShuttleRecord].ts == record.ts)
  }
}

Source File: LineItem.scala From scavro with Apache License 2.0

5 votes

package org.oedura.scavrodemo.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroReader, AvroSerializeable, AvroMetadata}
import org.oedura.scavrodemo.idl.{LineItem => JLineItem}


case class LineItem(name: String, price: Double, quantity: Int) extends AvroSerializeable {
  type J = JLineItem
  override def toAvro: JLineItem = new JLineItem(name, price.toFloat, quantity)
}

object LineItem {
  implicit def reader = new AvroReader[LineItem] { override type J = JLineItem }

  implicit val metadata: AvroMetadata[LineItem, JLineItem] = new AvroMetadata[LineItem, JLineItem] {
    override val avroClass: Class[JLineItem] = classOf[JLineItem]
    override val schema: Schema = JLineItem.getClassSchema
    override val fromAvro: (JLineItem) => LineItem = (j: JLineItem) => {
      LineItem(j.getName.toString, j.getPrice.doubleValue, j.getQuantity)
    }
  }
}

Source File: Number.scala From scavro with Apache License 2.0

5 votes

package org.oedura.scavro

import org.apache.avro.Schema


case class Number(name: String, value: Int) extends AvroSerializeable {
  type J = MockNumber
  override def toAvro: MockNumber = new MockNumber(name, value)
}

object Number {
  implicit def reader = new AvroReader[Number] { override type J = MockNumber }

  implicit val metadata = new AvroMetadata[Number, MockNumber] {
    override val avroClass = classOf[MockNumber]
    override val schema: Schema = MockNumber.getClassSchema
    override val fromAvro: (MockNumber) => Number = { mock =>
      val name: String = mock.get(0).asInstanceOf[String]
      val value: Int = mock.get(1).asInstanceOf[Int]
      Number(name, value)
    }
  }
}

Source File: GenericAvroSerializerSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.serializer

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
import java.nio.ByteBuffer

import com.esotericsoftware.kryo.io.{Input, Output}
import org.apache.avro.{Schema, SchemaBuilder}
import org.apache.avro.generic.GenericData.Record

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext {
  conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

  val schema : Schema = SchemaBuilder
    .record("testRecord").fields()
    .requiredString("data")
    .endRecord()
  val record = new Record(schema)
  record.put("data", "test data")

  test("schema compression and decompression") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    assert(schema === genericSer.decompress(ByteBuffer.wrap(genericSer.compress(schema))))
  }

  test("record serialization and deserialization") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)

    val outputStream = new ByteArrayOutputStream()
    val output = new Output(outputStream)
    genericSer.serializeDatum(record, output)
    output.flush()
    output.close()

    val input = new Input(new ByteArrayInputStream(outputStream.toByteArray))
    assert(genericSer.deserializeDatum(input) === record)
  }

  test("uses schema fingerprint to decrease message size") {
    val genericSerFull = new GenericAvroSerializer(conf.getAvroSchema)

    val output = new Output(new ByteArrayOutputStream())

    val beginningNormalPosition = output.total()
    genericSerFull.serializeDatum(record, output)
    output.flush()
    val normalLength = output.total - beginningNormalPosition

    conf.registerAvroSchemas(schema)
    val genericSerFinger = new GenericAvroSerializer(conf.getAvroSchema)
    val beginningFingerprintPosition = output.total()
    genericSerFinger.serializeDatum(record, output)
    val fingerprintLength = output.total - beginningFingerprintPosition

    assert(fingerprintLength < normalLength)
  }

  test("caches previously seen schemas") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    val compressedSchema = genericSer.compress(schema)
    val decompressedSchema = genericSer.decompress(ByteBuffer.wrap(compressedSchema))

    assert(compressedSchema.eq(genericSer.compress(schema)))
    assert(decompressedSchema.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema))))
  }
}

Source File: ClassStore.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package stores

import org.apache.avro.Schema

import treehugger.forest.Symbol

import java.util.concurrent.ConcurrentHashMap
import scala.jdk.CollectionConverters._

class ClassStore {

  val generatedClasses: scala.collection.concurrent.Map[Schema, Symbol] = {
  	new ConcurrentHashMap[Schema, Symbol]().asScala
  }

  def accept(schema: Schema, caseClassDef: Symbol) = {
    if (!generatedClasses.contains(schema)) {
      val _ = generatedClasses += schema -> caseClassDef
    }
  }
}

Source File: DefaultParamMatcher.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package matchers

import avrohugger.matchers.custom.CustomDefaultParamMatcher
import avrohugger.stores.ClassStore
import avrohugger.types._
import org.apache.avro.Schema
import org.apache.avro.Schema.Type
import treehugger.forest._
import definitions._
import treehugger.forest
import treehuggerDSL._


object DefaultParamMatcher {

  // for SpecificRecord
  def asDefaultParam(
    classStore: ClassStore,
    avroSchema: Schema,
    typeMatcher: TypeMatcher): Tree  = {

    avroSchema.getType match {

      case Type.BOOLEAN => FALSE
      case Type.INT     =>
        LogicalType.foldLogicalTypes[Tree](
          schema = avroSchema,
          default = LIT(0)) {
          case Date =>
            CustomDefaultParamMatcher.checkCustomDateType(
              typeMatcher.avroScalaTypes.date)
        }
      case Type.LONG    =>
        LogicalType.foldLogicalTypes[Tree](
          schema = avroSchema,
          default = LIT(0L)) {
          case TimestampMillis =>
            CustomDefaultParamMatcher.checkCustomTimestampMillisType(
              typeMatcher.avroScalaTypes.timestampMillis)
        }
      case Type.FLOAT   => LIT(0F)
      case Type.DOUBLE  => LIT(0D)
      case Type.STRING  =>
        LogicalType.foldLogicalTypes[Tree](
          schema = avroSchema,
          default = LIT("")) {
          case UUID => REF("java.util.UUID.randomUUID")
        }
      case Type.NULL    => NULL
      case Type.FIXED   => sys.error("the FIXED datatype is not yet supported")
      case Type.ENUM    =>
        CustomDefaultParamMatcher.checkCustomEnumType(typeMatcher.avroScalaTypes.enum)
      case Type.BYTES   =>
        CustomDefaultParamMatcher.checkCustomDecimalType(
          decimalType = typeMatcher.avroScalaTypes.decimal,
          schema = avroSchema,
          default = NULL)
      case Type.RECORD  => NEW(classStore.generatedClasses(avroSchema))
      case Type.UNION   => NONE
      case Type.ARRAY   =>
        CustomDefaultParamMatcher.checkCustomArrayType(typeMatcher.avroScalaTypes.array) DOT "empty"
      case Type.MAP     =>
        MAKE_MAP(LIT("") ANY_-> asDefaultParam(classStore, avroSchema.getValueType, typeMatcher))
      
    }

  }

}

Source File: CustomDefaultParamMatcher.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package matchers
package custom

import avrohugger.matchers.custom.CustomUtils._
import avrohugger.types._
import treehugger.forest._
import definitions._
import treehuggerDSL._

import org.apache.avro.Schema

object CustomDefaultParamMatcher {

  def checkCustomArrayType(arrayType: AvroScalaArrayType) = {
    arrayType match {
      case ScalaArray  => ArrayClass
      case ScalaList   => ListClass
      case ScalaSeq    => SeqClass
      case ScalaVector => VectorClass
    }
  }
  
  def checkCustomEnumType(enumType: AvroScalaEnumType) = {
    enumType match {
      case JavaEnum            => NULL // TODO Take first enum value?
      case ScalaEnumeration    => NULL // TODO Take first enum value?
      case ScalaCaseObjectEnum => NULL // TODO Take first enum value?
      case EnumAsScalaString   => LIT("")
    }
  }
  
  def checkCustomDateType(dateType: AvroScalaDateType) = dateType match {
    case JavaSqlDate       => NEW(REF("java.sql.Date"), LIT(0L))
    case JavaTimeLocalDate => REF("java.time.LocalDate.now")
  }
  
  def checkCustomTimestampMillisType(timestampMillisType: AvroScalaTimestampMillisType) =
    timestampMillisType match {
      case JavaSqlTimestamp => NEW(REF("java.sql.Timestamp"), LIT(0L))
      case JavaTimeInstant  => REF("java.time.Instant.now")
    }

  def checkCustomDecimalType(decimalType: AvroScalaDecimalType, schema: Schema, default: => Tree, decimalValue: => Option[String] = None) = {
    val decimalValueRef = REF("scala.math.BigDecimal") APPLY decimalValue.map(LIT(_)).getOrElse(LIT(0))
    LogicalType.foldLogicalTypes[Tree](
      schema = schema,
      default = default) {
        case Decimal(precision, scale) => decimalType match {
          case ScalaBigDecimal(_) => decimalValueRef
          case ScalaBigDecimalWithPrecision(_) => decimalTagged(precision, scale) APPLY decimalValueRef
        }                
    }
  }
}

Source File: CustomTypeMatcher.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package matchers
package custom

import avrohugger.matchers.custom.CustomUtils._
import avrohugger.stores.ClassStore
import avrohugger.types._
import org.apache.avro.Schema
import treehugger.forest._
import treehuggerDSL._
import definitions._

object CustomTypeMatcher {

  def checkCustomArrayType(arrayType: AvroScalaArrayType) = arrayType match {
    case ScalaArray  => TYPE_ARRAY(_)
    case ScalaList   => TYPE_LIST(_)
    case ScalaSeq    => TYPE_SEQ(_)
    case ScalaVector => TYPE_VECTOR(_)
  }
  
  def checkCustomEnumType(
    enumType: AvroScalaEnumType,
    classStore: ClassStore,
    schema: Schema) = enumType match {
      case JavaEnum => classStore.generatedClasses(schema)
      case ScalaEnumeration => classStore.generatedClasses(schema)
      case ScalaCaseObjectEnum => classStore.generatedClasses(schema)
      case EnumAsScalaString => StringClass
    }

  def checkCustomNumberType(numberType: AvroScalaNumberType) = numberType match {
    case ScalaDouble => DoubleClass
    case ScalaFloat  => FloatClass
    case ScalaLong   => LongClass
    case ScalaInt    => IntClass
  }
  
  def checkCustomDateType(dateType: AvroScalaDateType) = dateType match {
    case JavaTimeLocalDate => RootClass.newClass(nme.createNameType("java.time.LocalDate"))
    case JavaSqlDate       => RootClass.newClass(nme.createNameType("java.sql.Date"))
  } 
    
  def checkCustomTimestampMillisType(timestampType: AvroScalaTimestampMillisType) = timestampType match {
    case JavaSqlTimestamp => RootClass.newClass(nme.createNameType("java.sql.Timestamp"))
    case JavaTimeInstant  => RootClass.newClass(nme.createNameType("java.time.Instant"))
  }

  def checkCustomDecimalType(decimalType: AvroScalaDecimalType, schema: Schema) =
      LogicalType.foldLogicalTypes(
        schema = schema,
        default = TYPE_ARRAY(ByteClass)) {
          case Decimal(precision, scale) => decimalType match {
            case ScalaBigDecimal(_) => BigDecimalClass
            case ScalaBigDecimalWithPrecision(_) => decimalTaggedType(precision, scale)
          }
        }
}

Source File: LogicalAvroScalaTypes.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package types

import org.apache.avro.Schema

sealed trait AvroScalaDecimalType extends Product with Serializable
case class ScalaBigDecimal(maybeRoundingMode: Option[BigDecimal.RoundingMode.Value]) extends AvroScalaDecimalType
case class ScalaBigDecimalWithPrecision(maybeRoundingMode: Option[BigDecimal.RoundingMode.Value]) extends AvroScalaDecimalType

sealed trait AvroScalaDateType extends Product with Serializable
case object JavaSqlDate extends AvroScalaDateType
case object JavaTimeLocalDate extends AvroScalaDateType

sealed trait AvroScalaTimestampMillisType extends Product with Serializable
case object JavaSqlTimestamp extends AvroScalaTimestampMillisType
case object JavaTimeInstant extends AvroScalaTimestampMillisType

sealed trait AvroUuidType extends Product with Serializable
case object JavaUuid extends AvroUuidType

sealed abstract class LogicalType(name: String)
case class Decimal(precision: Int, scale: Int) extends LogicalType("decimal")
case object Date extends LogicalType("date")
case object TimestampMillis extends LogicalType("timestamp-millis")
case object UUID extends LogicalType("uuid")

object LogicalType {
  
  def apply(logicalType: org.apache.avro.LogicalType): Option[LogicalType] = logicalType match {
    case d: org.apache.avro.LogicalTypes.Decimal => Some(Decimal(d.getPrecision, d.getScale))
    case _: org.apache.avro.LogicalTypes.Date => Some(Date)
    case _: org.apache.avro.LogicalTypes.TimestampMillis => Some(TimestampMillis)
    case _ if logicalType.getName == "uuid" => Some(UUID)
    case _ => None
  }
  
  def foldLogicalTypes[A](schema: Schema, default: => A)(cases : PartialFunction[LogicalType, A]): A =
    Option(schema.getLogicalType) match {
      case Some(tpe) => LogicalType(tpe).flatMap(cases.lift(_)).getOrElse(default)
      case _ => default
    }

}

Source File: SpecificImporter.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package format
package specific

import avrohugger.format.abstractions.Importer
import avrohugger.input.DependencyInspector._
import avrohugger.input.NestedSchemaExtractor._
import avrohugger.matchers.TypeMatcher
import avrohugger.stores.SchemaStore

import org.apache.avro.{ Protocol, Schema }
import org.apache.avro.Schema.Type.RECORD

import treehugger.forest._
import definitions._
import treehuggerDSL._

import scala.collection.JavaConverters._

object SpecificImporter extends Importer {

  def getImports(
    schemaOrProtocol: Either[Schema, Protocol],
    currentNamespace: Option[String],
    schemaStore: SchemaStore,
    typeMatcher: TypeMatcher): List[Import] = {
      
    val switchAnnotSymbol = RootClass.newClass("scala.annotation.switch")
    val switchImport = IMPORT(switchAnnotSymbol)
    val topLevelSchemas =
      getTopLevelSchemas(schemaOrProtocol, schemaStore, typeMatcher)
    val recordSchemas = getRecordSchemas(topLevelSchemas)
    val enumSchemas = getEnumSchemas(topLevelSchemas)
    val deps = getUserDefinedImports(recordSchemas ++ enumSchemas, currentNamespace, typeMatcher)
    
    schemaOrProtocol match {
      case Left(schema) => {
        if (schema.getType == RECORD) switchImport :: deps
        else deps
      }
      case Right(protocol) => {
        val types = protocol.getTypes.asScala.toList
        val messages = protocol.getMessages.asScala.toMap
        if (messages.isEmpty) switchImport :: deps // for ADT
        else List.empty // for RPC
      }
    }
  }

}

Source File: SpecificSchemahugger.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package format
package specific
package avrohuggers

import format.abstractions.avrohuggers.Schemahugger
import trees.{ SpecificCaseClassTree, SpecificObjectTree }
import matchers.TypeMatcher
import stores.{ClassStore, SchemaStore}

import org.apache.avro.Schema

import treehugger.forest.Tree

object SpecificSchemahugger extends Schemahugger {

  def toTrees(
    schemaStore: SchemaStore,
    classStore: ClassStore,
    namespace: Option[String],
    schema: Schema,
    typeMatcher: TypeMatcher,
    maybeBaseTrait: Option[String],
    maybeFlags: Option[List[Long]],
    restrictedFields: Boolean): List[Tree] = {

    val caseClassDef = SpecificCaseClassTree.toCaseClassDef(
      classStore,
      namespace,
      schema,
      typeMatcher,
      maybeBaseTrait,
      maybeFlags,
      restrictedFields)

    val companionDef = SpecificObjectTree.toCaseCompanionDef(
      schema,
      maybeFlags,
      schemaStore,
      typeMatcher)

    List(caseClassDef, companionDef)
  }

}

Source File: SpecificProtocolhugger.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package format
package specific
package avrohuggers

import format.abstractions.avrohuggers.Protocolhugger
import generators.ScalaDocGenerator
import trees.{ SpecificObjectTree, SpecificTraitTree }
import matchers.TypeMatcher
import stores.{ClassStore, SchemaStore}
import types.ScalaADT

import org.apache.avro.{ Protocol, Schema }

import treehugger.forest._
import definitions._
import treehuggerDSL._

import scala.collection.JavaConverters._

object SpecificProtocolhugger extends Protocolhugger {

  def toTrees(
    schemaStore: SchemaStore,
    classStore: ClassStore,
    namespace: Option[String],
    protocol: Protocol,
    typeMatcher: TypeMatcher,
    maybeBaseTrait: Option[String],
    maybeFlags: Option[List[Long]],
    restrictedFields: Boolean): List[Tree] = {

    val name: String = protocol.getName
    val messages = protocol.getMessages.asScala.toMap
    val maybeProtocolDoc = Option(protocol.getDoc)

    if (messages.isEmpty) {
      val localSubTypes = getLocalSubtypes(protocol)
      // protocols with more than 1 schema defined (Java Enums don't count) and
      // without messages are generated as ADTs
      val localNonEnums = localSubTypes.filterNot(isEnum)

      if (localNonEnums.length > 1 && typeMatcher.avroScalaTypes.protocol == types.ScalaADT) {
        val maybeNewBaseTrait = Some(name)
        val maybeNewFlags = Some(List(Flags.FINAL.toLong))
        val sealedTraitDef = SpecificTraitTree.toADTRootDef(protocol)
        val subTypeDefs = localNonEnums.flatMap(schema => {
          SpecificSchemahugger.toTrees(
            schemaStore,
            classStore,
            namespace,
            schema,
            typeMatcher,
            maybeNewBaseTrait,
            maybeNewFlags,
            restrictedFields)
        })
        sealedTraitDef +: subTypeDefs
      }
      // if only one Scala type is defined, then don't generate sealed trait
      else {
        // no sealed trait tree, but could still need a protocol doc at top
        val docTrees = {
          Option(protocol.getDoc) match {
            case Some(doc) =>
              List(ScalaDocGenerator.docToScalaDoc(Right(protocol), EmptyTree))
            case None => List.empty
          }
        }
        docTrees ::: localNonEnums.flatMap(schema => {
          SpecificSchemahugger.toTrees(
            schemaStore,
            classStore,
            namespace,
            schema,
            typeMatcher,
            maybeBaseTrait,
            maybeFlags,
            restrictedFields)
          })
        }
    }
    else {
      val rpcTraitDef = SpecificTraitTree.toRPCTraitDef(
        classStore,
        namespace,
        protocol,
        typeMatcher)
      val companionDef = SpecificObjectTree.toTraitCompanionDef(protocol)
      List(rpcTraitDef, companionDef)
    }
  }

}

Source File: SpecificObjectTree.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package format
package specific
package trees

import generators.ScalaDocGenerator
import matchers.TypeMatcher
import stores.SchemaStore
import org.apache.avro.{Protocol, Schema}
import treehugger.forest._
import definitions._
import org.apache.avro.Schema.Type
import treehuggerDSL._

import scala.collection.JavaConverters._

// only companions, so no doc generation is required here
object SpecificObjectTree {

  // Companion to case classes
  def toCaseCompanionDef(
    schema: Schema,
    maybeFlags: Option[List[Long]],
    schemaStore: SchemaStore,
    typeMatcher: TypeMatcher) = {
    val ParserClass = RootClass.newClass("org.apache.avro.Schema.Parser")
    val objectDef = maybeFlags match {
      case Some(flags) => OBJECTDEF(schema.getName).withFlags(flags:_*)
      case None => OBJECTDEF(schema.getName)
    }
    val schemaDef = VAL(REF("SCHEMA$")) := {
      (NEW(ParserClass)) APPLY(Nil) DOT "parse" APPLY(LIT(schema.toString))
    }
    val DecimalConversion = RootClass.newClass("org.apache.avro.Conversions.DecimalConversion")
    val decimalConversionDef = VAL(REF("decimalConversion")) := NEW(DecimalConversion)
    def schemaContainsDecimal(schema: Schema): Boolean = {
      def getNestedSchemas(s: Schema): List[Schema] = s.getType match {
        case Schema.Type.ARRAY => getNestedSchemas(s.getElementType)
        case Schema.Type.MAP => getNestedSchemas(s.getValueType)
        case Schema.Type.UNION => s.getTypes.asScala.toList.flatMap(getNestedSchemas)
        case _ => List(s)
      }
      val topLevelSchemas = SpecificImporter.getTopLevelSchemas(Left(schema), schemaStore, typeMatcher)
      val recordSchemas = SpecificImporter.getRecordSchemas(topLevelSchemas).filter(s => s.getType == Schema.Type.RECORD)
      val fieldSchemas = recordSchemas.flatMap(_.getFields.asScala.map(_.schema()))
      fieldSchemas.flatMap(getNestedSchemas).exists(s => Option(s.getLogicalType()) match {
        case Some(logicalType) => logicalType.getName == "decimal"
        case None => false
      })
    }
    // companion object definition
    if (schemaContainsDecimal(schema)) objectDef := BLOCK(schemaDef, decimalConversionDef)
    else objectDef := BLOCK(schemaDef)
  }

  // union acts as a blackbox, fields are not seen on root level, unpack is required
  private def collectUnionFields(sc: Schema): Iterable[Schema] = {
    sc.getTypes.asScala.toList
  }
  
  // Companion to traits that have messages
  def toTraitCompanionDef(protocol: Protocol) = {
    val ProtocolClass = RootClass.newClass("org.apache.avro.Protocol")
    // companion object definition
    OBJECTDEF(protocol.getName) := BLOCK(
      VAL("PROTOCOL", ProtocolClass).withFlags(Flags.FINAL) := {
        REF(ProtocolClass) DOT "parse" APPLY(LIT(protocol.toString))
      }
    )
  }
  
}

Source File: SpecificScalaTreehugger.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package format
package specific

import format.abstractions.ScalaTreehugger
import avrohuggers.{ SpecificProtocolhugger, SpecificSchemahugger }
import matchers.TypeMatcher
import stores.{ ClassStore, SchemaStore }

import org.apache.avro.{ Protocol, Schema }
import org.apache.avro.Schema.Field
import org.apache.avro.Schema.Type.RECORD

import treehugger.forest._
import definitions._
import treehuggerDSL._


object SpecificScalaTreehugger extends ScalaTreehugger {

  val schemahugger = SpecificSchemahugger
  val protocolhugger = SpecificProtocolhugger
  val importer = SpecificImporter

  // SpecificCompiler can't return a tree for Java enums, so return
  // a String here for a consistent api vis a vis *ToFile and *ToStrings
  def asScalaCodeString(
    classStore: ClassStore,
    namespace: Option[String],
    schemaOrProtocol: Either[Schema, Protocol],
    typeMatcher: TypeMatcher,
    schemaStore: SchemaStore,
    restrictedFields: Boolean): String = {

    // imports in case a field type is from a different namespace
    val imports: List[Import] = importer.getImports(
      schemaOrProtocol,
      namespace,
      schemaStore,
      typeMatcher)

    val topLevelDefs: List[Tree] = schemaOrProtocol match {
      case Left(schema) => schemahugger.toTrees(
        schemaStore,
        classStore,
        namespace,
        schema,
        typeMatcher,
        None,
        None,
        restrictedFields
      )
      case Right(protocol) => protocolhugger.toTrees(
        schemaStore,
        classStore,
        namespace,
        protocol,
        typeMatcher,
        None,
        None,
        restrictedFields
      )
    }

    // wrap the definitions in a block with a comment and a package
    val tree = {
      val blockContent = imports ++ topLevelDefs
      if (namespace.isDefined) BLOCK(blockContent:_*).inPackage(namespace.get)
      else BLOCK(blockContent:_*).withoutPackage
    }.withDoc("MACHINE-GENERATED FROM AVRO SCHEMA. DO NOT EDIT DIRECTLY")

    val codeString = treeToString(tree)
    codeString
  }

}

Source File: Protocolhugger.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package format
package abstractions
package avrohuggers

import stores.{ClassStore, SchemaStore}
import matchers.TypeMatcher

import org.apache.avro.{ Protocol, Schema }

import treehugger.forest.Tree

import scala.collection.JavaConverters._

trait Protocolhugger {

  def toTrees(
    schemaStore: SchemaStore,
    classStore: ClassStore,
    namespace: Option[String],
    protocol: Protocol,
    typeMatcher: TypeMatcher,
    maybeBaseTrait: Option[String],
    maybeFlags: Option[List[Long]],
    restrictedFields: Boolean): List[Tree]


  def getLocalSubtypes(protocol: Protocol): List[Schema] = {
    val protocolNS = protocol.getNamespace
    val types = protocol.getTypes.asScala.toList
    def isTopLevelNamespace(schema: Schema) = schema.getNamespace == protocolNS
    types.filter(isTopLevelNamespace)
  }

  def isEnum(schema: Schema) = schema.getType == Schema.Type.ENUM


}

Source File: Schemahugger.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package format
package abstractions
package avrohuggers

import stores.{ClassStore, SchemaStore}
import matchers.TypeMatcher

import org.apache.avro.Schema

import treehugger.forest.Tree

trait Schemahugger {

  def toTrees(
    schemaStore: SchemaStore,
    classStore: ClassStore,
    namespace: Option[String],
    schema: Schema,
    typeMatcher: TypeMatcher,
    maybeBaseTrait: Option[String],
    maybeFlags: Option[List[Long]],
    restrictedFields: Boolean): List[Tree]

}

Source File: StandardScalaTreehugger.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package format
package standard

import format.abstractions.ScalaTreehugger
import avrohuggers.{ StandardProtocolhugger, StandardSchemahugger }
import matchers.TypeMatcher
import stores.{ ClassStore, SchemaStore }

import org.apache.avro.{ Protocol, Schema }
import org.apache.avro.Schema.Field
import org.apache.avro.Schema.Type.{ RECORD }

import treehugger.forest._
import definitions._
import treehuggerDSL._


object StandardScalaTreehugger extends ScalaTreehugger {

  val schemahugger = StandardSchemahugger
  val protocolhugger = StandardProtocolhugger
  val importer = StandardImporter

  def asScalaCodeString(
		classStore: ClassStore,
    namespace: Option[String],
    schemaOrProtocol: Either[Schema, Protocol],
    typeMatcher: TypeMatcher,
    schemaStore: SchemaStore,
    restrictedFields: Boolean): String = {

    val imports = importer.getImports(
      schemaOrProtocol, namespace, schemaStore, typeMatcher)

    val topLevelDefs: List[Tree] = schemaOrProtocol match {
      case Left(schema) => schemahugger.toTrees(
        schemaStore,
        classStore,
        namespace,
        schema,
        typeMatcher,
        None,
        None,
        restrictedFields
      )
      case Right(protocol) => protocolhugger.toTrees(
        schemaStore,
        classStore,
        namespace,
        protocol,
        typeMatcher,
        None,
        None,
        restrictedFields
      )
    }

    // wrap the imports and class definition in a block with comment and package
    val tree = {
      val blockContent = imports ++ topLevelDefs
      if (namespace.isDefined) BLOCK(blockContent).inPackage(namespace.get)
      else BLOCK(blockContent:_*).withoutPackage
    }.withDoc("MACHINE-GENERATED FROM AVRO SCHEMA. DO NOT EDIT DIRECTLY")
    // SpecificCompiler can't return a tree for Java enums, so return
    // a string here for a consistent api vis a vis *ToFile and *ToStrings
    treeToString(tree)
  }

}

Source File: StandardProtocolhugger.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package format
package standard
package avrohuggers

import generators.ScalaDocGenerator
import trees.StandardTraitTree
import matchers.TypeMatcher
import stores.{ClassStore, SchemaStore}
import types._
import org.apache.avro.{ Protocol, Schema }

import treehugger.forest._
import definitions._
import treehuggerDSL._

import format.abstractions.avrohuggers.Protocolhugger


object StandardProtocolhugger extends Protocolhugger {

  def toTrees(
    schemaStore: SchemaStore,
    classStore: ClassStore,
    namespace: Option[String],
    protocol: Protocol,
    typeMatcher: TypeMatcher,
    maybeBaseTrait: Option[String],
    maybeFlags: Option[List[Long]],
    restrictedFields: Boolean): List[Tree] = {

    val name: String = protocol.getName

    val localSubTypes = getLocalSubtypes(protocol)

    val adtSubTypes = typeMatcher.avroScalaTypes.enum match {
      case JavaEnum => localSubTypes.filterNot(isEnum)
      case ScalaCaseObjectEnum => localSubTypes
      case ScalaEnumeration => localSubTypes
      case EnumAsScalaString => localSubTypes.filterNot(isEnum)
    }

    if (adtSubTypes.length > 1 && typeMatcher.avroScalaTypes.protocol == types.ScalaADT) {
      val maybeNewBaseTrait = Some(name)
      val maybeNewFlags = Some(List(Flags.FINAL.toLong))
      val traitDef = StandardTraitTree.toADTRootDef(protocol, typeMatcher)
      traitDef +: adtSubTypes.flatMap(schema => {
        StandardSchemahugger.toTrees(
          schemaStore,
          classStore,
          namespace,
          schema,
          typeMatcher,
          maybeNewBaseTrait,
          maybeNewFlags,
          restrictedFields)
      })
    }
    // if only one Scala type is defined, then don't generate sealed trait
    else {
      // no sealed trait tree, but could still need a top-level doc
      val docTrees = {
        Option(protocol.getDoc) match {
          case Some(doc) =>
            List(ScalaDocGenerator.docToScalaDoc(Right(protocol), EmptyTree))
          case None => List.empty
        }
      }
      docTrees ::: localSubTypes.flatMap(schema => {
        StandardSchemahugger.toTrees(
          schemaStore,
          classStore,
          namespace,
          schema,
          typeMatcher,
          maybeBaseTrait,
          maybeFlags,
          restrictedFields)
      })
    }
  }

}

Source File: StandardSchemahugger.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package format
package standard
package avrohuggers

import format.abstractions.avrohuggers.Schemahugger
import trees.{ StandardCaseClassTree, StandardObjectTree, StandardTraitTree }
import matchers.TypeMatcher
import stores.{ClassStore, SchemaStore}
import types._

import org.apache.avro.{ Protocol, Schema }
import org.apache.avro.Schema.Type.{ ENUM, RECORD }

import treehugger.forest._
import definitions._
import treehuggerDSL._

object StandardSchemahugger extends Schemahugger {

  def toTrees(
    schemaStore: SchemaStore,
    classStore: ClassStore,
    namespace: Option[String],
    schema: Schema,
    typeMatcher: TypeMatcher,
    maybeBaseTrait: Option[String],
    maybeFlags: Option[List[Long]],
    restrictedFields: Boolean): List[Tree] = { // as case class definition

    schema.getType match {
      case RECORD =>
        val classDef = StandardCaseClassTree.toCaseClassDef(
          classStore,
          namespace,
          schema,
          typeMatcher,
          maybeBaseTrait,
          maybeFlags,
          restrictedFields)
        val companionDef = StandardObjectTree.toCaseCompanionDef(
          schema,
          maybeFlags)
        typeMatcher.avroScalaTypes.record match {
          case ScalaCaseClass => List(classDef)
          case ScalaCaseClassWithSchema => List(classDef, companionDef)
        }
      case ENUM => typeMatcher.avroScalaTypes.enum match {
        case JavaEnum =>
          List.empty
        case ScalaCaseObjectEnum =>
          StandardTraitTree.toCaseObjectEnumDef(schema, maybeBaseTrait)
        case ScalaEnumeration =>
          val objectDef = StandardObjectTree.toScalaEnumDef(
            classStore,
            schema,
            maybeBaseTrait,
            maybeFlags)
          List(objectDef)
        case EnumAsScalaString => List.empty
      }
      case _ => sys.error("Only RECORD or ENUM can be toplevel definitions")
    }
  }

}

Source File: StandardTraitTree.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package format
package standard
package trees

import generators.ScalaDocGenerator
import matchers.TypeMatcher
import types._
import treehugger.forest._
import definitions._
import treehuggerDSL._

import org.apache.avro.{ Protocol, Schema }
import org.apache.avro.Schema.Type.{ ENUM, RECORD }

import scala.collection.JavaConverters._

object StandardTraitTree {

  def toADTRootDef(protocol: Protocol, typeMatcher: TypeMatcher) = {
    def isEnum(schema: Schema) = schema.getType == ENUM
    val sealedTraitTree = TRAITDEF(protocol.getName).withFlags(Flags.SEALED)
    val adtRootTree = {
      val adtSubTypes = typeMatcher.avroScalaTypes.enum match {
        case JavaEnum => protocol.getTypes.asScala.toList.filterNot(isEnum)
        case ScalaCaseObjectEnum => protocol.getTypes.asScala.toList
        case ScalaEnumeration => protocol.getTypes.asScala.toList
        case EnumAsScalaString => protocol.getTypes.asScala.filterNot(isEnum)
      }
      if (adtSubTypes.forall(schema => schema.getType == RECORD)) {
        sealedTraitTree
          .withParents("Product")
          .withParents("Serializable")
      }
      else sealedTraitTree
    } 
    val treeWithScalaDoc = ScalaDocGenerator.docToScalaDoc(
      Right(protocol),
      adtRootTree)
      
    treeWithScalaDoc
  }
  
  def toCaseObjectEnumDef(schema: Schema,
    maybeBaseTrait: Option[String]): List[Tree] = {
    val adtRootTree: Tree = maybeBaseTrait match {
      case Some(baseTrait) =>
        TRAITDEF(schema.getName).withFlags(Flags.SEALED).withParents(baseTrait)
      case None =>
        TRAITDEF(schema.getName).withFlags(Flags.SEALED)
    }
    val adtSubTypes: List[Tree] = schema.getEnumSymbols.asScala
      .map(enumSymbol => enumSymbol.toString)
      .map(enumSymbolString => {
        (CASEOBJECTDEF(enumSymbolString).withParents(schema.getName): Tree)
      }).toList
    val objectTree = OBJECTDEF(schema.getName) := Block(adtSubTypes:_*)
    val adtRootTreeWithScalaDoc: Tree = ScalaDocGenerator.docToScalaDoc(
      Left(schema),
      adtRootTree)
    List(adtRootTreeWithScalaDoc, objectTree)
  }
}

Source File: StandardObjectTree.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package format
package standard
package trees

import generators.ScalaDocGenerator
import stores.ClassStore

import treehugger.forest._
import definitions._
import treehuggerDSL._

import org.apache.avro.Schema

import scala.collection.JavaConverters._

object StandardObjectTree {
  
  def toCaseCompanionDef(schema: Schema, maybeFlags: Option[List[Long]]) = {
    val ParserClass = RootClass.newClass("org.apache.avro.Schema.Parser")
    val objectDef = maybeFlags match {
      case Some(flags) => OBJECTDEF(schema.getName).withFlags(flags:_*)
      case None => OBJECTDEF(schema.getName)
    }
    // companion object definition
    objectDef := BLOCK(
      VAL(REF("SCHEMA$")) := {
        (NEW(ParserClass)) APPLY(Nil) DOT "parse" APPLY(LIT(schema.toString))
      }
    )
  }

  def toScalaEnumDef(
    classStore: ClassStore, 
    schema: Schema,
    maybeBaseTrait: Option[String],
    maybeFlags: Option[List[Long]]) = {
      
    val objectDef = (maybeBaseTrait, maybeFlags) match {
      case (Some(baseTrait), Some(flags)) => 
        OBJECTDEF(schema.getName)
          .withFlags(flags:_*)
          .withParents("Enumeration")
          .withParents(baseTrait) 
      case (Some(baseTrait), None) =>
        OBJECTDEF(schema.getName)
          .withParents("Enumeration")
          .withParents(baseTrait)
      case (None, Some(flags)) => 
        OBJECTDEF(schema.getName)
          .withFlags(flags:_*)
          .withParents("Enumeration")
      case (None, None) =>
        OBJECTDEF(schema.getName)
          .withParents("Enumeration")
    }
    
    val objectTree = objectDef := BLOCK(
      TYPEVAR(schema.getName) := REF("Value"),
      VAL(schema.getEnumSymbols.asScala.mkString(", ")) := REF("Value")
    )

    val treeWithScalaDoc = ScalaDocGenerator.docToScalaDoc(
      Left(schema),
      objectTree)
      
    treeWithScalaDoc
  }
  
  
  
}

Source File: StandardJavaTreehugger.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package format
package standard

import stores.ClassStore

import format.abstractions.JavaTreehugger
import org.apache.avro.Schema
import scala.collection.JavaConverters._

object StandardJavaTreehugger extends JavaTreehugger {
  
  
  val wrapRegEx = """(.{1,75})\s""".r
  def wrapLine(s: String) = wrapRegEx.replaceAllIn(s, m=>m.group(1)+"\n * ")
  def javaDoc(docString: String): String = s"
          |${namespace.orElse(Option(schema.getNamespace)).fold("")(n => s"package $n;")}
          |
          |${Option(schema.getDoc).fold("")(javaDoc)}
          |public enum ${schema.getName} {
          |  ${schema.getEnumSymbols.asScala.mkString(", ")}  ;
          |}""".stripMargin
      case _ => sys.error("Currently ENUM is the only supported Java type.")
    }
  }

}

Source File: ScavroNamespaceRenamer.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package format
package scavro

import avrohugger.matchers.TypeMatcher
import avrohugger.matchers.custom.CustomNamespaceMatcher
import org.apache.avro.{ Schema, Protocol }

object ScavroNamespaceRenamer {
  // By default, Scavro generates Scala classes in packages that are the same
  // as the Java package with `model` appended.
  // TypeMatcher is here because it holds the custom namespace map
  def renameNamespace(
    maybeNamespace: Option[String],
    schemaOrProtocol: Either[Schema, Protocol],
    typeMatcher: TypeMatcher): Option[String] = {

    val scavroModelDefaultPackage: String =
      typeMatcher.customNamespaces
        .get("SCAVRO_DEFAULT_PACKAGE$")
        .getOrElse("model")
        
    val someScavroModelDefaultNamespace = maybeNamespace match {
      case Some(ns) => Some(ns + "." + scavroModelDefaultPackage)
      case None => sys.error("Scavro requires a namespace because Java " +
        "classes cannot be imported from the default package")
    }
    val scavroModelNamespace = {
      val ns = schemaOrProtocol match {
        case Left(schema) => Option(schema.getNamespace)
        case Right(protocol) => Option(protocol.getNamespace)
      }
      ns match {
        case Some(schemaNS) => {
          CustomNamespaceMatcher.checkCustomNamespace(
            ns,
            typeMatcher,
            maybeDefaultNamespace = someScavroModelDefaultNamespace)
        }
        case None => someScavroModelDefaultNamespace
      }
    }

    scavroModelNamespace
  }

}

Source File: ScavroSchemahugger.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package format
package scavro
package avrohuggers

import format.abstractions.avrohuggers.Schemahugger
import trees.{ ScavroCaseClassTree, ScavroObjectTree, ScavroTraitTree }
import matchers.TypeMatcher
import stores.{ClassStore, SchemaStore}
import types._

import org.apache.avro.Schema
import org.apache.avro.Schema.Type.{ ENUM, RECORD }

import treehugger.forest._
import definitions._
import treehuggerDSL._

object ScavroSchemahugger extends Schemahugger{

  def toTrees(
    schemaStore: SchemaStore,
    classStore: ClassStore,
    namespace: Option[String],
    schema: Schema,
    typeMatcher: TypeMatcher,
    maybeBaseTrait: Option[String],
    maybeFlags: Option[List[Long]],
    restrictedFields: Boolean): List[Tree] = {
    val ScalaClass = RootClass.newClass(schema.getName)
    val JavaClass = RootClass.newClass("J" + schema.getName)
    schema.getType match {
      case RECORD =>
        val caseClassDef = ScavroCaseClassTree.toCaseClassDef(
          classStore,
          namespace,
          schema,
          ScalaClass,
          JavaClass,
          typeMatcher,
          maybeBaseTrait,
          maybeFlags,
          restrictedFields)
        val companionDef = ScavroObjectTree.toCompanionDef(
          classStore,
          schema,
          ScalaClass,
          JavaClass,
          typeMatcher,
          maybeFlags)
        List(caseClassDef, companionDef)
      case ENUM => typeMatcher.avroScalaTypes.enum match {
        case JavaEnum =>
          List.empty
        case ScalaCaseObjectEnum =>
          ScavroTraitTree.toCaseObjectEnumDef(schema, maybeBaseTrait)
        case ScalaEnumeration =>
          val objectDef = ScavroObjectTree.toScalaEnumDef(
            classStore,
            schema,
            maybeBaseTrait,
            maybeFlags)
          List(objectDef)
        case EnumAsScalaString => List.empty
      }
      case _ =>
        sys.error("Only RECORD and ENUM can be top-level definitions")
    }
  }

}

Source File: ScavroMethodRenamer.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package format
package scavro

import org.apache.avro.Schema
import org.apache.avro.Schema.Field
import org.apache.avro.compiler.specific.SpecificCompiler
import org.apache.avro.specific.SpecificData

import scala.collection.JavaConverters._

object ScavroMethodRenamer {

  
    val ERROR_RESERVED_WORDS: java.util.Set[String] =
      (Set("message", "cause") ++ ACCESSOR_MUTATOR_RESERVED_WORDS.asScala).asJava

    // Check for the special case in which the schema defines two fields whose
    // names are identical except for the case of the first character:
    val firstChar: Char = field.name().charAt(0)
    val conflictingFieldName: String = (if (Character.isLowerCase(firstChar))
        Character.toUpperCase(firstChar) else Character.toLowerCase(firstChar)) +
        (if (field.name().length() > 1) field.name().substring(1) else "")
    val fieldNameConflict: Boolean = Option(schema.getField(conflictingFieldName)).isDefined

    val methodBuilder: StringBuilder = new StringBuilder(prefix)
    val fieldName: String =
      SpecificCompiler.mangle(
        field.name(),
        if(schema.isError()) ERROR_RESERVED_WORDS else ACCESSOR_MUTATOR_RESERVED_WORDS,
        true)

    var nextCharToUpper: Boolean = true
    (0 until fieldName.length).map(ii => {
      if (fieldName.charAt(ii) == '_') {
        nextCharToUpper = true
      }
      else if (nextCharToUpper) {
        methodBuilder.append(Character.toUpperCase(fieldName.charAt(ii)))
        nextCharToUpper = false
      }
      else {
        methodBuilder.append(fieldName.charAt(ii))
      }
    })
    methodBuilder.append(postfix)

    // If there is a field name conflict append $0 or $1
    if (fieldNameConflict) {
      if (methodBuilder.charAt(methodBuilder.length() - 1) != '$') {
        methodBuilder.append('$')
      }
      methodBuilder.append(if(Character.isLowerCase(firstChar))'0' else '1')
    }

    methodBuilder.toString()
  }
  

  
}

Source File: ScavroTraitTree.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package format
package scavro
package trees

import generators.ScalaDocGenerator
import matchers.TypeMatcher
import types._

import treehugger.forest._
import definitions._
import treehuggerDSL._

import org.apache.avro.Schema.Type.{ ENUM, RECORD }
import org.apache.avro.{ Protocol, Schema }

import scala.collection.JavaConverters._


object ScavroTraitTree {

  def toADTRootDef(protocol: Protocol, typeMatcher: TypeMatcher) = {
    def isEnum(schema: Schema) = schema.getType == ENUM
    val sealedTraitTree =  TRAITDEF(protocol.getName).withFlags(Flags.SEALED)
    val adtRootTree = {
      val adtSubTypes = typeMatcher.avroScalaTypes.enum match {
        case JavaEnum => protocol.getTypes.asScala.toList.filterNot(isEnum)
        case ScalaCaseObjectEnum => protocol.getTypes.asScala.toList
        case ScalaEnumeration => protocol.getTypes.asScala.toList
        case EnumAsScalaString => protocol.getTypes.asScala.filterNot(isEnum)
      }
      if (adtSubTypes.forall(schema => schema.getType == RECORD)) {
        sealedTraitTree
          .withParents("AvroSerializeable")
          .withParents("Product")
          .withParents("Serializable")
      }
      else sealedTraitTree
    }
    val treeWithScalaDoc = ScalaDocGenerator.docToScalaDoc(
      Right(protocol),
      adtRootTree)
    
    treeWithScalaDoc
  }
  
  def toCaseObjectEnumDef(schema: Schema,
    maybeBaseTrait: Option[String]): List[Tree] = {
    val adtRootTree: Tree = maybeBaseTrait match {
      case Some(baseTrait) =>
        TRAITDEF(schema.getName).withFlags(Flags.SEALED).withParents(baseTrait)
      case None =>
        TRAITDEF(schema.getName).withFlags(Flags.SEALED)
    }
    val adtSubTypes: List[Tree] = schema.getEnumSymbols.asScala
      .map(enumSymbol => enumSymbol.toString)
      .map(enumSymbolString => {
        (CASEOBJECTDEF(enumSymbolString).withParents(schema.getName): Tree)
      }).toList
    val objectTree = OBJECTDEF(schema.getName) := Block(adtSubTypes:_*)
    val adtRootTreeWithScalaDoc: Tree = ScalaDocGenerator.docToScalaDoc(
      Left(schema),
      adtRootTree)
    List(adtRootTreeWithScalaDoc, objectTree)
  }
  
  
}

Source File: ScalaConverter.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package format
package scavro
package converters

import matchers.TypeMatcher
import types._

import treehugger.forest._
import definitions._
import treehuggerDSL._

import org.apache.avro.Schema

import scala.language.postfixOps
import scala.collection.JavaConverters._

class ScalaConverter(typeMatcher: TypeMatcher) {

  def convertFromJava(
    schema: Schema,
    tree: Tree,
    fieldPath: List[String] = List.empty): Tree = {

    schema.getType match {
      case Schema.Type.ENUM  => typeMatcher.avroScalaTypes.enum match {
        case EnumAsScalaString => tree TOSTRING
        case JavaEnum | ScalaEnumeration | ScalaCaseObjectEnum => {
          val conversionCases = schema.getEnumSymbols.asScala.map(enumSymbol => {
            CASE(REF("J" + schema.getName) DOT(enumSymbol)) ==> (REF(schema.getName) DOT(enumSymbol))
          })
          tree MATCH(conversionCases)
        }
      }
      case Schema.Type.RECORD => {
        REF(schema.getName).DOT("metadata").DOT("fromAvro").APPLY(tree)
      }
      case Schema.Type.UNION  => {
        val types = schema.getTypes.asScala
        // check if it's the kind of union that we support (i.e. nullable fields)
        if (types.length != 2 ||
           !types.map(x => x.getType).contains(Schema.Type.NULL) ||
            types.filterNot(x => x.getType == Schema.Type.NULL).length != 1) {
              sys.error("Unions beyond nullable fields are not supported")
        }
        // the union represents a nullable field, the kind of union supported in avrohugger
        else {
          val typeParamSchema = types.find(x => x.getType != Schema.Type.NULL).get
          val nullConversion = CASE(NULL) ==> NONE
          val someConversion = CASE(WILDCARD) ==> SOME(convertFromJava(typeParamSchema, tree, fieldPath))
          val conversionCases = List(nullConversion, someConversion)
          tree MATCH(conversionCases:_*)
        }
      }
      case Schema.Type.NULL => NULL
      case Schema.Type.STRING => tree TOSTRING
      case Schema.Type.INT => tree DOT "toInt"
      case Schema.Type.FLOAT => tree DOT "toFloat"
      case Schema.Type.DOUBLE => tree DOT "toDouble"
      case Schema.Type.LONG => tree DOT "toLong"

      case Schema.Type.ARRAY => {
        val seqArgs = SEQARG(tree DOT "asScala")
        val collection = typeMatcher.avroScalaTypes.array match {
          case ScalaArray  => ARRAY(seqArgs)
          case ScalaList   => LIST(seqArgs)
          case ScalaSeq   => SEQ(seqArgs)
          case ScalaVector => VECTOR(seqArgs)
        }
        collection MAP(LAMBDA(PARAM("x")) ==> BLOCK(convertFromJava(schema.getElementType, REF("x"), fieldPath)))
      }
      case Schema.Type.MAP => {
        val JavaMap = RootClass.newClass("java.util.Map[_,_]")
        val resultExpr = {
          BLOCK(
            REF("scala.collection.JavaConverters.mapAsScalaMapConverter")
            .APPLY(REF("map"))
            .DOT("asScala")
            .DOT("toMap")
            .MAP(LAMBDA(PARAM("kvp")) ==> BLOCK(
              VAL("key") := REF("kvp._1").DOT("toString"),
              VAL("value") := REF("kvp._2"),
              PAREN(REF("key"), convertFromJava(schema.getValueType, REF("value"), fieldPath)))
            )
          )
        }
        val mapConversion = CASE(ID("map") withType(JavaMap)) ==> resultExpr
        tree MATCH(mapConversion)
      }
      case Schema.Type.FIXED    => sys.error("the FIXED datatype is not yet supported")
      case Schema.Type.BYTES    => {
        val JavaBuffer = RootClass.newClass("java.nio.ByteBuffer")
        tree MATCH CASE(ID("buffer") withType(JavaBuffer)) ==> Block(
          REF("buffer") DOT "array" APPLY(Nil)
        )
      }
      case _ => tree
    }
  }

}

Source File: ScavroScalaTreehugger.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package format
package scavro

import format.abstractions.ScalaTreehugger
import avrohuggers.{ ScavroProtocolhugger, ScavroSchemahugger }
import input.reflectivecompilation.schemagen._
import matchers.TypeMatcher
import stores.{ ClassStore, SchemaStore }

import org.apache.avro.{ Protocol, Schema }

import treehugger.forest._
import definitions._
import treehuggerDSL._

object ScavroScalaTreehugger extends ScalaTreehugger {

  val schemahugger = ScavroSchemahugger
  val protocolhugger = ScavroProtocolhugger
	val importer = ScavroImporter

  // SpecificCompiler can't return a tree for Java enums, so return
  // a String here for a consistent api vis a vis *ToFile and *ToStrings
  def asScalaCodeString(
    classStore: ClassStore,
    namespace: Option[String],
    schemaOrProtocol: Either[Schema, Protocol],
    typeMatcher: TypeMatcher,
    schemaStore: SchemaStore,
    restrictedFields: Boolean): String = {

    val imports: List[Import] = importer.getImports(
      schemaOrProtocol, namespace, schemaStore, typeMatcher)

    val topLevelDefs: List[Tree] = schemaOrProtocol match {
      case Left(schema) => schemahugger.toTrees(
        schemaStore,
        classStore,
        namespace,
        schema,
        typeMatcher,
        None,
        None,
        restrictedFields
      )
      case Right(protocol) => protocolhugger.toTrees(
        schemaStore,
        classStore,
        namespace,
        protocol,
        typeMatcher,
        None,
        None,
        restrictedFields
      )
    }
    // wrap the imports and classdef in a block with a comment and a package
    val tree = {
      val blockContent = imports ++ topLevelDefs
      if (namespace.isDefined) BLOCK(blockContent).inPackage(namespace.get)
      else BLOCK(blockContent:_*).withoutPackage
    }.withDoc("MACHINE-GENERATED FROM AVRO SCHEMA. DO NOT EDIT DIRECTLY")

    treeToString(tree)
  }

}

Source File: NestedSchemaExtractor.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package input

import avrohugger.matchers.TypeMatcher
import stores.SchemaStore
import types.EnumAsScalaString

import org.apache.avro.Schema
import org.apache.avro.Schema.Type.{ARRAY, ENUM, MAP, RECORD, UNION}

import scala.collection.JavaConverters._

object NestedSchemaExtractor {
  // if a record is found, extract nested RECORDs and ENUMS (i.e. top-level types) 
  def getNestedSchemas(
    schema: Schema,
    schemaStore: SchemaStore,
    typeMatcher: TypeMatcher): List[Schema] = {
    def extract(
      schema: Schema,
      fieldPath: List[String] = List.empty): List[Schema] = {

      schema.getType match {
        case RECORD =>
          val fields: List[Schema.Field] = schema.getFields.asScala.toList
          val fieldSchemas: List[Schema] = fields.map(field => field.schema)
          def flattenSchema(fieldSchema: Schema): List[Schema] = {
            fieldSchema.getType match {
              case ARRAY => flattenSchema(fieldSchema.getElementType)
              case MAP => flattenSchema(fieldSchema.getValueType)
              case RECORD => {
                // if the field schema is one that has already been stored, use that one
                if (schemaStore.schemas.contains(fieldSchema.getFullName)) List()
                // if we've already seen this schema (recursive schemas) don't traverse further
                else if (fieldPath.contains(fieldSchema.getFullName)) List()
                else fieldSchema :: extract(fieldSchema, fieldSchema.getFullName :: fieldPath)
              }
              case UNION => fieldSchema.getTypes.asScala.toList.flatMap(x => flattenSchema(x))
              case ENUM => {
                // if the field schema is one that has already been stored, use that one
                if (schemaStore.schemas.contains(fieldSchema.getFullName)) List()
                else List(fieldSchema)
              }
              case _ => List(fieldSchema)
            }
          }
          val flatSchemas = fieldSchemas.flatMap(fieldSchema => flattenSchema(fieldSchema))
          def topLevelTypes(schema: Schema) = {
            if (typeMatcher.avroScalaTypes.enum == EnumAsScalaString) schema.getType == RECORD
            else (schema.getType == RECORD | schema.getType == ENUM)
          }
          val nestedTopLevelSchemas = flatSchemas.filter(topLevelTypes)
          nestedTopLevelSchemas
        case ENUM => List(schema)
        case _ => Nil
      } 
    }

    schema::extract(schema)
  }
}

Source File: DependencyInspector.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package input

import org.apache.avro.Schema

import scala.collection.JavaConverters._


object DependencyInspector {
  import Schema.Type._
  def getReferredNamespace(schema: Schema): Option[String] = schema.getType match {
    case ARRAY =>
      getReferredNamespace(schema.getElementType)
    case UNION =>
      schema.getTypes.asScala.find( innerType => innerType.getType != NULL ) flatMap getReferredNamespace
    case MAP =>
      getReferredNamespace(schema.getValueType)
    case RECORD | ENUM =>
      Option(schema.getNamespace)
    case _ => None

  }

  def getReferredTypeName(schema: Schema): String = schema.getType match {
    case ARRAY =>
      getReferredTypeName(schema.getElementType)
    case UNION =>
      schema.getTypes.asScala.find( innerType => innerType.getType != NULL ).map( getReferredTypeName ).getOrElse("")
    case MAP =>
      getReferredTypeName(schema.getValueType)
    case _ =>
      schema.getName
  }

}

Source File: IdlImportParser.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package input
package parsers

import org.apache.avro.{ Protocol, Schema }

import java.io.File
import scala.util.matching.Regex.Match

object IdlImportParser {
  
  def stripComments(fileContents: String): String = {
    val multiLinePattern = """/\*.*\*/""".r
    val singleLinePattern = """//.*$""".r
    val noSingleLines = singleLinePattern.replaceAllIn(fileContents, "")
    val commentFree = multiLinePattern.replaceAllIn(noSingleLines, "")
    commentFree
  }

  def getImportedFiles(infile: File, classLoader: ClassLoader): List[File] = {
    def readFile(file: File): String = {
      var count = 0
      val maxTries = 3
      try {
        count += 1
        val file = scala.io.Source.fromFile(infile)
        val fileContents: String = stripComments(file.mkString)
        file.close
        // if file is empty, try again, it was there when we read idl
        if (fileContents.isEmpty && (count < maxTries)) readFile(infile)
        else fileContents
      } catch {// if file is not found, try again, it was there when we read idl
        case e: java.io.FileNotFoundException => {
          if (count < maxTries) readFile(infile)
          else sys.error("File to found: " + infile)
        }
      }
    }
    val path = infile.getParent + "/"
    val contents = readFile(infile)
    val avdlPattern = """import[ \t]+idl[ \t]+"([^"]*\.avdl)"[ \t]*;""".r
    val avprPattern = """import[ \t]+protocol[ \t]+"([^"]*\.avpr)"[ \t]*;""".r
    val avscPattern = """import[ \t]+schema[ \t]+"([^"]*\.avsc)"[ \t]*;""".r
    val idlMatches = avdlPattern.findAllIn(contents).matchData.toList
    val protocolMatches = avprPattern.findAllIn(contents).matchData.toList
    val schemaMatches = avscPattern.findAllIn(contents).matchData.toList
    val importMatches = idlMatches ::: protocolMatches ::: schemaMatches
    
    val (localImports, nonLocalMatches): (List[File], List[Match]) =
      importMatches.foldLeft((List.empty[File], List.empty[Match])){
        case ((ai,am), m) =>
          val f = new File(path + m.group(1))
          if (f.exists) (ai:+f, am)
          else (ai, am:+m)
      }
      
    val classpathImports: List[File] = nonLocalMatches.map(m =>{
      
      Option(classLoader.getResource(m.group(1))).map(resource =>{
        new File(resource.getFile)
      })
    }).flatMap(_.toList).filter(file => file.exists)

    val importedFiles = classpathImports ++ localImports
    importedFiles
  }
  
}

Source File: StringInputParser.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package input
package parsers

import reflectivecompilation.{ PackageSplitter, Toolbox }
import stores.{ SchemaStore, TypecheckDependencyStore }

import org.apache.avro.Protocol
import org.apache.avro.Schema
import org.apache.avro.Schema.Parser
import org.apache.avro.SchemaParseException
import org.apache.avro.compiler.idl.Idl
import org.apache.avro.compiler.idl.ParseException

import scala.collection.JavaConverters._
import java.nio.charset.Charset
import java.io.FileNotFoundException

// tries schema first, then protocol, then idl, then for case class defs
class StringInputParser {

  lazy val schemaParser = new Parser()
  lazy val typecheckDependencyStore = new TypecheckDependencyStore

  def getSchemaOrProtocols(
    inputString: String,
    schemaStore: SchemaStore): List[Either[Schema, Protocol]] = {

    def trySchema(str: String): List[Either[Schema, Protocol]] = {
      try {
        List(Left(schemaParser.parse(str)))
      } 
      catch {
        case notSchema: SchemaParseException => tryProtocol(str)
        case unknown: Throwable => sys.error("Unexpected exception: " + unknown)
      }
    }

    def tryProtocol(str: String): List[Either[Schema, Protocol]] = {
      try {
        List(Right(Protocol.parse(str)))
      }
      catch {
        case notProtocol: SchemaParseException => tryIDL(str)
        case unknown: Throwable => sys.error("Unexpected exception: " + unknown)
      }
    }

    def tryIDL(str: String): List[Either[Schema, Protocol]] = {
      try {
        val bytes = str.getBytes(Charset.forName("UTF-8"))
        val inStream = new java.io.ByteArrayInputStream(bytes)
        val idlParser = new Idl(inStream)
        val protocol = idlParser.CompilationUnit()
        List(Right(protocol))
      }
      catch {
        case e: ParseException => {
          if (e.getMessage.contains("FileNotFoundException")) {
            sys.error("Imports not supported in String IDLs, only avdl files.")
          }
          else tryCaseClass(str, schemaStore)
        }
        case unknown: Throwable => sys.error("Unexpected exception: " + unknown)
        }
      }

    def tryCaseClass(
      str: String,
      schemaStore: SchemaStore): List[Either[Schema, Protocol]] = {
      val compilationUnits = PackageSplitter.getCompilationUnits(str)
      val scalaDocs = ScalaDocParser.getScalaDocs(compilationUnits)
      val trees = compilationUnits.map(src => Toolbox.toolBox.parse(src))
      val treesZippedWithDocs = trees.zip(scalaDocs)
      val schemas = treesZippedWithDocs.flatMap(treeAndDocs => {
        val tree = treeAndDocs._1
        val docs = treeAndDocs._2
        TreeInputParser.parse(tree, docs, schemaStore, typecheckDependencyStore)
      })
      schemas.map(schema => Left(schema))
    }
    
    // tries schema first, then protocol, then idl, then for case class defs
    val schemaOrProtocols = trySchema(inputString)
    schemaOrProtocols
  }
}

Source File: RecordSchemaGenerator.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package input
package reflectivecompilation
package schemagen

import parsers.ScalaDocParser
import stores.{ SchemaStore, TypecheckDependencyStore }

import org.apache.avro.Schema.Field
import org.apache.avro.Schema

import java.util.{ Arrays => JArrays }

import scala.reflect.runtime.universe._
import scala.reflect.runtime.currentMirror
import scala.collection.JavaConverters._

object RecordSchemaGenerator  {

  def generateSchema(
    className: String, 
    namespace: Option[Name], 
    fields: List[ValDef],
    maybeScalaDoc: Option[String],
    schemaStore: SchemaStore, 
    typecheckDependencyStore: TypecheckDependencyStore): Schema = {

    // Can't seem to typecheck packaged classes, so splice-in unpackaged versions
    // and later the FieldSchemaGenerator's type matcher must be passed the field's 
    // namespace explicitly.
    def typeCheck(t: Tree) = {
      val dependencies = typecheckDependencyStore.knownClasses.values.toList
      Toolbox.toolBox.typeCheck(q"..$dependencies; {type T = $t}") match {
	      case x @ Block(classDefs, Block(List(TypeDef(mods, name, tparams, rhs)), const)) => rhs.tpe
        case _ => t.tpe // if there are no fields, then no dependencies either
      }
    }

    def toAvroFieldSchema(valDef: ValDef) = {
      val (referredNamespace, fieldType) = valDef.tpt match {
        case tq"$ns.$typeName" => (Some(newTermName(ns.toString)), tq"$typeName")
        case t => (namespace, t)
      }

      val maybeFieldDoc = ScalaDocParser.fieldDocsMap(maybeScalaDoc).get(valDef.name.toString)

      new FieldSchemaGenerator().toAvroField(
        referredNamespace,
        valDef.name, 
        typeCheck(fieldType),
        valDef.rhs,
        maybeFieldDoc,
        schemaStore
      )
    }

    // conversion from Option to String/null is for compatibility with Apache Avro
    val ns = namespace match {
      case Some(n) => n.toString
      case None => null
    }

    val avroFields = fields.map(valDef => {
      toAvroFieldSchema(valDef)
    })

    // conversion from Option to String/null is for compatibility with Apache Avro
    val recordDoc = ScalaDocParser.getTopLevelDoc(maybeScalaDoc)

    val avroSchema = Schema.createRecord(className, recordDoc, ns, false)    
    avroSchema.setFields(JArrays.asList(avroFields.toArray:_*))
    schemaStore.accept(avroSchema)
    avroSchema
  }

}

Source File: EnumSchemaGenerator.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package input
package reflectivecompilation
package schemagen

import parsers.ScalaDocParser
import stores.SchemaStore

import org.apache.avro.Schema

import java.util.{Arrays => JArrays}

import scala.reflect.runtime.universe.Name
import scala.collection.JavaConverters._

object EnumSchemaGenerator  {

  def generateSchema(
    className: String, 
    namespace: Option[Name], 
    values: List[Name],
    maybeScalaDoc: Option[String],
    schemaStore: SchemaStore): Schema = {

    // conversion from Option to String/null is for compatibility with Apache Avro
    val ns = namespace match {
      case Some(n) => n.toString
      case None => null
    }

    val vals = JArrays.asList(values.map(value => value.toString).toArray:_*)

    val doc = ScalaDocParser.getTopLevelDoc(maybeScalaDoc)

    val avroSchema = Schema.createEnum(className, doc, ns, vals)    

    schemaStore.accept(avroSchema)
    avroSchema
  }

}

Source File: FileGenerator.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package generators

import avrohugger.format.abstractions.SourceFormat
import avrohugger.input.DependencyInspector
import avrohugger.input.NestedSchemaExtractor
import avrohugger.input.reflectivecompilation.schemagen._
import avrohugger.input.parsers.{ FileInputParser, StringInputParser}
import avrohugger.matchers.TypeMatcher
import avrohugger.stores.{ ClassStore, SchemaStore }

import java.io.{File, FileNotFoundException, IOException}

import org.apache.avro.{ Protocol, Schema }
import org.apache.avro.Schema.Type.ENUM

// Unable to overload this class' methods because outDir uses a default value
private[avrohugger] object FileGenerator {

  def schemaToFile(
    schema: Schema,
    outDir: String,
    format: SourceFormat,
    classStore: ClassStore,
    schemaStore: SchemaStore,
    typeMatcher: TypeMatcher,
    restrictedFields: Boolean): Unit = {
    val topNS: Option[String] = DependencyInspector.getReferredNamespace(schema)
    val topLevelSchemas: List[Schema] =
      NestedSchemaExtractor.getNestedSchemas(schema, schemaStore, typeMatcher)
    // most-nested classes processed first
    topLevelSchemas.reverse.distinct.foreach(schema => {
      // pass in the top-level schema's namespace if the nested schema has none
      val ns = DependencyInspector.getReferredNamespace(schema) orElse topNS
      format.compile(classStore, ns, Left(schema), outDir, schemaStore, typeMatcher, restrictedFields)
    })
  }

  def protocolToFile(
    protocol: Protocol,
    outDir: String,
    format: SourceFormat,
    classStore: ClassStore,
    schemaStore: SchemaStore,
    typeMatcher: TypeMatcher,
    restrictedFields: Boolean): Unit = {
    val ns = Option(protocol.getNamespace)
    format.compile(classStore, ns, Right(protocol), outDir, schemaStore, typeMatcher, restrictedFields)
  }

  def stringToFile(
    str: String,
    outDir: String,
    format: SourceFormat,
    classStore: ClassStore,
    schemaStore: SchemaStore,
    stringParser: StringInputParser,
    typeMatcher: TypeMatcher,
    restrictedFields: Boolean): Unit = {
    val schemaOrProtocols = stringParser.getSchemaOrProtocols(str, schemaStore)
    schemaOrProtocols.foreach(schemaOrProtocol => {
      schemaOrProtocol match {
        case Left(schema) => {
          schemaToFile(schema, outDir, format, classStore, schemaStore, typeMatcher, restrictedFields)
        }
        case Right(protocol) => {
          protocolToFile(protocol, outDir, format, classStore, schemaStore, typeMatcher, restrictedFields)
        }
      }
    })
  }

  def fileToFile(
    inFile: File,
    outDir: String,
    format: SourceFormat,
    classStore: ClassStore,
    schemaStore: SchemaStore,
    fileParser: FileInputParser,
    typeMatcher: TypeMatcher,
    classLoader: ClassLoader,
    restrictedFields: Boolean): Unit = {
    val schemaOrProtocols: List[Either[Schema, Protocol]] =
      fileParser.getSchemaOrProtocols(inFile, format, classStore, classLoader)
    schemaOrProtocols.foreach(schemaOrProtocol => schemaOrProtocol match {
      case Left(schema) => {
        schemaToFile(schema, outDir, format, classStore, schemaStore, typeMatcher, restrictedFields)
      }
      case Right(protocol) => {
        protocolToFile(protocol, outDir, format, classStore, schemaStore, typeMatcher, restrictedFields)
      }
    })
  }

}

Source File: ScalaDocGenerator.scala From avrohugger with Apache License 2.0

5 votes

package avrohugger
package generators

import treehugger.forest._
import definitions._
import treehuggerDSL._

import org.apache.avro.{ Protocol, Schema }
import org.apache.avro.Schema.Field
import org.apache.avro.Schema.Type.{ ENUM, RECORD }

import scala.language.postfixOps
import scala.collection.JavaConverters._

object ScalaDocGenerator {

  def docToScalaDoc(
    schemaOrProtocol: Either[Schema, Protocol],
    tree: Tree): Tree = {

    def aFieldHasDoc(schema: Schema): Boolean = {
      schema.getFields.asScala.exists(field => {
        val maybeFieldDoc = Option(field.doc)
        isDoc(maybeFieldDoc)
      })
    }

    def topLevelHasDoc(schema: Schema): Boolean = {
      val maybeSchemaDoc = Option(schema.getDoc)
      isDoc(maybeSchemaDoc)
    }

    def isDoc(maybeDoc: Option[String]): Boolean = {
      maybeDoc match {
        case Some(doc) => true
        case None => false
      }
    }

    // Need arbitrary number of fields, so can't use DocTags, must return String
    def getFieldFauxDocTags(schema: Schema): List[String] = {
      val docStrings = schema.getFields.asScala.toList.map(field => {
        val fieldName = field.name
        val fieldDoc = Option(field.doc).getOrElse("")
        s"@param $fieldName $fieldDoc"
      })
      docStrings
    }

    def wrapClassWithDoc(schema: Schema, tree: Tree, docs: List[String]) = {
      if (topLevelHasDoc(schema) || aFieldHasDoc(schema)) tree.withDoc(docs)
      else tree
    }

    def wrapEnumWithDoc(schema: Schema, tree: Tree, docs: List[String]) = {
      if (topLevelHasDoc(schema)) tree.withDoc(docs)
      else tree
    }
    
    def wrapTraitWithDoc(protocol: Protocol, tree: Tree, docs: List[String]) = {
      if (isDoc(Option(protocol.getDoc))) tree.withDoc(docs)
      else tree
    }

    val docStrings: List[String] = schemaOrProtocol match {
      case Left(schema) => Option(schema.getDoc).toList
      case Right(protocol) => Option(protocol.getDoc).toList
    }

    schemaOrProtocol match {
      case Left(schema) => schema.getType match {
        case RECORD =>
          val paramDocs = getFieldFauxDocTags(schema)
          wrapClassWithDoc(schema, tree, docStrings:::paramDocs)
        case ENUM =>
          wrapEnumWithDoc(schema, tree, docStrings)
        case _ =>
          sys.error("Error generating ScalaDoc from Avro doc. Not ENUM/RECORD")
      }
      case Right(protocol) => wrapTraitWithDoc(protocol, tree, docStrings)
    }

  }

}

Source File: EnumProtocol.scala From avrohugger with Apache License 2.0

5 votes

package example.idl.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroMetadata, AvroReader, AvroSerializeable}

import example.idl.{Card => JCard, Suit => JSuit}

sealed trait EnumProtocol extends AvroSerializeable with Product with Serializable

final object Suit extends Enumeration with EnumProtocol {
  type Suit = Value
  val SPADES, DIAMONDS, CLUBS, HEARTS = Value
}

final case class Card(suit: Suit.Value, number: Int) extends AvroSerializeable with EnumProtocol {
  type J = JCard
  override def toAvro: JCard = {
    new JCard(suit match {
      case Suit.SPADES => JSuit.SPADES
      case Suit.DIAMONDS => JSuit.DIAMONDS
      case Suit.CLUBS => JSuit.CLUBS
      case Suit.HEARTS => JSuit.HEARTS
    }, number)
  }
}

final object Card {
  implicit def reader = new AvroReader[Card] {
    override type J = JCard
  }
  implicit val metadata: AvroMetadata[Card, JCard] = new AvroMetadata[Card, JCard] {
    override val avroClass: Class[JCard] = classOf[JCard]
    override val schema: Schema = JCard.getClassSchema()
    override val fromAvro: (JCard) => Card = {
      (j: JCard) => Card(j.getSuit match {
        case JSuit.SPADES => Suit.SPADES
        case JSuit.DIAMONDS => Suit.DIAMONDS
        case JSuit.CLUBS => Suit.CLUBS
        case JSuit.HEARTS => Suit.HEARTS
      }, j.getNumber.toInt)
    }
  }
}

Source File: EnumProtocol.scala From avrohugger with Apache License 2.0

5 votes

package example.proto.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroMetadata, AvroReader, AvroSerializeable}

import example.proto.{Card => JCard, Suit => JSuit}

sealed trait EnumProtocol extends AvroSerializeable with Product with Serializable

final object Suit extends Enumeration with EnumProtocol {
  type Suit = Value
  val SPADES, HEARTS, DIAMONDS, CLUBS = Value
}

final case class Card(suit: Suit.Value, number: Int) extends AvroSerializeable with EnumProtocol {
  type J = JCard
  override def toAvro: JCard = {
    new JCard(suit match {
      case Suit.SPADES => JSuit.SPADES
      case Suit.HEARTS => JSuit.HEARTS
      case Suit.DIAMONDS => JSuit.DIAMONDS
      case Suit.CLUBS => JSuit.CLUBS
    }, number)
  }
}

final object Card {
  implicit def reader = new AvroReader[Card] {
    override type J = JCard
  }
  implicit val metadata: AvroMetadata[Card, JCard] = new AvroMetadata[Card, JCard] {
    override val avroClass: Class[JCard] = classOf[JCard]
    override val schema: Schema = JCard.getClassSchema()
    override val fromAvro: (JCard) => Card = {
      (j: JCard) => Card(j.getSuit match {
        case JSuit.SPADES => Suit.SPADES
        case JSuit.HEARTS => Suit.HEARTS
        case JSuit.DIAMONDS => Suit.DIAMONDS
        case JSuit.CLUBS => Suit.CLUBS
      }, j.getNumber.toInt)
    }
  }
}

Source File: Example5.scala From avrohugger with Apache License 2.0

5 votes

package com.example.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroMetadata, AvroReader, AvroSerializeable}

import com.example.{NoSpaces6 => JNoSpaces6, NoSpaces7 => JNoSpaces7}




final case class NoSpaces6(comment_property1: String) extends AvroSerializeable {
  type J = JNoSpaces6
  override def toAvro: JNoSpaces6 = {
    new JNoSpaces6(comment_property1)
  }
}

object NoSpaces6 {
  implicit def reader = new AvroReader[NoSpaces6] {
    override type J = JNoSpaces6
  }
  implicit val metadata: AvroMetadata[NoSpaces6, JNoSpaces6] = new AvroMetadata[NoSpaces6, JNoSpaces6] {
    override val avroClass: Class[JNoSpaces6] = classOf[JNoSpaces6]
    override val schema: Schema = JNoSpaces6.getClassSchema()
    override val fromAvro: (JNoSpaces6) => NoSpaces6 = {
      (j: JNoSpaces6) => NoSpaces6(j.getCommentProperty1.toString)
    }
  }
}

final case class NoSpaces7(comment_property2: String) extends AvroSerializeable {
  type J = JNoSpaces7
  override def toAvro: JNoSpaces7 = {
    new JNoSpaces7(comment_property2)
  }
}

object NoSpaces7 {
  implicit def reader = new AvroReader[NoSpaces7] {
    override type J = JNoSpaces7
  }
  implicit val metadata: AvroMetadata[NoSpaces7, JNoSpaces7] = new AvroMetadata[NoSpaces7, JNoSpaces7] {
    override val avroClass: Class[JNoSpaces7] = classOf[JNoSpaces7]
    override val schema: Schema = JNoSpaces7.getClassSchema()
    override val fromAvro: (JNoSpaces7) => NoSpaces7 = {
      (j: JNoSpaces7) => NoSpaces7(j.getCommentProperty2.toString)
    }
  }
}

Source File: NoSpaces2.scala From avrohugger with Apache License 2.0

5 votes

package com.example.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroMetadata, AvroReader, AvroSerializeable}

import com.example.{NoSpaces2 => JNoSpaces2}


final case class NoSpaces2(comment_property: String) extends AvroSerializeable {
  type J = JNoSpaces2
  override def toAvro: JNoSpaces2 = {
    new JNoSpaces2(comment_property)
  }
}

object NoSpaces2 {
  implicit def reader = new AvroReader[NoSpaces2] {
    override type J = JNoSpaces2
  }
  implicit val metadata: AvroMetadata[NoSpaces2, JNoSpaces2] = new AvroMetadata[NoSpaces2, JNoSpaces2] {
    override val avroClass: Class[JNoSpaces2] = classOf[JNoSpaces2]
    override val schema: Schema = JNoSpaces2.getClassSchema()
    override val fromAvro: (JNoSpaces2) => NoSpaces2 = {
      (j: JNoSpaces2) => NoSpaces2(j.getCommentProperty.toString)
    }
  }
}

Source File: NoSpaces3.scala From avrohugger with Apache License 2.0

5 votes

package com.example.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroMetadata, AvroReader, AvroSerializeable}

import com.example.{NoSpaces3 => JNoSpaces3}


final case class NoSpaces3(comment_property: String) extends AvroSerializeable {
  type J = JNoSpaces3
  override def toAvro: JNoSpaces3 = {
    new JNoSpaces3(comment_property)
  }
}

object NoSpaces3 {
  implicit def reader = new AvroReader[NoSpaces3] {
    override type J = JNoSpaces3
  }
  implicit val metadata: AvroMetadata[NoSpaces3, JNoSpaces3] = new AvroMetadata[NoSpaces3, JNoSpaces3] {
    override val avroClass: Class[JNoSpaces3] = classOf[JNoSpaces3]
    override val schema: Schema = JNoSpaces3.getClassSchema()
    override val fromAvro: (JNoSpaces3) => NoSpaces3 = {
      (j: JNoSpaces3) => NoSpaces3(j.getCommentProperty.toString)
    }
  }
}

Source File: Example4.scala From avrohugger with Apache License 2.0

5 votes

package com.example.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroMetadata, AvroReader, AvroSerializeable}

import com.example.{NoSpaces4 => JNoSpaces4, NoSpaces5 => JNoSpaces5}


sealed trait Example4 extends AvroSerializeable with Product with Serializable

final case class NoSpaces4(comment_property1: String) extends AvroSerializeable with Example4 {
  type J = JNoSpaces4
  override def toAvro: JNoSpaces4 = {
    new JNoSpaces4(comment_property1)
  }
}

final object NoSpaces4 {
  implicit def reader = new AvroReader[NoSpaces4] {
    override type J = JNoSpaces4
  }
  implicit val metadata: AvroMetadata[NoSpaces4, JNoSpaces4] = new AvroMetadata[NoSpaces4, JNoSpaces4] {
    override val avroClass: Class[JNoSpaces4] = classOf[JNoSpaces4]
    override val schema: Schema = JNoSpaces4.getClassSchema()
    override val fromAvro: (JNoSpaces4) => NoSpaces4 = {
      (j: JNoSpaces4) => NoSpaces4(j.getCommentProperty1.toString)
    }
  }
}

final case class NoSpaces5(comment_property2: String) extends AvroSerializeable with Example4 {
  type J = JNoSpaces5
  override def toAvro: JNoSpaces5 = {
    new JNoSpaces5(comment_property2)
  }
}

final object NoSpaces5 {
  implicit def reader = new AvroReader[NoSpaces5] {
    override type J = JNoSpaces5
  }
  implicit val metadata: AvroMetadata[NoSpaces5, JNoSpaces5] = new AvroMetadata[NoSpaces5, JNoSpaces5] {
    override val avroClass: Class[JNoSpaces5] = classOf[JNoSpaces5]
    override val schema: Schema = JNoSpaces5.getClassSchema()
    override val fromAvro: (JNoSpaces5) => NoSpaces5 = {
      (j: JNoSpaces5) => NoSpaces5(j.getCommentProperty2.toString)
    }
  }
}

Source File: NoSpaces1.scala From avrohugger with Apache License 2.0

5 votes

package com.example.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroMetadata, AvroReader, AvroSerializeable}

import com.example.{NoSpaces1 => JNoSpaces1}


final case class NoSpaces1(single_line_comment_property: String, multi_line_property: String) extends AvroSerializeable {
  type J = JNoSpaces1
  override def toAvro: JNoSpaces1 = {
    new JNoSpaces1(single_line_comment_property, multi_line_property)
  }
}

object NoSpaces1 {
  implicit def reader = new AvroReader[NoSpaces1] {
    override type J = JNoSpaces1
  }
  implicit val metadata: AvroMetadata[NoSpaces1, JNoSpaces1] = new AvroMetadata[NoSpaces1, JNoSpaces1] {
    override val avroClass: Class[JNoSpaces1] = classOf[JNoSpaces1]
    override val schema: Schema = JNoSpaces1.getClassSchema()
    override val fromAvro: (JNoSpaces1) => NoSpaces1 = {
      (j: JNoSpaces1) => NoSpaces1(j.getSingleLineCommentProperty.toString, j.getMultiLineProperty.toString)
    }
  }
}

Source File: Compass.scala From avrohugger with Apache License 2.0

5 votes

package example.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroMetadata, AvroReader, AvroSerializeable}

import example.{Compass => JCompass, Direction => JDirection}

final case class Compass(direction: Direction.Value) extends AvroSerializeable {
  type J = JCompass
  override def toAvro: JCompass = {
    new JCompass(direction match {
      case Direction.NORTH => JDirection.NORTH
      case Direction.SOUTH => JDirection.SOUTH
      case Direction.EAST => JDirection.EAST
      case Direction.WEST => JDirection.WEST
    })
  }
}

object Compass {
  implicit def reader = new AvroReader[Compass] {
    override type J = JCompass
  }
  implicit val metadata: AvroMetadata[Compass, JCompass] = new AvroMetadata[Compass, JCompass] {
    override val avroClass: Class[JCompass] = classOf[JCompass]
    override val schema: Schema = JCompass.getClassSchema()
    override val fromAvro: (JCompass) => Compass = {
      (j: JCompass) => Compass(j.getDirection match {
        case JDirection.NORTH => Direction.NORTH
        case JDirection.SOUTH => Direction.SOUTH
        case JDirection.EAST => Direction.EAST
        case JDirection.WEST => Direction.WEST
      })
    }
  }
}

Source File: Level0.scala From avrohugger with Apache License 2.0

5 votes

package example.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroMetadata, AvroReader, AvroSerializeable}

import example.{Level0 => JLevel0, Level1 => JLevel1, Level2 => JLevel2}

final case class Level0(level1: Level1) extends AvroSerializeable {
  type J = JLevel0
  override def toAvro: JLevel0 = {
    new JLevel0(level1.toAvro)
  }
}

object Level0 {
  implicit def reader = new AvroReader[Level0] {
    override type J = JLevel0
  }
  implicit val metadata: AvroMetadata[Level0, JLevel0] = new AvroMetadata[Level0, JLevel0] {
    override val avroClass: Class[JLevel0] = classOf[JLevel0]
    override val schema: Schema = JLevel0.getClassSchema()
    override val fromAvro: (JLevel0) => Level0 = {
      (j: JLevel0) => Level0(Level1.metadata.fromAvro(j.getLevel1))
    }
  }
}

Source File: BinarySc.scala From avrohugger with Apache License 2.0

5 votes

package example.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroMetadata, AvroReader, AvroSerializeable}

import example.{BinarySc => JBinarySc}

final case class BinarySc(data: Array[Byte]) extends AvroSerializeable {
  type J = JBinarySc
  override def toAvro: JBinarySc = {
    new JBinarySc(java.nio.ByteBuffer.wrap(data))
  }
}

object BinarySc {
  implicit def reader = new AvroReader[BinarySc] {
    override type J = JBinarySc
  }
  implicit val metadata: AvroMetadata[BinarySc, JBinarySc] = new AvroMetadata[BinarySc, JBinarySc] {
    override val avroClass: Class[JBinarySc] = classOf[JBinarySc]
    override val schema: Schema = JBinarySc.getClassSchema()
    override val fromAvro: (JBinarySc) => BinarySc = {
      (j: JBinarySc) => BinarySc(j.getData match {
        case (buffer: java.nio.ByteBuffer) => {
          buffer.array()
        }
      })
    }
  }
}

Source File: Level1.scala From avrohugger with Apache License 2.0

5 votes

package example.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroMetadata, AvroReader, AvroSerializeable}

import example.{Level1 => JLevel1, Level2 => JLevel2}

final case class Level1(level2: Level2) extends AvroSerializeable {
  type J = JLevel1
  override def toAvro: JLevel1 = {
    new JLevel1(level2.toAvro)
  }
}

object Level1 {
  implicit def reader = new AvroReader[Level1] {
    override type J = JLevel1
  }
  implicit val metadata: AvroMetadata[Level1, JLevel1] = new AvroMetadata[Level1, JLevel1] {
    override val avroClass: Class[JLevel1] = classOf[JLevel1]
    override val schema: Schema = JLevel1.getClassSchema()
    override val fromAvro: (JLevel1) => Level1 = {
      (j: JLevel1) => Level1(Level2.metadata.fromAvro(j.getLevel2))
    }
  }
}

Source File: User.scala From avrohugger with Apache License 2.0

5 votes

package example.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroMetadata, AvroReader, AvroSerializeable}

import example.{User => JUser}

final case class User(name: String, favorite_number: Option[Int], favorite_color: Option[String]) extends AvroSerializeable {
  type J = JUser
  override def toAvro: JUser = {
    new JUser(name, favorite_number match {
      case Some(x) => x
      case None => null
    }, favorite_color match {
      case Some(x) => x
      case None => null
    })
  }
}

object User {
  implicit def reader = new AvroReader[User] {
    override type J = JUser
  }
  implicit val metadata: AvroMetadata[User, JUser] = new AvroMetadata[User, JUser] {
    override val avroClass: Class[JUser] = classOf[JUser]
    override val schema: Schema = JUser.getClassSchema()
    override val fromAvro: (JUser) => User = {
      (j: JUser) => User(j.getName.toString, j.getFavoriteNumber match {
        case null => None
        case _ => Some(j.getFavoriteNumber.toInt)
      }, j.getFavoriteColor match {
        case null => None
        case _ => Some(j.getFavoriteColor.toString)
      })
    }
  }
}

Source File: ClashRecord.scala From avrohugger with Apache License 2.0

5 votes

package example.avro.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroMetadata, AvroReader, AvroSerializeable}

import example.avro.{ClashInner => JClashInner, ClashOuter => JClashOuter, ClashRecord => JClashRecord}

import scala.collection.JavaConverters._

final case class ClashRecord(some: Int, outer: ClashOuter, id: Int) extends AvroSerializeable {
  type J = JClashRecord
  override def toAvro: JClashRecord = {
    new JClashRecord(some, outer.toAvro, id)
  }
}

object ClashRecord {
  implicit def reader = new AvroReader[ClashRecord] {
    override type J = JClashRecord
  }
  implicit val metadata: AvroMetadata[ClashRecord, JClashRecord] = new AvroMetadata[ClashRecord, JClashRecord] {
    override val avroClass: Class[JClashRecord] = classOf[JClashRecord]
    override val schema: Schema = JClashRecord.getClassSchema()
    override val fromAvro: (JClashRecord) => ClashRecord = {
      (j: JClashRecord) => ClashRecord(j.getSome.toInt, ClashOuter.metadata.fromAvro(j.getOuter), j.getId.toInt)
    }
  }
}

Source File: ClashInner.scala From avrohugger with Apache License 2.0

5 votes

package example.avro.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroMetadata, AvroReader, AvroSerializeable}

import example.avro.{ClashInner => JClashInner}

final case class ClashInner(some: Option[Int], other: Option[Int], id: Option[Int]) extends AvroSerializeable {
  type J = JClashInner
  override def toAvro: JClashInner = {
    new JClashInner(some match {
      case Some(x) => x
      case None => null
    }, other match {
      case Some(x) => x
      case None => null
    }, id match {
      case Some(x) => x
      case None => null
    })
  }
}

object ClashInner {
  implicit def reader = new AvroReader[ClashInner] {
    override type J = JClashInner
  }
  implicit val metadata: AvroMetadata[ClashInner, JClashInner] = new AvroMetadata[ClashInner, JClashInner] {
    override val avroClass: Class[JClashInner] = classOf[JClashInner]
    override val schema: Schema = JClashInner.getClassSchema()
    override val fromAvro: (JClashInner) => ClashInner = {
      (j: JClashInner) => ClashInner(j.getSome match {
        case null => None
        case _ => Some(j.getSome.toInt)
      }, j.getOther match {
        case null => None
        case _ => Some(j.getOther.toInt)
      }, j.getId match {
        case null => None
        case _ => Some(j.getId.toInt)
      })
    }
  }
}

Source File: ClashOuter.scala From avrohugger with Apache License 2.0

5 votes

package example.avro.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroMetadata, AvroReader, AvroSerializeable}

import example.avro.{ClashInner => JClashInner, ClashOuter => JClashOuter}

import scala.collection.JavaConverters._

final case class ClashOuter(inner: Option[Array[Option[ClashInner]]]) extends AvroSerializeable {
  type J = JClashOuter
  override def toAvro: JClashOuter = {
    new JClashOuter(inner match {
      case Some(x) => {
        val array: java.util.List[JClashInner] = new java.util.ArrayList[JClashInner]
        x foreach { element =>
          array.add(element match {
            case Some(x) => x.toAvro
            case None => null
          })
        }
        array
      }
      case None => null
    })
  }
}

object ClashOuter {
  implicit def reader = new AvroReader[ClashOuter] {
    override type J = JClashOuter
  }
  implicit val metadata: AvroMetadata[ClashOuter, JClashOuter] = new AvroMetadata[ClashOuter, JClashOuter] {
    override val avroClass: Class[JClashOuter] = classOf[JClashOuter]
    override val schema: Schema = JClashOuter.getClassSchema()
    override val fromAvro: (JClashOuter) => ClashOuter = {
      (j: JClashOuter) => ClashOuter(j.getInner match {
        case null => None
        case _ => Some(Array((j.getInner.asScala: _*)) map { x =>
          x match {
            case null => None
            case _ => Some(ClashInner.metadata.fromAvro(x))
          }
        })
      })
    }
  }
}

Source File: Level2.scala From avrohugger with Apache License 2.0

5 votes

package example.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroMetadata, AvroReader, AvroSerializeable}

import example.{Level2 => JLevel2}

final case class Level2(name: String) extends AvroSerializeable {
  type J = JLevel2
  override def toAvro: JLevel2 = {
    new JLevel2(name)
  }
}

object Level2 {
  implicit def reader = new AvroReader[Level2] {
    override type J = JLevel2
  }
  implicit val metadata: AvroMetadata[Level2, JLevel2] = new AvroMetadata[Level2, JLevel2] {
    override val avroClass: Class[JLevel2] = classOf[JLevel2]
    override val schema: Schema = JLevel2.getClassSchema()
    override val fromAvro: (JLevel2) => Level2 = {
      (j: JLevel2) => Level2(j.getName.toString)
    }
  }
}

Source File: ArrayAsScalaSeq.scala From avrohugger with Apache License 2.0

5 votes

package example.idl.array.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroMetadata, AvroReader, AvroSerializeable}

import example.idl.array.{ArrayIdl => JArrayIdl}

import scala.collection.JavaConverters._

final case class ArrayIdl(data: Seq[Int]) extends AvroSerializeable {
  type J = JArrayIdl
  override def toAvro: JArrayIdl = {
    new JArrayIdl({
      val array: java.util.List[java.lang.Integer] = new java.util.ArrayList[java.lang.Integer]
      data foreach { element =>
        array.add(element)
      }
      array
    })
  }
}

object ArrayIdl {
  implicit def reader = new AvroReader[ArrayIdl] {
    override type J = JArrayIdl
  }
  implicit val metadata: AvroMetadata[ArrayIdl, JArrayIdl] = new AvroMetadata[ArrayIdl, JArrayIdl] {
    override val avroClass: Class[JArrayIdl] = classOf[JArrayIdl]
    override val schema: Schema = JArrayIdl.getClassSchema()
    override val fromAvro: (JArrayIdl) => ArrayIdl = {
      (j: JArrayIdl) => ArrayIdl(Seq((j.getData.asScala: _*)) map { x =>
        x.toInt
      })
    }
  }
}

Source File: ArrayAsScalaList.scala From avrohugger with Apache License 2.0

5 votes

package example.idl.array.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroMetadata, AvroReader, AvroSerializeable}

import example.idl.array.{ArrayIdl => JArrayIdl}

import scala.collection.JavaConverters._

final case class ArrayIdl(data: List[Int]) extends AvroSerializeable {
  type J = JArrayIdl
  override def toAvro: JArrayIdl = {
    new JArrayIdl({
      val array: java.util.List[java.lang.Integer] = new java.util.ArrayList[java.lang.Integer]
      data foreach { element =>
        array.add(element)
      }
      array
    })
  }
}

object ArrayIdl {
  implicit def reader = new AvroReader[ArrayIdl] {
    override type J = JArrayIdl
  }
  implicit val metadata: AvroMetadata[ArrayIdl, JArrayIdl] = new AvroMetadata[ArrayIdl, JArrayIdl] {
    override val avroClass: Class[JArrayIdl] = classOf[JArrayIdl]
    override val schema: Schema = JArrayIdl.getClassSchema()
    override val fromAvro: (JArrayIdl) => ArrayIdl = {
      (j: JArrayIdl) => ArrayIdl(List((j.getData.asScala: _*)) map { x =>
        x.toInt
      })
    }
  }
}

Source File: ArrayAsScalaVector.scala From avrohugger with Apache License 2.0

5 votes

package example.idl.array.model

import org.apache.avro.Schema

import org.oedura.scavro.{AvroMetadata, AvroReader, AvroSerializeable}

import example.idl.array.{ArrayIdl => JArrayIdl}

import scala.collection.JavaConverters._

final case class ArrayIdl(data: Vector[Int]) extends AvroSerializeable {
  type J = JArrayIdl
  override def toAvro: JArrayIdl = {
    new JArrayIdl({
      val array: java.util.List[java.lang.Integer] = new java.util.ArrayList[java.lang.Integer]
      data foreach { element =>
        array.add(element)
      }
      array
    })
  }
}

object ArrayIdl {
  implicit def reader = new AvroReader[ArrayIdl] {
    override type J = JArrayIdl
  }
  implicit val metadata: AvroMetadata[ArrayIdl, JArrayIdl] = new AvroMetadata[ArrayIdl, JArrayIdl] {
    override val avroClass: Class[JArrayIdl] = classOf[JArrayIdl]
    override val schema: Schema = JArrayIdl.getClassSchema()
    override val fromAvro: (JArrayIdl) => ArrayIdl = {
      (j: JArrayIdl) => ArrayIdl(Vector((j.getData.asScala: _*)) map { x =>
        x.toInt
      })
    }
  }
}

org.apache.avro.Schema Scala Examples