org.apache.avro.generic.IndexedRecord Scala Examples

The following examples show how to use org.apache.avro.generic.IndexedRecord. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: ToTableRow.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.extra.bigquery

import com.spotify.scio.extra.bigquery.AvroConverters.AvroConversionException

import java.math.{BigDecimal => JBigDecimal}
import java.nio.ByteBuffer
import java.util

import com.spotify.scio.bigquery.TableRow
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericFixed, IndexedRecord}
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.io.BaseEncoding
import org.joda.time.format.DateTimeFormat
import org.joda.time.{DateTime, LocalDate, LocalTime}

import scala.jdk.CollectionConverters._


private[bigquery] trait ToTableRow {
  private lazy val encodingPropName: String = "bigquery.bytes.encoder"
  private lazy val base64Encoding: BaseEncoding = BaseEncoding.base64()
  private lazy val hexEncoding: BaseEncoding = BaseEncoding.base16()

  // YYYY-[M]M-[D]D
  private[this] val localDateFormatter =
    DateTimeFormat.forPattern("yyyy-MM-dd").withZoneUTC()

  // YYYY-[M]M-[D]D[( |T)[H]H:[M]M:[S]S[.DDDDDD]]
  private[this] val localTimeFormatter =
    DateTimeFormat.forPattern("HH:mm:ss.SSSSSS")

  // YYYY-[M]M-[D]D[( |T)[H]H:[M]M:[S]S[.DDDDDD]][time zone]
  private[this] val timestampFormatter =
    DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSSSSS")

  private[bigquery] def toTableRowField(fieldValue: Any, field: Schema.Field): Any =
    fieldValue match {
      case x: CharSequence          => x.toString
      case x: Enum[_]               => x.name()
      case x: JBigDecimal           => x.toString
      case x: Number                => x
      case x: Boolean               => x
      case x: GenericFixed          => encodeByteArray(x.bytes(), field.schema())
      case x: ByteBuffer            => encodeByteArray(toByteArray(x), field.schema())
      case x: util.Map[_, _]        => toTableRowFromMap(x.asScala, field)
      case x: java.lang.Iterable[_] => toTableRowFromIterable(x.asScala, field)
      case x: IndexedRecord         => AvroConverters.toTableRow(x)
      case x: LocalDate             => localDateFormatter.print(x)
      case x: LocalTime             => localTimeFormatter.print(x)
      case x: DateTime              => timestampFormatter.print(x)
      case _ =>
        throw AvroConversionException(
          s"ToTableRow conversion failed:" +
            s"could not match ${fieldValue.getClass}"
        )
    }

  private def toTableRowFromIterable(iterable: Iterable[Any], field: Schema.Field): util.List[_] =
    iterable
      .map { item =>
        if (item.isInstanceOf[Iterable[_]] || item.isInstanceOf[Map[_, _]]) {
          throw AvroConversionException(
            s"ToTableRow conversion failed for item $item: " +
              s"iterable and map types not supported"
          )
        }
        toTableRowField(item, field)
      }
      .toList
      .asJava

  private def toTableRowFromMap(map: Iterable[Any], field: Schema.Field): util.List[_] =
    map
      .map {
        case (k, v) =>
          new TableRow()
            .set("key", toTableRowField(k, field))
            .set("value", toTableRowField(v, field))
      }
      .toList
      .asJava

  private def encodeByteArray(bytes: Array[Byte], fieldSchema: Schema): String =
    Option(fieldSchema.getProp(encodingPropName)) match {
      case Some("BASE64") => base64Encoding.encode(bytes)
      case Some("HEX")    => hexEncoding.encode(bytes)
      case Some(encoding) =>
        throw AvroConversionException(s"Unsupported encoding $encoding")
      case None => base64Encoding.encode(bytes)
    }

  private def toByteArray(buffer: ByteBuffer) = {
    val copy = buffer.asReadOnlyBuffer
    val bytes = new Array[Byte](copy.limit)
    copy.rewind
    copy.get(bytes)
    bytes
  }
} 
Example 2
Source File: SCollectionSyntax.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.extra.bigquery.syntax

import com.google.api.services.bigquery.model.TableReference
import com.spotify.scio.annotations.experimental
import com.spotify.scio.bigquery.BigQueryTable.WriteParam
import com.spotify.scio.bigquery.{BigQueryTable, Table, TableRow}
import com.spotify.scio.io.ClosedTap
import com.spotify.scio.util.ScioUtil
import com.spotify.scio.values.SCollection
import org.apache.avro.Schema
import org.apache.avro.generic.IndexedRecord
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.{CreateDisposition, WriteDisposition}

import scala.reflect.ClassTag

trait SCollectionSyntax {
  implicit def toAvroToBigQuerySCollection[T <: IndexedRecord: ClassTag](
    data: SCollection[T]
  ): AvroToBigQuerySCollectionOps[T] = new AvroToBigQuerySCollectionOps[T](data)
}

final class AvroToBigQuerySCollectionOps[T <: IndexedRecord: ClassTag](
  private val self: SCollection[T]
) extends Serializable {
  import com.spotify.scio.extra.bigquery.AvroConverters._

  
  @experimental
  def saveAvroAsBigQuery(
    table: TableReference,
    avroSchema: Schema = null,
    writeDisposition: WriteDisposition = null,
    createDisposition: CreateDisposition = null,
    tableDescription: String = null
  ): ClosedTap[TableRow] = {
    val schema: Schema = Option(avroSchema)
      .getOrElse {
        val cls = ScioUtil.classOf[T]
        if (classOf[IndexedRecord] isAssignableFrom cls) {
          cls.getMethod("getClassSchema").invoke(null).asInstanceOf[Schema]
        } else {
          throw AvroConversionException("Could not invoke $SCHEMA on provided Avro type")
        }
      }

    val params =
      WriteParam(toTableSchema(schema), writeDisposition, createDisposition, tableDescription)
    self
      .map(toTableRow(_))
      .write(BigQueryTable(Table.Ref(table)))(params)
  }
} 
Example 3
Source File: AvroConverters.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.extra.bigquery

import com.google.api.services.bigquery.model.TableSchema
import com.spotify.scio.annotations.experimental
import com.spotify.scio.bigquery.TableRow
import org.apache.avro.Schema
import org.apache.avro.generic.IndexedRecord

import scala.jdk.CollectionConverters._

object AvroConverters extends ToTableRow with ToTableSchema {

  @experimental
  def toTableRow[T <: IndexedRecord](record: T): TableRow = {
    val row = new TableRow

    record.getSchema.getFields.asScala.foreach { field =>
      Option(record.get(field.pos)).foreach { fieldValue =>
        row.set(field.name, toTableRowField(fieldValue, field))
      }
    }

    row
  }

  
  @experimental
  def toTableSchema(avroSchema: Schema): TableSchema = {
    val fields = getFieldSchemas(avroSchema)

    new TableSchema().setFields(fields.asJava)
  }

  final case class AvroConversionException(
    private val message: String,
    private val cause: Throwable = null
  ) extends Exception(message, cause)
} 
Example 4
Source File: AvroInstances.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.schemas.instances

import com.spotify.scio.schemas.{RawRecord, Schema}
import org.apache.avro.specific.SpecificRecord
import org.apache.avro.generic.{GenericRecord, IndexedRecord}
import org.apache.beam.sdk.schemas.utils.AvroUtils
import org.apache.beam.sdk.schemas.{AvroRecordSchema, Schema => BSchema}
import org.apache.beam.sdk.transforms.SerializableFunction
import org.apache.beam.sdk.values.{Row, TypeDescriptor}

import scala.jdk.CollectionConverters._
import scala.reflect.{classTag, ClassTag}

trait AvroInstances {
  implicit def avroSchema[T <: SpecificRecord: ClassTag]: Schema[T] = {
    // TODO: broken because of a bug upstream https://issues.apache.org/jira/browse/BEAM-6742
    // RawRecord[T](new AvroRecordSchema())
    import org.apache.avro.reflect.ReflectData
    val rc = classTag[T].runtimeClass.asInstanceOf[Class[T]]
    val provider = new AvroRecordSchema()
    val td = TypeDescriptor.of(rc)
    val schema = provider.schemaFor(td)
    val avroSchema =
      new AvroInstances.SerializableSchema(ReflectData.get().getSchema(td.getRawType))

    def fromRow = provider.fromRowFunction(td)

    val toRow: SerializableFunction[T, Row] =
      new SerializableFunction[T, Row] {
        def apply(t: T): Row =
          AvroInstances.recordtoRow(schema, avroSchema, t)
      }
    RawRecord[T](schema, fromRow, toRow)
  }

  def fromAvroSchema(schema: org.apache.avro.Schema): Schema[GenericRecord] = {
    val beamSchema = AvroUtils.toBeamSchema(schema)
    val avroSchema = new AvroInstances.SerializableSchema(schema)
    val toRow = new SerializableFunction[GenericRecord, Row] {
      def apply(t: GenericRecord): Row =
        AvroInstances.recordtoRow[GenericRecord](beamSchema, avroSchema, t)
    }

    val fromRow = new SerializableFunction[Row, GenericRecord] {
      def apply(t: Row): GenericRecord =
        AvroUtils.toGenericRecord(t, avroSchema.get)
    }

    RawRecord[GenericRecord](beamSchema, fromRow, toRow)
  }
}

object AvroInstances {
  private class SerializableSchema(@transient private val schema: org.apache.avro.Schema)
      extends Serializable {
    private[this] val stringSchema = schema.toString
    def get: org.apache.avro.Schema = new org.apache.avro.Schema.Parser().parse(stringSchema)
  }

  // Workaround BEAM-6742
  private def recordtoRow[T <: IndexedRecord](
    schema: BSchema,
    avroSchema: SerializableSchema,
    t: T
  ): Row = {
    val row = Row.withSchema(schema)
    schema.getFields.asScala.zip(avroSchema.get.getFields.asScala).zipWithIndex.foreach {
      case ((f, a), i) =>
        val value = t.get(i)
        val v = AvroUtils.convertAvroFieldStrict(value, a.schema, f.getType)
        row.addValue(v)
    }
    row.build()
  }
}