org.apache.avro.generic.IndexedRecord Scala Example

Source File: ToTableRow.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.extra.bigquery

import com.spotify.scio.extra.bigquery.AvroConverters.AvroConversionException

import java.math.{BigDecimal => JBigDecimal}
import java.nio.ByteBuffer
import java.util

import com.spotify.scio.bigquery.TableRow
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericFixed, IndexedRecord}
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.io.BaseEncoding
import org.joda.time.format.DateTimeFormat
import org.joda.time.{DateTime, LocalDate, LocalTime}

import scala.jdk.CollectionConverters._


private[bigquery] trait ToTableRow {
  private lazy val encodingPropName: String = "bigquery.bytes.encoder"
  private lazy val base64Encoding: BaseEncoding = BaseEncoding.base64()
  private lazy val hexEncoding: BaseEncoding = BaseEncoding.base16()

  // YYYY-[M]M-[D]D
  private[this] val localDateFormatter =
    DateTimeFormat.forPattern("yyyy-MM-dd").withZoneUTC()

  // YYYY-[M]M-[D]D[( |T)[H]H:[M]M:[S]S[.DDDDDD]]
  private[this] val localTimeFormatter =
    DateTimeFormat.forPattern("HH:mm:ss.SSSSSS")

  // YYYY-[M]M-[D]D[( |T)[H]H:[M]M:[S]S[.DDDDDD]][time zone]
  private[this] val timestampFormatter =
    DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSSSSS")

  private[bigquery] def toTableRowField(fieldValue: Any, field: Schema.Field): Any =
    fieldValue match {
      case x: CharSequence          => x.toString
      case x: Enum[_]               => x.name()
      case x: JBigDecimal           => x.toString
      case x: Number                => x
      case x: Boolean               => x
      case x: GenericFixed          => encodeByteArray(x.bytes(), field.schema())
      case x: ByteBuffer            => encodeByteArray(toByteArray(x), field.schema())
      case x: util.Map[_, _]        => toTableRowFromMap(x.asScala, field)
      case x: java.lang.Iterable[_] => toTableRowFromIterable(x.asScala, field)
      case x: IndexedRecord         => AvroConverters.toTableRow(x)
      case x: LocalDate             => localDateFormatter.print(x)
      case x: LocalTime             => localTimeFormatter.print(x)
      case x: DateTime              => timestampFormatter.print(x)
      case _ =>
        throw AvroConversionException(
          s"ToTableRow conversion failed:" +
            s"could not match ${fieldValue.getClass}"
        )
    }

  private def toTableRowFromIterable(iterable: Iterable[Any], field: Schema.Field): util.List[_] =
    iterable
      .map { item =>
        if (item.isInstanceOf[Iterable[_]] || item.isInstanceOf[Map[_, _]]) {
          throw AvroConversionException(
            s"ToTableRow conversion failed for item $item: " +
              s"iterable and map types not supported"
          )
        }
        toTableRowField(item, field)
      }
      .toList
      .asJava

  private def toTableRowFromMap(map: Iterable[Any], field: Schema.Field): util.List[_] =
    map
      .map {
        case (k, v) =>
          new TableRow()
            .set("key", toTableRowField(k, field))
            .set("value", toTableRowField(v, field))
      }
      .toList
      .asJava

  private def encodeByteArray(bytes: Array[Byte], fieldSchema: Schema): String =
    Option(fieldSchema.getProp(encodingPropName)) match {
      case Some("BASE64") => base64Encoding.encode(bytes)
      case Some("HEX")    => hexEncoding.encode(bytes)
      case Some(encoding) =>
        throw AvroConversionException(s"Unsupported encoding $encoding")
      case None => base64Encoding.encode(bytes)
    }

  private def toByteArray(buffer: ByteBuffer) = {
    val copy = buffer.asReadOnlyBuffer
    val bytes = new Array[Byte](copy.limit)
    copy.rewind
    copy.get(bytes)
    bytes
  }
}

Source File: SCollectionSyntax.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.extra.bigquery.syntax

import com.google.api.services.bigquery.model.TableReference
import com.spotify.scio.annotations.experimental
import com.spotify.scio.bigquery.BigQueryTable.WriteParam
import com.spotify.scio.bigquery.{BigQueryTable, Table, TableRow}
import com.spotify.scio.io.ClosedTap
import com.spotify.scio.util.ScioUtil
import com.spotify.scio.values.SCollection
import org.apache.avro.Schema
import org.apache.avro.generic.IndexedRecord
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.{CreateDisposition, WriteDisposition}

import scala.reflect.ClassTag

trait SCollectionSyntax {
  implicit def toAvroToBigQuerySCollection[T <: IndexedRecord: ClassTag](
    data: SCollection[T]
  ): AvroToBigQuerySCollectionOps[T] = new AvroToBigQuerySCollectionOps[T](data)
}

final class AvroToBigQuerySCollectionOps[T <: IndexedRecord: ClassTag](
  private val self: SCollection[T]
) extends Serializable {
  import com.spotify.scio.extra.bigquery.AvroConverters._

  
  @experimental
  def saveAvroAsBigQuery(
    table: TableReference,
    avroSchema: Schema = null,
    writeDisposition: WriteDisposition = null,
    createDisposition: CreateDisposition = null,
    tableDescription: String = null
  ): ClosedTap[TableRow] = {
    val schema: Schema = Option(avroSchema)
      .getOrElse {
        val cls = ScioUtil.classOf[T]
        if (classOf[IndexedRecord] isAssignableFrom cls) {
          cls.getMethod("getClassSchema").invoke(null).asInstanceOf[Schema]
        } else {
          throw AvroConversionException("Could not invoke $SCHEMA on provided Avro type")
        }
      }

    val params =
      WriteParam(toTableSchema(schema), writeDisposition, createDisposition, tableDescription)
    self
      .map(toTableRow(_))
      .write(BigQueryTable(Table.Ref(table)))(params)
  }
}

Source File: AvroConverters.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.extra.bigquery

import com.google.api.services.bigquery.model.TableSchema
import com.spotify.scio.annotations.experimental
import com.spotify.scio.bigquery.TableRow
import org.apache.avro.Schema
import org.apache.avro.generic.IndexedRecord

import scala.jdk.CollectionConverters._

object AvroConverters extends ToTableRow with ToTableSchema {

  @experimental
  def toTableRow[T <: IndexedRecord](record: T): TableRow = {
    val row = new TableRow

    record.getSchema.getFields.asScala.foreach { field =>
      Option(record.get(field.pos)).foreach { fieldValue =>
        row.set(field.name, toTableRowField(fieldValue, field))
      }
    }

    row
  }

  
  @experimental
  def toTableSchema(avroSchema: Schema): TableSchema = {
    val fields = getFieldSchemas(avroSchema)

    new TableSchema().setFields(fields.asJava)
  }

  final case class AvroConversionException(
    private val message: String,
    private val cause: Throwable = null
  ) extends Exception(message, cause)
}

Source File: AvroInstances.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.schemas.instances

import com.spotify.scio.schemas.{RawRecord, Schema}
import org.apache.avro.specific.SpecificRecord
import org.apache.avro.generic.{GenericRecord, IndexedRecord}
import org.apache.beam.sdk.schemas.utils.AvroUtils
import org.apache.beam.sdk.schemas.{AvroRecordSchema, Schema => BSchema}
import org.apache.beam.sdk.transforms.SerializableFunction
import org.apache.beam.sdk.values.{Row, TypeDescriptor}

import scala.jdk.CollectionConverters._
import scala.reflect.{classTag, ClassTag}

trait AvroInstances {
  implicit def avroSchema[T <: SpecificRecord: ClassTag]: Schema[T] = {
    // TODO: broken because of a bug upstream https://issues.apache.org/jira/browse/BEAM-6742
    // RawRecord[T](new AvroRecordSchema())
    import org.apache.avro.reflect.ReflectData
    val rc = classTag[T].runtimeClass.asInstanceOf[Class[T]]
    val provider = new AvroRecordSchema()
    val td = TypeDescriptor.of(rc)
    val schema = provider.schemaFor(td)
    val avroSchema =
      new AvroInstances.SerializableSchema(ReflectData.get().getSchema(td.getRawType))

    def fromRow = provider.fromRowFunction(td)

    val toRow: SerializableFunction[T, Row] =
      new SerializableFunction[T, Row] {
        def apply(t: T): Row =
          AvroInstances.recordtoRow(schema, avroSchema, t)
      }
    RawRecord[T](schema, fromRow, toRow)
  }

  def fromAvroSchema(schema: org.apache.avro.Schema): Schema[GenericRecord] = {
    val beamSchema = AvroUtils.toBeamSchema(schema)
    val avroSchema = new AvroInstances.SerializableSchema(schema)
    val toRow = new SerializableFunction[GenericRecord, Row] {
      def apply(t: GenericRecord): Row =
        AvroInstances.recordtoRow[GenericRecord](beamSchema, avroSchema, t)
    }

    val fromRow = new SerializableFunction[Row, GenericRecord] {
      def apply(t: Row): GenericRecord =
        AvroUtils.toGenericRecord(t, avroSchema.get)
    }

    RawRecord[GenericRecord](beamSchema, fromRow, toRow)
  }
}

object AvroInstances {
  private class SerializableSchema(@transient private val schema: org.apache.avro.Schema)
      extends Serializable {
    private[this] val stringSchema = schema.toString
    def get: org.apache.avro.Schema = new org.apache.avro.Schema.Parser().parse(stringSchema)
  }

  // Workaround BEAM-6742
  private def recordtoRow[T <: IndexedRecord](
    schema: BSchema,
    avroSchema: SerializableSchema,
    t: T
  ): Row = {
    val row = Row.withSchema(schema)
    schema.getFields.asScala.zip(avroSchema.get.getFields.asScala).zipWithIndex.foreach {
      case ((f, a), i) =>
        val value = t.get(i)
        val v = AvroUtils.convertAvroFieldStrict(value, a.schema, f.getType)
        row.addValue(v)
    }
    row.build()
  }
}

org.apache.avro.generic.IndexedRecord Scala Examples