org.apache.avro.file.DataFileWriter Scala Examples

The following examples show how to use org.apache.avro.file.DataFileWriter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: StandardTestUtil.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
package test

import java.io.File

import org.apache.avro.file.{ DataFileReader, DataFileWriter }
import org.apache.avro.generic.{
  GenericRecord,
  GenericDatumReader,
  GenericDatumWriter }


import org.specs2.mutable.Specification

object StandardTestUtil extends Specification {

  def write(file: File, records: List[GenericRecord]) = {
  
    val userDatumWriter = new GenericDatumWriter[GenericRecord]
    val dataFileWriter = new DataFileWriter[GenericRecord](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
   

  }

  def read(file: File, records: List[GenericRecord]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new GenericDatumReader[GenericRecord](schema)
    val dataFileReader = new DataFileReader[GenericRecord](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the StandardDatumReader.
    var record: GenericRecord = null.asInstanceOf[GenericRecord]
    var sameRecord: GenericRecord = null.asInstanceOf[GenericRecord]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  }

  def verifyWriteAndRead(records: List[GenericRecord]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

} 
Example 2
Source File: AvroDataOutputStream.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s

import java.io.OutputStream

import org.apache.avro.Schema
import org.apache.avro.file.{CodecFactory, DataFileWriter}
import org.apache.avro.generic.{GenericDatumWriter, GenericRecord}


case class AvroDataOutputStream[T](os: OutputStream,
                                   codec: CodecFactory)
                                  (implicit encoder: Encoder[T]) extends AvroOutputStream[T] {

  val resolved = encoder.resolveEncoder()

  val (writer, writeFn) = resolved.schema.getType match {
    case Schema.Type.DOUBLE | Schema.Type.LONG | Schema.Type.BOOLEAN | Schema.Type.STRING | Schema.Type.INT | Schema.Type.FLOAT =>
      val datumWriter = new GenericDatumWriter[T](resolved.schema)
      val dataFileWriter = new DataFileWriter[T](datumWriter)
      dataFileWriter.setCodec(codec)
      dataFileWriter.create(resolved.schema, os)
      (dataFileWriter, (t: T) => dataFileWriter.append(t))
    case _ =>
      val datumWriter = new GenericDatumWriter[GenericRecord](resolved.schema)
      val dataFileWriter = new DataFileWriter[GenericRecord](datumWriter)
      dataFileWriter.setCodec(codec)
      dataFileWriter.create(resolved.schema, os)
      (dataFileWriter, (t: T) => {
        val record = resolved.encode(t).asInstanceOf[GenericRecord]
        dataFileWriter.append(record)
      })
  }

  override def close(): Unit = {
    flush()
    writer.close()
  }

  override def write(t: T): Unit = {
    writeFn(t)
  }

  override def flush(): Unit = writer.flush()
  override def fSync(): Unit = writer.fSync()
} 
Example 3
Source File: AvroIO.scala    From ratatool   with Apache License 2.0 5 votes vote down vote up
package com.spotify.ratatool.io

import java.io.{File, InputStream, OutputStream}
import java.nio.ByteBuffer
import java.nio.channels.SeekableByteChannel

import com.google.common.io.ByteStreams
import org.apache.avro.Schema
import org.apache.avro.file.{DataFileReader, DataFileWriter, SeekableByteArrayInput, SeekableInput}
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.{DatumReader, DatumWriter}
import org.apache.avro.reflect.{ReflectDatumReader, ReflectDatumWriter}
import org.apache.avro.specific.{SpecificDatumReader, SpecificDatumWriter, SpecificRecord}
import org.apache.beam.sdk.io.FileSystems
import org.apache.beam.sdk.io.fs.MatchResult.Metadata

import scala.jdk.CollectionConverters._
import scala.reflect.ClassTag


  def writeToOutputStream[T: ClassTag](data: Iterable[T],
                                       schema: Schema,
                                       os: OutputStream): Unit = {
    val fileWriter = new DataFileWriter(createDatumWriter[T]).create(schema, os)
    data.foreach(fileWriter.append)
    fileWriter.close()
  }

  def getAvroSchemaFromFile(path: String): Schema = {
    require(FileStorage(path).exists, s"File `$path` does not exist!")
    val files = FileStorage(path).listFiles.filter(_.resourceId.getFilename.endsWith(".avro"))
    require(files.nonEmpty, s"File `$path` does not contain avro files")
    val reader = new GenericDatumReader[GenericRecord]()
    val dfr = new DataFileReader[GenericRecord](AvroIO.getAvroSeekableInput(files.head), reader)
    dfr.getSchema
  }

  private def getAvroSeekableInput(meta: Metadata): SeekableInput = new SeekableInput {
    require(meta.isReadSeekEfficient)
    private val in = FileSystems.open(meta.resourceId()).asInstanceOf[SeekableByteChannel]
    override def read(b: Array[Byte], off: Int, len: Int): Int =
      in.read(ByteBuffer.wrap(b, off, len))
    override def tell(): Long = in.position()
    override def length(): Long = in.size()
    override def seek(p: Long): Unit = in.position(p)
    override def close(): Unit = in.close()
  }

} 
Example 4
Source File: DefaultFrameWriter.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.avro

import java.io.ByteArrayOutputStream
import java.nio.charset.Charset

import org.apache.avro.Schema
import org.apache.avro.file.DataFileWriter
import org.apache.avro.generic.{GenericData, GenericDatumWriter}
import SchemaConverter._
import ml.combust.mleap.runtime.frame.LeapFrame
import ml.combust.mleap.runtime.serialization.{BuiltinFormats, FrameWriter}
import resource._

import scala.util.{Failure, Try}


class DefaultFrameWriter[LF <: LeapFrame[LF]](frame: LF) extends FrameWriter {
  val valueConverter = ValueConverter()

  override def toBytes(charset: Charset = BuiltinFormats.charset): Try[Array[Byte]] = {
    (for(out <- managed(new ByteArrayOutputStream())) yield {
      val writers = frame.schema.fields.map(_.dataType).map(valueConverter.mleapToAvro)
      val avroSchema = frame.schema: Schema
      val record = new GenericData.Record(avroSchema)
      val datumWriter = new GenericDatumWriter[GenericData.Record](avroSchema)
      val writer = new DataFileWriter[GenericData.Record](datumWriter)
      writer.create(avroSchema, out)

      for(row <- frame.collect()) {
        var i = 0
        for(writer <- writers) {
          record.put(i, writer(row.getRaw(i)))
          i = i + 1
        }

        Try(writer.append(record)) match {
          case Failure(error) => error.printStackTrace()
          case _ =>
        }
      }

      writer.close()

      out.toByteArray
    }).tried
  }
} 
Example 5
Source File: StandardTestUtil.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
package test

import java.io.File

import org.apache.avro.file.{ DataFileReader, DataFileWriter }
import org.apache.avro.generic.{
  GenericRecord,
  GenericDatumReader,
  GenericDatumWriter }


import org.specs2.mutable.Specification

object StandardTestUtil extends Specification {

  def write(file: File, records: List[GenericRecord]) = {
  
    val userDatumWriter = new GenericDatumWriter[GenericRecord]
    val dataFileWriter = new DataFileWriter[GenericRecord](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
   

  }

  def read(file: File, records: List[GenericRecord]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new GenericDatumReader[GenericRecord](schema)
    val dataFileReader = new DataFileReader[GenericRecord](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the StandardDatumReader.
    var record: GenericRecord = null.asInstanceOf[GenericRecord]
    var sameRecord: GenericRecord = null.asInstanceOf[GenericRecord]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord.equals(record)
  }

  def verifyWriteAndRead(records: List[GenericRecord]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

} 
Example 6
Source File: StandardTestUtil.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
package test

import java.io.File

import org.apache.avro.file.{ DataFileReader, DataFileWriter }
import org.apache.avro.generic.{
  GenericRecord,
  GenericDatumReader,
  GenericDatumWriter }


import org.specs2.mutable.Specification

object StandardTestUtil extends Specification {

  def write(file: File, records: List[GenericRecord]) = {
  
    val userDatumWriter = new GenericDatumWriter[GenericRecord]
    val dataFileWriter = new DataFileWriter[GenericRecord](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
   

  }

  def read(file: File, records: List[GenericRecord]) = {
    
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new GenericDatumReader[GenericRecord](schema)
    val dataFileReader = new DataFileReader[GenericRecord](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the StandardDatumReader.
    var record: GenericRecord = null.asInstanceOf[GenericRecord]
    var sameRecord: GenericRecord = null.asInstanceOf[GenericRecord]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  
  }

  def verifyWriteAndRead(records: List[GenericRecord]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

} 
Example 7
Source File: StandardTestUtil.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
package test

import java.io.File

import org.apache.avro.file.{ DataFileReader, DataFileWriter }
import org.apache.avro.generic.{
  GenericRecord,
  GenericDatumReader,
  GenericDatumWriter }


import org.specs2.mutable.Specification

object StandardTestUtil extends Specification {

  def write(file: File, records: List[GenericRecord]) = {
  
    val userDatumWriter = new GenericDatumWriter[GenericRecord]
    val dataFileWriter = new DataFileWriter[GenericRecord](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
   

  }

  def read(file: File, records: List[GenericRecord]) = {
    
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new GenericDatumReader[GenericRecord](schema)
    val dataFileReader = new DataFileReader[GenericRecord](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the StandardDatumReader.
    var record: GenericRecord = null.asInstanceOf[GenericRecord]
    var sameRecord: GenericRecord = null.asInstanceOf[GenericRecord]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  
  }

  def verifyWriteAndRead(records: List[GenericRecord]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

} 
Example 8
Source File: StandardDefaultValuesSpec.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
import test._
import org.specs2.mutable.Specification
import com.sksamuel.avro4s.RecordFormat
import java.io.File
import shapeless.Inl

import org.apache.avro.file.{ DataFileReader, DataFileWriter }
import org.apache.avro.generic.{
  GenericRecord,
  GenericDatumReader,
  GenericDatumWriter}
  
class StandardDefaultValuesSpec extends Specification {
skipAll
  "A case class with default values" should {
    "deserialize correctly" in {
    
      val format = RecordFormat[DefaultTest]
      val record = DefaultTest()
      val avro = format.to(record)
      val sameRecord = format.from(avro)
      sameRecord.suit === DefaultEnum.SPADES
      sameRecord.number === 0
      sameRecord.str === "str"
      sameRecord.optionString === None
      // sameRecord.optionStringValue === Some("default")
      // sameRecord.embedded === Embedded(1)
      sameRecord.defaultArray === List(1,3,4,5)
      sameRecord.optionalEnum === None
      sameRecord.defaultMap === Map("Hello" -> "world", "Merry" -> "Christmas")
      sameRecord.byt === "\u00FF".getBytes
      // sameRecord.defaultEither === Left(2)
      // sameRecord.defaultCoproduct === Inl(3)
    }
  }

} 
Example 9
Source File: SpecificTestUtil.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
package test

import java.io.File

import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.Schema
import org.apache.avro.file.{ DataFileReader, DataFileWriter }

import org.specs2.mutable.Specification

object SpecificTestUtil extends Specification {

  def write[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val userDatumWriter = new SpecificDatumWriter[T]
    val dataFileWriter = new DataFileWriter[T](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
  }

  def read[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new SpecificDatumReader[T](schema)
    val dataFileReader = new DataFileReader[T](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader.
    var record: T = null.asInstanceOf[T]
    var sameRecord: T = null.asInstanceOf[T]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  }

  def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

  def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = {
    val schema = record.getSchema
    val writer = new SpecificDatumWriter[T](schema)
    val out = new java.io.ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(out, null)
    writer.write(record, encoder)
    encoder.flush
    val ba = out.toByteArray
    ba.size must ===(1)
    ba(0) must ===(0)
    out.close
    val reader = new SpecificDatumReader[T](schema)
    val decoder = DecoderFactory.get().binaryDecoder(ba, null)
    val decoded = reader.read(record, decoder)
    decoded must ===(record)
  }

} 
Example 10
Source File: AvroWriter.scala    From eel-sdk   with Apache License 2.0 5 votes vote down vote up
package io.eels.component.avro

import java.io.OutputStream
import java.util.concurrent.atomic.AtomicInteger

import io.eels.Row
import io.eels.schema.StructType
import org.apache.avro.file.DataFileWriter
import org.apache.avro.generic
import org.apache.avro.generic.GenericRecord

class AvroWriter(structType: StructType, out: OutputStream) {
  
  private val schema = AvroSchemaFns.toAvroSchema(structType)
  private val datumWriter = new generic.GenericDatumWriter[GenericRecord](schema)
  private val dataFileWriter = new DataFileWriter[GenericRecord](datumWriter)
  private val serializer = new RowSerializer(schema)
  private val _records = new AtomicInteger(0)

  dataFileWriter.create(schema, out)

  def write(row: Row): Unit = {
    val record = serializer.serialize(row)
    dataFileWriter.append(record)
    _records.incrementAndGet()
  }

  def records: Int = _records.get()

  def close(): Unit = {
    dataFileWriter.flush()
    dataFileWriter.close()
  }
} 
Example 11
Source File: StandardTestUtil.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
package test

import java.io.File

import org.apache.avro.file.{ DataFileReader, DataFileWriter }
import org.apache.avro.generic.{
  GenericRecord,
  GenericDatumReader,
  GenericDatumWriter }


import org.specs2.mutable.Specification

object StandardTestUtil extends Specification {

  def write(file: File, records: List[GenericRecord]) = {
  
    val userDatumWriter = new GenericDatumWriter[GenericRecord]
    val dataFileWriter = new DataFileWriter[GenericRecord](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
   

  }

  def read(file: File, records: List[GenericRecord]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new GenericDatumReader[GenericRecord](schema)
    val dataFileReader = new DataFileReader[GenericRecord](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the StandardDatumReader.
    var record: GenericRecord = null.asInstanceOf[GenericRecord]
    var sameRecord: GenericRecord = null.asInstanceOf[GenericRecord]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord.equals(record)
  }

  def verifyWriteAndRead(records: List[GenericRecord]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

} 
Example 12
Source File: SpecificTestUtil.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
package test

import java.io.File

import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.Schema
import org.apache.avro.file.{ DataFileReader, DataFileWriter }

import org.specs2.mutable.Specification

object SpecificTestUtil extends Specification {

  def write[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val userDatumWriter = new SpecificDatumWriter[T]
    val dataFileWriter = new DataFileWriter[T](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
  }

  def read[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new SpecificDatumReader[T](schema)
    val dataFileReader = new DataFileReader[T](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader.
    var record: T = null.asInstanceOf[T]
    var sameRecord: T = null.asInstanceOf[T]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord.equals(record)
  }

  def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

  def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = {
    val schema = record.getSchema
    val writer = new SpecificDatumWriter[T](schema)
    val out = new java.io.ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(out, null)
    writer.write(record, encoder)
    encoder.flush
    val ba = out.toByteArray
    ba.size must ===(1)
    ba(0) must ===(0)
    out.close
    val reader = new SpecificDatumReader[T](schema)
    val decoder = DecoderFactory.get().binaryDecoder(ba, null)
    val decoded = reader.read(record, decoder)
    decoded must ===(record)
  }

} 
Example 13
Source File: SpecificTestUtil.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
package test

import java.io.File

import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.Schema
import org.apache.avro.file.{ DataFileReader, DataFileWriter }

import org.specs2.mutable.Specification

object SpecificTestUtil extends Specification {

  def write[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val userDatumWriter = new SpecificDatumWriter[T]()
    val dataFileWriter = new DataFileWriter[T](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file)
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close()
  }

  def read[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new SpecificDatumReader[T](schema)
    val dataFileReader = new DataFileReader[T](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader.
    var record: T = null.asInstanceOf[T]
    var sameRecord: T = null.asInstanceOf[T]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  }

  def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

  def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = {
    val schema = record.getSchema
    val writer = new SpecificDatumWriter[T](schema)
    val out = new java.io.ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(out, null)
    writer.write(record, encoder)
    encoder.flush
    val ba = out.toByteArray
    ba.size must ===(1)
    ba(0) must ===(0)
    out.close
    val reader = new SpecificDatumReader[T](schema)
    val decoder = DecoderFactory.get().binaryDecoder(ba, null)
    val decoded = reader.read(record, decoder)
    decoded must ===(record)
  }

} 
Example 14
Source File: SpecificTestUtil.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
package test

import java.io.File

import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.Schema
import org.apache.avro.file.{ DataFileReader, DataFileWriter }

import org.specs2.mutable.Specification

object SpecificTestUtil extends Specification {

  def write[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val userDatumWriter = new SpecificDatumWriter[T]
    val dataFileWriter = new DataFileWriter[T](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
  }

  def read[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new SpecificDatumReader[T](schema)
    val dataFileReader = new DataFileReader[T](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader.
    var record: T = null.asInstanceOf[T]
    var sameRecord: T = null.asInstanceOf[T]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  }

  def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

  def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = {
    val schema = record.getSchema
    val writer = new SpecificDatumWriter[T](schema)
    val out = new java.io.ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(out, null)
    writer.write(record, encoder)
    encoder.flush
    val ba = out.toByteArray
    ba.size must ===(1)
    ba(0) must ===(0)
    out.close
    val reader = new SpecificDatumReader[T](schema)
    val decoder = DecoderFactory.get().binaryDecoder(ba, null)
    val decoded = reader.read(record, decoder)
    decoded must ===(record)
  }

} 
Example 15
Source File: StandardTestUtil.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
package test

import java.io.File

import org.apache.avro.file.{ DataFileReader, DataFileWriter }
import org.apache.avro.generic.{
  GenericRecord,
  GenericDatumReader,
  GenericDatumWriter }


import org.specs2.mutable.Specification

object StandardTestUtil extends Specification {

  def write(file: File, records: List[GenericRecord]) = {
  
    val userDatumWriter = new GenericDatumWriter[GenericRecord]
    val dataFileWriter = new DataFileWriter[GenericRecord](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
   

  }

  def read(file: File, records: List[GenericRecord]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new GenericDatumReader[GenericRecord](schema)
    val dataFileReader = new DataFileReader[GenericRecord](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the StandardDatumReader.
    var record: GenericRecord = null.asInstanceOf[GenericRecord]
    var sameRecord: GenericRecord = null.asInstanceOf[GenericRecord]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  }

  def verifyWriteAndRead(records: List[GenericRecord]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

} 
Example 16
Source File: StandardTestUtil.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
package test

import java.io.File

import org.apache.avro.file.{ DataFileReader, DataFileWriter }
import org.apache.avro.generic.{
  GenericRecord,
  GenericDatumReader,
  GenericDatumWriter }


import org.specs2.mutable.Specification

object StandardTestUtil extends Specification {

  def write(file: File, records: List[GenericRecord]) = {
  
    val userDatumWriter = new GenericDatumWriter[GenericRecord]
    val dataFileWriter = new DataFileWriter[GenericRecord](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
   

  }

  def read(file: File, records: List[GenericRecord]) = {
    
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new GenericDatumReader[GenericRecord](schema)
    val dataFileReader = new DataFileReader[GenericRecord](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the StandardDatumReader.
    var record: GenericRecord = null.asInstanceOf[GenericRecord]
    var sameRecord: GenericRecord = null.asInstanceOf[GenericRecord]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  
  }

  def verifyWriteAndRead(records: List[GenericRecord]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

} 
Example 17
Source File: StandardTestUtil.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
package test

import java.io.File

import org.apache.avro.file.{ DataFileReader, DataFileWriter }
import org.apache.avro.generic.{
  GenericRecord,
  GenericDatumReader,
  GenericDatumWriter }


import org.specs2.mutable.Specification

object StandardTestUtil extends Specification {

  def write(file: File, records: List[GenericRecord]) = {
  
    val userDatumWriter = new GenericDatumWriter[GenericRecord]
    val dataFileWriter = new DataFileWriter[GenericRecord](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
   

  }

  def read(file: File, records: List[GenericRecord]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new GenericDatumReader[GenericRecord](schema)
    val dataFileReader = new DataFileReader[GenericRecord](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the StandardDatumReader.
    var record: GenericRecord = null.asInstanceOf[GenericRecord]
    var sameRecord: GenericRecord = null.asInstanceOf[GenericRecord]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  }

  def verifyWriteAndRead(records: List[GenericRecord]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

}