org.apache.parquet.avro.AvroParquetWriter Scala Example

Source File: AvroParquetSourceTest.scala From eel-sdk with Apache License 2.0

6 votes

package io.eels.component.parquet

import java.nio.file.Paths

import io.eels.component.parquet.avro.AvroParquetSource
import io.eels.component.parquet.util.ParquetLogMute
import io.eels.schema._
import org.apache.avro.SchemaBuilder
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.parquet.avro.AvroParquetWriter
import org.scalatest.{Matchers, WordSpec}

class AvroParquetSourceTest extends WordSpec with Matchers {
  ParquetLogMute()

  private implicit val conf = new Configuration()
  private implicit val fs = FileSystem.get(conf)

  private val personFile = Paths.get(getClass.getResource("/io/eels/component/parquet/person.avro.pq").toURI)
  private val resourcesDir = personFile.getParent

  "AvroParquetSource" should {
    "read schema" in {
      val people = AvroParquetSource(personFile)
      people.schema shouldBe StructType(
        Field("name", StringType, nullable = false),
        Field("job", StringType, nullable = false),
        Field("location", StringType, nullable = false)
      )
    }
    "read parquet files" in {
      val people = AvroParquetSource(personFile.toAbsolutePath()).toDataStream().toSet.map(_.values)
      people shouldBe Set(
        Vector("clint eastwood", "actor", "carmel"),
        Vector("elton john", "musician", "pinner")
      )
    }
    "read multiple parquet files using file expansion" in {
      import io.eels.FilePattern._
      val people = AvroParquetSource(s"${resourcesDir.toUri.toString}/*.pq").toDataStream().toSet.map(_.values)
      people shouldBe Set(
        Vector("clint eastwood", "actor", "carmel"),
        Vector("elton john", "musician", "pinner"),
        Vector("clint eastwood", "actor", "carmel"),
        Vector("elton john", "musician", "pinner")
      )
    }
    // todo add merge to parquet source
    "merge schemas" ignore {

      try {
        fs.delete(new Path("merge1.pq"), false)
      } catch {
        case t: Throwable =>
      }
      try {
        fs.delete(new Path("merge2.pq"), false)
      } catch {
        case t: Throwable =>
      }

      val schema1 = SchemaBuilder.builder().record("schema1").fields().requiredString("a").requiredDouble("b").endRecord()
      val schema2 = SchemaBuilder.builder().record("schema2").fields().requiredInt("a").requiredBoolean("c").endRecord()

      val writer1 = AvroParquetWriter.builder[GenericRecord](new Path("merge1.pq")).withSchema(schema1).build()
      val record1 = new GenericData.Record(schema1)
      record1.put("a", "aaaaa")
      record1.put("b", 124.3)
      writer1.write(record1)
      writer1.close()

      val writer2 = AvroParquetWriter.builder[GenericRecord](new Path("merge2.pq")).withSchema(schema2).build()
      val record2 = new GenericData.Record(schema2)
      record2.put("a", 111)
      record2.put("c", true)
      writer2.write(record2)
      writer2.close()

      ParquetSource(new Path("merge*")).schema shouldBe
        StructType(
          Field("a", StringType, nullable = false),
          Field("b", DoubleType, nullable = false),
          Field("c", BooleanType, nullable = false)
        )

      fs.delete(new Path(".merge1.pq.crc"), false)
      fs.delete(new Path(".merge2.pq.crc"), false)
      fs.delete(new Path("merge1.pq"), false)
      fs.delete(new Path("merge2.pq"), false)
    }
  }
}

Source File: AvroParquetWriterFn.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet.avro

import com.sksamuel.exts.Logging
import io.eels.component.parquet.ParquetWriterConfig
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.fs.Path
import org.apache.parquet.avro.AvroParquetWriter
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter}


object AvroParquetWriterFn extends Logging {
  def apply(path: Path, avroSchema: Schema): ParquetWriter[GenericRecord] = {
    val config = ParquetWriterConfig()
    AvroParquetWriter.builder[GenericRecord](path)
      .withSchema(avroSchema)
      .withCompressionCodec(config.compressionCodec)
      .withPageSize(config.pageSize)
      .withRowGroupSize(config.blockSize)
      .withDictionaryEncoding(config.enableDictionary)
      .withWriteMode(ParquetFileWriter.Mode.CREATE)
      .withValidation(config.validating)
      .build()
  }
}

Source File: AvroParquetReaderFnTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import java.util.UUID

import io.eels.component.avro.AvroSchemaFns
import io.eels.component.parquet.avro.AvroParquetReaderFn
import io.eels.schema.{DoubleType, Field, LongType, StructType}
import org.apache.avro.SchemaBuilder
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.avro.util.Utf8
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.parquet.avro.AvroParquetWriter
import org.scalatest.{BeforeAndAfterAll, Matchers, WordSpec}

class AvroParquetReaderFnTest extends WordSpec with Matchers with BeforeAndAfterAll {

  private implicit val conf = new Configuration()
  private implicit val fs = FileSystem.get(new Configuration())

  private val path = new Path(UUID.randomUUID().toString())

  override def afterAll(): Unit = {
    val fs = FileSystem.get(new Configuration())
    fs.delete(path, false)
  }

  private val avroSchema = SchemaBuilder.record("com.chuckle").fields()
    .requiredString("str").requiredLong("looong").requiredDouble("dooble").endRecord()

  private val writer = AvroParquetWriter.builder[GenericRecord](path)
    .withSchema(avroSchema)
    .build()

  private val record = new GenericData.Record(avroSchema)
  record.put("str", "wibble")
  record.put("looong", 999L)
  record.put("dooble", 12.34)
  writer.write(record)
  writer.close()

  val schema = StructType(Field("str"), Field("looong", LongType(true), true), Field("dooble", DoubleType, true))

  "AvroParquetReaderFn" should {
    "support projections on doubles" in {

      val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema.removeField("looong"))))
      val record = reader.read()
      reader.close()

      record.get("str").asInstanceOf[Utf8].toString shouldBe "wibble"
      record.get("dooble") shouldBe 12.34
    }
    "support projections on longs" in {

      val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema.removeField("str"))))
      val record = reader.read()
      reader.close()

      record.get("looong") shouldBe 999L
    }
    "support full projections" in {

      val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema)))
      val record = reader.read()
      reader.close()

      record.get("str").asInstanceOf[Utf8].toString shouldBe "wibble"
      record.get("looong") shouldBe 999L
      record.get("dooble") shouldBe 12.34

    }
    "support non projections" in {

      val reader = AvroParquetReaderFn(path, None, None)
      val group = reader.read()
      reader.close()

      group.get("str").asInstanceOf[Utf8].toString shouldBe "wibble"
      group.get("looong") shouldBe 999L
      group.get("dooble") shouldBe 12.34

    }
  }
}

Source File: ParquetWriterTask.scala From gearpump-examples with Apache License 2.0

5 votes

package io.gearpump.examples.kafka_hdfs_pipeline

import org.apache.avro.Schema
import io.gearpump.Message
import io.gearpump.cluster.UserConfig
import io.gearpump.examples.kafka_hdfs_pipeline.ParquetWriterTask._
import io.gearpump.streaming.task.{StartTime, Task, TaskContext}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.apache.parquet.avro.AvroParquetWriter

import scala.util.{Failure, Success, Try}

class ParquetWriterTask(taskContext : TaskContext, config: UserConfig) extends Task(taskContext, config) {
  val outputFileName = taskContext.appName + ".parquet"
  val absolutePath = Option(getHdfs + config.getString(PARQUET_OUTPUT_DIRECTORY).getOrElse("/parquet") + "/" + outputFileName).map(deleteFile(_)).get
  val outputPath = new Path(absolutePath)
  var parquetWriter = new AvroParquetWriter[SpaceShuttleRecord](outputPath, SpaceShuttleRecord.SCHEMA$)

  def getYarnConf = new YarnConfiguration
  def getFs = FileSystem.get(getYarnConf)
  def getHdfs = new Path(getFs.getHomeDirectory, "gearpump")

  private def deleteFile(fileName: String): String = {
    val file = new Path(fileName)
    getFs.exists(file) match {
      case true =>
        getFs.delete(file,false)
      case false =>
    }
    fileName
  }

  override def onStart(startTime: StartTime): Unit = {
    LOG.info(s"ParquetWriter.onStart $absolutePath")
  }

  override def onNext(msg: Message): Unit = {
    Try({
      parquetWriter.write(msg.msg.asInstanceOf[SpaceShuttleRecord])
    }) match {
      case Success(ok) =>
      case Failure(throwable) =>
        LOG.error(s"failed ${throwable.getMessage}")
    }
  }

  override def onStop(): Unit = {
    LOG.info("ParquetWriter.onStop")
    parquetWriter.close()
  }
}

object ParquetWriterTask {
  val PARQUET_OUTPUT_DIRECTORY = "parquet.output.directory"
  val PARQUET_WRITER = "parquet.writer"
}

Source File: ParquetWriterTaskSpec.scala From gearpump-examples with Apache License 2.0

5 votes

package io.gearpump.examples.kafka_hdfs_pipeline

import akka.actor.ActorSystem
import org.apache.avro.Schema
import io.gearpump.Message
import io.gearpump.cluster.UserConfig
import io.gearpump.streaming.MockUtil
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.apache.parquet.avro.{AvroParquetReader, AvroParquetWriter}
import org.apache.parquet.hadoop.ParquetReader
import org.apache.parquet.hadoop.api.ReadSupport
import org.mockito.Mockito
import org.mockito.Mockito._
import org.scalatest.prop.PropertyChecks
import org.scalatest.{BeforeAndAfterAll, Matchers, PropSpec}


class ParquetWriterTaskSpec extends PropSpec with PropertyChecks with Matchers with BeforeAndAfterAll {

  implicit var system: ActorSystem = ActorSystem("PipeLineSpec")
  val context = MockUtil.mockTaskContext
  val appName = "KafkaHdfsPipeLine"
  when(context.appName).thenReturn(appName)
  val fs = FileSystem.get(new YarnConfiguration)
  val homeDir = fs.getHomeDirectory.toUri.getPath
  val parquetDir = new Path(homeDir, "gearpump")  + "/parquet/"
  val parquetPath = parquetDir + appName + ".parquet"
  val parquetCrc = parquetDir + "." + appName + ".parquet.crc"
  val parquetWriter = Mockito.mock(classOf[AvroParquetWriter[SpaceShuttleRecord]])

  val anomaly = 0.252
  val now = System.currentTimeMillis

  val userConfig = UserConfig.empty.withString(ParquetWriterTask.PARQUET_OUTPUT_DIRECTORY, "/parquet")

  override def afterAll(): Unit = {
    List(parquetPath, parquetCrc, parquetDir).foreach(new java.io.File(_).delete)
    system.shutdown()
  }

  property("ParquetWriterTask should initialize with local parquet file opened for writing") {
    val parquetWriterTask = new ParquetWriterTask(context, userConfig)
    val path = parquetWriterTask.absolutePath.stripPrefix("file:")
    assert(parquetPath.equals(path))
    parquetWriterTask.onStop
  }

  property("ParquetWriterTask should write records to a parquet file") {
    val message = Message(SpaceShuttleRecord(now, anomaly), now)
    val parquetWriterTask = new ParquetWriterTask(context, userConfig)
    parquetWriterTask.parquetWriter = parquetWriter
    parquetWriterTask.onNext(message)
    verify(parquetWriterTask.parquetWriter).write(message.msg.asInstanceOf[SpaceShuttleRecord])
    parquetWriterTask.onStop
  }

  property("ParquetWriterTask should have verifiable written record") {
    val message = Message(SpaceShuttleRecord(now, anomaly), now)
    val parquetWriterTask = new ParquetWriterTask(context, userConfig)
    parquetWriterTask.onNext(message)
    parquetWriterTask.onStop
    val reader = new AvroParquetReader[SpaceShuttleRecord](new Path(parquetPath))
    val record = reader.read()
    assert(message.msg.asInstanceOf[SpaceShuttleRecord].anomaly == record.anomaly)
    assert(message.msg.asInstanceOf[SpaceShuttleRecord].ts == record.ts)
  }
}

Source File: AvroToParquetWriter.scala From etl-light with MIT License

5 votes

package yamrcraft.etlite.writers

import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.fs.Path
import org.apache.parquet.avro.AvroParquetWriter
import org.slf4j.LoggerFactory
import yamrcraft.etlite.utils.FileUtils

class AvroToParquetWriter(tempFile: String, outputFile: String) extends Writer[GenericRecord] {

  val logger = LoggerFactory.getLogger(this.getClass)

  // lazy initialization
  var writer: Option[AvroParquetWriter[GenericRecord]] = None

  val tempPath = new Path(tempFile + ".parquet")
  val outputPath = new Path(outputFile + ".parquet")
  logger.info(s"creating writer for working file: ${tempPath.toString}, outputFile: ${outputPath.toString}")

  override def write(event: GenericRecord): Unit = {
    logger.info(s"ParquetWriter.write, event type: ${event.getSchema.getName}")
    if (writer.isEmpty) {
      writer = Some(createWriter(tempPath.toString, event.getSchema))
    }

    writer.get.write(event)
  }

  override def commit(): Unit = {
    writer.get.close()

    val fs = FileUtils.getFS(outputPath.toString)
    fs.mkdirs(outputPath.getParent)
    if (fs.exists(outputPath)) {
      fs.rename(outputPath, new Path(outputPath.getParent, s"__${outputPath.getName}.${System.currentTimeMillis()}.old.__"))
    }
    // copy temp file to output file (typically temp file would be on local file system).
    if (tempFile.startsWith("file")) {
      logger.info(s"copy file from: ${tempPath.toString} to $outputPath")
      fs.copyFromLocalFile(true, true, tempPath, outputPath)
    } else {
      logger.info(s"renaming file from: ${tempPath.toString} to $outputPath")
      fs.rename(tempPath, outputPath)
    }
  }

  private def createWriter(file: String, schema: Schema) = {
    val fs = FileUtils.getFS(file)
    val path = new Path(file)
    if (fs.exists(path)) {
      fs.delete(path, true)
    }
    fs.mkdirs(path.getParent)
    new AvroParquetWriter[GenericRecord](path, schema)
  }

}

org.apache.parquet.avro.AvroParquetWriter Scala Examples