org.apache.parquet.hadoop.ParquetFileWriter Scala Example

Source File: DirectParquetOutputCommitter.scala From utils with Apache License 2.0

5 votes

package com.indix.utils.spark.parquet

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
import org.apache.parquet.Log
import org.apache.parquet.hadoop.util.ContextUtil
import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat}



class DirectParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext)
  extends ParquetOutputCommitter(outputPath, context) {
  val LOG = Log.getLog(classOf[ParquetOutputCommitter])

  override def getWorkPath: Path = outputPath

  override def abortTask(taskContext: TaskAttemptContext): Unit = {}

  override def commitTask(taskContext: TaskAttemptContext): Unit = {}

  override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true

  override def setupJob(jobContext: JobContext): Unit = {}

  override def setupTask(taskContext: TaskAttemptContext): Unit = {}

  override def commitJob(jobContext: JobContext) {
    val configuration = ContextUtil.getConfiguration(jobContext)
    val fileSystem = outputPath.getFileSystem(configuration)
    LOG.info("Using DirectParquetOutputCommitter to commit parquet files")

    if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) {
      try {
        val outputStatus = fileSystem.getFileStatus(outputPath)
        val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus)
        try {
          ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers)
        } catch {
          case e: Exception =>
            LOG.warn("Could not write summary file for " + outputPath, e)
            val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE)
            if (fileSystem.exists(metadataPath)) {
              fileSystem.delete(metadataPath, true)
            }
        }
      } catch {
        case e: Exception => LOG.warn("Could not write summary file for " + outputPath, e)
      }
    }

    if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) {
      try {
        val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME)
        fileSystem.create(successPath).close()
      } catch {
        case e: Exception => LOG.warn("Could not write success file for " + outputPath, e)
      }
    }
  }
}

Source File: IOOps.scala From parquet4s with MIT License

5 votes

package com.github.mjakubowski84.parquet4s

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.SecureIOUtils.AlreadyExistsException
import org.apache.parquet.hadoop.ParquetFileWriter
import org.slf4j.Logger

import scala.concurrent.{ExecutionContext, Future}
import scala.util.Try

trait IOOps {

  protected val logger: Logger

  protected def validateWritePath(path: Path, writeOptions: ParquetWriter.Options): Unit = {
    val fs = path.getFileSystem(writeOptions.hadoopConf)
    try {
      if (fs.exists(path)) {
        if (writeOptions.writeMode == ParquetFileWriter.Mode.CREATE)
          throw new AlreadyExistsException(s"File or directory already exists: $path")
        else {
          if (logger.isDebugEnabled) logger.debug(s"Deleting $path in order to override with new data.")
          fs.delete(path, true)
        }
      }
    } finally fs.close()
  }

  protected def filesAtPath(path: Path, writeOptions: ParquetWriter.Options)
                           (implicit ec: ExecutionContext): Future[List[String]] = Future {
    scala.concurrent.blocking {
      val fs = path.getFileSystem(writeOptions.hadoopConf)
      try {
        val iter = fs.listFiles(path, false)
        Stream
          .continually(Try(iter.next()))
          .takeWhile(_.isSuccess)
          .map(_.get)
          .map(_.getPath.getName)
          .toList
      } finally fs.close()
    }
  }

  protected def filesAtPath(path: String, writeOptions: ParquetWriter.Options)
                           (implicit ec: ExecutionContext): Future[List[String]] = filesAtPath(new Path(path), writeOptions)

}

Source File: ParquetWriterItSpec.scala From parquet4s with MIT License

5 votes

package com.github.mjakubowski84.parquet4s

import java.nio.file.Files

import org.apache.parquet.hadoop.ParquetFileWriter
import org.scalatest.{BeforeAndAfter, FreeSpec, Matchers}

import scala.util.Random

class ParquetWriterItSpec
  extends FreeSpec
    with Matchers
    with BeforeAndAfter {

  case class Record(i: Int, d: Double, s: String)
  object Record {
    def random(n: Int): Seq[Record] =
      (1 to n).map(_ =>
        Record(Random.nextInt(), Random.nextDouble(), Random.nextString(10)))
  }

  private val tempDir = com.google.common.io.Files.createTempDir().toPath.toAbsolutePath
  private val writePath = tempDir.resolve("file.parquet")

  // Generate records and do a single batch write.
  private val records = Record.random(5000)

  private def readRecords: Seq[Record] = {
    val iter = ParquetReader.read[Record](writePath.toString)
    try iter.toSeq
    finally iter.close()
  }

  after { // Delete written files
    Files.deleteIfExists(writePath)
  }

  "Batch write should result in proper number of records in the file" in {
    ParquetWriter.writeAndClose(writePath.toString, records)
    readRecords should be(records)
  }

  "Multiple incremental writes produce same result as a single batch write" in {
    val w = ParquetWriter.writer[Record](writePath.toString)
    try records.grouped(5).foreach(w.write)
    finally w.close()
    readRecords shouldBe records
  }

  "Writing record by record works as well" in {
    val w = ParquetWriter.writer[Record](writePath.toString)
    try records.foreach(record => w.write(record))
    finally w.close()
    readRecords shouldBe records
  }

  "Incremental writes work with write mode OVERWRITE" in {
    val w = ParquetWriter.writer[Record](
      writePath.toString,
      ParquetWriter.Options(ParquetFileWriter.Mode.OVERWRITE))
    try records.grouped(5).foreach(w.write)
    finally w.close()
    readRecords shouldBe records
  }

  "Writing to closed writer throws an exception" in {
    val w = ParquetWriter.writer[Record](writePath.toString)
    w.close()
    an[IllegalStateException] should be thrownBy records
      .grouped(2)
      .foreach(w.write)
  }

  "Closing writer without writing anything to it throws no exception" in {
    val w = ParquetWriter.writer[Record](writePath.toString)
    noException should be thrownBy w.close()
  }

  "Closing writer twice throws no exception" in {
    val w = ParquetWriter.writer[Record](writePath.toString)
    noException should be thrownBy w.close()
    noException should be thrownBy w.close()
  }

}

Source File: AvroParquetWriterFn.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet.avro

import com.sksamuel.exts.Logging
import io.eels.component.parquet.ParquetWriterConfig
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.fs.Path
import org.apache.parquet.avro.AvroParquetWriter
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter}


object AvroParquetWriterFn extends Logging {
  def apply(path: Path, avroSchema: Schema): ParquetWriter[GenericRecord] = {
    val config = ParquetWriterConfig()
    AvroParquetWriter.builder[GenericRecord](path)
      .withSchema(avroSchema)
      .withCompressionCodec(config.compressionCodec)
      .withPageSize(config.pageSize)
      .withRowGroupSize(config.blockSize)
      .withDictionaryEncoding(config.enableDictionary)
      .withWriteMode(ParquetFileWriter.Mode.CREATE)
      .withValidation(config.validating)
      .build()
  }
}

Source File: RowParquetWriterFn.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import io.eels.Row
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.parquet.column.ParquetProperties
import org.apache.parquet.hadoop.api.WriteSupport
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter}
import org.apache.parquet.schema.MessageType

import scala.math.BigDecimal.RoundingMode.RoundingMode


object RowParquetWriterFn {

  class RowParquetWriterBuilder(path: Path,
                                schema: MessageType,
                                roundingMode: RoundingMode,
                                metadata: Map[String, String])
    extends ParquetWriter.Builder[Row, RowParquetWriterBuilder](path) {
    override def getWriteSupport(conf: Configuration): WriteSupport[Row] = new RowWriteSupport(schema, roundingMode, metadata)
    override def self(): RowParquetWriterBuilder = this
  }

  def apply(path: Path,
            schema: StructType,
            metadata: Map[String, String],
            dictionary: Boolean,
            roundingMode: RoundingMode,
            fsConfig: Configuration): ParquetWriter[Row] = {
    val config = ParquetWriterConfig()
    val messageType = ParquetSchemaFns.toParquetMessageType(schema)
    new RowParquetWriterBuilder(path, messageType, roundingMode, metadata)
      .withCompressionCodec(config.compressionCodec)
      .withDictionaryEncoding(dictionary)
      .withPageSize(config.pageSize)
      .withRowGroupSize(config.blockSize)
      .withValidation(config.validating)
      .withWriteMode(ParquetFileWriter.Mode.CREATE)
      .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0)
      .withConf(fsConfig)
      .build()
  }
}

Source File: DirectParquetOutputCommitter.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.parquet

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
import org.apache.parquet.Log
import org.apache.parquet.hadoop.util.ContextUtil
import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat}


private[datasources] class DirectParquetOutputCommitter(
    outputPath: Path, context: TaskAttemptContext)
  extends ParquetOutputCommitter(outputPath, context) {
  val LOG = Log.getLog(classOf[ParquetOutputCommitter])

  override def getWorkPath: Path = outputPath
  override def abortTask(taskContext: TaskAttemptContext): Unit = {}
  override def commitTask(taskContext: TaskAttemptContext): Unit = {}
  override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true
  override def setupJob(jobContext: JobContext): Unit = {}
  override def setupTask(taskContext: TaskAttemptContext): Unit = {}

  override def commitJob(jobContext: JobContext) {
    val configuration = ContextUtil.getConfiguration(jobContext)
    val fileSystem = outputPath.getFileSystem(configuration)

    if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) {
      try {
        val outputStatus = fileSystem.getFileStatus(outputPath)
        val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus)
        try {
          ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers)
        } catch { case e: Exception =>
          LOG.warn("could not write summary file for " + outputPath, e)
          val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE)
          if (fileSystem.exists(metadataPath)) {
            fileSystem.delete(metadataPath, true)
          }
        }
      } catch {
        case e: Exception => LOG.warn("could not write summary file for " + outputPath, e)
      }
    }

    if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) {
      try {
        val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME)
        fileSystem.create(successPath).close()
      } catch {
        case e: Exception => LOG.warn("could not write success file for " + outputPath, e)
      }
    }
  }
}

Source File: DirectParquetOutputCommitter.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.parquet

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
import org.apache.parquet.Log
import org.apache.parquet.hadoop.util.ContextUtil
import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat}


private[datasources] class DirectParquetOutputCommitter(
    outputPath: Path, context: TaskAttemptContext)
  extends ParquetOutputCommitter(outputPath, context) {
  val LOG = Log.getLog(classOf[ParquetOutputCommitter])

  override def getWorkPath: Path = outputPath
  override def abortTask(taskContext: TaskAttemptContext): Unit = {}
  override def commitTask(taskContext: TaskAttemptContext): Unit = {}
  override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true
  override def setupJob(jobContext: JobContext): Unit = {}
  override def setupTask(taskContext: TaskAttemptContext): Unit = {}

  override def commitJob(jobContext: JobContext) {
    val configuration = {
      // scalastyle:off jobcontext
      ContextUtil.getConfiguration(jobContext)
      // scalastyle:on jobcontext
    }
    val fileSystem = outputPath.getFileSystem(configuration)

    if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) {
      try {
        val outputStatus = fileSystem.getFileStatus(outputPath)
        val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus)
        try {
          ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers)
        } catch { case e: Exception =>
          LOG.warn("could not write summary file for " + outputPath, e)
          val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE)
          if (fileSystem.exists(metadataPath)) {
            fileSystem.delete(metadataPath, true)
          }
        }
      } catch {
        case e: Exception => LOG.warn("could not write summary file for " + outputPath, e)
      }
    }

    if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) {
      try {
        val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME)
        fileSystem.create(successPath).close()
      } catch {
        case e: Exception => LOG.warn("could not write success file for " + outputPath, e)
      }
    }
  }
}

Source File: package.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.column.ParquetProperties
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetReader, ParquetWriter}

package object parquet {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  def listFiles(path: Path)(implicit fs: FileSystem): List[Path] = {
    if (fs.isDirectory(path)) {
      logger.debug(s"$path is a directory, reading constituent files")
      val remote = fs.listFiles(path, false)
      new Iterator[Path] {
        override def hasNext: Boolean = remote.hasNext
        override def next(): Path = remote.next().getPath
      }.toList
    } else {
      logger.debug(s"Reading $path as a single file")
      List(path)
    }
  }

  def parquetReader(file: Path)(implicit fs: FileSystem): ParquetReader[Struct] = {
    ParquetReader.builder(new StructReadSupport, file)
      .withConf(fs.getConf)
      .build()
  }

  def parquetWriter(path: Path,
                    schema: Schema,
                    config: ParquetSinkConfig): ParquetWriter[Struct] = {
    new StructParquetWriterBuilder(path, schema)
      .withCompressionCodec(config.compressionCodec)
      .withDictionaryEncoding(config.enableDictionary)
      .withValidation(config.validation)
      .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0)
      .withWriteMode(if (config.overwrite) {
        ParquetFileWriter.Mode.OVERWRITE
      } else {
        ParquetFileWriter.Mode.CREATE
      }).build()
  }
}

Source File: package.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.column.ParquetProperties
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetReader, ParquetWriter}

package object parquet {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  def listFiles(path: Path)(implicit fs: FileSystem): List[Path] = {
    if (fs.isDirectory(path)) {
      logger.debug(s"$path is a directory, reading constituent files")
      val remote = fs.listFiles(path, false)
      new Iterator[Path] {
        override def hasNext: Boolean = remote.hasNext
        override def next(): Path = remote.next().getPath
      }.toList
    } else {
      logger.debug(s"Reading $path as a single file")
      List(path)
    }
  }

  def parquetReader(file: Path)(implicit fs: FileSystem): ParquetReader[Struct] = {
    ParquetReader.builder(new StructReadSupport, file)
      .withConf(fs.getConf)
      .build()
  }

  def parquetWriter(path: Path,
                    schema: Schema,
                    config: ParquetSinkConfig): ParquetWriter[Struct] = {
    new StructParquetWriterBuilder(path, schema)
      .withCompressionCodec(config.compressionCodec)
      .withDictionaryEncoding(config.enableDictionary)
      .withValidation(config.validation)
      .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0)
      .withWriteMode(if (config.overwrite) {
        ParquetFileWriter.Mode.OVERWRITE
      } else {
        ParquetFileWriter.Mode.CREATE
      }).build()
  }
}

org.apache.parquet.hadoop.ParquetFileWriter Scala Examples