org.apache.parquet.hadoop.ParquetFileWriter Scala Examples
The following examples show how to use org.apache.parquet.hadoop.ParquetFileWriter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DirectParquetOutputCommitter.scala From utils with Apache License 2.0 | 5 votes |
package com.indix.utils.spark.parquet import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.parquet.Log import org.apache.parquet.hadoop.util.ContextUtil import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat} class DirectParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext) extends ParquetOutputCommitter(outputPath, context) { val LOG = Log.getLog(classOf[ParquetOutputCommitter]) override def getWorkPath: Path = outputPath override def abortTask(taskContext: TaskAttemptContext): Unit = {} override def commitTask(taskContext: TaskAttemptContext): Unit = {} override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true override def setupJob(jobContext: JobContext): Unit = {} override def setupTask(taskContext: TaskAttemptContext): Unit = {} override def commitJob(jobContext: JobContext) { val configuration = ContextUtil.getConfiguration(jobContext) val fileSystem = outputPath.getFileSystem(configuration) LOG.info("Using DirectParquetOutputCommitter to commit parquet files") if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) { try { val outputStatus = fileSystem.getFileStatus(outputPath) val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus) try { ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers) } catch { case e: Exception => LOG.warn("Could not write summary file for " + outputPath, e) val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE) if (fileSystem.exists(metadataPath)) { fileSystem.delete(metadataPath, true) } } } catch { case e: Exception => LOG.warn("Could not write summary file for " + outputPath, e) } } if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) { try { val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME) fileSystem.create(successPath).close() } catch { case e: Exception => LOG.warn("Could not write success file for " + outputPath, e) } } } }
Example 2
Source File: IOOps.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s import org.apache.hadoop.fs.Path import org.apache.hadoop.io.SecureIOUtils.AlreadyExistsException import org.apache.parquet.hadoop.ParquetFileWriter import org.slf4j.Logger import scala.concurrent.{ExecutionContext, Future} import scala.util.Try trait IOOps { protected val logger: Logger protected def validateWritePath(path: Path, writeOptions: ParquetWriter.Options): Unit = { val fs = path.getFileSystem(writeOptions.hadoopConf) try { if (fs.exists(path)) { if (writeOptions.writeMode == ParquetFileWriter.Mode.CREATE) throw new AlreadyExistsException(s"File or directory already exists: $path") else { if (logger.isDebugEnabled) logger.debug(s"Deleting $path in order to override with new data.") fs.delete(path, true) } } } finally fs.close() } protected def filesAtPath(path: Path, writeOptions: ParquetWriter.Options) (implicit ec: ExecutionContext): Future[List[String]] = Future { scala.concurrent.blocking { val fs = path.getFileSystem(writeOptions.hadoopConf) try { val iter = fs.listFiles(path, false) Stream .continually(Try(iter.next())) .takeWhile(_.isSuccess) .map(_.get) .map(_.getPath.getName) .toList } finally fs.close() } } protected def filesAtPath(path: String, writeOptions: ParquetWriter.Options) (implicit ec: ExecutionContext): Future[List[String]] = filesAtPath(new Path(path), writeOptions) }
Example 3
Source File: ParquetWriterItSpec.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s import java.nio.file.Files import org.apache.parquet.hadoop.ParquetFileWriter import org.scalatest.{BeforeAndAfter, FreeSpec, Matchers} import scala.util.Random class ParquetWriterItSpec extends FreeSpec with Matchers with BeforeAndAfter { case class Record(i: Int, d: Double, s: String) object Record { def random(n: Int): Seq[Record] = (1 to n).map(_ => Record(Random.nextInt(), Random.nextDouble(), Random.nextString(10))) } private val tempDir = com.google.common.io.Files.createTempDir().toPath.toAbsolutePath private val writePath = tempDir.resolve("file.parquet") // Generate records and do a single batch write. private val records = Record.random(5000) private def readRecords: Seq[Record] = { val iter = ParquetReader.read[Record](writePath.toString) try iter.toSeq finally iter.close() } after { // Delete written files Files.deleteIfExists(writePath) } "Batch write should result in proper number of records in the file" in { ParquetWriter.writeAndClose(writePath.toString, records) readRecords should be(records) } "Multiple incremental writes produce same result as a single batch write" in { val w = ParquetWriter.writer[Record](writePath.toString) try records.grouped(5).foreach(w.write) finally w.close() readRecords shouldBe records } "Writing record by record works as well" in { val w = ParquetWriter.writer[Record](writePath.toString) try records.foreach(record => w.write(record)) finally w.close() readRecords shouldBe records } "Incremental writes work with write mode OVERWRITE" in { val w = ParquetWriter.writer[Record]( writePath.toString, ParquetWriter.Options(ParquetFileWriter.Mode.OVERWRITE)) try records.grouped(5).foreach(w.write) finally w.close() readRecords shouldBe records } "Writing to closed writer throws an exception" in { val w = ParquetWriter.writer[Record](writePath.toString) w.close() an[IllegalStateException] should be thrownBy records .grouped(2) .foreach(w.write) } "Closing writer without writing anything to it throws no exception" in { val w = ParquetWriter.writer[Record](writePath.toString) noException should be thrownBy w.close() } "Closing writer twice throws no exception" in { val w = ParquetWriter.writer[Record](writePath.toString) noException should be thrownBy w.close() noException should be thrownBy w.close() } }
Example 4
Source File: AvroParquetWriterFn.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet.avro import com.sksamuel.exts.Logging import io.eels.component.parquet.ParquetWriterConfig import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.hadoop.fs.Path import org.apache.parquet.avro.AvroParquetWriter import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter} object AvroParquetWriterFn extends Logging { def apply(path: Path, avroSchema: Schema): ParquetWriter[GenericRecord] = { val config = ParquetWriterConfig() AvroParquetWriter.builder[GenericRecord](path) .withSchema(avroSchema) .withCompressionCodec(config.compressionCodec) .withPageSize(config.pageSize) .withRowGroupSize(config.blockSize) .withDictionaryEncoding(config.enableDictionary) .withWriteMode(ParquetFileWriter.Mode.CREATE) .withValidation(config.validating) .build() } }
Example 5
Source File: RowParquetWriterFn.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import io.eels.Row import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.parquet.column.ParquetProperties import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter} import org.apache.parquet.schema.MessageType import scala.math.BigDecimal.RoundingMode.RoundingMode object RowParquetWriterFn { class RowParquetWriterBuilder(path: Path, schema: MessageType, roundingMode: RoundingMode, metadata: Map[String, String]) extends ParquetWriter.Builder[Row, RowParquetWriterBuilder](path) { override def getWriteSupport(conf: Configuration): WriteSupport[Row] = new RowWriteSupport(schema, roundingMode, metadata) override def self(): RowParquetWriterBuilder = this } def apply(path: Path, schema: StructType, metadata: Map[String, String], dictionary: Boolean, roundingMode: RoundingMode, fsConfig: Configuration): ParquetWriter[Row] = { val config = ParquetWriterConfig() val messageType = ParquetSchemaFns.toParquetMessageType(schema) new RowParquetWriterBuilder(path, messageType, roundingMode, metadata) .withCompressionCodec(config.compressionCodec) .withDictionaryEncoding(dictionary) .withPageSize(config.pageSize) .withRowGroupSize(config.blockSize) .withValidation(config.validating) .withWriteMode(ParquetFileWriter.Mode.CREATE) .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0) .withConf(fsConfig) .build() } }
Example 6
Source File: DirectParquetOutputCommitter.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.parquet.Log import org.apache.parquet.hadoop.util.ContextUtil import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat} private[datasources] class DirectParquetOutputCommitter( outputPath: Path, context: TaskAttemptContext) extends ParquetOutputCommitter(outputPath, context) { val LOG = Log.getLog(classOf[ParquetOutputCommitter]) override def getWorkPath: Path = outputPath override def abortTask(taskContext: TaskAttemptContext): Unit = {} override def commitTask(taskContext: TaskAttemptContext): Unit = {} override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true override def setupJob(jobContext: JobContext): Unit = {} override def setupTask(taskContext: TaskAttemptContext): Unit = {} override def commitJob(jobContext: JobContext) { val configuration = ContextUtil.getConfiguration(jobContext) val fileSystem = outputPath.getFileSystem(configuration) if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) { try { val outputStatus = fileSystem.getFileStatus(outputPath) val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus) try { ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers) } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE) if (fileSystem.exists(metadataPath)) { fileSystem.delete(metadataPath, true) } } } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) } } if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) { try { val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME) fileSystem.create(successPath).close() } catch { case e: Exception => LOG.warn("could not write success file for " + outputPath, e) } } } }
Example 7
Source File: DirectParquetOutputCommitter.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.parquet.Log import org.apache.parquet.hadoop.util.ContextUtil import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat} private[datasources] class DirectParquetOutputCommitter( outputPath: Path, context: TaskAttemptContext) extends ParquetOutputCommitter(outputPath, context) { val LOG = Log.getLog(classOf[ParquetOutputCommitter]) override def getWorkPath: Path = outputPath override def abortTask(taskContext: TaskAttemptContext): Unit = {} override def commitTask(taskContext: TaskAttemptContext): Unit = {} override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true override def setupJob(jobContext: JobContext): Unit = {} override def setupTask(taskContext: TaskAttemptContext): Unit = {} override def commitJob(jobContext: JobContext) { val configuration = { // scalastyle:off jobcontext ContextUtil.getConfiguration(jobContext) // scalastyle:on jobcontext } val fileSystem = outputPath.getFileSystem(configuration) if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) { try { val outputStatus = fileSystem.getFileStatus(outputPath) val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus) try { ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers) } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE) if (fileSystem.exists(metadataPath)) { fileSystem.delete(metadataPath, true) } } } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) } } if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) { try { val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME) fileSystem.create(successPath).close() } catch { case e: Exception => LOG.warn("could not write success file for " + outputPath, e) } } } }
Example 8
Source File: package.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.column.ParquetProperties import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetReader, ParquetWriter} package object parquet { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) def listFiles(path: Path)(implicit fs: FileSystem): List[Path] = { if (fs.isDirectory(path)) { logger.debug(s"$path is a directory, reading constituent files") val remote = fs.listFiles(path, false) new Iterator[Path] { override def hasNext: Boolean = remote.hasNext override def next(): Path = remote.next().getPath }.toList } else { logger.debug(s"Reading $path as a single file") List(path) } } def parquetReader(file: Path)(implicit fs: FileSystem): ParquetReader[Struct] = { ParquetReader.builder(new StructReadSupport, file) .withConf(fs.getConf) .build() } def parquetWriter(path: Path, schema: Schema, config: ParquetSinkConfig): ParquetWriter[Struct] = { new StructParquetWriterBuilder(path, schema) .withCompressionCodec(config.compressionCodec) .withDictionaryEncoding(config.enableDictionary) .withValidation(config.validation) .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0) .withWriteMode(if (config.overwrite) { ParquetFileWriter.Mode.OVERWRITE } else { ParquetFileWriter.Mode.CREATE }).build() } }
Example 9
Source File: package.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.column.ParquetProperties import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetReader, ParquetWriter} package object parquet { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) def listFiles(path: Path)(implicit fs: FileSystem): List[Path] = { if (fs.isDirectory(path)) { logger.debug(s"$path is a directory, reading constituent files") val remote = fs.listFiles(path, false) new Iterator[Path] { override def hasNext: Boolean = remote.hasNext override def next(): Path = remote.next().getPath }.toList } else { logger.debug(s"Reading $path as a single file") List(path) } } def parquetReader(file: Path)(implicit fs: FileSystem): ParquetReader[Struct] = { ParquetReader.builder(new StructReadSupport, file) .withConf(fs.getConf) .build() } def parquetWriter(path: Path, schema: Schema, config: ParquetSinkConfig): ParquetWriter[Struct] = { new StructParquetWriterBuilder(path, schema) .withCompressionCodec(config.compressionCodec) .withDictionaryEncoding(config.enableDictionary) .withValidation(config.validation) .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0) .withWriteMode(if (config.overwrite) { ParquetFileWriter.Mode.OVERWRITE } else { ParquetFileWriter.Mode.CREATE }).build() } }