org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter Scala Examples
The following examples show how to use org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DirectParquetOutputCommitter.scala From utils with Apache License 2.0 | 5 votes |
package com.indix.utils.spark.parquet import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.parquet.Log import org.apache.parquet.hadoop.util.ContextUtil import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat} class DirectParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext) extends ParquetOutputCommitter(outputPath, context) { val LOG = Log.getLog(classOf[ParquetOutputCommitter]) override def getWorkPath: Path = outputPath override def abortTask(taskContext: TaskAttemptContext): Unit = {} override def commitTask(taskContext: TaskAttemptContext): Unit = {} override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true override def setupJob(jobContext: JobContext): Unit = {} override def setupTask(taskContext: TaskAttemptContext): Unit = {} override def commitJob(jobContext: JobContext) { val configuration = ContextUtil.getConfiguration(jobContext) val fileSystem = outputPath.getFileSystem(configuration) LOG.info("Using DirectParquetOutputCommitter to commit parquet files") if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) { try { val outputStatus = fileSystem.getFileStatus(outputPath) val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus) try { ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers) } catch { case e: Exception => LOG.warn("Could not write summary file for " + outputPath, e) val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE) if (fileSystem.exists(metadataPath)) { fileSystem.delete(metadataPath, true) } } } catch { case e: Exception => LOG.warn("Could not write summary file for " + outputPath, e) } } if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) { try { val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME) fileSystem.create(successPath).close() } catch { case e: Exception => LOG.warn("Could not write success file for " + outputPath, e) } } } }
Example 2
Source File: SQLHadoopMapReduceCommitProtocol.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.spark.internal.Logging import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol import org.apache.spark.sql.internal.SQLConf class SQLHadoopMapReduceCommitProtocol( jobId: String, path: String, dynamicPartitionOverwrite: Boolean = false) extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) with Serializable with Logging { override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { var committer = super.setupCommitter(context) val configuration = context.getConfiguration val clazz = configuration.getClass(SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter]) if (clazz != null) { logInfo(s"Using user defined output committer class ${clazz.getCanonicalName}") // Every output format based on org.apache.hadoop.mapreduce.lib.output.OutputFormat // has an associated output committer. To override this output committer, // we will first try to use the output committer set in SQLConf.OUTPUT_COMMITTER_CLASS. // If a data source needs to override the output committer, it needs to set the // output committer in prepareForWrite method. if (classOf[FileOutputCommitter].isAssignableFrom(clazz)) { // The specified output committer is a FileOutputCommitter. // So, we will use the FileOutputCommitter-specified constructor. val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext]) committer = ctor.newInstance(new Path(path), context) } else { // The specified output committer is just an OutputCommitter. // So, we will use the no-argument constructor. val ctor = clazz.getDeclaredConstructor() committer = ctor.newInstance() } } logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}") committer } }
Example 3
Source File: OapIndexCommitProtocolSuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.index import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.MRJobConfig import org.apache.hadoop.mapreduce.TaskAttemptID import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.sql.test.oap.SharedOapContext import org.apache.spark.util.Utils class OapIndexCommitProtocolSuite extends SharedOapContext { test("newTaskTempFile") { val attempt = "attempt_200707121733_0001_m_000000_0" val taskID = TaskAttemptID.forName(attempt) val jobID = taskID.getJobID.toString val outDir = Utils.createTempDir().getAbsolutePath val job = Job.getInstance() FileOutputFormat.setOutputPath(job, new Path(outDir)) val conf = job.getConfiguration() conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt) val jobContext = new JobContextImpl(conf, taskID.getJobID()) val taskContext = new TaskAttemptContextImpl(conf, taskID) val commitProtocol = new OapIndexCommitProtocol(jobID, outDir) // test task temp path val pendingDirName = "_temporary_" + jobID commitProtocol.setupJob(jobContext) commitProtocol.setupTask(taskContext) val tempFile = new Path(commitProtocol.newTaskTempFile(taskContext, None, "test")) val expectedJobAttemptPath = new Path(new Path(outDir, pendingDirName), "0") val expectedTaskWorkPath = new Path(new Path(expectedJobAttemptPath, pendingDirName), attempt) assert(tempFile.getParent == expectedTaskWorkPath) } }
Example 4
Source File: DirectParquetOutputCommitter.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.parquet import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import parquet.Log import parquet.hadoop.util.ContextUtil import parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat} private[parquet] class DirectParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext) extends ParquetOutputCommitter(outputPath, context) { val LOG = Log.getLog(classOf[ParquetOutputCommitter]) override def getWorkPath(): Path = outputPath override def abortTask(taskContext: TaskAttemptContext): Unit = {} override def commitTask(taskContext: TaskAttemptContext): Unit = {} override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true override def setupJob(jobContext: JobContext): Unit = {} override def setupTask(taskContext: TaskAttemptContext): Unit = {} override def commitJob(jobContext: JobContext) { val configuration = ContextUtil.getConfiguration(jobContext) val fileSystem = outputPath.getFileSystem(configuration) if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) { try { val outputStatus = fileSystem.getFileStatus(outputPath) val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus) try { ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers) } catch { case e: Exception => { LOG.warn("could not write summary file for " + outputPath, e) val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE) if (fileSystem.exists(metadataPath)) { fileSystem.delete(metadataPath, true) } } } } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) } } if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) { try { val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME) fileSystem.create(successPath).close() } catch { case e: Exception => LOG.warn("could not write success file for " + outputPath, e) } } } }
Example 5
Source File: DirectParquetOutputCommitter.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.parquet.Log import org.apache.parquet.hadoop.util.ContextUtil import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat} private[datasources] class DirectParquetOutputCommitter( outputPath: Path, context: TaskAttemptContext) extends ParquetOutputCommitter(outputPath, context) { val LOG = Log.getLog(classOf[ParquetOutputCommitter]) override def getWorkPath: Path = outputPath override def abortTask(taskContext: TaskAttemptContext): Unit = {} override def commitTask(taskContext: TaskAttemptContext): Unit = {} override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true override def setupJob(jobContext: JobContext): Unit = {} override def setupTask(taskContext: TaskAttemptContext): Unit = {} override def commitJob(jobContext: JobContext) { val configuration = ContextUtil.getConfiguration(jobContext) val fileSystem = outputPath.getFileSystem(configuration) if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) { try { val outputStatus = fileSystem.getFileStatus(outputPath) val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus) try { ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers) } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE) if (fileSystem.exists(metadataPath)) { fileSystem.delete(metadataPath, true) } } } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) } } if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) { try { val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME) fileSystem.create(successPath).close() } catch { case e: Exception => LOG.warn("could not write success file for " + outputPath, e) } } } }
Example 6
Source File: SQLHadoopMapReduceCommitProtocol.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.spark.internal.Logging import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol import org.apache.spark.sql.internal.SQLConf class SQLHadoopMapReduceCommitProtocol( jobId: String, path: String, dynamicPartitionOverwrite: Boolean = false) extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) with Serializable with Logging { override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { var committer = super.setupCommitter(context) val configuration = context.getConfiguration val clazz = configuration.getClass(SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter]) if (clazz != null) { logInfo(s"Using user defined output committer class ${clazz.getCanonicalName}") // Every output format based on org.apache.hadoop.mapreduce.lib.output.OutputFormat // has an associated output committer. To override this output committer, // we will first try to use the output committer set in SQLConf.OUTPUT_COMMITTER_CLASS. // If a data source needs to override the output committer, it needs to set the // output committer in prepareForWrite method. if (classOf[FileOutputCommitter].isAssignableFrom(clazz)) { // The specified output committer is a FileOutputCommitter. // So, we will use the FileOutputCommitter-specified constructor. val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext]) committer = ctor.newInstance(new Path(path), context) } else { // The specified output committer is just an OutputCommitter. // So, we will use the no-argument constructor. val ctor = clazz.getDeclaredConstructor() committer = ctor.newInstance() } } logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}") committer } }
Example 7
Source File: DirectParquetOutputCommitter.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.parquet.Log import org.apache.parquet.hadoop.util.ContextUtil import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat} private[datasources] class DirectParquetOutputCommitter( outputPath: Path, context: TaskAttemptContext) extends ParquetOutputCommitter(outputPath, context) { val LOG = Log.getLog(classOf[ParquetOutputCommitter]) override def getWorkPath: Path = outputPath override def abortTask(taskContext: TaskAttemptContext): Unit = {} override def commitTask(taskContext: TaskAttemptContext): Unit = {} override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true override def setupJob(jobContext: JobContext): Unit = {} override def setupTask(taskContext: TaskAttemptContext): Unit = {} override def commitJob(jobContext: JobContext) { val configuration = { // scalastyle:off jobcontext ContextUtil.getConfiguration(jobContext) // scalastyle:on jobcontext } val fileSystem = outputPath.getFileSystem(configuration) if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) { try { val outputStatus = fileSystem.getFileStatus(outputPath) val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus) try { ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers) } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE) if (fileSystem.exists(metadataPath)) { fileSystem.delete(metadataPath, true) } } } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) } } if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) { try { val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME) fileSystem.create(successPath).close() } catch { case e: Exception => LOG.warn("could not write success file for " + outputPath, e) } } } }
Example 8
Source File: PlyOutputWriter.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus.ply import org.apache.spark.sql.types._ import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext, JobContext } import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import java.io.DataOutputStream import org.apache.spark.sql.sources.OutputWriter import org.apache.hadoop.io.{ NullWritable, BytesWritable } import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.fs.Path import java.text.NumberFormat import org.apache.spark.sql.{ Row, SQLContext, sources } import fr.ign.spark.iqmulus.RowOutputStream class PlyOutputWriter( name: String, context: TaskAttemptContext, dataSchema: StructType, element: String, littleEndian: Boolean ) extends OutputWriter { private val file = { val path = getDefaultWorkFile(s".ply.$element") val fs = path.getFileSystem(context.getConfiguration) fs.create(path) } private var count = 0L // strip out ids private val schema = StructType(dataSchema.filterNot { Seq("fid", "pid") contains _.name }) private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian, schema, dataSchema) def getDefaultWorkFile(extension: String): Path = { val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID") val taskAttemptId: TaskAttemptID = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId new Path(name, f"$split%05d-$uniqueWriteJobId$extension") } override def write(row: Row): Unit = { recordWriter.write(row) count += 1 } override def close(): Unit = { recordWriter.close // write header val path = getDefaultWorkFile(".ply.header") val fs = path.getFileSystem(context.getConfiguration) val dos = new java.io.DataOutputStream(fs.create(path)) val header = new PlyHeader(path.toString, littleEndian, Map(element -> ((count, schema)))) header.write(dos) dos.close } }