org.apache.hadoop.mapreduce.TaskAttemptContext Scala Examples
The following examples show how to use org.apache.hadoop.mapreduce.TaskAttemptContext.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TextFileFormat.scala From drizzle-spark with Apache License 2.0 | 12 votes |
package org.apache.spark.sql.execution.datasources.text import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat} import org.apache.hadoop.util.ReflectionUtils import org.apache.spark.TaskContext import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter} import org.apache.spark.sql.catalyst.util.CompressionCodecs import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.util.SerializableConfiguration def getCompressionExtension(context: TaskAttemptContext): String = { // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile if (FileOutputFormat.getCompressOutput(context)) { val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec]) ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension } else { "" } } }
Example 2
Source File: SparkHadoopMapReduceUtil.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.mapreduce import java.lang.{Boolean => JBoolean, Integer => JInteger} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapreduce.{JobContext, JobID, TaskAttemptContext, TaskAttemptID} private[spark] trait SparkHadoopMapReduceUtil { def newJobContext(conf: Configuration, jobId: JobID): JobContext = { val klass = firstAvailableClass( "org.apache.hadoop.mapreduce.task.JobContextImpl", // hadoop2, hadoop2-yarn "org.apache.hadoop.mapreduce.JobContext") // hadoop1 val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[JobID]) ctor.newInstance(conf, jobId).asInstanceOf[JobContext] } def newTaskAttemptContext(conf: Configuration, attemptId: TaskAttemptID): TaskAttemptContext = { val klass = firstAvailableClass( "org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl", // hadoop2, hadoop2-yarn "org.apache.hadoop.mapreduce.TaskAttemptContext") // hadoop1 val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[TaskAttemptID]) ctor.newInstance(conf, attemptId).asInstanceOf[TaskAttemptContext] } def newTaskAttemptID( jtIdentifier: String, jobId: Int, isMap: Boolean, taskId: Int, attemptId: Int) = { val klass = Class.forName("org.apache.hadoop.mapreduce.TaskAttemptID") try { // First, attempt to use the old-style constructor that takes a boolean isMap // (not available in YARN) val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], classOf[Boolean], classOf[Int], classOf[Int]) ctor.newInstance(jtIdentifier, new JInteger(jobId), new JBoolean(isMap), new JInteger(taskId), new JInteger(attemptId)).asInstanceOf[TaskAttemptID] } catch { case exc: NoSuchMethodException => { // If that failed, look for the new constructor that takes a TaskType (not available in 1.x) val taskTypeClass = Class.forName("org.apache.hadoop.mapreduce.TaskType") .asInstanceOf[Class[Enum[_]]] val taskType = taskTypeClass.getMethod("valueOf", classOf[String]).invoke( taskTypeClass, if(isMap) "MAP" else "REDUCE") val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], taskTypeClass, classOf[Int], classOf[Int]) ctor.newInstance(jtIdentifier, new JInteger(jobId), taskType, new JInteger(taskId), new JInteger(attemptId)).asInstanceOf[TaskAttemptID] } } } private def firstAvailableClass(first: String, second: String): Class[_] = { try { Class.forName(first) } catch { case e: ClassNotFoundException => Class.forName(second) } } }
Example 3
Source File: LasOutputWriter.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus.las import org.apache.spark.sql.types._ import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext } import java.io.DataOutputStream import org.apache.spark.sql.sources.OutputWriter import org.apache.spark.deploy.SparkHadoopUtil import org.apache.hadoop.io.{ NullWritable, BytesWritable } import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.fs.Path import java.text.NumberFormat import org.apache.spark.sql.{ Row, SQLContext, sources } import fr.ign.spark.iqmulus.RowOutputStream class LasOutputWriter( name: String, context: TaskAttemptContext, dataSchema: StructType, formatOpt: Option[Byte] = None, version: Version = Version(), offset: Array[Double] = Array(0F, 0F, 0F), scale: Array[Double] = Array(0.01F, 0.01F, 0.01F) ) extends OutputWriter { private val file = { val path = getDefaultWorkFile("/1.pdr") val fs = path.getFileSystem(context.getConfiguration) fs.create(path) } private val pmin = Array.fill[Double](3)(Double.PositiveInfinity) private val pmax = Array.fill[Double](3)(Double.NegativeInfinity) private val countByReturn = Array.fill[Long](15)(0) private def count = countByReturn.sum private val format = formatOpt.getOrElse(LasHeader.formatFromSchema(dataSchema)) // todo, extra bytes private val schema = LasHeader.schema(format) private def header = new LasHeader(name, format, count, pmin, pmax, scale, offset, countByReturn) private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian = true, schema, dataSchema) def getDefaultWorkFile(extension: String): Path = { val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID") val taskAttemptId: TaskAttemptID = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId new Path(name, f"$split%05d-$uniqueWriteJobId$extension") } override def write(row: Row): Unit = { recordWriter.write(row) // gather statistics for the header val x = offset(0) + scale(0) * row.getAs[Int]("x").toDouble val y = offset(1) + scale(1) * row.getAs[Int]("y").toDouble val z = offset(2) + scale(2) * row.getAs[Int]("z").toDouble val ret = row.getAs[Byte]("flags") & 0x3 countByReturn(ret) += 1 pmin(0) = Math.min(pmin(0), x) pmin(1) = Math.min(pmin(1), y) pmin(2) = Math.min(pmin(2), z) pmax(0) = Math.max(pmax(0), x) pmax(1) = Math.max(pmax(1), y) pmax(2) = Math.max(pmax(2), z) } override def close(): Unit = { recordWriter.close // write header val path = getDefaultWorkFile("/0.header") val fs = path.getFileSystem(context.getConfiguration) val dos = new java.io.DataOutputStream(fs.create(path)) header.write(dos) dos.close // copy header and pdf to a final las file (1 per split) org.apache.hadoop.fs.FileUtil.copyMerge( fs, getDefaultWorkFile("/"), fs, getDefaultWorkFile(".las"), true, context.getConfiguration, "" ) } }
Example 4
Source File: WholeTextFileRecordReader.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.input import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable} import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader} import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.deploy.SparkHadoopUtil private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 5
Source File: WholeTextFileInputFormat.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.input import scala.collection.JavaConversions._ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.JobContext import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext def setMinPartitions(context: JobContext, minPartitions: Int) { val files = listStatus(context) val totalLen = files.map { file => if (file.isDir) 0L else file.getLen }.sum val maxSplitSize = Math.ceil(totalLen * 1.0 / (if (minPartitions == 0) 1 else minPartitions)).toLong super.setMaxSplitSize(maxSplitSize) } }
Example 6
Source File: CarbonTaskCompletionListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.carbondata.execution.datasources.tasklisteners import org.apache.hadoop.io.NullWritable import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext} import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.RecordReaderIterator import org.apache.spark.util.TaskCompletionListener import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.memory.UnsafeMemoryManager import org.apache.carbondata.core.util.{DataTypeUtil, ThreadLocalTaskInfo} import org.apache.carbondata.hadoop.internal.ObjectArrayWritable trait CarbonCompactionTaskCompletionListener extends TaskCompletionListener case class CarbonQueryTaskCompletionListenerImpl(iter: RecordReaderIterator[InternalRow], freeMemory: Boolean = false) extends CarbonQueryTaskCompletionListener { override def onTaskCompletion(context: TaskContext): Unit = { if (iter != null) { try { iter.close() } catch { case e: Exception => LogServiceFactory.getLogService(this.getClass.getCanonicalName).error(e) } } if (freeMemory) { UnsafeMemoryManager.INSTANCE .freeMemoryAll(ThreadLocalTaskInfo.getCarbonTaskInfo.getTaskId) ThreadLocalTaskInfo.clearCarbonTaskInfo() } DataTypeUtil.clearFormatter() } } case class CarbonLoadTaskCompletionListenerImpl(recordWriter: RecordWriter[NullWritable, ObjectArrayWritable], taskAttemptContext: TaskAttemptContext) extends CarbonLoadTaskCompletionListener { override def onTaskCompletion(context: TaskContext): Unit = { try { recordWriter.close(taskAttemptContext) } finally { UnsafeMemoryManager.INSTANCE .freeMemoryAll(ThreadLocalTaskInfo.getCarbonTaskInfo.getTaskId) ThreadLocalTaskInfo.clearCarbonTaskInfo() DataTypeUtil.clearFormatter() } } }
Example 7
Source File: CommitFailureTestSource.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.TaskContext import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory} import org.apache.spark.sql.types.StructType class CommitFailureTestSource extends SimpleTextSource { override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = new OutputWriterFactory { override def newInstance( path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { new SimpleTextOutputWriter(path, context) { var failed = false TaskContext.get().addTaskFailureListener { (t: TaskContext, e: Throwable) => failed = true SimpleTextRelation.callbackCalled = true } override def write(row: Row): Unit = { if (SimpleTextRelation.failWriter) { sys.error("Intentional task writer failure for testing purpose.") } super.write(row) } override def close(): Unit = { super.close() sys.error("Intentional task commitment failure for testing purpose.") } } } override def getFileExtension(context: TaskAttemptContext): String = "" } override def shortName(): String = "commit-failure-test" }
Example 8
Source File: ManifestFileCommitProtocol.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.util.UUID import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.spark.internal.Logging import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage def setupManifestOptions(fileLog: FileStreamSinkLog, batchId: Long): Unit = { this.fileLog = fileLog this.batchId = batchId } override def setupJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") // Do nothing } override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") val fileStatuses = taskCommits.flatMap(_.obj.asInstanceOf[Seq[SinkFileStatus]]).toArray if (fileLog.add(batchId, fileStatuses)) { logInfo(s"Committed batch $batchId") } else { throw new IllegalStateException(s"Race while writing batch $batchId") } } override def abortJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") // Do nothing } override def setupTask(taskContext: TaskAttemptContext): Unit = { addedFiles = new ArrayBuffer[String] } override def newTaskTempFile( taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = { // The file name looks like part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet // Note that %05d does not truncate the split number, so if we have more than 100000 tasks, // the file name is fine and won't overflow. val split = taskContext.getTaskAttemptID.getTaskID.getId val uuid = UUID.randomUUID.toString val filename = f"part-$split%05d-$uuid$ext" val file = dir.map { d => new Path(new Path(path, d), filename).toString }.getOrElse { new Path(path, filename).toString } addedFiles += file file } override def newTaskTempFileAbsPath( taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String = { throw new UnsupportedOperationException( s"$this does not support adding files with an absolute path") } override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = { if (addedFiles.nonEmpty) { val fs = new Path(addedFiles.head).getFileSystem(taskContext.getConfiguration) val statuses: Seq[SinkFileStatus] = addedFiles.map(f => SinkFileStatus(fs.getFileStatus(new Path(f)))) new TaskCommitMessage(statuses) } else { new TaskCommitMessage(Seq.empty[SinkFileStatus]) } } override def abortTask(taskContext: TaskAttemptContext): Unit = { // Do nothing // TODO: we can also try delete the addedFiles as a best-effort cleanup. } }
Example 9
Source File: WholeTextFileRecordReader.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.input import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.conf.{Configurable => HConfigurable, Configuration} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.input.{CombineFileRecordReader, CombineFileSplit} private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 10
Source File: DirectParquetOutputCommitter.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.parquet import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import parquet.Log import parquet.hadoop.util.ContextUtil import parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat} private[parquet] class DirectParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext) extends ParquetOutputCommitter(outputPath, context) { val LOG = Log.getLog(classOf[ParquetOutputCommitter]) override def getWorkPath(): Path = outputPath override def abortTask(taskContext: TaskAttemptContext): Unit = {} override def commitTask(taskContext: TaskAttemptContext): Unit = {} override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true override def setupJob(jobContext: JobContext): Unit = {} override def setupTask(taskContext: TaskAttemptContext): Unit = {} override def commitJob(jobContext: JobContext) { val configuration = ContextUtil.getConfiguration(jobContext) val fileSystem = outputPath.getFileSystem(configuration) if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) { try { val outputStatus = fileSystem.getFileStatus(outputPath) val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus) try { ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers) } catch { case e: Exception => { LOG.warn("could not write summary file for " + outputPath, e) val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE) if (fileSystem.exists(metadataPath)) { fileSystem.delete(metadataPath, true) } } } } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) } } if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) { try { val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME) fileSystem.create(successPath).close() } catch { case e: Exception => LOG.warn("could not write success file for " + outputPath, e) } } } }
Example 11
Source File: SparkHadoopMapReduceUtil.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mapreduce import java.lang.{Boolean => JBoolean, Integer => JInteger} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapreduce.{JobContext, JobID, TaskAttemptContext, TaskAttemptID} private[spark] trait SparkHadoopMapReduceUtil { def newJobContext(conf: Configuration, jobId: JobID): JobContext = { val klass = firstAvailableClass( "org.apache.hadoop.mapreduce.task.JobContextImpl", // hadoop2, hadoop2-yarn "org.apache.hadoop.mapreduce.JobContext") // hadoop1 val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[JobID]) ctor.newInstance(conf, jobId).asInstanceOf[JobContext] } def newTaskAttemptContext(conf: Configuration, attemptId: TaskAttemptID): TaskAttemptContext = { val klass = firstAvailableClass( "org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl", // hadoop2, hadoop2-yarn "org.apache.hadoop.mapreduce.TaskAttemptContext") // hadoop1 val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[TaskAttemptID]) ctor.newInstance(conf, attemptId).asInstanceOf[TaskAttemptContext] } def newTaskAttemptID( jtIdentifier: String, jobId: Int, isMap: Boolean, taskId: Int, attemptId: Int): TaskAttemptID = { val klass = Class.forName("org.apache.hadoop.mapreduce.TaskAttemptID") try { // First, attempt to use the old-style constructor that takes a boolean isMap // (not available in YARN) val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], classOf[Boolean], classOf[Int], classOf[Int]) ctor.newInstance(jtIdentifier, new JInteger(jobId), new JBoolean(isMap), new JInteger(taskId), new JInteger(attemptId)).asInstanceOf[TaskAttemptID] } catch { case exc: NoSuchMethodException => { // If that failed, look for the new constructor that takes a TaskType (not available in 1.x) val taskTypeClass = Class.forName("org.apache.hadoop.mapreduce.TaskType") .asInstanceOf[Class[Enum[_]]] val taskType = taskTypeClass.getMethod("valueOf", classOf[String]).invoke( taskTypeClass, if (isMap) "MAP" else "REDUCE") val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], taskTypeClass, classOf[Int], classOf[Int]) ctor.newInstance(jtIdentifier, new JInteger(jobId), taskType, new JInteger(taskId), new JInteger(attemptId)).asInstanceOf[TaskAttemptID] } } } private def firstAvailableClass(first: String, second: String): Class[_] = { try { Class.forName(first) } catch { case e: ClassNotFoundException => Class.forName(second) } } }
Example 12
Source File: WholeTextFileRecordReader.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.input import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable} import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader} import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.deploy.SparkHadoopUtil private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 13
Source File: WholeTextFileInputFormat.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.input import scala.collection.JavaConversions._ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.JobContext import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext def setMinPartitions(context: JobContext, minPartitions: Int) { val files = listStatus(context) val totalLen = files.map { file => if (file.isDir) 0L else file.getLen }.sum val maxSplitSize = Math.ceil(totalLen * 1.0 / (if (minPartitions == 0) 1 else minPartitions)).toLong super.setMaxSplitSize(maxSplitSize) } }
Example 14
Source File: DirectParquetOutputCommitter.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.parquet.Log import org.apache.parquet.hadoop.util.ContextUtil import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat} private[datasources] class DirectParquetOutputCommitter( outputPath: Path, context: TaskAttemptContext) extends ParquetOutputCommitter(outputPath, context) { val LOG = Log.getLog(classOf[ParquetOutputCommitter]) override def getWorkPath: Path = outputPath override def abortTask(taskContext: TaskAttemptContext): Unit = {} override def commitTask(taskContext: TaskAttemptContext): Unit = {} override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true override def setupJob(jobContext: JobContext): Unit = {} override def setupTask(taskContext: TaskAttemptContext): Unit = {} override def commitJob(jobContext: JobContext) { val configuration = ContextUtil.getConfiguration(jobContext) val fileSystem = outputPath.getFileSystem(configuration) if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) { try { val outputStatus = fileSystem.getFileStatus(outputPath) val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus) try { ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers) } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE) if (fileSystem.exists(metadataPath)) { fileSystem.delete(metadataPath, true) } } } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) } } if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) { try { val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME) fileSystem.create(successPath).close() } catch { case e: Exception => LOG.warn("could not write success file for " + outputPath, e) } } } }
Example 15
Source File: SparkHadoopMapReduceUtil.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mapreduce import java.lang.{Boolean => JBoolean, Integer => JInteger} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapreduce.{JobContext, JobID, TaskAttemptContext, TaskAttemptID} import org.apache.spark.util.Utils private[spark] trait SparkHadoopMapReduceUtil { def newJobContext(conf: Configuration, jobId: JobID): JobContext = { val klass = firstAvailableClass( "org.apache.hadoop.mapreduce.task.JobContextImpl", // hadoop2, hadoop2-yarn "org.apache.hadoop.mapreduce.JobContext") // hadoop1 val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[JobID]) ctor.newInstance(conf, jobId).asInstanceOf[JobContext] } def newTaskAttemptContext(conf: Configuration, attemptId: TaskAttemptID): TaskAttemptContext = { val klass = firstAvailableClass( "org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl", // hadoop2, hadoop2-yarn "org.apache.hadoop.mapreduce.TaskAttemptContext") // hadoop1 val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[TaskAttemptID]) ctor.newInstance(conf, attemptId).asInstanceOf[TaskAttemptContext] } def newTaskAttemptID( jtIdentifier: String, jobId: Int, isMap: Boolean, taskId: Int, attemptId: Int): TaskAttemptID = { val klass = Utils.classForName("org.apache.hadoop.mapreduce.TaskAttemptID") try { // First, attempt to use the old-style constructor that takes a boolean isMap // (not available in YARN) //首先,尝试使用带有boolean isMap的旧式构造函数(在YARN中不可用) val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], classOf[Boolean], classOf[Int], classOf[Int]) ctor.newInstance(jtIdentifier, new JInteger(jobId), new JBoolean(isMap), new JInteger(taskId), new JInteger(attemptId)).asInstanceOf[TaskAttemptID] } catch { case exc: NoSuchMethodException => { // If that failed, look for the new constructor that takes a TaskType (not available in 1.x) //如果失败,请查找采用TaskType的新构造函数(在1.x中不可用) val taskTypeClass = Utils.classForName("org.apache.hadoop.mapreduce.TaskType") .asInstanceOf[Class[Enum[_]]] val taskType = taskTypeClass.getMethod("valueOf", classOf[String]).invoke( taskTypeClass, if (isMap) "MAP" else "REDUCE") val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], taskTypeClass, classOf[Int], classOf[Int]) ctor.newInstance(jtIdentifier, new JInteger(jobId), taskType, new JInteger(taskId), new JInteger(attemptId)).asInstanceOf[TaskAttemptID] } } } private def firstAvailableClass(first: String, second: String): Class[_] = { try { Utils.classForName(first) } catch { case e: ClassNotFoundException => Utils.classForName(second) } } }
Example 16
Source File: WholeTextFileRecordReader.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.input import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable} import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader} import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.deploy.SparkHadoopUtil private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 17
Source File: WholeTextFileInputFormat.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.input import scala.collection.JavaConversions._ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.JobContext import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext def setMinPartitions(context: JobContext, minPartitions: Int) { val files = listStatus(context) val totalLen = files.map { file => if (file.isDir) 0L else file.getLen }.sum val maxSplitSize = Math.ceil(totalLen * 1.0 / (if (minPartitions == 0) 1 else minPartitions)).toLong super.setMaxSplitSize(maxSplitSize) } }
Example 18
Source File: CommitFailureTestSource.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.TaskContext import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory} import org.apache.spark.sql.types.StructType class CommitFailureTestSource extends SimpleTextSource { override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = new OutputWriterFactory { override def newInstance( path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { new SimpleTextOutputWriter(path, dataSchema, context) { var failed = false TaskContext.get().addTaskFailureListener { (t: TaskContext, e: Throwable) => failed = true SimpleTextRelation.callbackCalled = true } override def write(row: InternalRow): Unit = { if (SimpleTextRelation.failWriter) { sys.error("Intentional task writer failure for testing purpose.") } super.write(row) } override def close(): Unit = { super.close() sys.error("Intentional task commitment failure for testing purpose.") } } } override def getFileExtension(context: TaskAttemptContext): String = "" } override def shortName(): String = "commit-failure-test" }
Example 19
Source File: OrcOutputWriter.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.orc import org.apache.hadoop.fs.Path import org.apache.hadoop.io.NullWritable import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.orc.mapred.OrcStruct import org.apache.orc.mapreduce.OrcOutputFormat import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.OutputWriter import org.apache.spark.sql.types._ private[orc] class OrcOutputWriter( path: String, dataSchema: StructType, context: TaskAttemptContext) extends OutputWriter { private[this] val serializer = new OrcSerializer(dataSchema) private val recordWriter = { new OrcOutputFormat[OrcStruct]() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { new Path(path) } }.getRecordWriter(context) } override def write(row: InternalRow): Unit = { recordWriter.write(NullWritable.get(), serializer.serialize(row)) } override def close(): Unit = { recordWriter.close(context) } }
Example 20
Source File: SQLHadoopMapReduceCommitProtocol.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.spark.internal.Logging import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol import org.apache.spark.sql.internal.SQLConf class SQLHadoopMapReduceCommitProtocol( jobId: String, path: String, dynamicPartitionOverwrite: Boolean = false) extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) with Serializable with Logging { override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { var committer = super.setupCommitter(context) val configuration = context.getConfiguration val clazz = configuration.getClass(SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter]) if (clazz != null) { logInfo(s"Using user defined output committer class ${clazz.getCanonicalName}") // Every output format based on org.apache.hadoop.mapreduce.lib.output.OutputFormat // has an associated output committer. To override this output committer, // we will first try to use the output committer set in SQLConf.OUTPUT_COMMITTER_CLASS. // If a data source needs to override the output committer, it needs to set the // output committer in prepareForWrite method. if (classOf[FileOutputCommitter].isAssignableFrom(clazz)) { // The specified output committer is a FileOutputCommitter. // So, we will use the FileOutputCommitter-specified constructor. val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext]) committer = ctor.newInstance(new Path(path), context) } else { // The specified output committer is just an OutputCommitter. // So, we will use the no-argument constructor. val ctor = clazz.getDeclaredConstructor() committer = ctor.newInstance() } } logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}") committer } }
Example 21
Source File: ManifestFileCommitProtocol.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.util.UUID import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.spark.internal.Logging import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage def setupManifestOptions(fileLog: FileStreamSinkLog, batchId: Long): Unit = { this.fileLog = fileLog this.batchId = batchId } override def setupJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") // Do nothing } override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") val fileStatuses = taskCommits.flatMap(_.obj.asInstanceOf[Seq[SinkFileStatus]]).toArray if (fileLog.add(batchId, fileStatuses)) { logInfo(s"Committed batch $batchId") } else { throw new IllegalStateException(s"Race while writing batch $batchId") } } override def abortJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") // Do nothing } override def setupTask(taskContext: TaskAttemptContext): Unit = { addedFiles = new ArrayBuffer[String] } override def newTaskTempFile( taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = { // The file name looks like part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet // Note that %05d does not truncate the split number, so if we have more than 100000 tasks, // the file name is fine and won't overflow. val split = taskContext.getTaskAttemptID.getTaskID.getId val uuid = UUID.randomUUID.toString val filename = f"part-$split%05d-$uuid$ext" val file = dir.map { d => new Path(new Path(path, d), filename).toString }.getOrElse { new Path(path, filename).toString } addedFiles += file file } override def newTaskTempFileAbsPath( taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String = { throw new UnsupportedOperationException( s"$this does not support adding files with an absolute path") } override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = { if (addedFiles.nonEmpty) { val fs = new Path(addedFiles.head).getFileSystem(taskContext.getConfiguration) val statuses: Seq[SinkFileStatus] = addedFiles.map(f => SinkFileStatus(fs.getFileStatus(new Path(f)))) new TaskCommitMessage(statuses) } else { new TaskCommitMessage(Seq.empty[SinkFileStatus]) } } override def abortTask(taskContext: TaskAttemptContext): Unit = { // Do nothing // TODO: we can also try delete the addedFiles as a best-effort cleanup. } }
Example 22
Source File: WholeTextFileRecordReader.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.input import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.conf.{Configurable => HConfigurable, Configuration} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.input.{CombineFileRecordReader, CombineFileSplit} private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 23
Source File: SageMakerProtobufFileFormat.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.protobuf import org.apache.hadoop.fs.FileStatus import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriter, OutputWriterFactory} import org.apache.spark.sql.sources.DataSourceRegister import org.apache.spark.sql.types.StructType class SageMakerProtobufFileFormat extends FileFormat with DataSourceRegister { override def inferSchema(sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = { Option.empty } override def shortName(): String = "sagemaker" override def toString: String = "SageMaker" override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = { new OutputWriterFactory { override def newInstance( path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { new SageMakerProtobufWriter(path, context, dataSchema, options) } override def getFileExtension(context: TaskAttemptContext): String = { ".pbr" } } } }
Example 24
Source File: SageMakerProtobufWriter.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.protobuf import java.io.ByteArrayOutputStream import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext} import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.execution.datasources.OutputWriter import org.apache.spark.sql.types.StructType def write(row: Row): Unit = { val labelColumnName = options.getOrElse("labelColumnName", "label") val featuresColumnName = options.getOrElse("featuresColumnName", "features") val record = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Some(labelColumnName)) record.writeTo(byteArrayOutputStream) recordWriter.write(NullWritable.get(), new BytesWritable(byteArrayOutputStream.toByteArray)) byteArrayOutputStream.reset() } override def close(): Unit = { recordWriter.close(context) } }
Example 25
Source File: RecordIOOutputFormatTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.protobuf import java.io.ByteArrayOutputStream import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataOutputStream, Path} import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.hadoop.mapreduce.TaskAttemptContext import org.mockito.Matchers.any import org.mockito.Mockito.{verify, when} import org.scalatest.{BeforeAndAfter, FlatSpec} import org.scalatest.mock.MockitoSugar import com.amazonaws.services.sagemaker.sparksdk.protobuf.RecordIOOutputFormat.SageMakerProtobufRecordWriter class RecordIOOutputFormatTests extends FlatSpec with MockitoSugar with BeforeAndAfter { var sagemakerProtobufRecordWriter: SageMakerProtobufRecordWriter = _ var mockOutputStream : FSDataOutputStream = _ var byteArrayOutputStream: ByteArrayOutputStream = _ var mockTaskAttemptContext: TaskAttemptContext = _ var mockPath: Path = _ var mockFileSystem: FileSystem = _ before { byteArrayOutputStream = new ByteArrayOutputStream() mockOutputStream = mock[FSDataOutputStream] sagemakerProtobufRecordWriter = new SageMakerProtobufRecordWriter(mockOutputStream) mockTaskAttemptContext = mock[TaskAttemptContext] mockPath = mock[Path] mockFileSystem = mock[FileSystem] } it should "write an empty array of bytes" in { val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "write an array of bytes" in { val byteArray = Array[Byte](0, 0, 0, 0) byteArrayOutputStream.write(byteArray) val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "write an array of bytes, padding as necessary" in { byteArrayOutputStream.write(5) val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "write an array of bytes, padding only as much as necessary" in { byteArrayOutputStream.write(Array[Byte](0, 0, 0, 0, 0)) val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "create a record writer from a FSDataOutputStream created by the filesystem" in { val mockTaskAttemptContext = mock[TaskAttemptContext] val mockPath = mock[Path] val mockFileSystem = mock[FileSystem] when(mockPath.getFileSystem(any[Configuration])).thenReturn(mockFileSystem) new RecordIOOutputFormat() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { mockPath } }.getRecordWriter(mockTaskAttemptContext) verify(mockFileSystem).create(mockPath, true) } }
Example 26
Source File: DirectParquetOutputCommitter.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.parquet.Log import org.apache.parquet.hadoop.util.ContextUtil import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat} private[datasources] class DirectParquetOutputCommitter( outputPath: Path, context: TaskAttemptContext) extends ParquetOutputCommitter(outputPath, context) { val LOG = Log.getLog(classOf[ParquetOutputCommitter]) override def getWorkPath: Path = outputPath override def abortTask(taskContext: TaskAttemptContext): Unit = {} override def commitTask(taskContext: TaskAttemptContext): Unit = {} override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true override def setupJob(jobContext: JobContext): Unit = {} override def setupTask(taskContext: TaskAttemptContext): Unit = {} override def commitJob(jobContext: JobContext) { val configuration = { // scalastyle:off jobcontext ContextUtil.getConfiguration(jobContext) // scalastyle:on jobcontext } val fileSystem = outputPath.getFileSystem(configuration) if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) { try { val outputStatus = fileSystem.getFileStatus(outputPath) val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus) try { ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers) } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE) if (fileSystem.exists(metadataPath)) { fileSystem.delete(metadataPath, true) } } } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) } } if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) { try { val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME) fileSystem.create(successPath).close() } catch { case e: Exception => LOG.warn("could not write success file for " + outputPath, e) } } } }
Example 27
Source File: SparkHadoopMapReduceUtil.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mapreduce import java.lang.{Boolean => JBoolean, Integer => JInteger} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapreduce.{JobContext, JobID, TaskAttemptContext, TaskAttemptID} import org.apache.spark.util.Utils private[spark] trait SparkHadoopMapReduceUtil { def newJobContext(conf: Configuration, jobId: JobID): JobContext = { val klass = firstAvailableClass( "org.apache.hadoop.mapreduce.task.JobContextImpl", // hadoop2, hadoop2-yarn "org.apache.hadoop.mapreduce.JobContext") // hadoop1 val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[JobID]) ctor.newInstance(conf, jobId).asInstanceOf[JobContext] } def newTaskAttemptContext(conf: Configuration, attemptId: TaskAttemptID): TaskAttemptContext = { val klass = firstAvailableClass( "org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl", // hadoop2, hadoop2-yarn "org.apache.hadoop.mapreduce.TaskAttemptContext") // hadoop1 val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[TaskAttemptID]) ctor.newInstance(conf, attemptId).asInstanceOf[TaskAttemptContext] } def newTaskAttemptID( jtIdentifier: String, jobId: Int, isMap: Boolean, taskId: Int, attemptId: Int): TaskAttemptID = { val klass = Utils.classForName("org.apache.hadoop.mapreduce.TaskAttemptID") try { // First, attempt to use the old-style constructor that takes a boolean isMap // (not available in YARN) val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], classOf[Boolean], classOf[Int], classOf[Int]) ctor.newInstance(jtIdentifier, new JInteger(jobId), new JBoolean(isMap), new JInteger(taskId), new JInteger(attemptId)).asInstanceOf[TaskAttemptID] } catch { case exc: NoSuchMethodException => { // If that failed, look for the new constructor that takes a TaskType (not available in 1.x) val taskTypeClass = Utils.classForName("org.apache.hadoop.mapreduce.TaskType") .asInstanceOf[Class[Enum[_]]] val taskType = taskTypeClass.getMethod("valueOf", classOf[String]).invoke( taskTypeClass, if (isMap) "MAP" else "REDUCE") val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], taskTypeClass, classOf[Int], classOf[Int]) ctor.newInstance(jtIdentifier, new JInteger(jobId), taskType, new JInteger(taskId), new JInteger(attemptId)).asInstanceOf[TaskAttemptID] } } } private def firstAvailableClass(first: String, second: String): Class[_] = { try { Utils.classForName(first) } catch { case e: ClassNotFoundException => Utils.classForName(second) } } }
Example 28
Source File: WholeTextFileRecordReader.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.input import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable} import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader} import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.deploy.SparkHadoopUtil private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 29
Source File: PlyOutputWriter.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus.ply import org.apache.spark.sql.types._ import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext, JobContext } import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import java.io.DataOutputStream import org.apache.spark.sql.sources.OutputWriter import org.apache.hadoop.io.{ NullWritable, BytesWritable } import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.fs.Path import java.text.NumberFormat import org.apache.spark.sql.{ Row, SQLContext, sources } import fr.ign.spark.iqmulus.RowOutputStream class PlyOutputWriter( name: String, context: TaskAttemptContext, dataSchema: StructType, element: String, littleEndian: Boolean ) extends OutputWriter { private val file = { val path = getDefaultWorkFile(s".ply.$element") val fs = path.getFileSystem(context.getConfiguration) fs.create(path) } private var count = 0L // strip out ids private val schema = StructType(dataSchema.filterNot { Seq("fid", "pid") contains _.name }) private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian, schema, dataSchema) def getDefaultWorkFile(extension: String): Path = { val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID") val taskAttemptId: TaskAttemptID = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId new Path(name, f"$split%05d-$uniqueWriteJobId$extension") } override def write(row: Row): Unit = { recordWriter.write(row) count += 1 } override def close(): Unit = { recordWriter.close // write header val path = getDefaultWorkFile(".ply.header") val fs = path.getFileSystem(context.getConfiguration) val dos = new java.io.DataOutputStream(fs.create(path)) val header = new PlyHeader(path.toString, littleEndian, Map(element -> ((count, schema)))) header.write(dos) dos.close } }
Example 30
Source File: CommitFailureTestSource.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.TaskContext import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory} import org.apache.spark.sql.types.StructType class CommitFailureTestSource extends SimpleTextSource { override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = new OutputWriterFactory { override def newInstance( stagingDir: String, fileNamePrefix: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { new SimpleTextOutputWriter(stagingDir, fileNamePrefix, context) { var failed = false TaskContext.get().addTaskFailureListener { (t: TaskContext, e: Throwable) => failed = true SimpleTextRelation.callbackCalled = true } override val path: String = new Path(stagingDir, fileNamePrefix).toString override def write(row: Row): Unit = { if (SimpleTextRelation.failWriter) { sys.error("Intentional task writer failure for testing purpose.") } super.write(row) } override def close(): Unit = { super.close() sys.error("Intentional task commitment failure for testing purpose.") } } } } override def shortName(): String = "commit-failure-test" }
Example 31
Source File: ExcelOutputWriter.scala From spark-hadoopoffice-ds with Apache License 2.0 | 5 votes |
package org.zuinnote.spark.office.excel import java.math.BigDecimal import java.sql.Date import java.sql.Timestamp import java.text.DateFormat import java.text.SimpleDateFormat import java.util.Calendar import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.NullWritable import org.apache.hadoop.io.ArrayWritable import org.apache.hadoop.mapreduce.RecordWriter import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.fs.Path import org.apache.spark.sql.catalyst.{ CatalystTypeConverters, InternalRow } import org.apache.spark.sql.Row import org.apache.spark.sql.execution.datasources.OutputWriter import org.apache.spark.sql.types._ import org.zuinnote.hadoop.office.format.common.dao.SpreadSheetCellDAO import org.zuinnote.hadoop.office.format.common.HadoopOfficeWriteConfiguration import org.zuinnote.hadoop.office.format.common.util.msexcel.MSExcelUtil import org.zuinnote.hadoop.office.format.mapreduce._ import org.apache.commons.logging.LogFactory import org.apache.commons.logging.Log import org.zuinnote.hadoop.office.format.common.HadoopOfficeWriteConfiguration import java.util.Locale import java.text.DecimalFormat import org.zuinnote.hadoop.office.format.common.converter.ExcelConverterSimpleSpreadSheetCellDAO import java.text.NumberFormat // NOTE: This class is instantiated and used on executor side only, no need to be serializable. private[excel] class ExcelOutputWriter( path: String, dataSchema: StructType, context: TaskAttemptContext, options: Map[String, String]) extends OutputWriter { def write(row: Row): Unit = { // check useHeader if (useHeader) { val headers = row.schema.fieldNames var i = 0 for (x <- headers) { val headerColumnSCD = new SpreadSheetCellDAO(x, "", "", MSExcelUtil.getCellAddressA1Format(currentRowNum, i), defaultSheetName) recordWriter.write(NullWritable.get(), headerColumnSCD) i += 1 } currentRowNum += 1 useHeader = false } // for each value in the row if (row.size>0) { var currentColumnNum = 0; val simpleObject = new Array[AnyRef](row.size) for (i <- 0 to row.size - 1) { // for each element of the row val obj = row.get(i) if ((obj.isInstanceOf[Seq[String]]) && (obj.asInstanceOf[Seq[String]].length==5)) { val formattedValue = obj.asInstanceOf[Seq[String]](0) val comment = obj.asInstanceOf[Seq[String]](1) val formula = obj.asInstanceOf[Seq[String]](2) val address = obj.asInstanceOf[Seq[String]](3) val sheetName = obj.asInstanceOf[Seq[String]](4) simpleObject(i) = new SpreadSheetCellDAO(formattedValue,comment,formula,address,sheetName) } else { simpleObject(i)=obj.asInstanceOf[AnyRef] } } // convert row to spreadsheetcellDAO val spreadSheetCellDAORow = simpleConverter.getSpreadSheetCellDAOfromSimpleDataType(simpleObject, defaultSheetName, currentRowNum) // write it for (x<- spreadSheetCellDAORow) { recordWriter.write(NullWritable.get(), x) } } currentRowNum += 1 } override def close(): Unit = { recordWriter.close(context) currentRowNum = 0; } }
Example 32
Source File: PortableDataStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.input import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream} import scala.collection.JavaConverters._ import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat, CombineFileRecordReader, CombineFileSplit} def toArray(): Array[Byte] = { val stream = open() try { ByteStreams.toByteArray(stream) } finally { Closeables.close(stream, true) } } def getPath(): String = path }
Example 33
Source File: WholeTextFileRecordReader.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.input import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.conf.{Configurable => HConfigurable, Configuration} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.input.{CombineFileRecordReader, CombineFileSplit} private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 34
Source File: RosbagInputFormat.scala From ros_hadoop with Apache License 2.0 | 5 votes |
package de.valtech.foss import scala.io.Source import scala.collection.JavaConverters._ import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, LongWritable, MapWritable} import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.input.FileInputFormat object RosbagInputFormat { def getRosChunkIdx(context: JobContext): String = { context.getConfiguration.get("RosbagInputFormat.chunkIdx") } def getBlockSize(context: JobContext): Long = { context.getConfiguration.get("dfs.blocksize").toLong } } class RosbagBytesInputFormat extends FileInputFormat[LongWritable, BytesWritable] { private var rosChunkIdx = "" private var recordLength = -1L override def isSplitable(context: JobContext, filename: Path): Boolean = { rosChunkIdx = RosbagInputFormat.getRosChunkIdx(context) recordLength = RosbagInputFormat.getBlockSize(context) true } override def computeSplitSize(blockSize: Long, minSize: Long, maxSize: Long): Long = { val defaultSize = super.computeSplitSize(blockSize, minSize, maxSize) defaultSize } override def createRecordReader(split: InputSplit, context: TaskAttemptContext) : RecordReader[LongWritable, BytesWritable] = { new RosbagBytesRecordReader } } class RosbagMapInputFormat extends FileInputFormat[LongWritable, MapWritable] { private var rosChunkIdx = "" private var recordLength = -1L override def isSplitable(context: JobContext, filename: Path): Boolean = { rosChunkIdx = RosbagInputFormat.getRosChunkIdx(context) recordLength = RosbagInputFormat.getBlockSize(context) true } override def computeSplitSize(blockSize: Long, minSize: Long, maxSize: Long): Long = { val defaultSize = super.computeSplitSize(blockSize, minSize, maxSize) defaultSize } override def createRecordReader(split: InputSplit, context: TaskAttemptContext) : RecordReader[LongWritable, MapWritable] = { new RosbagMapRecordReader } }
Example 35
Source File: TFRecordOutputFormat.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.utils.tf import org.apache.hadoop.io.BytesWritable import org.apache.hadoop.io.NullWritable import org.apache.hadoop.mapreduce.RecordWriter import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat class TFRecordOutputFormat extends FileOutputFormat[BytesWritable, NullWritable]{ override def getRecordWriter(taskAttemptContext: TaskAttemptContext): RecordWriter[BytesWritable, NullWritable] = { val conf = taskAttemptContext.getConfiguration val file = getDefaultWorkFile(taskAttemptContext, "") val fs = file.getFileSystem(conf) val bufferSize = 4096 val outStream = fs.create(file, true, bufferSize) val writer = new TFRecordWriter(outStream) new RecordWriter[BytesWritable, NullWritable]() { override def close(context: TaskAttemptContext): Unit = { outStream.close() } override def write(k: BytesWritable, v: NullWritable): Unit = { writer.write(k.getBytes, 0, k.getLength) } } } }
Example 36
Source File: TFRecordInputFormat.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.utils.tf import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} import org.apache.hadoop.fs.FSDataInputStream class TFRecordInputFormat extends FileInputFormat[BytesWritable, NullWritable] { override def createRecordReader(inputSplit: InputSplit, context: TaskAttemptContext): RecordReader[BytesWritable, NullWritable] = new RecordReader[BytesWritable, NullWritable] { private var inputStream: FSDataInputStream = null private var reader: TFRecordIterator = null private var length: Long = 0L private var begin: Long = 0L private var current: Array[Byte] = null override def getCurrentKey: BytesWritable = { new BytesWritable(current) } override def getProgress: Float = { (inputStream.getPos - begin) / (length + 1e-6f) } override def nextKeyValue(): Boolean = { if (reader.hasNext) { current = reader.next() true } else { false } } override def getCurrentValue: NullWritable = { NullWritable.get() } override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = { val conf = context.getConfiguration val fileSplit = split.asInstanceOf[FileSplit] length = fileSplit.getLength begin = fileSplit.getStart val file = fileSplit.getPath val fs = file.getFileSystem(conf) inputStream = fs.open(file, 4096) reader = new TFRecordIterator(inputStream) } override def close(): Unit = { inputStream.close() } } override protected def isSplitable(context: JobContext, filename: Path): Boolean = false }
Example 37
Source File: FileLocalityInputFormat.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.sparkling.util import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} class FileLocalityInputFormat extends FileInputFormat[NullWritable, Text] { class FileLocalityRecordReader extends RecordReader[NullWritable, Text] { private var filePath: Text = new Text() private var read: Boolean = true override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = { filePath.set(split.asInstanceOf[FileSplit].getPath.toString) read = false } override def nextKeyValue(): Boolean = { if (read) false else { read = true true } } override def getCurrentKey: NullWritable = NullWritable.get override def getCurrentValue: Text = filePath override def getProgress: Float = if (read) 1.0f else 0.0f override def close(): Unit = read = true } override def isSplitable(context: JobContext, filename: Path): Boolean = false override def createRecordReader(split: InputSplit, context: TaskAttemptContext): RecordReader[NullWritable, Text] = new FileLocalityRecordReader }
Example 38
Source File: DirectParquetOutputCommitter.scala From utils with Apache License 2.0 | 5 votes |
package com.indix.utils.spark.parquet import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.parquet.Log import org.apache.parquet.hadoop.util.ContextUtil import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat} class DirectParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext) extends ParquetOutputCommitter(outputPath, context) { val LOG = Log.getLog(classOf[ParquetOutputCommitter]) override def getWorkPath: Path = outputPath override def abortTask(taskContext: TaskAttemptContext): Unit = {} override def commitTask(taskContext: TaskAttemptContext): Unit = {} override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true override def setupJob(jobContext: JobContext): Unit = {} override def setupTask(taskContext: TaskAttemptContext): Unit = {} override def commitJob(jobContext: JobContext) { val configuration = ContextUtil.getConfiguration(jobContext) val fileSystem = outputPath.getFileSystem(configuration) LOG.info("Using DirectParquetOutputCommitter to commit parquet files") if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) { try { val outputStatus = fileSystem.getFileStatus(outputPath) val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus) try { ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers) } catch { case e: Exception => LOG.warn("Could not write summary file for " + outputPath, e) val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE) if (fileSystem.exists(metadataPath)) { fileSystem.delete(metadataPath, true) } } } catch { case e: Exception => LOG.warn("Could not write summary file for " + outputPath, e) } } if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) { try { val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME) fileSystem.create(successPath).close() } catch { case e: Exception => LOG.warn("Could not write success file for " + outputPath, e) } } } }
Example 39
Source File: S3PointCloudInputFormat.scala From geotrellis-pointcloud with Apache License 2.0 | 5 votes |
package geotrellis.pointcloud.spark.store.s3 import geotrellis.spark.store.s3._ import geotrellis.pointcloud.spark.store.hadoop.formats._ import geotrellis.pointcloud.util.Filesystem import io.pdal._ import io.circe.Json import io.circe.syntax._ import cats.syntax.either._ import org.apache.hadoop.mapreduce.{InputSplit, TaskAttemptContext} import org.apache.commons.io.FileUtils import java.io.{File, InputStream} import java.net.URI import scala.collection.JavaConverters._ mode match { case "s3" => new S3URIRecordReader[S3PointCloudHeader, List[PointCloud]](s3Client) { def read(key: String, uri: URI): (S3PointCloudHeader, List[PointCloud]) = { val s3Pipeline = pipeline .hcursor .downField("pipeline").downArray .downField("filename").withFocus(_ => uri.toString.asJson) .top.fold(pipeline)(identity) executePipeline(context)(key, s3Pipeline) } } case _ => val tmpDir = { val dir = PointCloudInputFormat.getTmpDir(context) if (dir == null) Filesystem.createDirectory() else Filesystem.createDirectory(dir) } new S3StreamRecordReader[S3PointCloudHeader, List[PointCloud]](s3Client) { def read(key: String, is: InputStream): (S3PointCloudHeader, List[PointCloud]) = { // copy remote file into local tmp dir tmpDir.mkdirs() // to be sure that dirs created val localPath = new File(tmpDir, key.replace("/", "_")) FileUtils.copyInputStreamToFile(is, localPath) is.close() // use local filename path if it's present in json val localPipeline = pipeline .hcursor .downField("pipeline").downArray .downField("filename").withFocus(_ => localPath.getAbsolutePath.asJson) .top.fold(pipeline)(identity) try executePipeline(context)(key, localPipeline) finally { localPath.delete() tmpDir.delete() } } } } } }
Example 40
Source File: ConfOnlyTAC.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.hadoop import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapreduce.{ Counter, TaskAttemptID, Job, TaskAttemptContext } // This exists just because of a quirk of the record reader api. case class ConfOnlyTAC(_conf: Configuration) extends Job with TaskAttemptContext { // JobContextImpl and JobContext override def getConfiguration: Configuration = _conf // TaskAttemptContext override def getTaskAttemptID: TaskAttemptID = sys.error("not implemented") override def setStatus(msg: String): Unit = sys.error("not implemented") override def getStatus = sys.error("not implemented") override def getProgress: Float = sys.error("not implemented") override def getCounter(counterName: Enum[_]): Counter = sys.error("not implemented") override def getCounter(groupName: String, counterName: String): Counter = sys.error("not implemented") // Progressable override def progress(): Unit = sys.error("not implemented") }
Example 41
Source File: CommitFailureTestSource.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.TaskContext import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory} import org.apache.spark.sql.types.StructType class CommitFailureTestSource extends SimpleTextSource { override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = new OutputWriterFactory { override def newInstance( path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { new SimpleTextOutputWriter(path, dataSchema, context) { var failed = false TaskContext.get().addTaskFailureListener { (t: TaskContext, e: Throwable) => failed = true SimpleTextRelation.callbackCalled = true } override def write(row: InternalRow): Unit = { if (SimpleTextRelation.failWriter) { sys.error("Intentional task writer failure for testing purpose.") } super.write(row) } override def close(): Unit = { super.close() sys.error("Intentional task commitment failure for testing purpose.") } } } override def getFileExtension(context: TaskAttemptContext): String = "" } override def shortName(): String = "commit-failure-test" }
Example 42
Source File: OrcOutputWriter.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.orc import org.apache.hadoop.fs.Path import org.apache.hadoop.io.NullWritable import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.orc.mapred.OrcStruct import org.apache.orc.mapreduce.OrcOutputFormat import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.OutputWriter import org.apache.spark.sql.types._ private[orc] class OrcOutputWriter( path: String, dataSchema: StructType, context: TaskAttemptContext) extends OutputWriter { private[this] val serializer = new OrcSerializer(dataSchema) private val recordWriter = { new OrcOutputFormat[OrcStruct]() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { new Path(path) } }.getRecordWriter(context) } override def write(row: InternalRow): Unit = { recordWriter.write(NullWritable.get(), serializer.serialize(row)) } override def close(): Unit = { recordWriter.close(context) } }
Example 43
Source File: SQLHadoopMapReduceCommitProtocol.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.spark.internal.Logging import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol import org.apache.spark.sql.internal.SQLConf class SQLHadoopMapReduceCommitProtocol( jobId: String, path: String, dynamicPartitionOverwrite: Boolean = false) extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) with Serializable with Logging { override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { var committer = super.setupCommitter(context) val configuration = context.getConfiguration val clazz = configuration.getClass(SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter]) if (clazz != null) { logInfo(s"Using user defined output committer class ${clazz.getCanonicalName}") // Every output format based on org.apache.hadoop.mapreduce.lib.output.OutputFormat // has an associated output committer. To override this output committer, // we will first try to use the output committer set in SQLConf.OUTPUT_COMMITTER_CLASS. // If a data source needs to override the output committer, it needs to set the // output committer in prepareForWrite method. if (classOf[FileOutputCommitter].isAssignableFrom(clazz)) { // The specified output committer is a FileOutputCommitter. // So, we will use the FileOutputCommitter-specified constructor. val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext]) committer = ctor.newInstance(new Path(path), context) } else { // The specified output committer is just an OutputCommitter. // So, we will use the no-argument constructor. val ctor = clazz.getDeclaredConstructor() committer = ctor.newInstance() } } logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}") committer } }
Example 44
Source File: ManifestFileCommitProtocol.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.util.UUID import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.spark.internal.Logging import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage def setupManifestOptions(fileLog: FileStreamSinkLog, batchId: Long): Unit = { this.fileLog = fileLog this.batchId = batchId } override def setupJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") // Do nothing } override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") val fileStatuses = taskCommits.flatMap(_.obj.asInstanceOf[Seq[SinkFileStatus]]).toArray if (fileLog.add(batchId, fileStatuses)) { logInfo(s"Committed batch $batchId") } else { throw new IllegalStateException(s"Race while writing batch $batchId") } } override def abortJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") // Do nothing } override def setupTask(taskContext: TaskAttemptContext): Unit = { addedFiles = new ArrayBuffer[String] } override def newTaskTempFile( taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = { // The file name looks like part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet // Note that %05d does not truncate the split number, so if we have more than 100000 tasks, // the file name is fine and won't overflow. val split = taskContext.getTaskAttemptID.getTaskID.getId val uuid = UUID.randomUUID.toString val filename = f"part-$split%05d-$uuid$ext" val file = dir.map { d => new Path(new Path(path, d), filename).toString }.getOrElse { new Path(path, filename).toString } addedFiles += file file } override def newTaskTempFileAbsPath( taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String = { throw new UnsupportedOperationException( s"$this does not support adding files with an absolute path") } override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = { if (addedFiles.nonEmpty) { val fs = new Path(addedFiles.head).getFileSystem(taskContext.getConfiguration) val statuses: Seq[SinkFileStatus] = addedFiles.map(f => SinkFileStatus(fs.getFileStatus(new Path(f)))) new TaskCommitMessage(statuses) } else { new TaskCommitMessage(Seq.empty[SinkFileStatus]) } } override def abortTask(taskContext: TaskAttemptContext): Unit = { // Do nothing // TODO: we can also try delete the addedFiles as a best-effort cleanup. } }
Example 45
Source File: WholeTextFileRecordReader.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.input import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.conf.{Configurable => HConfigurable, Configuration} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.input.{CombineFileRecordReader, CombineFileSplit} private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 46
Source File: ExcelOutputWriterFactory.scala From spark-hadoopoffice-ds with Apache License 2.0 | 5 votes |
package org.zuinnote.spark.office.excel import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.sql.execution.datasources.{ OutputWriter, OutputWriterFactory } import org.apache.spark.sql.types.StructType import org.zuinnote.hadoop.office.format.mapreduce.ExcelFileOutputFormat import org.zuinnote.hadoop.office.format.common.HadoopOfficeWriteConfiguration private[excel] class ExcelOutputWriterFactory(options: Map[String, String]) extends OutputWriterFactory { def newInstance( path: String, bucketId: Option[Int], dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { new ExcelOutputWriter(path, dataSchema, context, options) } def newInstance( path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { new ExcelOutputWriter(path, dataSchema, context, options) } def getFileExtension(context: TaskAttemptContext): String = { val conf = context.getConfiguration(); val defaultConf = conf.get(HadoopOfficeWriteConfiguration.CONF_MIMETYPE, ExcelFileOutputFormat.DEFAULT_MIMETYPE); conf.set(HadoopOfficeWriteConfiguration.CONF_MIMETYPE, defaultConf); ExcelFileOutputFormat.getSuffix(conf.get(HadoopOfficeWriteConfiguration.CONF_MIMETYPE)) } }
Example 47
Source File: ShapeInputFormat.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import com.google.common.base.Stopwatch import magellan.io.{ShapeKey, ShapeWritable} import org.apache.commons.logging.LogFactory import org.apache.hadoop.fs.{LocatedFileStatus, Path} import org.apache.hadoop.mapreduce.lib.input._ import org.apache.hadoop.mapreduce.{InputSplit, JobContext, TaskAttemptContext} import scala.collection.JavaConversions._ import scala.collection.mutable.ListBuffer private[magellan] class ShapeInputFormat extends FileInputFormat[ShapeKey, ShapeWritable] { private val log = LogFactory.getLog(classOf[ShapeInputFormat]) override def createRecordReader(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext) = { new ShapefileReader } override def isSplitable(context: JobContext, filename: Path): Boolean = true override def getSplits(job: JobContext): java.util.List[InputSplit] = { val splitInfos = SplitInfos.SPLIT_INFO_MAP.get() computeSplits(job, splitInfos) } private def computeSplits( job: JobContext, splitInfos: scala.collection.Map[String, Array[Long]]) = { val sw = new Stopwatch().start val splits = ListBuffer[InputSplit]() val files = listStatus(job) for (file <- files) { val path = file.getPath val length = file.getLen val blkLocations = if (file.isInstanceOf[LocatedFileStatus]) { file.asInstanceOf[LocatedFileStatus].getBlockLocations } else { val fs = path.getFileSystem(job.getConfiguration) fs.getFileBlockLocations(file, 0, length) } val key = path.getName.split("\\.shp$")(0) if (splitInfos == null || !splitInfos.containsKey(key)) { val blkIndex = getBlockIndex(blkLocations, 0) splits.+= (makeSplit(path, 0, length, blkLocations(blkIndex).getHosts, blkLocations(blkIndex).getCachedHosts)) } else { val s = splitInfos(key).toSeq val start = s val end = s.drop(1) ++ Seq(length) start.zip(end).foreach { case (startOffset: Long, endOffset: Long) => val blkIndex = getBlockIndex(blkLocations, startOffset) splits.+=(makeSplit(path, startOffset, endOffset - startOffset, blkLocations(blkIndex).getHosts, blkLocations(blkIndex).getCachedHosts)) } } } sw.stop if (log.isDebugEnabled) { log.debug("Total # of splits generated by getSplits: " + splits.size + ", TimeTaken: " + sw.elapsedMillis) } splits } } object SplitInfos { // TODO: Can we get rid of this hack to pass split calculation to the Shapefile Reader? val SPLIT_INFO_MAP = new ThreadLocal[scala.collection.Map[String, Array[Long]]] }
Example 48
Source File: OsmRecordReader.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import magellan.io.{OsmKey, OsmShape, OsmNode, OsmWay, OsmRelation} import org.apache.hadoop.mapreduce.lib.input.FileSplit import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext} import scala.xml.{XML, Elem, Node} private[magellan] class OsmRecordReader extends RecordReader[OsmKey, OsmShape] { val definedNodeLabels = Set("node", "way", "relation") var nodes : Seq[Node] = _ var current : Int = 0 lazy val total = nodes.length override def initialize(genericSplit: InputSplit, context: TaskAttemptContext) : Unit = { val split: FileSplit = genericSplit.asInstanceOf[FileSplit] val job = MapReduceUtils.getConfigurationFromContext(context) val file = split.getPath() val fs = file.getFileSystem(job) val fileIn = fs.open(file) val doc = XML.load(fileIn) fileIn.close() nodes = doc.child.filter(n => definedNodeLabels contains n.label) } override def nextKeyValue() : Boolean = { if (!nodes.isEmpty) { if (current != 0) nodes = nodes.tail current += 1 } !nodes.isEmpty } override def getCurrentKey() : OsmKey = { val current = nodes.head new OsmKey(current.label, (current \ "@id").text) } def getTags(shape: Node) = { (shape \ "tag").map(t => (t \ "@k").text -> (t \ "@v").text).toMap } def getOsmNode(shape: Node) = { new OsmNode( (shape \ "@id").text, (shape \ "@lat").text.toDouble, (shape \ "@lon").text.toDouble, getTags(shape)) } def getOsmWay(shape: Node) = { new OsmWay((shape \ "@id").text, (shape \ "nd").map(w => (w \ "@ref").text), getTags(shape)) } def getOsmRelation(shape: Node) = { new OsmRelation( (shape \ "@id").text, (shape \ "member").map(r => (r \ "@ref").text), getTags(shape) ) } override def getCurrentValue() : OsmShape = { val current = nodes.head current.label match { case "node" => getOsmNode(current) case "way" => getOsmWay(current) case "relation" => getOsmRelation(current) } } override def getProgress() : Float = { current.toFloat / total } override def close() : Unit = { } }
Example 49
Source File: ShapefileReader.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import java.io.DataInputStream import org.apache.commons.io.EndianUtils import org.apache.hadoop.mapreduce.lib.input.FileSplit import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext} import magellan.io.{ShapeKey, ShapeWritable} private[magellan] class ShapefileReader extends RecordReader[ShapeKey, ShapeWritable] { private val key: ShapeKey = new ShapeKey() private var value: ShapeWritable = _ private var dis: DataInputStream = _ private var remaining: BigInt = _ override def getProgress: Float = 0 override def nextKeyValue(): Boolean = { if (remaining <= 0) { false } else { // record header has fixed length of 8 bytes // byte 0 = record #, byte 4 = content length val recordNumber = dis.readInt() // record numbers begin at 1 require(recordNumber > 0) val contentLength = 2 * (dis.readInt() + 4) value.readFields(dis) remaining -= contentLength key.setRecordIndex(key.getRecordIndex() + 1) true } } override def getCurrentValue: ShapeWritable = value override def initialize(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext) { val split = inputSplit.asInstanceOf[FileSplit] val job = MapReduceUtils.getConfigurationFromContext(taskAttemptContext) val path = split.getPath() val fs = path.getFileSystem(job) val is = fs.open(path) val (start, end) = { val v = split.getStart if (v == 0) { is.seek(24) (100L, 2 * is.readInt().toLong) } else { (v, v + split.getLength) } } is.seek(start) dis = new DataInputStream(is) key.setFileNamePrefix(split.getPath.getName.split("\\.")(0)) value = new ShapeWritable() remaining = (end - start) } override def getCurrentKey: ShapeKey = key override def close(): Unit = dis.close() }
Example 50
Source File: DBInputFormat.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import java.util import scala.collection.JavaConversions.seqAsJavaList import org.apache.hadoop.fs.Path import org.apache.hadoop.io.MapWritable import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.{InputSplit, JobContext, TaskAttemptContext} import magellan.io.ShapeKey private[magellan] class DBInputFormat extends FileInputFormat[ShapeKey, MapWritable] { override def createRecordReader(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext) = { new DBReader } override def isSplitable(context: JobContext, filename: Path): Boolean = false override def getSplits(job: JobContext): util.List[InputSplit] = { try { super.getSplits(job) }catch { case e: Exception => seqAsJavaList(List[InputSplit]()) } } }
Example 51
Source File: WholeFileReader.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import java.io.InputStream import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.hadoop.io.compress.{CodecPool, CompressionCodecFactory, Decompressor} import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapreduce.lib.input.FileSplit import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext} class WholeFileReader extends RecordReader[NullWritable, Text] { private val key = NullWritable.get() private val value = new Text() private var split: FileSplit = _ private var conf: Configuration = _ private var path: Path = _ private var done: Boolean = false override def getProgress: Float = ??? override def nextKeyValue(): Boolean = { if (done){ false } else { val fs = path.getFileSystem(conf) var is: FSDataInputStream = null var in: InputStream = null var decompressor: Decompressor = null try { is = fs.open(split.getPath) val codec = new CompressionCodecFactory(conf).getCodec(path) if (codec != null) { decompressor = CodecPool.getDecompressor(codec) in = codec.createInputStream(is, decompressor) } else { in = is } val result = IOUtils.toByteArray(in) value.clear() value.set(result) done = true true } finally { if (in != null) { IOUtils.closeQuietly(in) } if (decompressor != null) { CodecPool.returnDecompressor(decompressor) } } } } override def getCurrentValue: Text = value override def initialize(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext): Unit = { this.split = inputSplit.asInstanceOf[FileSplit] this.conf = MapReduceUtils.getConfigurationFromContext(taskAttemptContext) this.path = this.split.getPath } override def getCurrentKey: NullWritable = key override def close() {} }
Example 52
Source File: OapOutputWriter.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.{OutputWriter, WriteResult} import org.apache.spark.sql.execution.datasources.oap.io.OapDataWriter import org.apache.spark.sql.types.StructType private[oap] class OapOutputWriter( path: String, dataSchema: StructType, context: TaskAttemptContext) extends OutputWriter { private var rowCount = 0 private var partitionString: String = "" override def setPartitionString(ps: String): Unit = { partitionString = ps } private val writer: OapDataWriter = { val isCompressed = FileOutputFormat.getCompressOutput(context) val conf = context.getConfiguration val file: Path = new Path(path) val fs = file.getFileSystem(conf) val fileOut = fs.create(file, false) new OapDataWriter(isCompressed, fileOut, dataSchema, conf) } override def write(row: InternalRow): Unit = { rowCount += 1 writer.write(row) } override def close(): Unit = { writer.close() } override def writeStatus(): WriteResult = { OapWriteResult(dataFileName, rowCount, partitionString) } def dataFileName: String = new Path(path).getName }
Example 53
Source File: OapIndexCommitProtocol.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.index import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.OutputCommitter import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol import org.apache.spark.sql.internal.oap.OapConf @transient private var committer: OapIndexFileOutputCommitter = _ override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { val algorithmVersion = SparkEnv.get.conf.get(OapConf.OAP_INDEXFILEOUTPUTCOMMITTER_ALGORITHM_VERSION) val tempDirName = s"_temporary_$jobId" committer = new OapIndexFileOutputCommitter( new Path(path), context, tempDirName, algorithmVersion) logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}") committer } override def newTaskTempFile( taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = { val filename = getFilename(taskContext, ext) val stagingDir = new Path(Option(committer.getWorkPath).map(_.toString).getOrElse(path)) dir.map { d => new Path(new Path(stagingDir, d), filename).toString }.getOrElse { new Path(stagingDir, filename).toString } } override protected def getFilename(taskContext: TaskAttemptContext, ext: String): String = { // The file name looks like part-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003-c000.parquet // Note that %05d does not truncate the split number, so if we have more than 100000 tasks, // the file name is fine and won't overflow. val split = taskContext.getTaskAttemptID.getTaskID.getId f"part-$split%05d-$jobId$ext" } }
Example 54
Source File: OapIndexOutputWriter.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.index import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.parquet.hadoop.util.ContextUtil import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.OutputWriter import org.apache.spark.sql.oap.adapter.InputFileNameHolderAdapter // TODO: parameter name "path" is ambiguous private[index] class OapIndexOutputWriter( path: String, context: TaskAttemptContext ) extends OutputWriter { private val outputFormat = new OapIndexOutputFormat() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { val outputPath = FileOutputFormat.getOutputPath(context) val configuration = ContextUtil.getConfiguration(context) IndexUtils.generateTempIndexFilePath( configuration, inputFileName, outputPath, path, extension) } } private var recordWriter: RecordWriter[Void, InternalRow] = _ private var inputFileName: String = _ private var rowCount: Long = 0 override def write(row: InternalRow): Unit = { checkStartOfNewFile() recordWriter.write(null, row) rowCount += 1 } override def close(): Unit = { closeWriter() } private def initWriter(): Unit = { inputFileName = InputFileNameHolderAdapter.getInputFileName().toString recordWriter = outputFormat.getRecordWriter(context) rowCount = 0 } private def closeWriter(): Unit = { if (recordWriter != null) { recordWriter.close(context) recordWriter = null } } private def checkStartOfNewFile(): Unit = { if (inputFileName != InputFileNameHolderAdapter.getInputFileName().toString) { closeWriter() initWriter() } } }
Example 55
Source File: OapIndexOutputFormat.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.index import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.parquet.format.CompressionCodec import org.apache.parquet.hadoop.util.ContextUtil import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.OapException import org.apache.spark.sql.execution.datasources.oap.index.OapIndexProperties.IndexVersion import org.apache.spark.sql.internal.oap.OapConf import org.apache.spark.sql.types.StructType private[index] class OapIndexOutputFormat extends FileOutputFormat[Void, InternalRow] { private val BTREE_WRITER_VERSION = OapConf.OAP_INDEX_BTREE_WRITER_VERSION.key private def getCodec(taskAttemptContext: TaskAttemptContext): CompressionCodec = { val configuration = ContextUtil.getConfiguration(taskAttemptContext) CompressionCodec.valueOf( configuration.get( OapConf.OAP_INDEX_BTREE_COMPRESSION.key, OapConf.OAP_INDEX_BTREE_COMPRESSION.defaultValueString).toUpperCase) } private def getWriterVersion(taskAttemptContext: TaskAttemptContext) = { val configuration = ContextUtil.getConfiguration(taskAttemptContext) val indexVersion = configuration.get(BTREE_WRITER_VERSION, OapIndexProperties.DEFAULT_WRITER_VERSION.toString) IndexVersion.fromString(indexVersion) } override def getRecordWriter( taskAttemptContext: TaskAttemptContext): RecordWriter[Void, InternalRow] = { val configuration = ContextUtil.getConfiguration(taskAttemptContext) def canBeSkipped(file: Path): Boolean = { val isAppend = configuration.get(OapIndexFileFormat.IS_APPEND).toBoolean if (isAppend) { val target = new Path(FileOutputFormat.getOutputPath(taskAttemptContext), file.getName) target.getFileSystem(configuration).exists(target) } else { false } } val codec = getCodec(taskAttemptContext) val writerVersion = getWriterVersion(taskAttemptContext) val extension = "." + configuration.get(OapIndexFileFormat.INDEX_TIME) + "." + configuration.get(OapIndexFileFormat.INDEX_NAME) + ".index" val file = getDefaultWorkFile(taskAttemptContext, extension) val schema = StructType.fromString(configuration.get(OapIndexFileFormat.ROW_SCHEMA)) val indexType = configuration.get(OapIndexFileFormat.INDEX_TYPE, "") if (canBeSkipped(file)) { new DummyIndexRecordWriter() } else if (indexType == "BTREE") { BTreeIndexRecordWriter(configuration, file, schema, codec, writerVersion) } else if (indexType == "BITMAP") { val writer = file.getFileSystem(configuration).create(file, true) new BitmapIndexRecordWriter(configuration, writer, schema) } else { throw new OapException("Unknown Index Type: " + indexType) } } }
Example 56
Source File: OapIndexFileFormat.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.index import org.apache.hadoop.fs.FileStatus import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.parquet.hadoop.util.ContextUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory} import org.apache.spark.sql.execution.datasources.oap.OapFileFormat import org.apache.spark.sql.types.StructType private[index] class OapIndexFileFormat extends FileFormat with Logging with Serializable { override def inferSchema( sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = None override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = { val configuration = ContextUtil.getConfiguration(job) configuration.set(OapIndexFileFormat.ROW_SCHEMA, dataSchema.json) configuration.set(OapIndexFileFormat.INDEX_TYPE, options("indexType")) configuration.set(OapIndexFileFormat.INDEX_NAME, options("indexName")) configuration.set(OapIndexFileFormat.INDEX_TIME, options("indexTime")) configuration.set(OapIndexFileFormat.IS_APPEND, options("isAppend")) new OutputWriterFactory { override def getFileExtension(context: TaskAttemptContext): String = OapFileFormat.OAP_INDEX_EXTENSION override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext) = new OapIndexOutputWriter(path, context) } } } private[index] object OapIndexFileFormat { val ROW_SCHEMA: String = "org.apache.spark.sql.oap.row.attributes" val INDEX_TYPE: String = "org.apache.spark.sql.oap.index.type" val INDEX_NAME: String = "org.apache.spark.sql.oap.index.name" val INDEX_TIME: String = "org.apache.spark.sql.oap.index.time" val IS_APPEND: String = "org.apache.spark.sql.oap.index.append" } case class IndexBuildResult(dataFile: String, rowCount: Long, fingerprint: String, parent: String)
Example 57
Source File: CommitFailureTestSource.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.TaskContext import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory} import org.apache.spark.sql.types.StructType class CommitFailureTestSource extends SimpleTextSource { override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = new OutputWriterFactory { override def newInstance( path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { new SimpleTextOutputWriter(path, context) { var failed = false TaskContext.get().addTaskFailureListener { (t: TaskContext, e: Throwable) => failed = true SimpleTextRelation.callbackCalled = true } override def write(row: Row): Unit = { if (SimpleTextRelation.failWriter) { sys.error("Intentional task writer failure for testing purpose.") } super.write(row) } override def close(): Unit = { super.close() sys.error("Intentional task commitment failure for testing purpose.") } } } override def getFileExtension(context: TaskAttemptContext): String = "" } override def shortName(): String = "commit-failure-test" }
Example 58
Source File: ManifestFileCommitProtocol.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.util.UUID import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.spark.internal.Logging import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage def setupManifestOptions(fileLog: FileStreamSinkLog, batchId: Long): Unit = { this.fileLog = fileLog this.batchId = batchId } override def setupJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") // Do nothing } override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") val fileStatuses = taskCommits.flatMap(_.obj.asInstanceOf[Seq[SinkFileStatus]]).toArray if (fileLog.add(batchId, fileStatuses)) { logInfo(s"Committed batch $batchId") } else { throw new IllegalStateException(s"Race while writing batch $batchId") } } override def abortJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") // Do nothing } override def setupTask(taskContext: TaskAttemptContext): Unit = { addedFiles = new ArrayBuffer[String] } override def newTaskTempFile( taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = { // The file name looks like part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet // Note that %05d does not truncate the split number, so if we have more than 100000 tasks, // the file name is fine and won't overflow. val split = taskContext.getTaskAttemptID.getTaskID.getId val uuid = UUID.randomUUID.toString val filename = f"part-$split%05d-$uuid$ext" val file = dir.map { d => new Path(new Path(path, d), filename).toString }.getOrElse { new Path(path, filename).toString } addedFiles += file file } override def newTaskTempFileAbsPath( taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String = { throw new UnsupportedOperationException( s"$this does not support adding files with an absolute path") } override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = { if (addedFiles.nonEmpty) { val fs = new Path(addedFiles.head).getFileSystem(taskContext.getConfiguration) val statuses: Seq[SinkFileStatus] = addedFiles.map(f => SinkFileStatus(fs.getFileStatus(new Path(f)))) new TaskCommitMessage(statuses) } else { new TaskCommitMessage(Seq.empty[SinkFileStatus]) } } override def abortTask(taskContext: TaskAttemptContext): Unit = { // Do nothing // TODO: we can also try delete the addedFiles as a best-effort cleanup. } }