org.apache.hadoop.mapreduce.JobContext Scala Example

Source File: WholeTextFileInputFormat.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.input

import scala.collection.JavaConversions._

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.JobContext
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat
import org.apache.hadoop.mapreduce.RecordReader
import org.apache.hadoop.mapreduce.TaskAttemptContext


  def setMinPartitions(context: JobContext, minPartitions: Int) {
    val files = listStatus(context)
    val totalLen = files.map { file =>
      if (file.isDir) 0L else file.getLen
    }.sum
    val maxSplitSize = Math.ceil(totalLen * 1.0 /
      (if (minPartitions == 0) 1 else minPartitions)).toLong
    super.setMaxSplitSize(maxSplitSize)
  }
}

Source File: PlyOutputWriter.scala From spark-iqmulus with Apache License 2.0

5 votes

package fr.ign.spark.iqmulus.ply

import org.apache.spark.sql.types._
import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext, JobContext }
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
import java.io.DataOutputStream
import org.apache.spark.sql.sources.OutputWriter
import org.apache.hadoop.io.{ NullWritable, BytesWritable }
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.fs.Path
import java.text.NumberFormat
import org.apache.spark.sql.{ Row, SQLContext, sources }
import fr.ign.spark.iqmulus.RowOutputStream

class PlyOutputWriter(
  name: String,
  context: TaskAttemptContext,
  dataSchema: StructType,
  element: String,
  littleEndian: Boolean
)
    extends OutputWriter {

  private val file = {
    val path = getDefaultWorkFile(s".ply.$element")
    val fs = path.getFileSystem(context.getConfiguration)
    fs.create(path)
  }

  private var count = 0L

  // strip out ids
  private val schema = StructType(dataSchema.filterNot { Seq("fid", "pid") contains _.name })

  private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian, schema, dataSchema)

  def getDefaultWorkFile(extension: String): Path = {
    val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID")
    val taskAttemptId: TaskAttemptID = context.getTaskAttemptID
    val split = taskAttemptId.getTaskID.getId
    new Path(name, f"$split%05d-$uniqueWriteJobId$extension")
  }

  override def write(row: Row): Unit = {
    recordWriter.write(row)
    count += 1
  }

  override def close(): Unit = {
    recordWriter.close

    // write header
    val path = getDefaultWorkFile(".ply.header")
    val fs = path.getFileSystem(context.getConfiguration)
    val dos = new java.io.DataOutputStream(fs.create(path))
    val header = new PlyHeader(path.toString, littleEndian, Map(element -> ((count, schema))))
    header.write(dos)
    dos.close
  }
}

Source File: SparkHadoopMapReduceUtil.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mapreduce

import java.lang.{Boolean => JBoolean, Integer => JInteger}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapreduce.{JobContext, JobID, TaskAttemptContext, TaskAttemptID}
import org.apache.spark.util.Utils

private[spark]
trait SparkHadoopMapReduceUtil {
  def newJobContext(conf: Configuration, jobId: JobID): JobContext = {
    val klass = firstAvailableClass(
        "org.apache.hadoop.mapreduce.task.JobContextImpl",  // hadoop2, hadoop2-yarn
        "org.apache.hadoop.mapreduce.JobContext")           // hadoop1
    val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[JobID])
    ctor.newInstance(conf, jobId).asInstanceOf[JobContext]
  }

  def newTaskAttemptContext(conf: Configuration, attemptId: TaskAttemptID): TaskAttemptContext = {
    val klass = firstAvailableClass(
        "org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl",  // hadoop2, hadoop2-yarn
        "org.apache.hadoop.mapreduce.TaskAttemptContext")           // hadoop1
    val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[TaskAttemptID])
    ctor.newInstance(conf, attemptId).asInstanceOf[TaskAttemptContext]
  }

  def newTaskAttemptID(
      jtIdentifier: String,
      jobId: Int,
      isMap: Boolean,
      taskId: Int,
      attemptId: Int): TaskAttemptID = {
    val klass = Utils.classForName("org.apache.hadoop.mapreduce.TaskAttemptID")
    try {
      // First, attempt to use the old-style constructor that takes a boolean isMap
      // (not available in YARN)
      val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], classOf[Boolean],
        classOf[Int], classOf[Int])
      ctor.newInstance(jtIdentifier, new JInteger(jobId), new JBoolean(isMap), new JInteger(taskId),
        new JInteger(attemptId)).asInstanceOf[TaskAttemptID]
    } catch {
      case exc: NoSuchMethodException => {
        // If that failed, look for the new constructor that takes a TaskType (not available in 1.x)
        val taskTypeClass = Utils.classForName("org.apache.hadoop.mapreduce.TaskType")
          .asInstanceOf[Class[Enum[_]]]
        val taskType = taskTypeClass.getMethod("valueOf", classOf[String]).invoke(
          taskTypeClass, if (isMap) "MAP" else "REDUCE")
        val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], taskTypeClass,
          classOf[Int], classOf[Int])
        ctor.newInstance(jtIdentifier, new JInteger(jobId), taskType, new JInteger(taskId),
          new JInteger(attemptId)).asInstanceOf[TaskAttemptID]
      }
    }
  }

  private def firstAvailableClass(first: String, second: String): Class[_] = {
    try {
      Utils.classForName(first)
    } catch {
      case e: ClassNotFoundException =>
        Utils.classForName(second)
    }
  }
}

Source File: DirectParquetOutputCommitter.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.parquet

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
import org.apache.parquet.Log
import org.apache.parquet.hadoop.util.ContextUtil
import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat}


private[datasources] class DirectParquetOutputCommitter(
    outputPath: Path, context: TaskAttemptContext)
  extends ParquetOutputCommitter(outputPath, context) {
  val LOG = Log.getLog(classOf[ParquetOutputCommitter])

  override def getWorkPath: Path = outputPath
  override def abortTask(taskContext: TaskAttemptContext): Unit = {}
  override def commitTask(taskContext: TaskAttemptContext): Unit = {}
  override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true
  override def setupJob(jobContext: JobContext): Unit = {}
  override def setupTask(taskContext: TaskAttemptContext): Unit = {}

  override def commitJob(jobContext: JobContext) {
    val configuration = {
      // scalastyle:off jobcontext
      ContextUtil.getConfiguration(jobContext)
      // scalastyle:on jobcontext
    }
    val fileSystem = outputPath.getFileSystem(configuration)

    if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) {
      try {
        val outputStatus = fileSystem.getFileStatus(outputPath)
        val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus)
        try {
          ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers)
        } catch { case e: Exception =>
          LOG.warn("could not write summary file for " + outputPath, e)
          val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE)
          if (fileSystem.exists(metadataPath)) {
            fileSystem.delete(metadataPath, true)
          }
        }
      } catch {
        case e: Exception => LOG.warn("could not write summary file for " + outputPath, e)
      }
    }

    if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) {
      try {
        val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME)
        fileSystem.create(successPath).close()
      } catch {
        case e: Exception => LOG.warn("could not write success file for " + outputPath, e)
      }
    }
  }
}

Source File: ManifestFileCommitProtocol.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.util.UUID

import scala.collection.mutable.ArrayBuffer

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}

import org.apache.spark.internal.Logging
import org.apache.spark.internal.io.FileCommitProtocol
import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage


  def setupManifestOptions(fileLog: FileStreamSinkLog, batchId: Long): Unit = {
    this.fileLog = fileLog
    this.batchId = batchId
  }

  override def setupJob(jobContext: JobContext): Unit = {
    require(fileLog != null, "setupManifestOptions must be called before this function")
    // Do nothing
  }

  override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = {
    require(fileLog != null, "setupManifestOptions must be called before this function")
    val fileStatuses = taskCommits.flatMap(_.obj.asInstanceOf[Seq[SinkFileStatus]]).toArray

    if (fileLog.add(batchId, fileStatuses)) {
      logInfo(s"Committed batch $batchId")
    } else {
      throw new IllegalStateException(s"Race while writing batch $batchId")
    }
  }

  override def abortJob(jobContext: JobContext): Unit = {
    require(fileLog != null, "setupManifestOptions must be called before this function")
    // Do nothing
  }

  override def setupTask(taskContext: TaskAttemptContext): Unit = {
    addedFiles = new ArrayBuffer[String]
  }

  override def newTaskTempFile(
      taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = {
    // The file name looks like part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet
    // Note that %05d does not truncate the split number, so if we have more than 100000 tasks,
    // the file name is fine and won't overflow.
    val split = taskContext.getTaskAttemptID.getTaskID.getId
    val uuid = UUID.randomUUID.toString
    val filename = f"part-$split%05d-$uuid$ext"

    val file = dir.map { d =>
      new Path(new Path(path, d), filename).toString
    }.getOrElse {
      new Path(path, filename).toString
    }

    addedFiles += file
    file
  }

  override def newTaskTempFileAbsPath(
      taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String = {
    throw new UnsupportedOperationException(
      s"$this does not support adding files with an absolute path")
  }

  override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = {
    if (addedFiles.nonEmpty) {
      val fs = new Path(addedFiles.head).getFileSystem(taskContext.getConfiguration)
      val statuses: Seq[SinkFileStatus] =
        addedFiles.map(f => SinkFileStatus(fs.getFileStatus(new Path(f))))
      new TaskCommitMessage(statuses)
    } else {
      new TaskCommitMessage(Seq.empty[SinkFileStatus])
    }
  }

  override def abortTask(taskContext: TaskAttemptContext): Unit = {
    // Do nothing
    // TODO: we can also try delete the addedFiles as a best-effort cleanup.
  }
}

Source File: CodecStreams.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.io.{InputStream, OutputStream, OutputStreamWriter}
import java.nio.charset.{Charset, StandardCharsets}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.compress._
import org.apache.hadoop.mapreduce.JobContext
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.util.ReflectionUtils

import org.apache.spark.TaskContext

object CodecStreams {
  private def getDecompressionCodec(config: Configuration, file: Path): Option[CompressionCodec] = {
    val compressionCodecs = new CompressionCodecFactory(config)
    Option(compressionCodecs.getCodec(file))
  }

  def createInputStream(config: Configuration, file: Path): InputStream = {
    val fs = file.getFileSystem(config)
    val inputStream: InputStream = fs.open(file)

    getDecompressionCodec(config, file)
      .map(codec => codec.createInputStream(inputStream))
      .getOrElse(inputStream)
  }

  
  def getCompressionExtension(context: JobContext): String = {
    getCompressionCodec(context)
      .map(_.getDefaultExtension)
      .getOrElse("")
  }
}

Source File: WholeTextFileInputFormat.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.input

import scala.collection.JavaConversions._

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.JobContext
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat
import org.apache.hadoop.mapreduce.RecordReader
import org.apache.hadoop.mapreduce.TaskAttemptContext


  def setMinPartitions(context: JobContext, minPartitions: Int) {
    val files = listStatus(context)
    val totalLen = files.map { file =>
      if (file.isDir) 0L else file.getLen
    }.sum
    val maxSplitSize = Math.ceil(totalLen * 1.0 /
      (if (minPartitions == 0) 1 else minPartitions)).toLong
    super.setMaxSplitSize(maxSplitSize)
  }
}

Source File: SparkHadoopMapReduceUtil.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mapreduce

import java.lang.{Boolean => JBoolean, Integer => JInteger}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapreduce.{JobContext, JobID, TaskAttemptContext, TaskAttemptID}
import org.apache.spark.util.Utils

private[spark]
trait SparkHadoopMapReduceUtil {
  def newJobContext(conf: Configuration, jobId: JobID): JobContext = {
    val klass = firstAvailableClass(
        "org.apache.hadoop.mapreduce.task.JobContextImpl",  // hadoop2, hadoop2-yarn
        "org.apache.hadoop.mapreduce.JobContext")           // hadoop1
    val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[JobID])
    ctor.newInstance(conf, jobId).asInstanceOf[JobContext]
  }

  def newTaskAttemptContext(conf: Configuration, attemptId: TaskAttemptID): TaskAttemptContext = {
    val klass = firstAvailableClass(
        "org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl",  // hadoop2, hadoop2-yarn
        "org.apache.hadoop.mapreduce.TaskAttemptContext")           // hadoop1
    val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[TaskAttemptID])
    ctor.newInstance(conf, attemptId).asInstanceOf[TaskAttemptContext]
  }

  def newTaskAttemptID(
      jtIdentifier: String,
      jobId: Int,
      isMap: Boolean,
      taskId: Int,
      attemptId: Int): TaskAttemptID = {
    val klass = Utils.classForName("org.apache.hadoop.mapreduce.TaskAttemptID")
    try {
      // First, attempt to use the old-style constructor that takes a boolean isMap
      // (not available in YARN)
      //首先,尝试使用带有boolean isMap的旧式构造函数（在YARN中不可用）
      val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], classOf[Boolean],
        classOf[Int], classOf[Int])
      ctor.newInstance(jtIdentifier, new JInteger(jobId), new JBoolean(isMap), new JInteger(taskId),
        new JInteger(attemptId)).asInstanceOf[TaskAttemptID]
    } catch {
      case exc: NoSuchMethodException => {
        // If that failed, look for the new constructor that takes a TaskType (not available in 1.x)
        //如果失败，请查找采用TaskType的新构造函数（在1.x中不可用）
        val taskTypeClass = Utils.classForName("org.apache.hadoop.mapreduce.TaskType")
          .asInstanceOf[Class[Enum[_]]]
        val taskType = taskTypeClass.getMethod("valueOf", classOf[String]).invoke(
          taskTypeClass, if (isMap) "MAP" else "REDUCE")
        val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], taskTypeClass,
          classOf[Int], classOf[Int])
        ctor.newInstance(jtIdentifier, new JInteger(jobId), taskType, new JInteger(taskId),
          new JInteger(attemptId)).asInstanceOf[TaskAttemptID]
      }
    }
  }

  private def firstAvailableClass(first: String, second: String): Class[_] = {
    try {
      Utils.classForName(first)
    } catch {
      case e: ClassNotFoundException =>
        Utils.classForName(second)
    }
  }
}

Source File: DirectParquetOutputCommitter.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.parquet

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
import org.apache.parquet.Log
import org.apache.parquet.hadoop.util.ContextUtil
import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat}


private[datasources] class DirectParquetOutputCommitter(
    outputPath: Path, context: TaskAttemptContext)
  extends ParquetOutputCommitter(outputPath, context) {
  val LOG = Log.getLog(classOf[ParquetOutputCommitter])

  override def getWorkPath: Path = outputPath
  override def abortTask(taskContext: TaskAttemptContext): Unit = {}
  override def commitTask(taskContext: TaskAttemptContext): Unit = {}
  override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true
  override def setupJob(jobContext: JobContext): Unit = {}
  override def setupTask(taskContext: TaskAttemptContext): Unit = {}

  override def commitJob(jobContext: JobContext) {
    val configuration = ContextUtil.getConfiguration(jobContext)
    val fileSystem = outputPath.getFileSystem(configuration)

    if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) {
      try {
        val outputStatus = fileSystem.getFileStatus(outputPath)
        val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus)
        try {
          ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers)
        } catch { case e: Exception =>
          LOG.warn("could not write summary file for " + outputPath, e)
          val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE)
          if (fileSystem.exists(metadataPath)) {
            fileSystem.delete(metadataPath, true)
          }
        }
      } catch {
        case e: Exception => LOG.warn("could not write summary file for " + outputPath, e)
      }
    }

    if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) {
      try {
        val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME)
        fileSystem.create(successPath).close()
      } catch {
        case e: Exception => LOG.warn("could not write success file for " + outputPath, e)
      }
    }
  }
}

Source File: WholeTextFileInputFormat.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.input

import scala.collection.JavaConversions._

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.JobContext
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat
import org.apache.hadoop.mapreduce.RecordReader
import org.apache.hadoop.mapreduce.TaskAttemptContext


  def setMinPartitions(context: JobContext, minPartitions: Int) {
    val files = listStatus(context)
    val totalLen = files.map { file =>
      if (file.isDir) 0L else file.getLen
    }.sum
    val maxSplitSize = Math.ceil(totalLen * 1.0 /
      (if (minPartitions == 0) 1 else minPartitions)).toLong
    super.setMaxSplitSize(maxSplitSize)
  }
}

Source File: SparkHadoopMapReduceUtil.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mapreduce

import java.lang.{Boolean => JBoolean, Integer => JInteger}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapreduce.{JobContext, JobID, TaskAttemptContext, TaskAttemptID}

private[spark]
trait SparkHadoopMapReduceUtil {
  def newJobContext(conf: Configuration, jobId: JobID): JobContext = {
    val klass = firstAvailableClass(
        "org.apache.hadoop.mapreduce.task.JobContextImpl",  // hadoop2, hadoop2-yarn
        "org.apache.hadoop.mapreduce.JobContext")           // hadoop1
    val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[JobID])
    ctor.newInstance(conf, jobId).asInstanceOf[JobContext]
  }

  def newTaskAttemptContext(conf: Configuration, attemptId: TaskAttemptID): TaskAttemptContext = {
    val klass = firstAvailableClass(
        "org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl",  // hadoop2, hadoop2-yarn
        "org.apache.hadoop.mapreduce.TaskAttemptContext")           // hadoop1
    val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[TaskAttemptID])
    ctor.newInstance(conf, attemptId).asInstanceOf[TaskAttemptContext]
  }

  def newTaskAttemptID(
      jtIdentifier: String,
      jobId: Int,
      isMap: Boolean,
      taskId: Int,
      attemptId: Int): TaskAttemptID = {
    val klass = Class.forName("org.apache.hadoop.mapreduce.TaskAttemptID")
    try {
      // First, attempt to use the old-style constructor that takes a boolean isMap
      // (not available in YARN)
      val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], classOf[Boolean],
        classOf[Int], classOf[Int])
      ctor.newInstance(jtIdentifier, new JInteger(jobId), new JBoolean(isMap), new JInteger(taskId),
        new JInteger(attemptId)).asInstanceOf[TaskAttemptID]
    } catch {
      case exc: NoSuchMethodException => {
        // If that failed, look for the new constructor that takes a TaskType (not available in 1.x)
        val taskTypeClass = Class.forName("org.apache.hadoop.mapreduce.TaskType")
          .asInstanceOf[Class[Enum[_]]]
        val taskType = taskTypeClass.getMethod("valueOf", classOf[String]).invoke(
          taskTypeClass, if (isMap) "MAP" else "REDUCE")
        val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], taskTypeClass,
          classOf[Int], classOf[Int])
        ctor.newInstance(jtIdentifier, new JInteger(jobId), taskType, new JInteger(taskId),
          new JInteger(attemptId)).asInstanceOf[TaskAttemptID]
      }
    }
  }

  private def firstAvailableClass(first: String, second: String): Class[_] = {
    try {
      Class.forName(first)
    } catch {
      case e: ClassNotFoundException =>
        Class.forName(second)
    }
  }
}

Source File: DirectParquetOutputCommitter.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.parquet

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter

import parquet.Log
import parquet.hadoop.util.ContextUtil
import parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat}

private[parquet] class DirectParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext)
  extends ParquetOutputCommitter(outputPath, context) {
  val LOG = Log.getLog(classOf[ParquetOutputCommitter])

  override def getWorkPath(): Path = outputPath
  override def abortTask(taskContext: TaskAttemptContext): Unit = {}
  override def commitTask(taskContext: TaskAttemptContext): Unit = {}
  override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true
  override def setupJob(jobContext: JobContext): Unit = {}
  override def setupTask(taskContext: TaskAttemptContext): Unit = {}

  override def commitJob(jobContext: JobContext) {
    val configuration = ContextUtil.getConfiguration(jobContext)
    val fileSystem = outputPath.getFileSystem(configuration)

    if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) {
      try {
        val outputStatus = fileSystem.getFileStatus(outputPath)
        val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus)
        try {
          ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers)
        } catch {
          case e: Exception => {
            LOG.warn("could not write summary file for " + outputPath, e)
            val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE)
            if (fileSystem.exists(metadataPath)) {
              fileSystem.delete(metadataPath, true)
            }
          }
        }
      } catch {
        case e: Exception => LOG.warn("could not write summary file for " + outputPath, e)
      }
    }

    if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) {
      try {
        val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME)
        fileSystem.create(successPath).close()
      } catch {
        case e: Exception => LOG.warn("could not write success file for " + outputPath, e)
      }
    }
  }
}

Source File: ManifestFileCommitProtocol.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.util.UUID

import scala.collection.mutable.ArrayBuffer

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}

import org.apache.spark.internal.Logging
import org.apache.spark.internal.io.FileCommitProtocol
import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage


  def setupManifestOptions(fileLog: FileStreamSinkLog, batchId: Long): Unit = {
    this.fileLog = fileLog
    this.batchId = batchId
  }

  override def setupJob(jobContext: JobContext): Unit = {
    require(fileLog != null, "setupManifestOptions must be called before this function")
    // Do nothing
  }

  override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = {
    require(fileLog != null, "setupManifestOptions must be called before this function")
    val fileStatuses = taskCommits.flatMap(_.obj.asInstanceOf[Seq[SinkFileStatus]]).toArray

    if (fileLog.add(batchId, fileStatuses)) {
      logInfo(s"Committed batch $batchId")
    } else {
      throw new IllegalStateException(s"Race while writing batch $batchId")
    }
  }

  override def abortJob(jobContext: JobContext): Unit = {
    require(fileLog != null, "setupManifestOptions must be called before this function")
    // Do nothing
  }

  override def setupTask(taskContext: TaskAttemptContext): Unit = {
    addedFiles = new ArrayBuffer[String]
  }

  override def newTaskTempFile(
      taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = {
    // The file name looks like part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet
    // Note that %05d does not truncate the split number, so if we have more than 100000 tasks,
    // the file name is fine and won't overflow.
    val split = taskContext.getTaskAttemptID.getTaskID.getId
    val uuid = UUID.randomUUID.toString
    val filename = f"part-$split%05d-$uuid$ext"

    val file = dir.map { d =>
      new Path(new Path(path, d), filename).toString
    }.getOrElse {
      new Path(path, filename).toString
    }

    addedFiles += file
    file
  }

  override def newTaskTempFileAbsPath(
      taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String = {
    throw new UnsupportedOperationException(
      s"$this does not support adding files with an absolute path")
  }

  override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = {
    if (addedFiles.nonEmpty) {
      val fs = new Path(addedFiles.head).getFileSystem(taskContext.getConfiguration)
      val statuses: Seq[SinkFileStatus] =
        addedFiles.map(f => SinkFileStatus(fs.getFileStatus(new Path(f))))
      new TaskCommitMessage(statuses)
    } else {
      new TaskCommitMessage(Seq.empty[SinkFileStatus])
    }
  }

  override def abortTask(taskContext: TaskAttemptContext): Unit = {
    // Do nothing
    // TODO: we can also try delete the addedFiles as a best-effort cleanup.
  }
}

Source File: PortableDataStream.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.input

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream}

import scala.collection.JavaConverters._

import com.google.common.io.{ByteStreams, Closeables}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat, CombineFileRecordReader, CombineFileSplit}


  def toArray(): Array[Byte] = {
    val stream = open()
    try {
      ByteStreams.toByteArray(stream)
    } finally {
      Closeables.close(stream, true)
    }
  }

  def getPath(): String = path
}

Source File: SparkHadoopMapReduceUtil.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.mapreduce

import java.lang.{Boolean => JBoolean, Integer => JInteger}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapreduce.{JobContext, JobID, TaskAttemptContext, TaskAttemptID}

private[spark]
trait SparkHadoopMapReduceUtil {
  def newJobContext(conf: Configuration, jobId: JobID): JobContext = {
    val klass = firstAvailableClass(
        "org.apache.hadoop.mapreduce.task.JobContextImpl",  // hadoop2, hadoop2-yarn
        "org.apache.hadoop.mapreduce.JobContext")           // hadoop1
    val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[JobID])
    ctor.newInstance(conf, jobId).asInstanceOf[JobContext]
  }

  def newTaskAttemptContext(conf: Configuration, attemptId: TaskAttemptID): TaskAttemptContext = {
    val klass = firstAvailableClass(
        "org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl",  // hadoop2, hadoop2-yarn
        "org.apache.hadoop.mapreduce.TaskAttemptContext")           // hadoop1
    val ctor = klass.getDeclaredConstructor(classOf[Configuration], classOf[TaskAttemptID])
    ctor.newInstance(conf, attemptId).asInstanceOf[TaskAttemptContext]
  }

  def newTaskAttemptID(
      jtIdentifier: String,
      jobId: Int,
      isMap: Boolean,
      taskId: Int,
      attemptId: Int) = {
    val klass = Class.forName("org.apache.hadoop.mapreduce.TaskAttemptID")
    try {
      // First, attempt to use the old-style constructor that takes a boolean isMap
      // (not available in YARN)
      val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], classOf[Boolean],
        classOf[Int], classOf[Int])
      ctor.newInstance(jtIdentifier, new JInteger(jobId), new JBoolean(isMap), new JInteger(taskId),
        new JInteger(attemptId)).asInstanceOf[TaskAttemptID]
    } catch {
      case exc: NoSuchMethodException => {
        // If that failed, look for the new constructor that takes a TaskType (not available in 1.x)
        val taskTypeClass = Class.forName("org.apache.hadoop.mapreduce.TaskType")
          .asInstanceOf[Class[Enum[_]]]
        val taskType = taskTypeClass.getMethod("valueOf", classOf[String]).invoke(
          taskTypeClass, if(isMap) "MAP" else "REDUCE")
        val ctor = klass.getDeclaredConstructor(classOf[String], classOf[Int], taskTypeClass,
          classOf[Int], classOf[Int])
        ctor.newInstance(jtIdentifier, new JInteger(jobId), taskType, new JInteger(taskId),
          new JInteger(attemptId)).asInstanceOf[TaskAttemptID]
      }
    }
  }

  private def firstAvailableClass(first: String, second: String): Class[_] = {
    try {
      Class.forName(first)
    } catch {
      case e: ClassNotFoundException =>
        Class.forName(second)
    }
  }
}

Source File: ManifestFileCommitProtocol.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.util.UUID

import scala.collection.mutable.ArrayBuffer

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}

import org.apache.spark.internal.Logging
import org.apache.spark.internal.io.FileCommitProtocol
import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage


  def setupManifestOptions(fileLog: FileStreamSinkLog, batchId: Long): Unit = {
    this.fileLog = fileLog
    this.batchId = batchId
  }

  override def setupJob(jobContext: JobContext): Unit = {
    require(fileLog != null, "setupManifestOptions must be called before this function")
    // Do nothing
  }

  override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = {
    require(fileLog != null, "setupManifestOptions must be called before this function")
    val fileStatuses = taskCommits.flatMap(_.obj.asInstanceOf[Seq[SinkFileStatus]]).toArray

    if (fileLog.add(batchId, fileStatuses)) {
      logInfo(s"Committed batch $batchId")
    } else {
      throw new IllegalStateException(s"Race while writing batch $batchId")
    }
  }

  override def abortJob(jobContext: JobContext): Unit = {
    require(fileLog != null, "setupManifestOptions must be called before this function")
    // Do nothing
  }

  override def setupTask(taskContext: TaskAttemptContext): Unit = {
    addedFiles = new ArrayBuffer[String]
  }

  override def newTaskTempFile(
      taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = {
    // The file name looks like part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet
    // Note that %05d does not truncate the split number, so if we have more than 100000 tasks,
    // the file name is fine and won't overflow.
    val split = taskContext.getTaskAttemptID.getTaskID.getId
    val uuid = UUID.randomUUID.toString
    val filename = f"part-$split%05d-$uuid$ext"

    val file = dir.map { d =>
      new Path(new Path(path, d), filename).toString
    }.getOrElse {
      new Path(path, filename).toString
    }

    addedFiles += file
    file
  }

  override def newTaskTempFileAbsPath(
      taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String = {
    throw new UnsupportedOperationException(
      s"$this does not support adding files with an absolute path")
  }

  override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = {
    if (addedFiles.nonEmpty) {
      val fs = new Path(addedFiles.head).getFileSystem(taskContext.getConfiguration)
      val statuses: Seq[SinkFileStatus] =
        addedFiles.map(f => SinkFileStatus(fs.getFileStatus(new Path(f))))
      new TaskCommitMessage(statuses)
    } else {
      new TaskCommitMessage(Seq.empty[SinkFileStatus])
    }
  }

  override def abortTask(taskContext: TaskAttemptContext): Unit = {
    // Do nothing
    // TODO: we can also try delete the addedFiles as a best-effort cleanup.
  }
}

Source File: DBInputFormat.scala From magellan with Apache License 2.0

5 votes

package magellan.mapreduce

import java.util

import scala.collection.JavaConversions.seqAsJavaList

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.MapWritable
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.{InputSplit, JobContext, TaskAttemptContext}

import magellan.io.ShapeKey

private[magellan] class DBInputFormat extends FileInputFormat[ShapeKey, MapWritable] {

  override def createRecordReader(inputSplit: InputSplit,
      taskAttemptContext: TaskAttemptContext) = {
    new DBReader
  }

  override def isSplitable(context: JobContext, filename: Path): Boolean = false

  override def getSplits(job: JobContext): util.List[InputSplit] = {
    try {
      super.getSplits(job)
    }catch {
      case e: Exception => seqAsJavaList(List[InputSplit]())
    }
  }
}

Source File: ShapeInputFormat.scala From magellan with Apache License 2.0

5 votes

package magellan.mapreduce

import com.google.common.base.Stopwatch
import magellan.io.{ShapeKey, ShapeWritable}
import org.apache.commons.logging.LogFactory
import org.apache.hadoop.fs.{LocatedFileStatus, Path}
import org.apache.hadoop.mapreduce.lib.input._
import org.apache.hadoop.mapreduce.{InputSplit, JobContext, TaskAttemptContext}

import scala.collection.JavaConversions._
import scala.collection.mutable.ListBuffer

private[magellan] class ShapeInputFormat
  extends FileInputFormat[ShapeKey, ShapeWritable] {

  private val log = LogFactory.getLog(classOf[ShapeInputFormat])

  override def createRecordReader(inputSplit: InputSplit,
    taskAttemptContext: TaskAttemptContext) = {
    new ShapefileReader
  }

  override def isSplitable(context: JobContext, filename: Path): Boolean = true

  override def getSplits(job: JobContext): java.util.List[InputSplit] = {
    val splitInfos = SplitInfos.SPLIT_INFO_MAP.get()
    computeSplits(job, splitInfos)
  }

  private def computeSplits(
       job: JobContext,
       splitInfos: scala.collection.Map[String, Array[Long]]) = {

    val sw = new Stopwatch().start
    val splits = ListBuffer[InputSplit]()
    val files = listStatus(job)
    for (file <- files) {
      val path = file.getPath
      val length = file.getLen
      val blkLocations = if (file.isInstanceOf[LocatedFileStatus]) {
        file.asInstanceOf[LocatedFileStatus].getBlockLocations
      } else {
        val fs = path.getFileSystem(job.getConfiguration)
        fs.getFileBlockLocations(file, 0, length)
      }
      val key = path.getName.split("\\.shp$")(0)
      if (splitInfos == null || !splitInfos.containsKey(key)) {
        val blkIndex = getBlockIndex(blkLocations, 0)
        splits.+= (makeSplit(path, 0, length, blkLocations(blkIndex).getHosts,
          blkLocations(blkIndex).getCachedHosts))
      } else {
        val s = splitInfos(key).toSeq
        val start = s
        val end = s.drop(1) ++ Seq(length)
        start.zip(end).foreach { case (startOffset: Long, endOffset: Long) =>
          val blkIndex = getBlockIndex(blkLocations, startOffset)
          splits.+=(makeSplit(path, startOffset, endOffset - startOffset, blkLocations(blkIndex).getHosts,
            blkLocations(blkIndex).getCachedHosts))
        }
      }
    }
    sw.stop
    if (log.isDebugEnabled) {
      log.debug("Total # of splits generated by getSplits: " + splits.size + ", TimeTaken: " + sw.elapsedMillis)
    }
    splits
  }
}

object SplitInfos {

  // TODO: Can we get rid of this hack to pass split calculation to the Shapefile Reader?
  val SPLIT_INFO_MAP = new ThreadLocal[scala.collection.Map[String, Array[Long]]]

}

Source File: UnsplittableSequenceFileInputFormat.scala From spark-util with Apache License 2.0

5 votes

package org.hammerlab.hadoop.splits

import java.io.IOException
import java.util

import org.apache.hadoop.fs.{ FileStatus, FileSystem, Path ⇒ HPath }
import org.apache.hadoop.mapred.{ JobConf, SequenceFileInputFormat }
import org.apache.hadoop.mapreduce.JobContext
import org.apache.hadoop.mapreduce.lib.input

import scala.collection.JavaConverters._



  override def listStatus(job: JobContext): util.List[FileStatus] =
    super
      .listStatus(job)
      .asScala
      .sortBy {
        _.getPath.getName match {
          case PartFileBasename(idx) ⇒
            idx
          case basename ⇒
            throw new IllegalArgumentException(s"Bad partition file: $basename")
        }
      }
      .asJava
}

Source File: ManifestFileCommitProtocol.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.util.UUID

import scala.collection.mutable.ArrayBuffer

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}

import org.apache.spark.internal.Logging
import org.apache.spark.internal.io.FileCommitProtocol
import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage


  def setupManifestOptions(fileLog: FileStreamSinkLog, batchId: Long): Unit = {
    this.fileLog = fileLog
    this.batchId = batchId
  }

  override def setupJob(jobContext: JobContext): Unit = {
    require(fileLog != null, "setupManifestOptions must be called before this function")
    // Do nothing
  }

  override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = {
    require(fileLog != null, "setupManifestOptions must be called before this function")
    val fileStatuses = taskCommits.flatMap(_.obj.asInstanceOf[Seq[SinkFileStatus]]).toArray

    if (fileLog.add(batchId, fileStatuses)) {
      logInfo(s"Committed batch $batchId")
    } else {
      throw new IllegalStateException(s"Race while writing batch $batchId")
    }
  }

  override def abortJob(jobContext: JobContext): Unit = {
    require(fileLog != null, "setupManifestOptions must be called before this function")
    // Do nothing
  }

  override def setupTask(taskContext: TaskAttemptContext): Unit = {
    addedFiles = new ArrayBuffer[String]
  }

  override def newTaskTempFile(
      taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = {
    // The file name looks like part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet
    // Note that %05d does not truncate the split number, so if we have more than 100000 tasks,
    // the file name is fine and won't overflow.
    val split = taskContext.getTaskAttemptID.getTaskID.getId
    val uuid = UUID.randomUUID.toString
    val filename = f"part-$split%05d-$uuid$ext"

    val file = dir.map { d =>
      new Path(new Path(path, d), filename).toString
    }.getOrElse {
      new Path(path, filename).toString
    }

    addedFiles += file
    file
  }

  override def newTaskTempFileAbsPath(
      taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String = {
    throw new UnsupportedOperationException(
      s"$this does not support adding files with an absolute path")
  }

  override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = {
    if (addedFiles.nonEmpty) {
      val fs = new Path(addedFiles.head).getFileSystem(taskContext.getConfiguration)
      val statuses: Seq[SinkFileStatus] =
        addedFiles.map(f => SinkFileStatus(fs.getFileStatus(new Path(f))))
      new TaskCommitMessage(statuses)
    } else {
      new TaskCommitMessage(Seq.empty[SinkFileStatus])
    }
  }

  override def abortTask(taskContext: TaskAttemptContext): Unit = {
    // Do nothing
    // TODO: we can also try delete the addedFiles as a best-effort cleanup.
  }
}

Source File: CodecStreams.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.io.{InputStream, OutputStream, OutputStreamWriter}
import java.nio.charset.{Charset, StandardCharsets}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.compress._
import org.apache.hadoop.mapreduce.JobContext
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.util.ReflectionUtils

import org.apache.spark.TaskContext

object CodecStreams {
  private def getDecompressionCodec(config: Configuration, file: Path): Option[CompressionCodec] = {
    val compressionCodecs = new CompressionCodecFactory(config)
    Option(compressionCodecs.getCodec(file))
  }

  def createInputStream(config: Configuration, file: Path): InputStream = {
    val fs = file.getFileSystem(config)
    val inputStream: InputStream = fs.open(file)

    getDecompressionCodec(config, file)
      .map(codec => codec.createInputStream(inputStream))
      .getOrElse(inputStream)
  }

  
  def getCompressionExtension(context: JobContext): String = {
    getCompressionCodec(context)
      .map(_.getDefaultExtension)
      .getOrElse("")
  }
}

Source File: DirectParquetOutputCommitter.scala From utils with Apache License 2.0

5 votes

package com.indix.utils.spark.parquet

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
import org.apache.parquet.Log
import org.apache.parquet.hadoop.util.ContextUtil
import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat}



class DirectParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext)
  extends ParquetOutputCommitter(outputPath, context) {
  val LOG = Log.getLog(classOf[ParquetOutputCommitter])

  override def getWorkPath: Path = outputPath

  override def abortTask(taskContext: TaskAttemptContext): Unit = {}

  override def commitTask(taskContext: TaskAttemptContext): Unit = {}

  override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true

  override def setupJob(jobContext: JobContext): Unit = {}

  override def setupTask(taskContext: TaskAttemptContext): Unit = {}

  override def commitJob(jobContext: JobContext) {
    val configuration = ContextUtil.getConfiguration(jobContext)
    val fileSystem = outputPath.getFileSystem(configuration)
    LOG.info("Using DirectParquetOutputCommitter to commit parquet files")

    if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) {
      try {
        val outputStatus = fileSystem.getFileStatus(outputPath)
        val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus)
        try {
          ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers)
        } catch {
          case e: Exception =>
            LOG.warn("Could not write summary file for " + outputPath, e)
            val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE)
            if (fileSystem.exists(metadataPath)) {
              fileSystem.delete(metadataPath, true)
            }
        }
      } catch {
        case e: Exception => LOG.warn("Could not write summary file for " + outputPath, e)
      }
    }

    if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) {
      try {
        val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME)
        fileSystem.create(successPath).close()
      } catch {
        case e: Exception => LOG.warn("Could not write success file for " + outputPath, e)
      }
    }
  }
}

Source File: FileLocalityInputFormat.scala From ArchiveSpark with MIT License

5 votes

package org.archive.archivespark.sparkling.util

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}

class FileLocalityInputFormat extends FileInputFormat[NullWritable, Text] {
  class FileLocalityRecordReader extends RecordReader[NullWritable, Text] {
    private var filePath: Text = new Text()
    private var read: Boolean = true

    override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = {
      filePath.set(split.asInstanceOf[FileSplit].getPath.toString)
      read = false
    }

    override def nextKeyValue(): Boolean = {
      if (read) false
      else {
        read = true
        true
      }
    }

    override def getCurrentKey: NullWritable = NullWritable.get
    override def getCurrentValue: Text = filePath
    override def getProgress: Float = if (read) 1.0f else 0.0f
    override def close(): Unit = read = true
  }

  override def isSplitable(context: JobContext, filename: Path): Boolean = false
  override def createRecordReader(split: InputSplit, context: TaskAttemptContext): RecordReader[NullWritable, Text] = new FileLocalityRecordReader
}

Source File: TFRecordInputFormat.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.utils.tf

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
import org.apache.hadoop.fs.FSDataInputStream

class TFRecordInputFormat extends FileInputFormat[BytesWritable, NullWritable] {
  override def createRecordReader(inputSplit: InputSplit, context: TaskAttemptContext):
  RecordReader[BytesWritable, NullWritable] = new RecordReader[BytesWritable, NullWritable] {

    private var inputStream: FSDataInputStream = null
    private var reader: TFRecordIterator = null
    private var length: Long = 0L
    private var begin: Long = 0L
    private var current: Array[Byte] = null


    override def getCurrentKey: BytesWritable = {
      new BytesWritable(current)
    }

    override def getProgress: Float = {
      (inputStream.getPos - begin) / (length + 1e-6f)
    }

    override def nextKeyValue(): Boolean = {
      if (reader.hasNext) {
        current = reader.next()
        true
      } else {
        false
      }
    }

    override def getCurrentValue: NullWritable = {
      NullWritable.get()
    }

    override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = {
      val conf = context.getConfiguration
      val fileSplit = split.asInstanceOf[FileSplit]
      length = fileSplit.getLength
      begin = fileSplit.getStart

      val file = fileSplit.getPath
      val fs = file.getFileSystem(conf)
      inputStream = fs.open(file, 4096)
      reader = new TFRecordIterator(inputStream)
    }

    override def close(): Unit = {
      inputStream.close()
    }
  }

  override protected def isSplitable(context: JobContext, filename: Path): Boolean = false
}

Source File: RosbagInputFormat.scala From ros_hadoop with Apache License 2.0

5 votes

package de.valtech.foss

import scala.io.Source
import scala.collection.JavaConverters._

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, LongWritable, MapWritable}
import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat

object RosbagInputFormat {
  def getRosChunkIdx(context: JobContext): String = {
    context.getConfiguration.get("RosbagInputFormat.chunkIdx")
  }
  def getBlockSize(context: JobContext): Long = {
    context.getConfiguration.get("dfs.blocksize").toLong
  }
}

class RosbagBytesInputFormat
  extends FileInputFormat[LongWritable, BytesWritable] {

  private var rosChunkIdx = ""
  private var recordLength = -1L

  override def isSplitable(context: JobContext, filename: Path): Boolean = {
    rosChunkIdx = RosbagInputFormat.getRosChunkIdx(context)
    recordLength = RosbagInputFormat.getBlockSize(context)
    true
  }

  override def computeSplitSize(blockSize: Long, minSize: Long, maxSize: Long): Long = {
    val defaultSize = super.computeSplitSize(blockSize, minSize, maxSize)
    defaultSize
  }

  override def createRecordReader(split: InputSplit, context: TaskAttemptContext)
      : RecordReader[LongWritable, BytesWritable] = {
    new RosbagBytesRecordReader
  }
}



class RosbagMapInputFormat
  extends FileInputFormat[LongWritable, MapWritable] {

  private var rosChunkIdx = ""
  private var recordLength = -1L

  override def isSplitable(context: JobContext, filename: Path): Boolean = {
    rosChunkIdx = RosbagInputFormat.getRosChunkIdx(context)
    recordLength = RosbagInputFormat.getBlockSize(context)
    true
  }

  override def computeSplitSize(blockSize: Long, minSize: Long, maxSize: Long): Long = {
    val defaultSize = super.computeSplitSize(blockSize, minSize, maxSize)
    defaultSize
  }

  override def createRecordReader(split: InputSplit, context: TaskAttemptContext)
      : RecordReader[LongWritable, MapWritable] = {
    new RosbagMapRecordReader
  }
}

org.apache.hadoop.mapreduce.JobContext Scala Examples