org.apache.hadoop.mapreduce.lib.output.FileOutputFormat Scala Examples

The following examples show how to use org.apache.hadoop.mapreduce.lib.output.FileOutputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: TextFileFormat.scala    From drizzle-spark   with Apache License 2.0 12 votes vote down vote up
package org.apache.spark.sql.execution.datasources.text

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, Path}
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat}
import org.apache.hadoop.util.ReflectionUtils

import org.apache.spark.TaskContext
import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.UnsafeRow
import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter}
import org.apache.spark.sql.catalyst.util.CompressionCodecs
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.util.SerializableConfiguration


  def getCompressionExtension(context: TaskAttemptContext): String = {
    // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile
    if (FileOutputFormat.getCompressOutput(context)) {
      val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec])
      ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension
    } else {
      ""
    }
  }
} 
Example 2
Source File: TFRecordOutputFormat.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.utils.tf

import org.apache.hadoop.io.BytesWritable
import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.mapreduce.RecordWriter
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat

class TFRecordOutputFormat extends FileOutputFormat[BytesWritable, NullWritable]{
  override def getRecordWriter(taskAttemptContext: TaskAttemptContext):
  RecordWriter[BytesWritable, NullWritable] = {
    val conf = taskAttemptContext.getConfiguration
    val file = getDefaultWorkFile(taskAttemptContext, "")
    val fs = file.getFileSystem(conf)

    val bufferSize = 4096
    val outStream = fs.create(file, true, bufferSize)

    val writer = new TFRecordWriter(outStream)

    new RecordWriter[BytesWritable, NullWritable]() {
      override def close(context: TaskAttemptContext): Unit = {
        outStream.close()
      }

      override def write(k: BytesWritable, v: NullWritable): Unit = {
        writer.write(k.getBytes, 0, k.getLength)
      }
    }
  }
} 
Example 3
Source File: CodecStreams.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources

import java.io.{InputStream, OutputStream, OutputStreamWriter}
import java.nio.charset.{Charset, StandardCharsets}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.compress._
import org.apache.hadoop.mapreduce.JobContext
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.util.ReflectionUtils

import org.apache.spark.TaskContext

object CodecStreams {
  private def getDecompressionCodec(config: Configuration, file: Path): Option[CompressionCodec] = {
    val compressionCodecs = new CompressionCodecFactory(config)
    Option(compressionCodecs.getCodec(file))
  }

  def createInputStream(config: Configuration, file: Path): InputStream = {
    val fs = file.getFileSystem(config)
    val inputStream: InputStream = fs.open(file)

    getDecompressionCodec(config, file)
      .map(codec => codec.createInputStream(inputStream))
      .getOrElse(inputStream)
  }

  
  def getCompressionExtension(context: JobContext): String = {
    getCompressionCodec(context)
      .map(_.getDefaultExtension)
      .getOrElse("")
  }
} 
Example 4
Source File: MapreduceTransformation.scala    From schedoscope   with Apache License 2.0 5 votes vote down vote up
package org.schedoscope.dsl.transformations

import java.net.URI

import org.apache.commons.lang3.StringUtils
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.mapreduce.{Job, MRJobConfig}
import org.schedoscope.Schedoscope
import org.schedoscope.dsl.View
import org.schedoscope.scheduler.driver.{DriverRunState, MapreduceDriver}
import org.schedoscope.scheduler.service.ViewTransformationStatus


case class MapreduceTransformation(v: View,
                                   createJob: (Map[String, Any]) => Job,
                                   cleanupAfterJob: (Job, MapreduceDriver, DriverRunState[MapreduceBaseTransformation]) => DriverRunState[MapreduceBaseTransformation] = (_, __, completionRunState) => completionRunState,
                                   dirsToDelete: List[String] = List(),
                                   deleteViewPath: Boolean = true) extends MapreduceBaseTransformation {

  lazy val job = createJob(configuration.toMap)

  var directoriesToDelete = dirsToDelete ++ (if (deleteViewPath) List(v.fullPath) else List())

  description = StringUtils.abbreviate(v.urlPath, 100)
}

trait MapreduceBaseTransformation extends Transformation {

  def name = "mapreduce"

  val cleanupAfterJob: (Job, MapreduceDriver, DriverRunState[MapreduceBaseTransformation]) => DriverRunState[MapreduceBaseTransformation]

  val v: View

  val job: Job

  var directoriesToDelete: List[String]

  override def fileResourcesToChecksum = {
    val jarName = try {
      job.getConfiguration().get(MRJobConfig.JAR).split("/").last
    } catch {
      case _: Throwable => null
    }

    Schedoscope.settings
      .getDriverSettings("mapreduce")
      .libJarsHdfs
      .filter(lj => jarName == null || lj.contains(jarName))
  }

  override def viewTransformationStatus = ViewTransformationStatus(
    name,
    Some(Map(
      "input" -> job.getConfiguration().get(FileInputFormat.INPUT_DIR),
      "output" -> job.getConfiguration().get(FileOutputFormat.OUTDIR))))

  def configure() {
    // if job jar hasn't been registered, add all mapreduce libjars
    // to distributed cache
    if (job.getConfiguration().get(MRJobConfig.JAR) == null) {
      fileResourcesToChecksum.foreach(r => {
        try {
          job.addCacheFile(new URI(r))
        } catch {
          case _: Throwable => Unit
        }
      })
    }
    configuration.foreach { case (k, v) => if (v == null) job.getConfiguration.unset(k) else job.getConfiguration.set(k, v.toString) }
  }
} 
Example 5
Source File: MapreduceDriverTest.scala    From schedoscope   with Apache License 2.0 5 votes vote down vote up
package org.schedoscope.scheduler.driver

import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.scalatest.{FlatSpec, Matchers}
import org.schedoscope.dsl.View
import org.schedoscope.dsl.transformations.{FailingMapper, MapreduceTransformation}
import org.schedoscope.test.resources.LocalTestResources
import org.schedoscope.test.resources.TestDriverRunCompletionHandlerCallCounter._

class MapreduceDriverTest extends FlatSpec with Matchers with TestFolder {
  lazy val driver = new LocalTestResources().driverFor[MapreduceTransformation]("mapreduce")

  def invalidJob: (Map[String, Any]) => Job = (m: Map[String, Any]) => Job.getInstance

  def failingJob: (Map[String, Any]) => Job = (m: Map[String, Any]) => {
    writeData()
    val job = Job.getInstance
    job.setMapperClass(classOf[FailingMapper])
    FileInputFormat.setInputPaths(job, new Path(inputPath("")))
    FileOutputFormat.setOutputPath(job, new Path(outputPath(System.nanoTime.toString)))
    job
  }

  def identityJob: (Map[String, Any]) => Job = (m: Map[String, Any]) => {
    writeData()
    val job = Job.getInstance
    FileInputFormat.setInputPaths(job, new Path(inputPath("")))
    FileOutputFormat.setOutputPath(job, new Path(outputPath(System.nanoTime.toString)))
    job
  }

  case class DummyView() extends View

  def writeData() {
    Files.write(Paths.get(s"${inputPath("")}/file.txt"), "some data".getBytes(StandardCharsets.UTF_8))
  }

  "MapreduceDriver" should "have transformation name Mapreduce" in {
    driver.transformationName shouldBe "mapreduce"
  }

  it should "execute Mapreduce transformations synchronously" in {
    val driverRunState = driver.runAndWait(MapreduceTransformation(new DummyView(), identityJob))

    driverRunState shouldBe a[DriverRunSucceeded[_]]
  }

  it should "execute another Mapreduce transformations synchronously" in {
    val driverRunState = driver.runAndWait(MapreduceTransformation(new DummyView(), identityJob))

    driverRunState shouldBe a[DriverRunSucceeded[_]]
  }

  it should "execute Mapreduce transformations asynchronously" in {
    val driverRunHandle = driver.run(MapreduceTransformation(new DummyView(), identityJob))

    var runWasAsynchronous = false

    while (driver.getDriverRunState(driverRunHandle).isInstanceOf[DriverRunOngoing[_]])
      runWasAsynchronous = true

    runWasAsynchronous shouldBe true
    driver.getDriverRunState(driverRunHandle) shouldBe a[DriverRunSucceeded[_]]
  }

  it should "execute Mapreduce transformations and return errors when running asynchronously" in {
    val driverRunHandle = driver.run(MapreduceTransformation(new DummyView(), failingJob))

    var runWasAsynchronous = false

    while (driver.getDriverRunState(driverRunHandle).isInstanceOf[DriverRunOngoing[_]])
      runWasAsynchronous = true

    // runWasAsynchronous shouldBe true FIXME: isn't asynchronous, why?
    driver.getDriverRunState(driverRunHandle) shouldBe a[DriverRunFailed[_]]
  }

  it should "call its DriverRunCompletitionHandlers' driverRunCompleted upon request" in {
    val runHandle = driver.run(MapreduceTransformation(new DummyView(), identityJob))

    while (driver.getDriverRunState(runHandle).isInstanceOf[DriverRunOngoing[_]]) {}

    driver.driverRunCompleted(runHandle)

    driverRunCompletedCalled(runHandle, driver.getDriverRunState(runHandle)) shouldBe true
  }

  it should "call its DriverRunCompletitionHandlers' driverRunStarted upon request" in {
    val runHandle = driver.run(MapreduceTransformation(new DummyView(), identityJob))

    driver.driverRunStarted(runHandle)

    driverRunStartedCalled(runHandle) shouldBe true
  }
} 
Example 6
Source File: NodesWithGeohash.scala    From schedoscope   with Apache License 2.0 5 votes vote down vote up
package schedoscope.example.osm.processed

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, LazyOutputFormat, TextOutputFormat}
import org.schedoscope.dsl.View
import org.schedoscope.dsl.storageformats.TextFile
import org.schedoscope.dsl.transformations.MapreduceTransformation
import schedoscope.example.osm.mapreduce.GeohashMapper

case class NodesWithGeohash() extends View {

  val id = fieldOf[Long]("The node ID")
  val version = fieldOf[Int]("OSM version - ignored")
  val userId = fieldOf[Int]("OSM user ID - ignored")
  val tstamp = fieldOf[String]("Timestamp of node creation")
  val longitude = fieldOf[Double]("Longitude of the node")
  val latitude = fieldOf[Double]("Latitude of the node")
  val geohash = fieldOf[String]("A geoencoded area string")

  val stageNodes = dependsOn { () =>
    schedoscope.example.osm.stage.Nodes()
      .affects(n => Seq(
        n.id -> id,
        n.version -> version,
        n.userId -> userId,
        n.tstamp -> tstamp,
        n.longitude -> longitude,
        n.longitude -> geohash,
        n.latitude -> latitude,
        n.latitude -> geohash
      ))
  }

  transformVia(() =>
    MapreduceTransformation(
      this,
      (conf: Map[String, Any]) => {
        val job = Job.getInstance
        LazyOutputFormat.setOutputFormatClass(job, classOf[TextOutputFormat[Text, NullWritable]])
        job.setJobName(this.urlPath)
        job.setJarByClass(classOf[GeohashMapper])
        job.setMapperClass(classOf[GeohashMapper])
        job.setNumReduceTasks(0)
        FileInputFormat.setInputPaths(job, conf.get("input_path").get.toString)
        FileOutputFormat.setOutputPath(job, new Path(conf.get("output_path").get.toString))
        val cfg = job.getConfiguration();
        if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
          cfg.set("mapreduce.job.credentials.binary",
            System.getenv("HADOOP_TOKEN_FILE_LOCATION"))
        }
        job
      }).configureWith(
      Map(
        "input_path" -> stageNodes().fullPath,
        "output_path" -> fullPath)))

  comment("nodes, extended with geohash")

  storedAs(TextFile(fieldTerminator = "\\t", lineTerminator = "\\n"))
} 
Example 7
Source File: OapOutputWriter.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.oap

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.{OutputWriter, WriteResult}
import org.apache.spark.sql.execution.datasources.oap.io.OapDataWriter
import org.apache.spark.sql.types.StructType


private[oap] class OapOutputWriter(
    path: String,
    dataSchema: StructType,
    context: TaskAttemptContext) extends OutputWriter {
  private var rowCount = 0
  private var partitionString: String = ""

  override def setPartitionString(ps: String): Unit = {
    partitionString = ps
  }

  private val writer: OapDataWriter = {
    val isCompressed = FileOutputFormat.getCompressOutput(context)
    val conf = context.getConfiguration
    val file: Path = new Path(path)
    val fs = file.getFileSystem(conf)
    val fileOut = fs.create(file, false)

    new OapDataWriter(isCompressed, fileOut, dataSchema, conf)
  }

  override def write(row: InternalRow): Unit = {
    rowCount += 1
    writer.write(row)
  }

  override def close(): Unit = {
    writer.close()
  }

  override def writeStatus(): WriteResult = {
    OapWriteResult(dataFileName, rowCount, partitionString)
  }

  def dataFileName: String = new Path(path).getName
} 
Example 8
Source File: OapIndexOutputWriter.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.oap.index

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.parquet.hadoop.util.ContextUtil

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.OutputWriter
import org.apache.spark.sql.oap.adapter.InputFileNameHolderAdapter

// TODO: parameter name "path" is ambiguous
private[index] class OapIndexOutputWriter(
    path: String,
    context: TaskAttemptContext
) extends OutputWriter {

  private val outputFormat = new OapIndexOutputFormat() {
    override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
      val outputPath = FileOutputFormat.getOutputPath(context)
      val configuration = ContextUtil.getConfiguration(context)
      IndexUtils.generateTempIndexFilePath(
        configuration, inputFileName, outputPath, path, extension)
    }
  }

  private var recordWriter: RecordWriter[Void, InternalRow] = _

  private var inputFileName: String = _

  private var rowCount: Long = 0

  override def write(row: InternalRow): Unit = {
    checkStartOfNewFile()
    recordWriter.write(null, row)
    rowCount += 1
  }

  override def close(): Unit = {
    closeWriter()
  }

  private def initWriter(): Unit = {
    inputFileName = InputFileNameHolderAdapter.getInputFileName().toString
    recordWriter = outputFormat.getRecordWriter(context)
    rowCount = 0
  }

  private def closeWriter(): Unit = {
    if (recordWriter != null) {
      recordWriter.close(context)
      recordWriter = null
    }
  }

  private def checkStartOfNewFile(): Unit = {
    if (inputFileName != InputFileNameHolderAdapter.getInputFileName().toString) {
      closeWriter()
      initWriter()
    }
  }
} 
Example 9
Source File: OapIndexOutputFormat.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.oap.index

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.parquet.format.CompressionCodec
import org.apache.parquet.hadoop.util.ContextUtil

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.OapException
import org.apache.spark.sql.execution.datasources.oap.index.OapIndexProperties.IndexVersion
import org.apache.spark.sql.internal.oap.OapConf
import org.apache.spark.sql.types.StructType

private[index] class OapIndexOutputFormat extends FileOutputFormat[Void, InternalRow] {

  private val BTREE_WRITER_VERSION = OapConf.OAP_INDEX_BTREE_WRITER_VERSION.key

  private def getCodec(taskAttemptContext: TaskAttemptContext): CompressionCodec = {
    val configuration = ContextUtil.getConfiguration(taskAttemptContext)
    CompressionCodec.valueOf(
      configuration.get(
        OapConf.OAP_INDEX_BTREE_COMPRESSION.key,
        OapConf.OAP_INDEX_BTREE_COMPRESSION.defaultValueString).toUpperCase)
  }

  private def getWriterVersion(taskAttemptContext: TaskAttemptContext) = {
    val configuration = ContextUtil.getConfiguration(taskAttemptContext)
    val indexVersion =
      configuration.get(BTREE_WRITER_VERSION, OapIndexProperties.DEFAULT_WRITER_VERSION.toString)
    IndexVersion.fromString(indexVersion)
  }

  override def getRecordWriter(
      taskAttemptContext: TaskAttemptContext): RecordWriter[Void, InternalRow] = {

    val configuration = ContextUtil.getConfiguration(taskAttemptContext)

    def canBeSkipped(file: Path): Boolean = {
      val isAppend = configuration.get(OapIndexFileFormat.IS_APPEND).toBoolean
      if (isAppend) {
        val target = new Path(FileOutputFormat.getOutputPath(taskAttemptContext), file.getName)
        target.getFileSystem(configuration).exists(target)
      } else {
        false
      }
    }

    val codec = getCodec(taskAttemptContext)
    val writerVersion = getWriterVersion(taskAttemptContext)

    val extension = "." + configuration.get(OapIndexFileFormat.INDEX_TIME) +
        "." + configuration.get(OapIndexFileFormat.INDEX_NAME) +
        ".index"

    val file = getDefaultWorkFile(taskAttemptContext, extension)

    val schema = StructType.fromString(configuration.get(OapIndexFileFormat.ROW_SCHEMA))

    val indexType = configuration.get(OapIndexFileFormat.INDEX_TYPE, "")

    if (canBeSkipped(file)) {
      new DummyIndexRecordWriter()
    } else if (indexType == "BTREE") {
      BTreeIndexRecordWriter(configuration, file, schema, codec, writerVersion)
    } else if (indexType == "BITMAP") {
      val writer = file.getFileSystem(configuration).create(file, true)
      new BitmapIndexRecordWriter(configuration, writer, schema)
    } else {
      throw new OapException("Unknown Index Type: " + indexType)
    }
  }
} 
Example 10
Source File: OapIndexCommitProtocolSuite.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.oap.index

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.MRJobConfig
import org.apache.hadoop.mapreduce.TaskAttemptID
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.mapreduce.task.JobContextImpl
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl

import org.apache.spark.sql.test.oap.SharedOapContext
import org.apache.spark.util.Utils

class OapIndexCommitProtocolSuite extends SharedOapContext {
  test("newTaskTempFile") {
    val attempt = "attempt_200707121733_0001_m_000000_0"
    val taskID = TaskAttemptID.forName(attempt)
    val jobID = taskID.getJobID.toString
    val outDir = Utils.createTempDir().getAbsolutePath
    val job = Job.getInstance()
    FileOutputFormat.setOutputPath(job, new Path(outDir))
    val conf = job.getConfiguration()
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt)
    val jobContext = new JobContextImpl(conf, taskID.getJobID())
    val taskContext = new TaskAttemptContextImpl(conf, taskID)
    val commitProtocol = new OapIndexCommitProtocol(jobID, outDir)
    // test task temp path
    val pendingDirName = "_temporary_" + jobID
    commitProtocol.setupJob(jobContext)
    commitProtocol.setupTask(taskContext)
    val tempFile = new Path(commitProtocol.newTaskTempFile(taskContext, None, "test"))
    val expectedJobAttemptPath = new Path(new Path(outDir, pendingDirName), "0")
    val expectedTaskWorkPath = new Path(new Path(expectedJobAttemptPath, pendingDirName), attempt)
    assert(tempFile.getParent == expectedTaskWorkPath)
  }
} 
Example 11
Source File: CodecStreams.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources

import java.io.{InputStream, OutputStream, OutputStreamWriter}
import java.nio.charset.{Charset, StandardCharsets}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.compress._
import org.apache.hadoop.mapreduce.JobContext
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.util.ReflectionUtils

import org.apache.spark.TaskContext

object CodecStreams {
  private def getDecompressionCodec(config: Configuration, file: Path): Option[CompressionCodec] = {
    val compressionCodecs = new CompressionCodecFactory(config)
    Option(compressionCodecs.getCodec(file))
  }

  def createInputStream(config: Configuration, file: Path): InputStream = {
    val fs = file.getFileSystem(config)
    val inputStream: InputStream = fs.open(file)

    getDecompressionCodec(config, file)
      .map(codec => codec.createInputStream(inputStream))
      .getOrElse(inputStream)
  }

  
  def getCompressionExtension(context: JobContext): String = {
    getCompressionCodec(context)
      .map(_.getDefaultExtension)
      .getOrElse("")
  }
} 
Example 12
Source File: PlyOutputWriter.scala    From spark-iqmulus   with Apache License 2.0 5 votes vote down vote up
package fr.ign.spark.iqmulus.ply

import org.apache.spark.sql.types._
import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext, JobContext }
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
import java.io.DataOutputStream
import org.apache.spark.sql.sources.OutputWriter
import org.apache.hadoop.io.{ NullWritable, BytesWritable }
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.fs.Path
import java.text.NumberFormat
import org.apache.spark.sql.{ Row, SQLContext, sources }
import fr.ign.spark.iqmulus.RowOutputStream

class PlyOutputWriter(
  name: String,
  context: TaskAttemptContext,
  dataSchema: StructType,
  element: String,
  littleEndian: Boolean
)
    extends OutputWriter {

  private val file = {
    val path = getDefaultWorkFile(s".ply.$element")
    val fs = path.getFileSystem(context.getConfiguration)
    fs.create(path)
  }

  private var count = 0L

  // strip out ids
  private val schema = StructType(dataSchema.filterNot { Seq("fid", "pid") contains _.name })

  private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian, schema, dataSchema)

  def getDefaultWorkFile(extension: String): Path = {
    val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID")
    val taskAttemptId: TaskAttemptID = context.getTaskAttemptID
    val split = taskAttemptId.getTaskID.getId
    new Path(name, f"$split%05d-$uniqueWriteJobId$extension")
  }

  override def write(row: Row): Unit = {
    recordWriter.write(row)
    count += 1
  }

  override def close(): Unit = {
    recordWriter.close

    // write header
    val path = getDefaultWorkFile(".ply.header")
    val fs = path.getFileSystem(context.getConfiguration)
    val dos = new java.io.DataOutputStream(fs.create(path))
    val header = new PlyHeader(path.toString, littleEndian, Map(element -> ((count, schema))))
    header.write(dos)
    dos.close
  }
} 
Example 13
Source File: LasOutputWriter.scala    From spark-iqmulus   with Apache License 2.0 5 votes vote down vote up
package fr.ign.spark.iqmulus.las

import org.apache.spark.sql.types._
import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext }
import java.io.DataOutputStream
import org.apache.spark.sql.sources.OutputWriter
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.hadoop.io.{ NullWritable, BytesWritable }
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.fs.Path
import java.text.NumberFormat
import org.apache.spark.sql.{ Row, SQLContext, sources }
import fr.ign.spark.iqmulus.RowOutputStream

class LasOutputWriter(
  name: String,
  context: TaskAttemptContext,
  dataSchema: StructType,
  formatOpt: Option[Byte] = None,
  version: Version = Version(),
  offset: Array[Double] = Array(0F, 0F, 0F),
  scale: Array[Double] = Array(0.01F, 0.01F, 0.01F)
)
    extends OutputWriter {

  private val file = {
    val path = getDefaultWorkFile("/1.pdr")
    val fs = path.getFileSystem(context.getConfiguration)
    fs.create(path)
  }

  private val pmin = Array.fill[Double](3)(Double.PositiveInfinity)
  private val pmax = Array.fill[Double](3)(Double.NegativeInfinity)
  private val countByReturn = Array.fill[Long](15)(0)
  private def count = countByReturn.sum

  private val format = formatOpt.getOrElse(LasHeader.formatFromSchema(dataSchema))

  // todo, extra bytes
  private val schema = LasHeader.schema(format)
  private def header =
    new LasHeader(name, format, count, pmin, pmax, scale, offset, countByReturn)

  private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian = true, schema, dataSchema)

  def getDefaultWorkFile(extension: String): Path = {
    val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID")
    val taskAttemptId: TaskAttemptID = context.getTaskAttemptID
    val split = taskAttemptId.getTaskID.getId
    new Path(name, f"$split%05d-$uniqueWriteJobId$extension")
  }

  override def write(row: Row): Unit = {
    recordWriter.write(row)

    // gather statistics for the header
    val x = offset(0) + scale(0) * row.getAs[Int]("x").toDouble
    val y = offset(1) + scale(1) * row.getAs[Int]("y").toDouble
    val z = offset(2) + scale(2) * row.getAs[Int]("z").toDouble
    val ret = row.getAs[Byte]("flags") & 0x3
    countByReturn(ret) += 1
    pmin(0) = Math.min(pmin(0), x)
    pmin(1) = Math.min(pmin(1), y)
    pmin(2) = Math.min(pmin(2), z)
    pmax(0) = Math.max(pmax(0), x)
    pmax(1) = Math.max(pmax(1), y)
    pmax(2) = Math.max(pmax(2), z)
  }

  override def close(): Unit = {
    recordWriter.close

    // write header
    val path = getDefaultWorkFile("/0.header")
    val fs = path.getFileSystem(context.getConfiguration)
    val dos = new java.io.DataOutputStream(fs.create(path))
    header.write(dos)
    dos.close

    // copy header and pdf to a final las file (1 per split)
    org.apache.hadoop.fs.FileUtil.copyMerge(
      fs, getDefaultWorkFile("/"),
      fs, getDefaultWorkFile(".las"),
      true, context.getConfiguration, ""
    )
  }
}