org.apache.hadoop.mapreduce.lib.output.FileOutputFormat Scala Examples
The following examples show how to use org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TextFileFormat.scala From drizzle-spark with Apache License 2.0 | 12 votes |
package org.apache.spark.sql.execution.datasources.text import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat} import org.apache.hadoop.util.ReflectionUtils import org.apache.spark.TaskContext import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter} import org.apache.spark.sql.catalyst.util.CompressionCodecs import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.util.SerializableConfiguration def getCompressionExtension(context: TaskAttemptContext): String = { // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile if (FileOutputFormat.getCompressOutput(context)) { val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec]) ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension } else { "" } } }
Example 2
Source File: TFRecordOutputFormat.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.utils.tf import org.apache.hadoop.io.BytesWritable import org.apache.hadoop.io.NullWritable import org.apache.hadoop.mapreduce.RecordWriter import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat class TFRecordOutputFormat extends FileOutputFormat[BytesWritable, NullWritable]{ override def getRecordWriter(taskAttemptContext: TaskAttemptContext): RecordWriter[BytesWritable, NullWritable] = { val conf = taskAttemptContext.getConfiguration val file = getDefaultWorkFile(taskAttemptContext, "") val fs = file.getFileSystem(conf) val bufferSize = 4096 val outStream = fs.create(file, true, bufferSize) val writer = new TFRecordWriter(outStream) new RecordWriter[BytesWritable, NullWritable]() { override def close(context: TaskAttemptContext): Unit = { outStream.close() } override def write(k: BytesWritable, v: NullWritable): Unit = { writer.write(k.getBytes, 0, k.getLength) } } } }
Example 3
Source File: CodecStreams.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.{InputStream, OutputStream, OutputStreamWriter} import java.nio.charset.{Charset, StandardCharsets} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.compress._ import org.apache.hadoop.mapreduce.JobContext import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.util.ReflectionUtils import org.apache.spark.TaskContext object CodecStreams { private def getDecompressionCodec(config: Configuration, file: Path): Option[CompressionCodec] = { val compressionCodecs = new CompressionCodecFactory(config) Option(compressionCodecs.getCodec(file)) } def createInputStream(config: Configuration, file: Path): InputStream = { val fs = file.getFileSystem(config) val inputStream: InputStream = fs.open(file) getDecompressionCodec(config, file) .map(codec => codec.createInputStream(inputStream)) .getOrElse(inputStream) } def getCompressionExtension(context: JobContext): String = { getCompressionCodec(context) .map(_.getDefaultExtension) .getOrElse("") } }
Example 4
Source File: MapreduceTransformation.scala From schedoscope with Apache License 2.0 | 5 votes |
package org.schedoscope.dsl.transformations import java.net.URI import org.apache.commons.lang3.StringUtils import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.mapreduce.{Job, MRJobConfig} import org.schedoscope.Schedoscope import org.schedoscope.dsl.View import org.schedoscope.scheduler.driver.{DriverRunState, MapreduceDriver} import org.schedoscope.scheduler.service.ViewTransformationStatus case class MapreduceTransformation(v: View, createJob: (Map[String, Any]) => Job, cleanupAfterJob: (Job, MapreduceDriver, DriverRunState[MapreduceBaseTransformation]) => DriverRunState[MapreduceBaseTransformation] = (_, __, completionRunState) => completionRunState, dirsToDelete: List[String] = List(), deleteViewPath: Boolean = true) extends MapreduceBaseTransformation { lazy val job = createJob(configuration.toMap) var directoriesToDelete = dirsToDelete ++ (if (deleteViewPath) List(v.fullPath) else List()) description = StringUtils.abbreviate(v.urlPath, 100) } trait MapreduceBaseTransformation extends Transformation { def name = "mapreduce" val cleanupAfterJob: (Job, MapreduceDriver, DriverRunState[MapreduceBaseTransformation]) => DriverRunState[MapreduceBaseTransformation] val v: View val job: Job var directoriesToDelete: List[String] override def fileResourcesToChecksum = { val jarName = try { job.getConfiguration().get(MRJobConfig.JAR).split("/").last } catch { case _: Throwable => null } Schedoscope.settings .getDriverSettings("mapreduce") .libJarsHdfs .filter(lj => jarName == null || lj.contains(jarName)) } override def viewTransformationStatus = ViewTransformationStatus( name, Some(Map( "input" -> job.getConfiguration().get(FileInputFormat.INPUT_DIR), "output" -> job.getConfiguration().get(FileOutputFormat.OUTDIR)))) def configure() { // if job jar hasn't been registered, add all mapreduce libjars // to distributed cache if (job.getConfiguration().get(MRJobConfig.JAR) == null) { fileResourcesToChecksum.foreach(r => { try { job.addCacheFile(new URI(r)) } catch { case _: Throwable => Unit } }) } configuration.foreach { case (k, v) => if (v == null) job.getConfiguration.unset(k) else job.getConfiguration.set(k, v.toString) } } }
Example 5
Source File: MapreduceDriverTest.scala From schedoscope with Apache License 2.0 | 5 votes |
package org.schedoscope.scheduler.driver import java.nio.charset.StandardCharsets import java.nio.file.{Files, Paths} import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.scalatest.{FlatSpec, Matchers} import org.schedoscope.dsl.View import org.schedoscope.dsl.transformations.{FailingMapper, MapreduceTransformation} import org.schedoscope.test.resources.LocalTestResources import org.schedoscope.test.resources.TestDriverRunCompletionHandlerCallCounter._ class MapreduceDriverTest extends FlatSpec with Matchers with TestFolder { lazy val driver = new LocalTestResources().driverFor[MapreduceTransformation]("mapreduce") def invalidJob: (Map[String, Any]) => Job = (m: Map[String, Any]) => Job.getInstance def failingJob: (Map[String, Any]) => Job = (m: Map[String, Any]) => { writeData() val job = Job.getInstance job.setMapperClass(classOf[FailingMapper]) FileInputFormat.setInputPaths(job, new Path(inputPath(""))) FileOutputFormat.setOutputPath(job, new Path(outputPath(System.nanoTime.toString))) job } def identityJob: (Map[String, Any]) => Job = (m: Map[String, Any]) => { writeData() val job = Job.getInstance FileInputFormat.setInputPaths(job, new Path(inputPath(""))) FileOutputFormat.setOutputPath(job, new Path(outputPath(System.nanoTime.toString))) job } case class DummyView() extends View def writeData() { Files.write(Paths.get(s"${inputPath("")}/file.txt"), "some data".getBytes(StandardCharsets.UTF_8)) } "MapreduceDriver" should "have transformation name Mapreduce" in { driver.transformationName shouldBe "mapreduce" } it should "execute Mapreduce transformations synchronously" in { val driverRunState = driver.runAndWait(MapreduceTransformation(new DummyView(), identityJob)) driverRunState shouldBe a[DriverRunSucceeded[_]] } it should "execute another Mapreduce transformations synchronously" in { val driverRunState = driver.runAndWait(MapreduceTransformation(new DummyView(), identityJob)) driverRunState shouldBe a[DriverRunSucceeded[_]] } it should "execute Mapreduce transformations asynchronously" in { val driverRunHandle = driver.run(MapreduceTransformation(new DummyView(), identityJob)) var runWasAsynchronous = false while (driver.getDriverRunState(driverRunHandle).isInstanceOf[DriverRunOngoing[_]]) runWasAsynchronous = true runWasAsynchronous shouldBe true driver.getDriverRunState(driverRunHandle) shouldBe a[DriverRunSucceeded[_]] } it should "execute Mapreduce transformations and return errors when running asynchronously" in { val driverRunHandle = driver.run(MapreduceTransformation(new DummyView(), failingJob)) var runWasAsynchronous = false while (driver.getDriverRunState(driverRunHandle).isInstanceOf[DriverRunOngoing[_]]) runWasAsynchronous = true // runWasAsynchronous shouldBe true FIXME: isn't asynchronous, why? driver.getDriverRunState(driverRunHandle) shouldBe a[DriverRunFailed[_]] } it should "call its DriverRunCompletitionHandlers' driverRunCompleted upon request" in { val runHandle = driver.run(MapreduceTransformation(new DummyView(), identityJob)) while (driver.getDriverRunState(runHandle).isInstanceOf[DriverRunOngoing[_]]) {} driver.driverRunCompleted(runHandle) driverRunCompletedCalled(runHandle, driver.getDriverRunState(runHandle)) shouldBe true } it should "call its DriverRunCompletitionHandlers' driverRunStarted upon request" in { val runHandle = driver.run(MapreduceTransformation(new DummyView(), identityJob)) driver.driverRunStarted(runHandle) driverRunStartedCalled(runHandle) shouldBe true } }
Example 6
Source File: NodesWithGeohash.scala From schedoscope with Apache License 2.0 | 5 votes |
package schedoscope.example.osm.processed import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, LazyOutputFormat, TextOutputFormat} import org.schedoscope.dsl.View import org.schedoscope.dsl.storageformats.TextFile import org.schedoscope.dsl.transformations.MapreduceTransformation import schedoscope.example.osm.mapreduce.GeohashMapper case class NodesWithGeohash() extends View { val id = fieldOf[Long]("The node ID") val version = fieldOf[Int]("OSM version - ignored") val userId = fieldOf[Int]("OSM user ID - ignored") val tstamp = fieldOf[String]("Timestamp of node creation") val longitude = fieldOf[Double]("Longitude of the node") val latitude = fieldOf[Double]("Latitude of the node") val geohash = fieldOf[String]("A geoencoded area string") val stageNodes = dependsOn { () => schedoscope.example.osm.stage.Nodes() .affects(n => Seq( n.id -> id, n.version -> version, n.userId -> userId, n.tstamp -> tstamp, n.longitude -> longitude, n.longitude -> geohash, n.latitude -> latitude, n.latitude -> geohash )) } transformVia(() => MapreduceTransformation( this, (conf: Map[String, Any]) => { val job = Job.getInstance LazyOutputFormat.setOutputFormatClass(job, classOf[TextOutputFormat[Text, NullWritable]]) job.setJobName(this.urlPath) job.setJarByClass(classOf[GeohashMapper]) job.setMapperClass(classOf[GeohashMapper]) job.setNumReduceTasks(0) FileInputFormat.setInputPaths(job, conf.get("input_path").get.toString) FileOutputFormat.setOutputPath(job, new Path(conf.get("output_path").get.toString)) val cfg = job.getConfiguration(); if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { cfg.set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")) } job }).configureWith( Map( "input_path" -> stageNodes().fullPath, "output_path" -> fullPath))) comment("nodes, extended with geohash") storedAs(TextFile(fieldTerminator = "\\t", lineTerminator = "\\n")) }
Example 7
Source File: OapOutputWriter.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.{OutputWriter, WriteResult} import org.apache.spark.sql.execution.datasources.oap.io.OapDataWriter import org.apache.spark.sql.types.StructType private[oap] class OapOutputWriter( path: String, dataSchema: StructType, context: TaskAttemptContext) extends OutputWriter { private var rowCount = 0 private var partitionString: String = "" override def setPartitionString(ps: String): Unit = { partitionString = ps } private val writer: OapDataWriter = { val isCompressed = FileOutputFormat.getCompressOutput(context) val conf = context.getConfiguration val file: Path = new Path(path) val fs = file.getFileSystem(conf) val fileOut = fs.create(file, false) new OapDataWriter(isCompressed, fileOut, dataSchema, conf) } override def write(row: InternalRow): Unit = { rowCount += 1 writer.write(row) } override def close(): Unit = { writer.close() } override def writeStatus(): WriteResult = { OapWriteResult(dataFileName, rowCount, partitionString) } def dataFileName: String = new Path(path).getName }
Example 8
Source File: OapIndexOutputWriter.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.index import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.parquet.hadoop.util.ContextUtil import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.OutputWriter import org.apache.spark.sql.oap.adapter.InputFileNameHolderAdapter // TODO: parameter name "path" is ambiguous private[index] class OapIndexOutputWriter( path: String, context: TaskAttemptContext ) extends OutputWriter { private val outputFormat = new OapIndexOutputFormat() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { val outputPath = FileOutputFormat.getOutputPath(context) val configuration = ContextUtil.getConfiguration(context) IndexUtils.generateTempIndexFilePath( configuration, inputFileName, outputPath, path, extension) } } private var recordWriter: RecordWriter[Void, InternalRow] = _ private var inputFileName: String = _ private var rowCount: Long = 0 override def write(row: InternalRow): Unit = { checkStartOfNewFile() recordWriter.write(null, row) rowCount += 1 } override def close(): Unit = { closeWriter() } private def initWriter(): Unit = { inputFileName = InputFileNameHolderAdapter.getInputFileName().toString recordWriter = outputFormat.getRecordWriter(context) rowCount = 0 } private def closeWriter(): Unit = { if (recordWriter != null) { recordWriter.close(context) recordWriter = null } } private def checkStartOfNewFile(): Unit = { if (inputFileName != InputFileNameHolderAdapter.getInputFileName().toString) { closeWriter() initWriter() } } }
Example 9
Source File: OapIndexOutputFormat.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.index import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.parquet.format.CompressionCodec import org.apache.parquet.hadoop.util.ContextUtil import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.OapException import org.apache.spark.sql.execution.datasources.oap.index.OapIndexProperties.IndexVersion import org.apache.spark.sql.internal.oap.OapConf import org.apache.spark.sql.types.StructType private[index] class OapIndexOutputFormat extends FileOutputFormat[Void, InternalRow] { private val BTREE_WRITER_VERSION = OapConf.OAP_INDEX_BTREE_WRITER_VERSION.key private def getCodec(taskAttemptContext: TaskAttemptContext): CompressionCodec = { val configuration = ContextUtil.getConfiguration(taskAttemptContext) CompressionCodec.valueOf( configuration.get( OapConf.OAP_INDEX_BTREE_COMPRESSION.key, OapConf.OAP_INDEX_BTREE_COMPRESSION.defaultValueString).toUpperCase) } private def getWriterVersion(taskAttemptContext: TaskAttemptContext) = { val configuration = ContextUtil.getConfiguration(taskAttemptContext) val indexVersion = configuration.get(BTREE_WRITER_VERSION, OapIndexProperties.DEFAULT_WRITER_VERSION.toString) IndexVersion.fromString(indexVersion) } override def getRecordWriter( taskAttemptContext: TaskAttemptContext): RecordWriter[Void, InternalRow] = { val configuration = ContextUtil.getConfiguration(taskAttemptContext) def canBeSkipped(file: Path): Boolean = { val isAppend = configuration.get(OapIndexFileFormat.IS_APPEND).toBoolean if (isAppend) { val target = new Path(FileOutputFormat.getOutputPath(taskAttemptContext), file.getName) target.getFileSystem(configuration).exists(target) } else { false } } val codec = getCodec(taskAttemptContext) val writerVersion = getWriterVersion(taskAttemptContext) val extension = "." + configuration.get(OapIndexFileFormat.INDEX_TIME) + "." + configuration.get(OapIndexFileFormat.INDEX_NAME) + ".index" val file = getDefaultWorkFile(taskAttemptContext, extension) val schema = StructType.fromString(configuration.get(OapIndexFileFormat.ROW_SCHEMA)) val indexType = configuration.get(OapIndexFileFormat.INDEX_TYPE, "") if (canBeSkipped(file)) { new DummyIndexRecordWriter() } else if (indexType == "BTREE") { BTreeIndexRecordWriter(configuration, file, schema, codec, writerVersion) } else if (indexType == "BITMAP") { val writer = file.getFileSystem(configuration).create(file, true) new BitmapIndexRecordWriter(configuration, writer, schema) } else { throw new OapException("Unknown Index Type: " + indexType) } } }
Example 10
Source File: OapIndexCommitProtocolSuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.index import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.MRJobConfig import org.apache.hadoop.mapreduce.TaskAttemptID import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.sql.test.oap.SharedOapContext import org.apache.spark.util.Utils class OapIndexCommitProtocolSuite extends SharedOapContext { test("newTaskTempFile") { val attempt = "attempt_200707121733_0001_m_000000_0" val taskID = TaskAttemptID.forName(attempt) val jobID = taskID.getJobID.toString val outDir = Utils.createTempDir().getAbsolutePath val job = Job.getInstance() FileOutputFormat.setOutputPath(job, new Path(outDir)) val conf = job.getConfiguration() conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt) val jobContext = new JobContextImpl(conf, taskID.getJobID()) val taskContext = new TaskAttemptContextImpl(conf, taskID) val commitProtocol = new OapIndexCommitProtocol(jobID, outDir) // test task temp path val pendingDirName = "_temporary_" + jobID commitProtocol.setupJob(jobContext) commitProtocol.setupTask(taskContext) val tempFile = new Path(commitProtocol.newTaskTempFile(taskContext, None, "test")) val expectedJobAttemptPath = new Path(new Path(outDir, pendingDirName), "0") val expectedTaskWorkPath = new Path(new Path(expectedJobAttemptPath, pendingDirName), attempt) assert(tempFile.getParent == expectedTaskWorkPath) } }
Example 11
Source File: CodecStreams.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.{InputStream, OutputStream, OutputStreamWriter} import java.nio.charset.{Charset, StandardCharsets} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.compress._ import org.apache.hadoop.mapreduce.JobContext import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.util.ReflectionUtils import org.apache.spark.TaskContext object CodecStreams { private def getDecompressionCodec(config: Configuration, file: Path): Option[CompressionCodec] = { val compressionCodecs = new CompressionCodecFactory(config) Option(compressionCodecs.getCodec(file)) } def createInputStream(config: Configuration, file: Path): InputStream = { val fs = file.getFileSystem(config) val inputStream: InputStream = fs.open(file) getDecompressionCodec(config, file) .map(codec => codec.createInputStream(inputStream)) .getOrElse(inputStream) } def getCompressionExtension(context: JobContext): String = { getCompressionCodec(context) .map(_.getDefaultExtension) .getOrElse("") } }
Example 12
Source File: PlyOutputWriter.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus.ply import org.apache.spark.sql.types._ import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext, JobContext } import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import java.io.DataOutputStream import org.apache.spark.sql.sources.OutputWriter import org.apache.hadoop.io.{ NullWritable, BytesWritable } import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.fs.Path import java.text.NumberFormat import org.apache.spark.sql.{ Row, SQLContext, sources } import fr.ign.spark.iqmulus.RowOutputStream class PlyOutputWriter( name: String, context: TaskAttemptContext, dataSchema: StructType, element: String, littleEndian: Boolean ) extends OutputWriter { private val file = { val path = getDefaultWorkFile(s".ply.$element") val fs = path.getFileSystem(context.getConfiguration) fs.create(path) } private var count = 0L // strip out ids private val schema = StructType(dataSchema.filterNot { Seq("fid", "pid") contains _.name }) private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian, schema, dataSchema) def getDefaultWorkFile(extension: String): Path = { val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID") val taskAttemptId: TaskAttemptID = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId new Path(name, f"$split%05d-$uniqueWriteJobId$extension") } override def write(row: Row): Unit = { recordWriter.write(row) count += 1 } override def close(): Unit = { recordWriter.close // write header val path = getDefaultWorkFile(".ply.header") val fs = path.getFileSystem(context.getConfiguration) val dos = new java.io.DataOutputStream(fs.create(path)) val header = new PlyHeader(path.toString, littleEndian, Map(element -> ((count, schema)))) header.write(dos) dos.close } }
Example 13
Source File: LasOutputWriter.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus.las import org.apache.spark.sql.types._ import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext } import java.io.DataOutputStream import org.apache.spark.sql.sources.OutputWriter import org.apache.spark.deploy.SparkHadoopUtil import org.apache.hadoop.io.{ NullWritable, BytesWritable } import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.fs.Path import java.text.NumberFormat import org.apache.spark.sql.{ Row, SQLContext, sources } import fr.ign.spark.iqmulus.RowOutputStream class LasOutputWriter( name: String, context: TaskAttemptContext, dataSchema: StructType, formatOpt: Option[Byte] = None, version: Version = Version(), offset: Array[Double] = Array(0F, 0F, 0F), scale: Array[Double] = Array(0.01F, 0.01F, 0.01F) ) extends OutputWriter { private val file = { val path = getDefaultWorkFile("/1.pdr") val fs = path.getFileSystem(context.getConfiguration) fs.create(path) } private val pmin = Array.fill[Double](3)(Double.PositiveInfinity) private val pmax = Array.fill[Double](3)(Double.NegativeInfinity) private val countByReturn = Array.fill[Long](15)(0) private def count = countByReturn.sum private val format = formatOpt.getOrElse(LasHeader.formatFromSchema(dataSchema)) // todo, extra bytes private val schema = LasHeader.schema(format) private def header = new LasHeader(name, format, count, pmin, pmax, scale, offset, countByReturn) private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian = true, schema, dataSchema) def getDefaultWorkFile(extension: String): Path = { val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID") val taskAttemptId: TaskAttemptID = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId new Path(name, f"$split%05d-$uniqueWriteJobId$extension") } override def write(row: Row): Unit = { recordWriter.write(row) // gather statistics for the header val x = offset(0) + scale(0) * row.getAs[Int]("x").toDouble val y = offset(1) + scale(1) * row.getAs[Int]("y").toDouble val z = offset(2) + scale(2) * row.getAs[Int]("z").toDouble val ret = row.getAs[Byte]("flags") & 0x3 countByReturn(ret) += 1 pmin(0) = Math.min(pmin(0), x) pmin(1) = Math.min(pmin(1), y) pmin(2) = Math.min(pmin(2), z) pmax(0) = Math.max(pmax(0), x) pmax(1) = Math.max(pmax(1), y) pmax(2) = Math.max(pmax(2), z) } override def close(): Unit = { recordWriter.close // write header val path = getDefaultWorkFile("/0.header") val fs = path.getFileSystem(context.getConfiguration) val dos = new java.io.DataOutputStream(fs.create(path)) header.write(dos) dos.close // copy header and pdf to a final las file (1 per split) org.apache.hadoop.fs.FileUtil.copyMerge( fs, getDefaultWorkFile("/"), fs, getDefaultWorkFile(".las"), true, context.getConfiguration, "" ) } }