org.apache.hadoop.mapreduce.lib.input.FileSplit Scala Examples
The following examples show how to use org.apache.hadoop.mapreduce.lib.input.FileSplit.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: HadoopFileLinesReader.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl class HadoopFileLinesReader( file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable { private val iterator = { val fileSplit = new FileSplit( new Path(new URI(file.filePath)), file.start, file.length, // TODO: Implement Locality Array.empty) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = new LineRecordReader() reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = iterator.hasNext override def next(): Text = iterator.next() override def close(): Unit = iterator.close() }
Example 2
Source File: TFRecordInputFormat.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.utils.tf import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} import org.apache.hadoop.fs.FSDataInputStream class TFRecordInputFormat extends FileInputFormat[BytesWritable, NullWritable] { override def createRecordReader(inputSplit: InputSplit, context: TaskAttemptContext): RecordReader[BytesWritable, NullWritable] = new RecordReader[BytesWritable, NullWritable] { private var inputStream: FSDataInputStream = null private var reader: TFRecordIterator = null private var length: Long = 0L private var begin: Long = 0L private var current: Array[Byte] = null override def getCurrentKey: BytesWritable = { new BytesWritable(current) } override def getProgress: Float = { (inputStream.getPos - begin) / (length + 1e-6f) } override def nextKeyValue(): Boolean = { if (reader.hasNext) { current = reader.next() true } else { false } } override def getCurrentValue: NullWritable = { NullWritable.get() } override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = { val conf = context.getConfiguration val fileSplit = split.asInstanceOf[FileSplit] length = fileSplit.getLength begin = fileSplit.getStart val file = fileSplit.getPath val fs = file.getFileSystem(conf) inputStream = fs.open(file, 4096) reader = new TFRecordIterator(inputStream) } override def close(): Unit = { inputStream.close() } } override protected def isSplitable(context: JobContext, filename: Path): Boolean = false }
Example 3
Source File: FileLocalityInputFormat.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.sparkling.util import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} class FileLocalityInputFormat extends FileInputFormat[NullWritable, Text] { class FileLocalityRecordReader extends RecordReader[NullWritable, Text] { private var filePath: Text = new Text() private var read: Boolean = true override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = { filePath.set(split.asInstanceOf[FileSplit].getPath.toString) read = false } override def nextKeyValue(): Boolean = { if (read) false else { read = true true } } override def getCurrentKey: NullWritable = NullWritable.get override def getCurrentValue: Text = filePath override def getProgress: Float = if (read) 1.0f else 0.0f override def close(): Unit = read = true } override def isSplitable(context: JobContext, filename: Path): Boolean = false override def createRecordReader(split: InputSplit, context: TaskAttemptContext): RecordReader[NullWritable, Text] = new FileLocalityRecordReader }
Example 4
Source File: InputFormatConf.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.hadoop import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{ FileSystem, Path } import org.apache.hadoop.io.{ LongWritable, Text, Writable } import org.apache.hadoop.mapreduce.{ InputFormat, InputSplit, Job, RecordReader } import org.apache.hadoop.mapreduce.lib.input.{ FileInputFormat, FileSplit, TextInputFormat } import scala.collection.immutable trait InputFormatConf[K, V] extends Serializable { type IF <: InputFormat[K, V] type Split <: InputSplit with Writable type KExtract <: Extract[K] type VExtract <: Extract[V] def kExtract: KExtract def vExtract: VExtract def makeInputFormat(): IF // I'm unsure if we should WriSer them for them def makeSplits(hadoopConf: Configuration): IndexedSeq[WriSer[Split]] // TODO do we want to require typing of the RecordReader as well? final def createRecordReader(hadoopConf: Configuration, split: Split, inputFormat: IF = makeInputFormat()): RecordReader[K, V] = { val tac = ConfOnlyTAC(hadoopConf) val recordReader = inputFormat.createRecordReader(split, tac) recordReader.initialize(split, tac) recordReader } } case class TextInputFormatConf(file: String, partitions: Int) extends InputFormatConf[LongWritable, Text] { type IF = TextInputFormat type Split = FileSplit // TODO now that we figured out what's up, see if we can't eliminate the need for this... val internalK = Extract.unit[LongWritable] val internalV = Extract.text type KExtract = internalK.type type VExtract = internalV.type override val kExtract: KExtract = internalK override val vExtract: VExtract = internalV def makeInputFormat() = new TextInputFormat() def makeSplits(hadoopConf: Configuration): immutable.IndexedSeq[WriSer[FileSplit]] = { val job = Job.getInstance(hadoopConf) FileInputFormat.setInputPaths(job, file) val path = new Path(file) val len = FileSystem.get(hadoopConf).listStatus(path).head.getLen val size_per = math.round(len / partitions.toDouble) ((0 until partitions - 1).map { p => new FileSplit(path, size_per * p, size_per, null) } :+ { val fin = size_per * (partitions - 1) new FileSplit(path, fin, len - fin, null) }).map(WriSer(_)) } } // TODO do we really get much from having this as its own class? consider just making a def csv method in TextInputFormatConf object CSVInputFormatConf { def apply[V](ifc: InputFormatConf[LongWritable, V] { type Split = FileSplit }): InputFormatConf[LongWritable, V] { type IF = ifc.IF type Split = ifc.Split type KExtract = ifc.KExtract type VExtract = ifc.VExtract } = new InputFormatConf[LongWritable, V] { type IF = ifc.IF type Split = ifc.Split type KExtract = ifc.KExtract type VExtract = ifc.VExtract override val kExtract: KExtract = ifc.kExtract override val vExtract: VExtract = ifc.vExtract override def makeInputFormat() = ifc.makeInputFormat() override def makeSplits(hadoopConf: Configuration) = { val splits = ifc.makeSplits(hadoopConf) splits.headOption.fold(IndexedSeq.empty[WriSer[Split]]) { case WriSer(head) => val rr = createRecordReader(hadoopConf, head) require(rr.nextKeyValue, "csv has no header, first line was empty") val afterHeader = rr.getCurrentKey.get require(rr.nextKeyValue, "first split is empty") WriSer(new FileSplit(head.getPath, afterHeader, head.getLength - afterHeader, null)) +: splits.tail } } } }
Example 5
Source File: HadoopFileLinesReader.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl class HadoopFileLinesReader( file: PartitionedFile, lineSeparator: Option[Array[Byte]], conf: Configuration) extends Iterator[Text] with Closeable { def this(file: PartitionedFile, conf: Configuration) = this(file, None, conf) private val iterator = { val fileSplit = new FileSplit( new Path(new URI(file.filePath)), file.start, file.length, // TODO: Implement Locality Array.empty) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = lineSeparator match { case Some(sep) => new LineRecordReader(sep) // If the line separator is `None`, it covers `\r`, `\r\n` and `\n`. case _ => new LineRecordReader() } reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = iterator.hasNext override def next(): Text = iterator.next() override def close(): Unit = iterator.close() }
Example 6
Source File: HadoopFileExcelReader.scala From spark-hadoopoffice-ds with Apache License 2.0 | 5 votes |
package org.zuinnote.spark.office.excel import java.io.Closeable import java.net.URI import org.apache.spark.sql.execution.datasources._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.ArrayWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{ FileSplit, LineRecordReader } import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.sql.execution.datasources.RecordReaderIterator import org.zuinnote.hadoop.office.format.mapreduce.ExcelFileInputFormat import org.zuinnote.hadoop.office.format.mapreduce.ExcelRecordReader import org.apache.commons.logging.LogFactory import org.apache.commons.logging.Log class HadoopFileExcelReader( file: PartitionedFile, conf: Configuration) extends Iterator[ArrayWritable] with Closeable { val LOG = LogFactory.getLog(classOf[HadoopFileExcelReader]) private var reader: RecordReader[Text, ArrayWritable] = null private val iterator = { val fileSplit = new FileSplit( new Path(new URI(file.filePath)), file.start, file.length, Array.empty) // todo: implement locality (replace Array.empty with the locations) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val inputFormat = new ExcelFileInputFormat() reader = inputFormat.createRecordReader(fileSplit, hadoopAttemptContext) reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } def getReader: RecordReader[Text, ArrayWritable] = reader override def hasNext: Boolean = iterator.hasNext override def next(): ArrayWritable = iterator.next() override def close(): Unit = { if (reader != null) { reader.close() } } }
Example 7
Source File: OsmRecordReader.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import magellan.io.{OsmKey, OsmShape, OsmNode, OsmWay, OsmRelation} import org.apache.hadoop.mapreduce.lib.input.FileSplit import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext} import scala.xml.{XML, Elem, Node} private[magellan] class OsmRecordReader extends RecordReader[OsmKey, OsmShape] { val definedNodeLabels = Set("node", "way", "relation") var nodes : Seq[Node] = _ var current : Int = 0 lazy val total = nodes.length override def initialize(genericSplit: InputSplit, context: TaskAttemptContext) : Unit = { val split: FileSplit = genericSplit.asInstanceOf[FileSplit] val job = MapReduceUtils.getConfigurationFromContext(context) val file = split.getPath() val fs = file.getFileSystem(job) val fileIn = fs.open(file) val doc = XML.load(fileIn) fileIn.close() nodes = doc.child.filter(n => definedNodeLabels contains n.label) } override def nextKeyValue() : Boolean = { if (!nodes.isEmpty) { if (current != 0) nodes = nodes.tail current += 1 } !nodes.isEmpty } override def getCurrentKey() : OsmKey = { val current = nodes.head new OsmKey(current.label, (current \ "@id").text) } def getTags(shape: Node) = { (shape \ "tag").map(t => (t \ "@k").text -> (t \ "@v").text).toMap } def getOsmNode(shape: Node) = { new OsmNode( (shape \ "@id").text, (shape \ "@lat").text.toDouble, (shape \ "@lon").text.toDouble, getTags(shape)) } def getOsmWay(shape: Node) = { new OsmWay((shape \ "@id").text, (shape \ "nd").map(w => (w \ "@ref").text), getTags(shape)) } def getOsmRelation(shape: Node) = { new OsmRelation( (shape \ "@id").text, (shape \ "member").map(r => (r \ "@ref").text), getTags(shape) ) } override def getCurrentValue() : OsmShape = { val current = nodes.head current.label match { case "node" => getOsmNode(current) case "way" => getOsmWay(current) case "relation" => getOsmRelation(current) } } override def getProgress() : Float = { current.toFloat / total } override def close() : Unit = { } }
Example 8
Source File: ShapefileReader.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import java.io.DataInputStream import org.apache.commons.io.EndianUtils import org.apache.hadoop.mapreduce.lib.input.FileSplit import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext} import magellan.io.{ShapeKey, ShapeWritable} private[magellan] class ShapefileReader extends RecordReader[ShapeKey, ShapeWritable] { private val key: ShapeKey = new ShapeKey() private var value: ShapeWritable = _ private var dis: DataInputStream = _ private var remaining: BigInt = _ override def getProgress: Float = 0 override def nextKeyValue(): Boolean = { if (remaining <= 0) { false } else { // record header has fixed length of 8 bytes // byte 0 = record #, byte 4 = content length val recordNumber = dis.readInt() // record numbers begin at 1 require(recordNumber > 0) val contentLength = 2 * (dis.readInt() + 4) value.readFields(dis) remaining -= contentLength key.setRecordIndex(key.getRecordIndex() + 1) true } } override def getCurrentValue: ShapeWritable = value override def initialize(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext) { val split = inputSplit.asInstanceOf[FileSplit] val job = MapReduceUtils.getConfigurationFromContext(taskAttemptContext) val path = split.getPath() val fs = path.getFileSystem(job) val is = fs.open(path) val (start, end) = { val v = split.getStart if (v == 0) { is.seek(24) (100L, 2 * is.readInt().toLong) } else { (v, v + split.getLength) } } is.seek(start) dis = new DataInputStream(is) key.setFileNamePrefix(split.getPath.getName.split("\\.")(0)) value = new ShapeWritable() remaining = (end - start) } override def getCurrentKey: ShapeKey = key override def close(): Unit = dis.close() }
Example 9
Source File: WholeFileReader.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import java.io.InputStream import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.hadoop.io.compress.{CodecPool, CompressionCodecFactory, Decompressor} import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapreduce.lib.input.FileSplit import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext} class WholeFileReader extends RecordReader[NullWritable, Text] { private val key = NullWritable.get() private val value = new Text() private var split: FileSplit = _ private var conf: Configuration = _ private var path: Path = _ private var done: Boolean = false override def getProgress: Float = ??? override def nextKeyValue(): Boolean = { if (done){ false } else { val fs = path.getFileSystem(conf) var is: FSDataInputStream = null var in: InputStream = null var decompressor: Decompressor = null try { is = fs.open(split.getPath) val codec = new CompressionCodecFactory(conf).getCodec(path) if (codec != null) { decompressor = CodecPool.getDecompressor(codec) in = codec.createInputStream(is, decompressor) } else { in = is } val result = IOUtils.toByteArray(in) value.clear() value.set(result) done = true true } finally { if (in != null) { IOUtils.closeQuietly(in) } if (decompressor != null) { CodecPool.returnDecompressor(decompressor) } } } } override def getCurrentValue: Text = value override def initialize(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext): Unit = { this.split = inputSplit.asInstanceOf[FileSplit] this.conf = MapReduceUtils.getConfigurationFromContext(taskAttemptContext) this.path = this.split.getPath } override def getCurrentKey: NullWritable = key override def close() {} }
Example 10
Source File: HadoopFileLinesReader.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl class HadoopFileLinesReader( file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable { private val iterator = { val fileSplit = new FileSplit( new Path(new URI(file.filePath)), file.start, file.length, // TODO: Implement Locality Array.empty) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = new LineRecordReader() reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = iterator.hasNext override def next(): Text = iterator.next() override def close(): Unit = iterator.close() }
Example 11
Source File: HadoopFileLinesReader.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl class HadoopFileLinesReader( file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable { private val iterator = { val fileSplit = new FileSplit( new Path(new URI(file.filePath)), file.start, file.length, // TODO: Implement Locality Array.empty) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = new LineRecordReader() reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = iterator.hasNext override def next(): Text = iterator.next() override def close(): Unit = iterator.close() }
Example 12
Source File: HadoopFileLinesReader.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl class HadoopFileLinesReader( file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable { private val iterator = { val fileSplit = new FileSplit( new Path(new URI(file.filePath)), file.start, file.length, // TODO: Implement Locality Array.empty) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = new LineRecordReader() reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = iterator.hasNext override def next(): Text = iterator.next() override def close(): Unit = iterator.close() }
Example 13
Source File: HadoopLineIterator.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql.util import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.sql.execution.datasources.RecordReaderIterator import io.projectglow.common.GlowLogging class HadoopLineIterator( path: String, start: Long, length: Long, lineSeparator: Option[Array[Byte]], conf: Configuration) extends Iterator[Text] with Closeable with GlowLogging { private val iterator = { val fileSplit = new FileSplit( new Path(new URI(path)), start, length, // TODO: Implement Locality Array.empty ) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = lineSeparator match { case Some(sep) => new LineRecordReader(sep) // If the line separator is `None`, it covers `\r`, `\r\n` and `\n`. case _ => new LineRecordReader() } reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = { iterator.hasNext } override def next(): Text = { iterator.next() } override def close(): Unit = { iterator.close() } }