org.apache.hadoop.mapreduce.InputSplit Scala Examples
The following examples show how to use org.apache.hadoop.mapreduce.InputSplit.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: WholeTextFileRecordReader.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.input import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable} import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader} import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.deploy.SparkHadoopUtil private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 2
Source File: WholeTextFileInputFormat.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.input import scala.collection.JavaConversions._ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.JobContext import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext def setMinPartitions(context: JobContext, minPartitions: Int) { val files = listStatus(context) val totalLen = files.map { file => if (file.isDir) 0L else file.getLen }.sum val maxSplitSize = Math.ceil(totalLen * 1.0 / (if (minPartitions == 0) 1 else minPartitions)).toLong super.setMaxSplitSize(maxSplitSize) } }
Example 3
Source File: DistributedCountRDD.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.indexserver import java.util.concurrent.Executors import scala.collection.JavaConverters._ import scala.concurrent.{Await, ExecutionContext, ExecutionContextExecutor, Future} import scala.concurrent.duration.Duration import org.apache.hadoop.mapred.TaskAttemptID import org.apache.hadoop.mapreduce.{InputSplit, TaskType} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.{Partition, SparkEnv, TaskContext} import org.apache.spark.sql.SparkSession import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.cache.CacheProvider import org.apache.carbondata.core.datastore.impl.FileFactory import org.apache.carbondata.core.index.{IndexInputFormat, IndexStoreManager} import org.apache.carbondata.core.index.dev.expr.IndexInputSplitWrapper import org.apache.carbondata.core.util.{CarbonProperties, CarbonThreadFactory} import org.apache.carbondata.spark.rdd.CarbonRDD class DistributedCountRDD(@transient ss: SparkSession, indexInputFormat: IndexInputFormat) extends CarbonRDD[(String, String)](ss, Nil) { @transient private val LOGGER = LogServiceFactory.getLogService(classOf[DistributedPruneRDD] .getName) override protected def getPreferredLocations(split: Partition): Seq[String] = { if (split.asInstanceOf[IndexRDDPartition].getLocations != null) { split.asInstanceOf[IndexRDDPartition].getLocations.toSeq } else { Seq() } } override def internalCompute(split: Partition, context: TaskContext): Iterator[(String, String)] = { val attemptId = new TaskAttemptID(DistributedRDDUtils.generateTrackerId, id, TaskType.MAP, split.index, 0) val attemptContext = new TaskAttemptContextImpl(FileFactory.getConfiguration, attemptId) val inputSplits = split.asInstanceOf[IndexRDDPartition].inputSplit val numOfThreads = CarbonProperties.getInstance().getNumOfThreadsForExecutorPruning val service = Executors .newFixedThreadPool(numOfThreads, new CarbonThreadFactory("IndexPruningPool", true)) implicit val ec: ExecutionContextExecutor = ExecutionContext .fromExecutor(service) if (indexInputFormat.ifAsyncCall()) { // to clear cache of invalid segments during pre-priming in index server IndexStoreManager.getInstance().clearInvalidSegments(indexInputFormat.getCarbonTable, indexInputFormat.getInvalidSegments) } val futures = if (inputSplits.length <= numOfThreads) { inputSplits.map { split => generateFuture(Seq(split)) } } else { DistributedRDDUtils.groupSplits(inputSplits, numOfThreads).map { splits => generateFuture(splits) } } // scalastyle:off awaitresult val results = Await.result(Future.sequence(futures), Duration.Inf).flatten // scalastyle:on awaitresult val executorIP = s"${ SparkEnv.get.blockManager.blockManagerId.host }_${ SparkEnv.get.blockManager.blockManagerId.executorId }" val cacheSize = if (CacheProvider.getInstance().getCarbonCache != null) { CacheProvider.getInstance().getCarbonCache.getCurrentSize } else { 0L } Iterator((executorIP + "_" + cacheSize.toString, results.map(_._2.toLong).sum.toString)) } override protected def internalGetPartitions: Array[Partition] = { new DistributedPruneRDD(ss, indexInputFormat).partitions } private def generateFuture(split: Seq[InputSplit]) (implicit executionContext: ExecutionContext) = { Future { val segments = split.map { inputSplit => val distributable = inputSplit.asInstanceOf[IndexInputSplitWrapper] distributable.getDistributable.getSegment .setReadCommittedScope(indexInputFormat.getReadCommittedScope) distributable.getDistributable.getSegment } val defaultIndex = IndexStoreManager.getInstance .getIndex(indexInputFormat.getCarbonTable, split.head .asInstanceOf[IndexInputSplitWrapper].getDistributable.getIndexSchema) defaultIndex.getBlockRowCount(defaultIndex, segments.toList.asJava, indexInputFormat .getPartitions).asScala } } }
Example 4
Source File: JavaNewHadoopRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 5
Source File: WholeTextFileRecordReader.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.input import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.conf.{Configurable => HConfigurable, Configuration} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.input.{CombineFileRecordReader, CombineFileSplit} private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 6
Source File: WholeTextFileRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.{Text, Writable} import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.spark.{Partition, SparkContext} import org.apache.spark.input.WholeTextFileInputFormat private[spark] class WholeTextFileRDD( sc : SparkContext, inputFormatClass: Class[_ <: WholeTextFileInputFormat], keyClass: Class[Text], valueClass: Class[Text], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance val conf = getConf inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = new JobContextImpl(conf, jobId) inputFormat.setMinPartitions(jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 7
Source File: JavaNewHadoopRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConversions._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)), preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 8
Source File: WholeTextFileRecordReader.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.input import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable} import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader} import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.deploy.SparkHadoopUtil private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 9
Source File: WholeTextFileInputFormat.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.input import scala.collection.JavaConversions._ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.JobContext import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext def setMinPartitions(context: JobContext, minPartitions: Int) { val files = listStatus(context) val totalLen = files.map { file => if (file.isDir) 0L else file.getLen }.sum val maxSplitSize = Math.ceil(totalLen * 1.0 / (if (minPartitions == 0) 1 else minPartitions)).toLong super.setMaxSplitSize(maxSplitSize) } }
Example 10
Source File: JavaNewHadoopRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConversions._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)), preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 11
Source File: WholeTextFileRecordReader.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.input import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable} import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader} import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.deploy.SparkHadoopUtil private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 12
Source File: WholeTextFileInputFormat.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.input import scala.collection.JavaConversions._ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.JobContext import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext def setMinPartitions(context: JobContext, minPartitions: Int) { val files = listStatus(context) val totalLen = files.map { file => if (file.isDir) 0L else file.getLen }.sum val maxSplitSize = Math.ceil(totalLen * 1.0 / (if (minPartitions == 0) 1 else minPartitions)).toLong super.setMaxSplitSize(maxSplitSize) } }
Example 13
Source File: JavaNewHadoopRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 14
Source File: WholeTextFileRecordReader.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.input import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.conf.{Configurable => HConfigurable, Configuration} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.input.{CombineFileRecordReader, CombineFileSplit} private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 15
Source File: WholeTextFileRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.{Text, Writable} import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.spark.{Partition, SparkContext} import org.apache.spark.input.WholeTextFileInputFormat private[spark] class WholeTextFileRDD( sc : SparkContext, inputFormatClass: Class[_ <: WholeTextFileInputFormat], keyClass: Class[Text], valueClass: Class[Text], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val conf = getConf // setMinPartitions below will call FileInputFormat.listStatus(), which can be quite slow when // traversing a large number of directories and files. Parallelize it. conf.setIfUnset(FileInputFormat.LIST_STATUS_NUM_THREADS, Runtime.getRuntime.availableProcessors().toString) val inputFormat = inputFormatClass.newInstance inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = new JobContextImpl(conf, jobId) inputFormat.setMinPartitions(jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 16
Source File: JavaNewHadoopRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 17
Source File: WholeTextFileRecordReader.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.input import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable} import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader} import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.deploy.SparkHadoopUtil private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 18
Source File: WholeTextFileRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.{Text, Writable} import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.{Partition, SparkContext} import org.apache.spark.input.WholeTextFileInputFormat private[spark] class WholeTextFileRDD( sc : SparkContext, inputFormatClass: Class[_ <: WholeTextFileInputFormat], keyClass: Class[Text], valueClass: Class[Text], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance val conf = getConf inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = newJobContext(conf, jobId) inputFormat.setMinPartitions(jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 19
Source File: SplitRDD.scala From spark-bam with Apache License 2.0 | 5 votes |
package org.hammerlab.bam.spark.load import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input import org.apache.spark.{ Partition, SparkContext, TaskContext } import org.apache.spark.rdd.RDD import scala.collection.JavaConverters._ case class FileSplitPartition(index: Int, start: Long, end: Long, locations: Array[String]) extends Partition class SplitRDD private(@transient override val getPartitions: Array[Partition])(implicit sc: SparkContext) extends RDD[(Long, Long)](sc, Nil) { override def compute(split: Partition, context: TaskContext) = Iterator( split.asInstanceOf[FileSplitPartition] ) .map( fs ⇒ fs.start → fs.end ) override protected def getPreferredLocations(split: Partition) = split .asInstanceOf[FileSplitPartition] .locations } object SplitRDD { def apply(splits: java.util.List[InputSplit])(implicit sc: SparkContext): SplitRDD = new SplitRDD( splits .iterator() .asScala .map(_.asInstanceOf[input.FileSplit]) .zipWithIndex .map { case (fs, idx) ⇒ FileSplitPartition( idx, fs.getStart, fs.getStart + fs.getLength, fs.getLocations ) } .toArray ) }
Example 20
Source File: ShapeInputFormat.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import com.google.common.base.Stopwatch import magellan.io.{ShapeKey, ShapeWritable} import org.apache.commons.logging.LogFactory import org.apache.hadoop.fs.{LocatedFileStatus, Path} import org.apache.hadoop.mapreduce.lib.input._ import org.apache.hadoop.mapreduce.{InputSplit, JobContext, TaskAttemptContext} import scala.collection.JavaConversions._ import scala.collection.mutable.ListBuffer private[magellan] class ShapeInputFormat extends FileInputFormat[ShapeKey, ShapeWritable] { private val log = LogFactory.getLog(classOf[ShapeInputFormat]) override def createRecordReader(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext) = { new ShapefileReader } override def isSplitable(context: JobContext, filename: Path): Boolean = true override def getSplits(job: JobContext): java.util.List[InputSplit] = { val splitInfos = SplitInfos.SPLIT_INFO_MAP.get() computeSplits(job, splitInfos) } private def computeSplits( job: JobContext, splitInfos: scala.collection.Map[String, Array[Long]]) = { val sw = new Stopwatch().start val splits = ListBuffer[InputSplit]() val files = listStatus(job) for (file <- files) { val path = file.getPath val length = file.getLen val blkLocations = if (file.isInstanceOf[LocatedFileStatus]) { file.asInstanceOf[LocatedFileStatus].getBlockLocations } else { val fs = path.getFileSystem(job.getConfiguration) fs.getFileBlockLocations(file, 0, length) } val key = path.getName.split("\\.shp$")(0) if (splitInfos == null || !splitInfos.containsKey(key)) { val blkIndex = getBlockIndex(blkLocations, 0) splits.+= (makeSplit(path, 0, length, blkLocations(blkIndex).getHosts, blkLocations(blkIndex).getCachedHosts)) } else { val s = splitInfos(key).toSeq val start = s val end = s.drop(1) ++ Seq(length) start.zip(end).foreach { case (startOffset: Long, endOffset: Long) => val blkIndex = getBlockIndex(blkLocations, startOffset) splits.+=(makeSplit(path, startOffset, endOffset - startOffset, blkLocations(blkIndex).getHosts, blkLocations(blkIndex).getCachedHosts)) } } } sw.stop if (log.isDebugEnabled) { log.debug("Total # of splits generated by getSplits: " + splits.size + ", TimeTaken: " + sw.elapsedMillis) } splits } } object SplitInfos { // TODO: Can we get rid of this hack to pass split calculation to the Shapefile Reader? val SPLIT_INFO_MAP = new ThreadLocal[scala.collection.Map[String, Array[Long]]] }
Example 21
Source File: PortableDataStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.input import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream} import scala.collection.JavaConverters._ import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat, CombineFileRecordReader, CombineFileSplit} def toArray(): Array[Byte] = { val stream = open() try { ByteStreams.toByteArray(stream) } finally { Closeables.close(stream, true) } } def getPath(): String = path }
Example 22
Source File: WholeTextFileRecordReader.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.input import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.conf.{Configurable => HConfigurable, Configuration} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.input.{CombineFileRecordReader, CombineFileSplit} private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 23
Source File: WholeTextFileRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.{Text, Writable} import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.spark.{Partition, SparkContext} import org.apache.spark.input.WholeTextFileInputFormat private[spark] class WholeTextFileRDD( sc : SparkContext, inputFormatClass: Class[_ <: WholeTextFileInputFormat], keyClass: Class[Text], valueClass: Class[Text], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance val conf = getConf inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = new JobContextImpl(conf, jobId) inputFormat.setMinPartitions(jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 24
Source File: RosbagInputFormat.scala From ros_hadoop with Apache License 2.0 | 5 votes |
package de.valtech.foss import scala.io.Source import scala.collection.JavaConverters._ import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, LongWritable, MapWritable} import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.input.FileInputFormat object RosbagInputFormat { def getRosChunkIdx(context: JobContext): String = { context.getConfiguration.get("RosbagInputFormat.chunkIdx") } def getBlockSize(context: JobContext): Long = { context.getConfiguration.get("dfs.blocksize").toLong } } class RosbagBytesInputFormat extends FileInputFormat[LongWritable, BytesWritable] { private var rosChunkIdx = "" private var recordLength = -1L override def isSplitable(context: JobContext, filename: Path): Boolean = { rosChunkIdx = RosbagInputFormat.getRosChunkIdx(context) recordLength = RosbagInputFormat.getBlockSize(context) true } override def computeSplitSize(blockSize: Long, minSize: Long, maxSize: Long): Long = { val defaultSize = super.computeSplitSize(blockSize, minSize, maxSize) defaultSize } override def createRecordReader(split: InputSplit, context: TaskAttemptContext) : RecordReader[LongWritable, BytesWritable] = { new RosbagBytesRecordReader } } class RosbagMapInputFormat extends FileInputFormat[LongWritable, MapWritable] { private var rosChunkIdx = "" private var recordLength = -1L override def isSplitable(context: JobContext, filename: Path): Boolean = { rosChunkIdx = RosbagInputFormat.getRosChunkIdx(context) recordLength = RosbagInputFormat.getBlockSize(context) true } override def computeSplitSize(blockSize: Long, minSize: Long, maxSize: Long): Long = { val defaultSize = super.computeSplitSize(blockSize, minSize, maxSize) defaultSize } override def createRecordReader(split: InputSplit, context: TaskAttemptContext) : RecordReader[LongWritable, MapWritable] = { new RosbagMapRecordReader } }
Example 25
Source File: TFRecordInputFormat.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.utils.tf import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} import org.apache.hadoop.fs.FSDataInputStream class TFRecordInputFormat extends FileInputFormat[BytesWritable, NullWritable] { override def createRecordReader(inputSplit: InputSplit, context: TaskAttemptContext): RecordReader[BytesWritable, NullWritable] = new RecordReader[BytesWritable, NullWritable] { private var inputStream: FSDataInputStream = null private var reader: TFRecordIterator = null private var length: Long = 0L private var begin: Long = 0L private var current: Array[Byte] = null override def getCurrentKey: BytesWritable = { new BytesWritable(current) } override def getProgress: Float = { (inputStream.getPos - begin) / (length + 1e-6f) } override def nextKeyValue(): Boolean = { if (reader.hasNext) { current = reader.next() true } else { false } } override def getCurrentValue: NullWritable = { NullWritable.get() } override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = { val conf = context.getConfiguration val fileSplit = split.asInstanceOf[FileSplit] length = fileSplit.getLength begin = fileSplit.getStart val file = fileSplit.getPath val fs = file.getFileSystem(conf) inputStream = fs.open(file, 4096) reader = new TFRecordIterator(inputStream) } override def close(): Unit = { inputStream.close() } } override protected def isSplitable(context: JobContext, filename: Path): Boolean = false }
Example 26
Source File: FileLocalityInputFormat.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.sparkling.util import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} class FileLocalityInputFormat extends FileInputFormat[NullWritable, Text] { class FileLocalityRecordReader extends RecordReader[NullWritable, Text] { private var filePath: Text = new Text() private var read: Boolean = true override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = { filePath.set(split.asInstanceOf[FileSplit].getPath.toString) read = false } override def nextKeyValue(): Boolean = { if (read) false else { read = true true } } override def getCurrentKey: NullWritable = NullWritable.get override def getCurrentValue: Text = filePath override def getProgress: Float = if (read) 1.0f else 0.0f override def close(): Unit = read = true } override def isSplitable(context: JobContext, filename: Path): Boolean = false override def createRecordReader(split: InputSplit, context: TaskAttemptContext): RecordReader[NullWritable, Text] = new FileLocalityRecordReader }
Example 27
Source File: S3PointCloudInputFormat.scala From geotrellis-pointcloud with Apache License 2.0 | 5 votes |
package geotrellis.pointcloud.spark.store.s3 import geotrellis.spark.store.s3._ import geotrellis.pointcloud.spark.store.hadoop.formats._ import geotrellis.pointcloud.util.Filesystem import io.pdal._ import io.circe.Json import io.circe.syntax._ import cats.syntax.either._ import org.apache.hadoop.mapreduce.{InputSplit, TaskAttemptContext} import org.apache.commons.io.FileUtils import java.io.{File, InputStream} import java.net.URI import scala.collection.JavaConverters._ mode match { case "s3" => new S3URIRecordReader[S3PointCloudHeader, List[PointCloud]](s3Client) { def read(key: String, uri: URI): (S3PointCloudHeader, List[PointCloud]) = { val s3Pipeline = pipeline .hcursor .downField("pipeline").downArray .downField("filename").withFocus(_ => uri.toString.asJson) .top.fold(pipeline)(identity) executePipeline(context)(key, s3Pipeline) } } case _ => val tmpDir = { val dir = PointCloudInputFormat.getTmpDir(context) if (dir == null) Filesystem.createDirectory() else Filesystem.createDirectory(dir) } new S3StreamRecordReader[S3PointCloudHeader, List[PointCloud]](s3Client) { def read(key: String, is: InputStream): (S3PointCloudHeader, List[PointCloud]) = { // copy remote file into local tmp dir tmpDir.mkdirs() // to be sure that dirs created val localPath = new File(tmpDir, key.replace("/", "_")) FileUtils.copyInputStreamToFile(is, localPath) is.close() // use local filename path if it's present in json val localPipeline = pipeline .hcursor .downField("pipeline").downArray .downField("filename").withFocus(_ => localPath.getAbsolutePath.asJson) .top.fold(pipeline)(identity) try executePipeline(context)(key, localPipeline) finally { localPath.delete() tmpDir.delete() } } } } } }
Example 28
Source File: InputFormatConf.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.hadoop import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{ FileSystem, Path } import org.apache.hadoop.io.{ LongWritable, Text, Writable } import org.apache.hadoop.mapreduce.{ InputFormat, InputSplit, Job, RecordReader } import org.apache.hadoop.mapreduce.lib.input.{ FileInputFormat, FileSplit, TextInputFormat } import scala.collection.immutable trait InputFormatConf[K, V] extends Serializable { type IF <: InputFormat[K, V] type Split <: InputSplit with Writable type KExtract <: Extract[K] type VExtract <: Extract[V] def kExtract: KExtract def vExtract: VExtract def makeInputFormat(): IF // I'm unsure if we should WriSer them for them def makeSplits(hadoopConf: Configuration): IndexedSeq[WriSer[Split]] // TODO do we want to require typing of the RecordReader as well? final def createRecordReader(hadoopConf: Configuration, split: Split, inputFormat: IF = makeInputFormat()): RecordReader[K, V] = { val tac = ConfOnlyTAC(hadoopConf) val recordReader = inputFormat.createRecordReader(split, tac) recordReader.initialize(split, tac) recordReader } } case class TextInputFormatConf(file: String, partitions: Int) extends InputFormatConf[LongWritable, Text] { type IF = TextInputFormat type Split = FileSplit // TODO now that we figured out what's up, see if we can't eliminate the need for this... val internalK = Extract.unit[LongWritable] val internalV = Extract.text type KExtract = internalK.type type VExtract = internalV.type override val kExtract: KExtract = internalK override val vExtract: VExtract = internalV def makeInputFormat() = new TextInputFormat() def makeSplits(hadoopConf: Configuration): immutable.IndexedSeq[WriSer[FileSplit]] = { val job = Job.getInstance(hadoopConf) FileInputFormat.setInputPaths(job, file) val path = new Path(file) val len = FileSystem.get(hadoopConf).listStatus(path).head.getLen val size_per = math.round(len / partitions.toDouble) ((0 until partitions - 1).map { p => new FileSplit(path, size_per * p, size_per, null) } :+ { val fin = size_per * (partitions - 1) new FileSplit(path, fin, len - fin, null) }).map(WriSer(_)) } } // TODO do we really get much from having this as its own class? consider just making a def csv method in TextInputFormatConf object CSVInputFormatConf { def apply[V](ifc: InputFormatConf[LongWritable, V] { type Split = FileSplit }): InputFormatConf[LongWritable, V] { type IF = ifc.IF type Split = ifc.Split type KExtract = ifc.KExtract type VExtract = ifc.VExtract } = new InputFormatConf[LongWritable, V] { type IF = ifc.IF type Split = ifc.Split type KExtract = ifc.KExtract type VExtract = ifc.VExtract override val kExtract: KExtract = ifc.kExtract override val vExtract: VExtract = ifc.vExtract override def makeInputFormat() = ifc.makeInputFormat() override def makeSplits(hadoopConf: Configuration) = { val splits = ifc.makeSplits(hadoopConf) splits.headOption.fold(IndexedSeq.empty[WriSer[Split]]) { case WriSer(head) => val rr = createRecordReader(hadoopConf, head) require(rr.nextKeyValue, "csv has no header, first line was empty") val afterHeader = rr.getCurrentKey.get require(rr.nextKeyValue, "first split is empty") WriSer(new FileSplit(head.getPath, afterHeader, head.getLength - afterHeader, null)) +: splits.tail } } } }
Example 29
Source File: JavaNewHadoopRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 30
Source File: OsmRecordReader.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import magellan.io.{OsmKey, OsmShape, OsmNode, OsmWay, OsmRelation} import org.apache.hadoop.mapreduce.lib.input.FileSplit import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext} import scala.xml.{XML, Elem, Node} private[magellan] class OsmRecordReader extends RecordReader[OsmKey, OsmShape] { val definedNodeLabels = Set("node", "way", "relation") var nodes : Seq[Node] = _ var current : Int = 0 lazy val total = nodes.length override def initialize(genericSplit: InputSplit, context: TaskAttemptContext) : Unit = { val split: FileSplit = genericSplit.asInstanceOf[FileSplit] val job = MapReduceUtils.getConfigurationFromContext(context) val file = split.getPath() val fs = file.getFileSystem(job) val fileIn = fs.open(file) val doc = XML.load(fileIn) fileIn.close() nodes = doc.child.filter(n => definedNodeLabels contains n.label) } override def nextKeyValue() : Boolean = { if (!nodes.isEmpty) { if (current != 0) nodes = nodes.tail current += 1 } !nodes.isEmpty } override def getCurrentKey() : OsmKey = { val current = nodes.head new OsmKey(current.label, (current \ "@id").text) } def getTags(shape: Node) = { (shape \ "tag").map(t => (t \ "@k").text -> (t \ "@v").text).toMap } def getOsmNode(shape: Node) = { new OsmNode( (shape \ "@id").text, (shape \ "@lat").text.toDouble, (shape \ "@lon").text.toDouble, getTags(shape)) } def getOsmWay(shape: Node) = { new OsmWay((shape \ "@id").text, (shape \ "nd").map(w => (w \ "@ref").text), getTags(shape)) } def getOsmRelation(shape: Node) = { new OsmRelation( (shape \ "@id").text, (shape \ "member").map(r => (r \ "@ref").text), getTags(shape) ) } override def getCurrentValue() : OsmShape = { val current = nodes.head current.label match { case "node" => getOsmNode(current) case "way" => getOsmWay(current) case "relation" => getOsmRelation(current) } } override def getProgress() : Float = { current.toFloat / total } override def close() : Unit = { } }
Example 31
Source File: ShapefileReader.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import java.io.DataInputStream import org.apache.commons.io.EndianUtils import org.apache.hadoop.mapreduce.lib.input.FileSplit import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext} import magellan.io.{ShapeKey, ShapeWritable} private[magellan] class ShapefileReader extends RecordReader[ShapeKey, ShapeWritable] { private val key: ShapeKey = new ShapeKey() private var value: ShapeWritable = _ private var dis: DataInputStream = _ private var remaining: BigInt = _ override def getProgress: Float = 0 override def nextKeyValue(): Boolean = { if (remaining <= 0) { false } else { // record header has fixed length of 8 bytes // byte 0 = record #, byte 4 = content length val recordNumber = dis.readInt() // record numbers begin at 1 require(recordNumber > 0) val contentLength = 2 * (dis.readInt() + 4) value.readFields(dis) remaining -= contentLength key.setRecordIndex(key.getRecordIndex() + 1) true } } override def getCurrentValue: ShapeWritable = value override def initialize(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext) { val split = inputSplit.asInstanceOf[FileSplit] val job = MapReduceUtils.getConfigurationFromContext(taskAttemptContext) val path = split.getPath() val fs = path.getFileSystem(job) val is = fs.open(path) val (start, end) = { val v = split.getStart if (v == 0) { is.seek(24) (100L, 2 * is.readInt().toLong) } else { (v, v + split.getLength) } } is.seek(start) dis = new DataInputStream(is) key.setFileNamePrefix(split.getPath.getName.split("\\.")(0)) value = new ShapeWritable() remaining = (end - start) } override def getCurrentKey: ShapeKey = key override def close(): Unit = dis.close() }
Example 32
Source File: DBInputFormat.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import java.util import scala.collection.JavaConversions.seqAsJavaList import org.apache.hadoop.fs.Path import org.apache.hadoop.io.MapWritable import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.{InputSplit, JobContext, TaskAttemptContext} import magellan.io.ShapeKey private[magellan] class DBInputFormat extends FileInputFormat[ShapeKey, MapWritable] { override def createRecordReader(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext) = { new DBReader } override def isSplitable(context: JobContext, filename: Path): Boolean = false override def getSplits(job: JobContext): util.List[InputSplit] = { try { super.getSplits(job) }catch { case e: Exception => seqAsJavaList(List[InputSplit]()) } } }
Example 33
Source File: WholeFileReader.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import java.io.InputStream import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.hadoop.io.compress.{CodecPool, CompressionCodecFactory, Decompressor} import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapreduce.lib.input.FileSplit import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext} class WholeFileReader extends RecordReader[NullWritable, Text] { private val key = NullWritable.get() private val value = new Text() private var split: FileSplit = _ private var conf: Configuration = _ private var path: Path = _ private var done: Boolean = false override def getProgress: Float = ??? override def nextKeyValue(): Boolean = { if (done){ false } else { val fs = path.getFileSystem(conf) var is: FSDataInputStream = null var in: InputStream = null var decompressor: Decompressor = null try { is = fs.open(split.getPath) val codec = new CompressionCodecFactory(conf).getCodec(path) if (codec != null) { decompressor = CodecPool.getDecompressor(codec) in = codec.createInputStream(is, decompressor) } else { in = is } val result = IOUtils.toByteArray(in) value.clear() value.set(result) done = true true } finally { if (in != null) { IOUtils.closeQuietly(in) } if (decompressor != null) { CodecPool.returnDecompressor(decompressor) } } } } override def getCurrentValue: Text = value override def initialize(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext): Unit = { this.split = inputSplit.asInstanceOf[FileSplit] this.conf = MapReduceUtils.getConfigurationFromContext(taskAttemptContext) this.path = this.split.getPath } override def getCurrentKey: NullWritable = key override def close() {} }
Example 34
Source File: JavaNewHadoopRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 35
Source File: WholeTextFileRecordReader.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.input import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.conf.{Configurable => HConfigurable, Configuration} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.input.{CombineFileRecordReader, CombineFileSplit} private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 36
Source File: WholeTextFileRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.{Text, Writable} import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.spark.{Partition, SparkContext} import org.apache.spark.input.WholeTextFileInputFormat private[spark] class WholeTextFileRDD( sc : SparkContext, inputFormatClass: Class[_ <: WholeTextFileInputFormat], keyClass: Class[Text], valueClass: Class[Text], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance val conf = getConf inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = new JobContextImpl(conf, jobId) inputFormat.setMinPartitions(jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 37
Source File: JavaNewHadoopRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConversions._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)), preservesPartitioning)(fakeClassTag))(fakeClassTag) } }