org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl Scala Examples
The following examples show how to use org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: HadoopFileLinesReader.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl class HadoopFileLinesReader( file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable { private val iterator = { val fileSplit = new FileSplit( new Path(new URI(file.filePath)), file.start, file.length, // TODO: Implement Locality Array.empty) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = new LineRecordReader() reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = iterator.hasNext override def next(): Text = iterator.next() override def close(): Unit = iterator.close() }
Example 2
Source File: HadoopFileLinesReader.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl class HadoopFileLinesReader( file: PartitionedFile, lineSeparator: Option[Array[Byte]], conf: Configuration) extends Iterator[Text] with Closeable { def this(file: PartitionedFile, conf: Configuration) = this(file, None, conf) private val iterator = { val fileSplit = new FileSplit( new Path(new URI(file.filePath)), file.start, file.length, // TODO: Implement Locality Array.empty) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = lineSeparator match { case Some(sep) => new LineRecordReader(sep) // If the line separator is `None`, it covers `\r`, `\r\n` and `\n`. case _ => new LineRecordReader() } reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = iterator.hasNext override def next(): Text = iterator.next() override def close(): Unit = iterator.close() }
Example 3
Source File: HadoopFileWholeTextReader.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.input.WholeTextFileRecordReader class HadoopFileWholeTextReader(file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable { private val iterator = { val fileSplit = new CombineFileSplit( Array(new Path(new URI(file.filePath))), Array(file.start), Array(file.length), // TODO: Implement Locality Array.empty[String]) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = new WholeTextFileRecordReader(fileSplit, hadoopAttemptContext, 0) reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = iterator.hasNext override def next(): Text = iterator.next() override def close(): Unit = iterator.close() }
Example 4
Source File: HadoopFileExcelReader.scala From spark-hadoopoffice-ds with Apache License 2.0 | 5 votes |
package org.zuinnote.spark.office.excel import java.io.Closeable import java.net.URI import org.apache.spark.sql.execution.datasources._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.ArrayWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{ FileSplit, LineRecordReader } import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.sql.execution.datasources.RecordReaderIterator import org.zuinnote.hadoop.office.format.mapreduce.ExcelFileInputFormat import org.zuinnote.hadoop.office.format.mapreduce.ExcelRecordReader import org.apache.commons.logging.LogFactory import org.apache.commons.logging.Log class HadoopFileExcelReader( file: PartitionedFile, conf: Configuration) extends Iterator[ArrayWritable] with Closeable { val LOG = LogFactory.getLog(classOf[HadoopFileExcelReader]) private var reader: RecordReader[Text, ArrayWritable] = null private val iterator = { val fileSplit = new FileSplit( new Path(new URI(file.filePath)), file.start, file.length, Array.empty) // todo: implement locality (replace Array.empty with the locations) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val inputFormat = new ExcelFileInputFormat() reader = inputFormat.createRecordReader(fileSplit, hadoopAttemptContext) reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } def getReader: RecordReader[Text, ArrayWritable] = reader override def hasNext: Boolean = iterator.hasNext override def next(): ArrayWritable = iterator.next() override def close(): Unit = { if (reader != null) { reader.close() } } }
Example 5
Source File: VectorizedFilePartitionReaderHandler.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.v2 import com.intel.sparkColumnarPlugin.datasource.VectorizedParquetArrowReader import java.net.URI import java.time.ZoneId import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.hadoop.fs.Path import org.apache.spark.sql.connector.read.{InputPartition, PartitionReaderFactory, PartitionReader} import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile} import org.apache.spark.sql.execution.datasources.v2.PartitionedFileReader import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetPartitionReaderFactory import org.apache.spark.sql.execution.datasources.v2.FilePartitionReader import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} object VectorizedFilePartitionReaderHandler { def get( inputPartition: InputPartition, parquetReaderFactory: ParquetPartitionReaderFactory) : FilePartitionReader[ColumnarBatch] = { val iter: Iterator[PartitionedFileReader[ColumnarBatch]] = inputPartition.asInstanceOf[FilePartition].files.toIterator.map { file => val filePath = new Path(new URI(file.filePath)) val split = new org.apache.parquet.hadoop.ParquetInputSplit( filePath, file.start, file.start + file.length, file.length, Array.empty, null) //val timestampConversion: Boolean = sqlConf.isParquetINT96TimestampConversion val capacity = 4096 //partitionReaderFactory.createColumnarReader(inputPartition) val dataSchema = parquetReaderFactory.dataSchema val readDataSchema = parquetReaderFactory.readDataSchema val conf = parquetReaderFactory.broadcastedConf.value.value val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val vectorizedReader = new VectorizedParquetArrowReader(split.getPath().toString(), null, false, capacity, dataSchema, readDataSchema) vectorizedReader.initialize(split, hadoopAttemptContext) val partitionReader = new PartitionReader[ColumnarBatch] { override def next(): Boolean = vectorizedReader.nextKeyValue() override def get(): ColumnarBatch = vectorizedReader.getCurrentValue.asInstanceOf[ColumnarBatch] override def close(): Unit = vectorizedReader.close() } PartitionedFileReader(file, partitionReader) } new FilePartitionReader[ColumnarBatch](iter) } }
Example 6
Source File: OapIndexCommitProtocolSuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.index import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.MRJobConfig import org.apache.hadoop.mapreduce.TaskAttemptID import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.sql.test.oap.SharedOapContext import org.apache.spark.util.Utils class OapIndexCommitProtocolSuite extends SharedOapContext { test("newTaskTempFile") { val attempt = "attempt_200707121733_0001_m_000000_0" val taskID = TaskAttemptID.forName(attempt) val jobID = taskID.getJobID.toString val outDir = Utils.createTempDir().getAbsolutePath val job = Job.getInstance() FileOutputFormat.setOutputPath(job, new Path(outDir)) val conf = job.getConfiguration() conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt) val jobContext = new JobContextImpl(conf, taskID.getJobID()) val taskContext = new TaskAttemptContextImpl(conf, taskID) val commitProtocol = new OapIndexCommitProtocol(jobID, outDir) // test task temp path val pendingDirName = "_temporary_" + jobID commitProtocol.setupJob(jobContext) commitProtocol.setupTask(taskContext) val tempFile = new Path(commitProtocol.newTaskTempFile(taskContext, None, "test")) val expectedJobAttemptPath = new Path(new Path(outDir, pendingDirName), "0") val expectedTaskWorkPath = new Path(new Path(expectedJobAttemptPath, pendingDirName), attempt) assert(tempFile.getParent == expectedTaskWorkPath) } }
Example 7
Source File: HadoopFileLinesReader.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl class HadoopFileLinesReader( file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable { private val iterator = { val fileSplit = new FileSplit( new Path(new URI(file.filePath)), file.start, file.length, // TODO: Implement Locality Array.empty) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = new LineRecordReader() reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = iterator.hasNext override def next(): Text = iterator.next() override def close(): Unit = iterator.close() }
Example 8
Source File: DistributedCountRDD.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.indexserver import java.util.concurrent.Executors import scala.collection.JavaConverters._ import scala.concurrent.{Await, ExecutionContext, ExecutionContextExecutor, Future} import scala.concurrent.duration.Duration import org.apache.hadoop.mapred.TaskAttemptID import org.apache.hadoop.mapreduce.{InputSplit, TaskType} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.{Partition, SparkEnv, TaskContext} import org.apache.spark.sql.SparkSession import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.cache.CacheProvider import org.apache.carbondata.core.datastore.impl.FileFactory import org.apache.carbondata.core.index.{IndexInputFormat, IndexStoreManager} import org.apache.carbondata.core.index.dev.expr.IndexInputSplitWrapper import org.apache.carbondata.core.util.{CarbonProperties, CarbonThreadFactory} import org.apache.carbondata.spark.rdd.CarbonRDD class DistributedCountRDD(@transient ss: SparkSession, indexInputFormat: IndexInputFormat) extends CarbonRDD[(String, String)](ss, Nil) { @transient private val LOGGER = LogServiceFactory.getLogService(classOf[DistributedPruneRDD] .getName) override protected def getPreferredLocations(split: Partition): Seq[String] = { if (split.asInstanceOf[IndexRDDPartition].getLocations != null) { split.asInstanceOf[IndexRDDPartition].getLocations.toSeq } else { Seq() } } override def internalCompute(split: Partition, context: TaskContext): Iterator[(String, String)] = { val attemptId = new TaskAttemptID(DistributedRDDUtils.generateTrackerId, id, TaskType.MAP, split.index, 0) val attemptContext = new TaskAttemptContextImpl(FileFactory.getConfiguration, attemptId) val inputSplits = split.asInstanceOf[IndexRDDPartition].inputSplit val numOfThreads = CarbonProperties.getInstance().getNumOfThreadsForExecutorPruning val service = Executors .newFixedThreadPool(numOfThreads, new CarbonThreadFactory("IndexPruningPool", true)) implicit val ec: ExecutionContextExecutor = ExecutionContext .fromExecutor(service) if (indexInputFormat.ifAsyncCall()) { // to clear cache of invalid segments during pre-priming in index server IndexStoreManager.getInstance().clearInvalidSegments(indexInputFormat.getCarbonTable, indexInputFormat.getInvalidSegments) } val futures = if (inputSplits.length <= numOfThreads) { inputSplits.map { split => generateFuture(Seq(split)) } } else { DistributedRDDUtils.groupSplits(inputSplits, numOfThreads).map { splits => generateFuture(splits) } } // scalastyle:off awaitresult val results = Await.result(Future.sequence(futures), Duration.Inf).flatten // scalastyle:on awaitresult val executorIP = s"${ SparkEnv.get.blockManager.blockManagerId.host }_${ SparkEnv.get.blockManager.blockManagerId.executorId }" val cacheSize = if (CacheProvider.getInstance().getCarbonCache != null) { CacheProvider.getInstance().getCarbonCache.getCurrentSize } else { 0L } Iterator((executorIP + "_" + cacheSize.toString, results.map(_._2.toLong).sum.toString)) } override protected def internalGetPartitions: Array[Partition] = { new DistributedPruneRDD(ss, indexInputFormat).partitions } private def generateFuture(split: Seq[InputSplit]) (implicit executionContext: ExecutionContext) = { Future { val segments = split.map { inputSplit => val distributable = inputSplit.asInstanceOf[IndexInputSplitWrapper] distributable.getDistributable.getSegment .setReadCommittedScope(indexInputFormat.getReadCommittedScope) distributable.getDistributable.getSegment } val defaultIndex = IndexStoreManager.getInstance .getIndex(indexInputFormat.getCarbonTable, split.head .asInstanceOf[IndexInputSplitWrapper].getDistributable.getIndexSchema) defaultIndex.getBlockRowCount(defaultIndex, segments.toList.asJava, indexInputFormat .getPartitions).asScala } } }
Example 9
Source File: HadoopFileLinesReader.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl class HadoopFileLinesReader( file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable { private val iterator = { val fileSplit = new FileSplit( new Path(new URI(file.filePath)), file.start, file.length, // TODO: Implement Locality Array.empty) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = new LineRecordReader() reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = iterator.hasNext override def next(): Text = iterator.next() override def close(): Unit = iterator.close() }
Example 10
Source File: HadoopFileLinesReader.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl class HadoopFileLinesReader( file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable { private val iterator = { val fileSplit = new FileSplit( new Path(new URI(file.filePath)), file.start, file.length, // TODO: Implement Locality Array.empty) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = new LineRecordReader() reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = iterator.hasNext override def next(): Text = iterator.next() override def close(): Unit = iterator.close() }
Example 11
Source File: HadoopFileWholeTextReader.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.input.WholeTextFileRecordReader class HadoopFileWholeTextReader(file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable { private val iterator = { val fileSplit = new CombineFileSplit( Array(new Path(new URI(file.filePath))), Array(file.start), Array(file.length), // TODO: Implement Locality Array.empty[String]) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = new WholeTextFileRecordReader(fileSplit, hadoopAttemptContext, 0) reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = iterator.hasNext override def next(): Text = iterator.next() override def close(): Unit = iterator.close() }
Example 12
Source File: HadoopLineIterator.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql.util import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.sql.execution.datasources.RecordReaderIterator import io.projectglow.common.GlowLogging class HadoopLineIterator( path: String, start: Long, length: Long, lineSeparator: Option[Array[Byte]], conf: Configuration) extends Iterator[Text] with Closeable with GlowLogging { private val iterator = { val fileSplit = new FileSplit( new Path(new URI(path)), start, length, // TODO: Implement Locality Array.empty ) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = lineSeparator match { case Some(sep) => new LineRecordReader(sep) // If the line separator is `None`, it covers `\r`, `\r\n` and `\n`. case _ => new LineRecordReader() } reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = { iterator.hasNext } override def next(): Text = { iterator.next() } override def close(): Unit = { iterator.close() } }