org.apache.hadoop.mapreduce.RecordReader Scala Examples

The following examples show how to use org.apache.hadoop.mapreduce.RecordReader. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: WholeTextFileRecordReader.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.input

import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable}
import com.google.common.io.{ByteStreams, Closeables}

import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.CompressionCodecFactory
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader}
import org.apache.hadoop.mapreduce.RecordReader
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.apache.spark.deploy.SparkHadoopUtil



private[spark] class ConfigurableCombineFileRecordReader[K, V](
    split: InputSplit,
    context: TaskAttemptContext,
    recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable])
  extends CombineFileRecordReader[K, V](
    split.asInstanceOf[CombineFileSplit],
    context,
    recordReaderClass
  ) with Configurable {

  override def initNextRecordReader(): Boolean = {
    val r = super.initNextRecordReader()
    if (r) {
      this.curReader.asInstanceOf[HConfigurable].setConf(getConf)
    }
    r
  }
} 
Example 2
Source File: WholeTextFileRecordReader.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.input

import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable}
import com.google.common.io.{ByteStreams, Closeables}

import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.CompressionCodecFactory
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader}
import org.apache.hadoop.mapreduce.RecordReader
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.apache.spark.deploy.SparkHadoopUtil



private[spark] class ConfigurableCombineFileRecordReader[K, V](
    split: InputSplit,
    context: TaskAttemptContext,
    recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable])
  extends CombineFileRecordReader[K, V](
    split.asInstanceOf[CombineFileSplit],
    context,
    recordReaderClass
  ) with Configurable {

  override def initNextRecordReader(): Boolean = {
    val r = super.initNextRecordReader()
    if (r) {
      this.curReader.asInstanceOf[HConfigurable].setConf(getConf)
    }
    r
  }
} 
Example 3
Source File: WholeTextFileRecordReader.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.input

import com.google.common.io.{ByteStreams, Closeables}
import org.apache.hadoop.conf.{Configurable => HConfigurable, Configuration}
import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.CompressionCodecFactory
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.RecordReader
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.apache.hadoop.mapreduce.lib.input.{CombineFileRecordReader, CombineFileSplit}


private[spark] class ConfigurableCombineFileRecordReader[K, V](
    split: InputSplit,
    context: TaskAttemptContext,
    recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable])
  extends CombineFileRecordReader[K, V](
    split.asInstanceOf[CombineFileSplit],
    context,
    recordReaderClass
  ) with Configurable {

  override def initNextRecordReader(): Boolean = {
    val r = super.initNextRecordReader()
    if (r) {
      this.curReader.asInstanceOf[HConfigurable].setConf(getConf)
    }
    r
  }
} 
Example 4
Source File: RecordReaderIterator.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources

import java.io.Closeable

import org.apache.hadoop.mapreduce.RecordReader

import org.apache.spark.sql.catalyst.InternalRow


class RecordReaderIterator[T](
    private[this] var rowReader: RecordReader[_, T]) extends Iterator[T] with Closeable {
  private[this] var havePair = false
  private[this] var finished = false

  override def hasNext: Boolean = {
    if (!finished && !havePair) {
      finished = !rowReader.nextKeyValue
      if (finished) {
        // Close and release the reader here; close() will also be called when the task
        // completes, but for tasks that read from many files, it helps to release the
        // resources early.
        close()
      }
      havePair = !finished
    }
    !finished
  }

  override def next(): T = {
    if (!hasNext) {
      throw new java.util.NoSuchElementException("End of stream")
    }
    havePair = false
    rowReader.getCurrentValue
  }

  override def close(): Unit = {
    if (rowReader != null) {
      try {
        rowReader.close()
      } finally {
        rowReader = null
      }
    }
  }
} 
Example 5
Source File: WholeTextFileInputFormat.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.input

import scala.collection.JavaConversions._

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.JobContext
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat
import org.apache.hadoop.mapreduce.RecordReader
import org.apache.hadoop.mapreduce.TaskAttemptContext


  def setMinPartitions(context: JobContext, minPartitions: Int) {
    val files = listStatus(context)
    val totalLen = files.map { file =>
      if (file.isDir) 0L else file.getLen
    }.sum
    val maxSplitSize = Math.ceil(totalLen * 1.0 /
      (if (minPartitions == 0) 1 else minPartitions)).toLong
    super.setMaxSplitSize(maxSplitSize)
  }
} 
Example 6
Source File: WholeTextFileRecordReader.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.input

import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable}
import com.google.common.io.{ByteStreams, Closeables}

import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.CompressionCodecFactory
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader}
import org.apache.hadoop.mapreduce.RecordReader
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.apache.spark.deploy.SparkHadoopUtil



private[spark] class ConfigurableCombineFileRecordReader[K, V](
    split: InputSplit,
    context: TaskAttemptContext,
    recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable])
  extends CombineFileRecordReader[K, V](
    split.asInstanceOf[CombineFileSplit],
    context,
    recordReaderClass
  ) with Configurable {

  override def initNextRecordReader(): Boolean = {
    val r = super.initNextRecordReader()
    if (r) {
      this.curReader.asInstanceOf[HConfigurable].setConf(getConf)
    }
    r
  }
} 
Example 7
Source File: WholeTextFileInputFormat.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.input

import scala.collection.JavaConversions._

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.JobContext
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat
import org.apache.hadoop.mapreduce.RecordReader
import org.apache.hadoop.mapreduce.TaskAttemptContext


  def setMinPartitions(context: JobContext, minPartitions: Int) {
    val files = listStatus(context)
    val totalLen = files.map { file =>
      if (file.isDir) 0L else file.getLen
    }.sum
    val maxSplitSize = Math.ceil(totalLen * 1.0 /
      (if (minPartitions == 0) 1 else minPartitions)).toLong
    super.setMaxSplitSize(maxSplitSize)
  }
} 
Example 8
Source File: WholeTextFileRecordReader.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.input

import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable}
import com.google.common.io.{ByteStreams, Closeables}

import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.CompressionCodecFactory
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader}
import org.apache.hadoop.mapreduce.RecordReader
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.apache.spark.deploy.SparkHadoopUtil



private[spark] class ConfigurableCombineFileRecordReader[K, V](
    split: InputSplit,
    context: TaskAttemptContext,
    recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable])
  extends CombineFileRecordReader[K, V](
    split.asInstanceOf[CombineFileSplit],
    context,
    recordReaderClass
  ) with Configurable {

  override def initNextRecordReader(): Boolean = {
    val r = super.initNextRecordReader()
    if (r) {
      this.curReader.asInstanceOf[HConfigurable].setConf(getConf)
    }
    r
  }
} 
Example 9
Source File: WholeTextFileRecordReader.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.input

import com.google.common.io.{ByteStreams, Closeables}
import org.apache.hadoop.conf.{Configurable => HConfigurable, Configuration}
import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.CompressionCodecFactory
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.RecordReader
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.apache.hadoop.mapreduce.lib.input.{CombineFileRecordReader, CombineFileSplit}


private[spark] class ConfigurableCombineFileRecordReader[K, V](
    split: InputSplit,
    context: TaskAttemptContext,
    recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable])
  extends CombineFileRecordReader[K, V](
    split.asInstanceOf[CombineFileSplit],
    context,
    recordReaderClass
  ) with Configurable {

  override def initNextRecordReader(): Boolean = {
    val r = super.initNextRecordReader()
    if (r) {
      this.curReader.asInstanceOf[HConfigurable].setConf(getConf)
    }
    r
  }
} 
Example 10
Source File: RecordReaderIterator.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources

import java.io.Closeable

import org.apache.hadoop.mapreduce.RecordReader

import org.apache.spark.sql.catalyst.InternalRow


class RecordReaderIterator[T](
    private[this] var rowReader: RecordReader[_, T]) extends Iterator[T] with Closeable {
  private[this] var havePair = false
  private[this] var finished = false

  override def hasNext: Boolean = {
    if (!finished && !havePair) {
      finished = !rowReader.nextKeyValue
      if (finished) {
        // Close and release the reader here; close() will also be called when the task
        // completes, but for tasks that read from many files, it helps to release the
        // resources early.
        close()
      }
      havePair = !finished
    }
    !finished
  }

  override def next(): T = {
    if (!hasNext) {
      throw new java.util.NoSuchElementException("End of stream")
    }
    havePair = false
    rowReader.getCurrentValue
  }

  override def close(): Unit = {
    if (rowReader != null) {
      try {
        // Close the reader and release it. Note: it's very important that we don't close the
        // reader more than once, since that exposes us to MAPREDUCE-5918 when running against
        // older Hadoop 2.x releases. That bug can lead to non-deterministic corruption issues
        // when reading compressed input.
        rowReader.close()
      } finally {
        rowReader = null
      }
    }
  }
} 
Example 11
Source File: KeyValueReaderIterator.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package org.apache.spark.binary

import java.io.Closeable
import org.apache.hadoop.mapreduce.RecordReader

// Based on:
// https://github.com/apache/spark/blob/master/sql/core/src/main/scala/
//   org/apache/spark/sql/execution/datasources/RecordReaderIterator.scala


private class KeyValueReaderIterator[K, V] (
  private[this] var rowReader: RecordReader[K, V]) extends Iterator[(K, V)] with Closeable {
  private[this] var havePair = false
  private[this] var finished = false

  override def hasNext: Boolean = {
    if (!finished && !havePair) {
      finished = !rowReader.nextKeyValue
      if (finished) {
        // Close and release the reader here; close() will also be called when the task
        // completes, but for tasks that read from many files, it helps to release the
        // resources early.
        close()
      }
      havePair = !finished
    }
    !finished
  }

  override def next(): (K, V) = {
    if (!hasNext) {
      throw new java.util.NoSuchElementException("End of stream")
    }
    havePair = false
    (rowReader.getCurrentKey, rowReader.getCurrentValue)
  }

  override def close(): Unit = {
    if (rowReader != null) { //scalastyle:ignore null
      try {
        rowReader.close()
      } finally {
        rowReader = null //scalastyle:ignore null
      }
    }
  }

} 
Example 12
Source File: QueryTaskCompletionListener.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.spark.rdd

import scala.collection.JavaConverters._

import org.apache.hadoop.mapreduce.RecordReader
import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.sql.carbondata.execution.datasources.tasklisteners.CarbonQueryTaskCompletionListener
import org.apache.spark.sql.profiler.{Profiler, QueryTaskEnd}

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.memory.UnsafeMemoryManager
import org.apache.carbondata.core.stats.{QueryStatistic, QueryStatisticsConstants, QueryStatisticsRecorder}
import org.apache.carbondata.core.util.{DataTypeUtil, TaskMetricsMap, ThreadLocalTaskInfo}
import org.apache.carbondata.spark.InitInputMetrics

class QueryTaskCompletionListener(freeMemory: Boolean,
    var reader: RecordReader[Void, Object],
    inputMetricsStats: InitInputMetrics, executionId: String, taskId: Int, queryStartTime: Long,
    queryStatisticsRecorder: QueryStatisticsRecorder, split: Partition, queryId: String)
  extends CarbonQueryTaskCompletionListener {
  override def onTaskCompletion(context: TaskContext): Unit = {
    if (reader != null) {
      try {
        reader.close()
      } catch {
        case e: Exception =>
          LogServiceFactory.getLogService(this.getClass.getCanonicalName).error(e)
      }
      reader = null
    }
    TaskMetricsMap.getInstance().updateReadBytes(Thread.currentThread().getId)
    inputMetricsStats.updateAndClose()
    logStatistics(executionId, taskId, queryStartTime, queryStatisticsRecorder, split)
    if (freeMemory) {
      UnsafeMemoryManager.INSTANCE
        .freeMemoryAll(ThreadLocalTaskInfo.getCarbonTaskInfo.getTaskId)
      ThreadLocalTaskInfo.clearCarbonTaskInfo()
      DataTypeUtil.clearFormatter()
    }
  }

  def logStatistics(
      executionId: String,
      taskId: Long,
      queryStartTime: Long,
      recorder: QueryStatisticsRecorder,
      split: Partition
  ): Unit = {
    if (null != recorder) {
      val queryStatistic = new QueryStatistic()
      queryStatistic.addFixedTimeStatistic(QueryStatisticsConstants.EXECUTOR_PART,
        System.currentTimeMillis - queryStartTime)
      recorder.recordStatistics(queryStatistic)
      // print executor query statistics for each task_id
      val statistics = recorder.statisticsForTask(taskId, queryStartTime)
      if (statistics != null && executionId != null) {
        Profiler.invokeIfEnable {
          val inputSplit = split.asInstanceOf[CarbonSparkPartition].split.value
          inputSplit.calculateLength()
          val size = inputSplit.getLength
          val files = inputSplit.getAllSplits.asScala.map { s =>
            s.getSegmentId + "/" + s.getPath.getName
          }.toArray[String]
          Profiler.send(
            QueryTaskEnd(
              executionId.toLong,
              queryId,
              statistics.getValues,
              size,
              files
            )
          )
        }
      }
      recorder.logStatisticsForTask(statistics)
    }
  }
} 
Example 13
Source File: WholeTextFileInputFormat.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.input

import scala.collection.JavaConversions._

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.JobContext
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat
import org.apache.hadoop.mapreduce.RecordReader
import org.apache.hadoop.mapreduce.TaskAttemptContext


  def setMinPartitions(context: JobContext, minPartitions: Int) {
    val files = listStatus(context)
    val totalLen = files.map { file =>
      if (file.isDir) 0L else file.getLen
    }.sum
    val maxSplitSize = Math.ceil(totalLen * 1.0 /
      (if (minPartitions == 0) 1 else minPartitions)).toLong
    super.setMaxSplitSize(maxSplitSize)
  }
} 
Example 14
Source File: RecordReaderIterator.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources

import java.io.Closeable

import org.apache.hadoop.mapreduce.RecordReader

import org.apache.spark.sql.catalyst.InternalRow


class RecordReaderIterator[T](
    private[this] var rowReader: RecordReader[_, T]) extends Iterator[T] with Closeable {
  private[this] var havePair = false
  private[this] var finished = false

  override def hasNext: Boolean = {
    if (!finished && !havePair) {
      finished = !rowReader.nextKeyValue
      if (finished) {
        // Close and release the reader here; close() will also be called when the task
        // completes, but for tasks that read from many files, it helps to release the
        // resources early.
        close()
      }
      havePair = !finished
    }
    !finished
  }

  override def next(): T = {
    if (!hasNext) {
      throw new java.util.NoSuchElementException("End of stream")
    }
    havePair = false
    rowReader.getCurrentValue
  }

  override def close(): Unit = {
    if (rowReader != null) {
      try {
        // Close the reader and release it. Note: it's very important that we don't close the
        // reader more than once, since that exposes us to MAPREDUCE-5918 when running against
        // older Hadoop 2.x releases. That bug can lead to non-deterministic corruption issues
        // when reading compressed input.
        rowReader.close()
      } finally {
        rowReader = null
      }
    }
  }
} 
Example 15
Source File: WholeTextFileRecordReader.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.input

import com.google.common.io.{ByteStreams, Closeables}
import org.apache.hadoop.conf.{Configurable => HConfigurable, Configuration}
import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.CompressionCodecFactory
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.RecordReader
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.apache.hadoop.mapreduce.lib.input.{CombineFileRecordReader, CombineFileSplit}


private[spark] class ConfigurableCombineFileRecordReader[K, V](
    split: InputSplit,
    context: TaskAttemptContext,
    recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable])
  extends CombineFileRecordReader[K, V](
    split.asInstanceOf[CombineFileSplit],
    context,
    recordReaderClass
  ) with Configurable {

  override def initNextRecordReader(): Boolean = {
    val r = super.initNextRecordReader()
    if (r) {
      this.curReader.asInstanceOf[HConfigurable].setConf(getConf)
    }
    r
  }
} 
Example 16
Source File: RecordReaderIterator.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources

import java.io.Closeable

import org.apache.hadoop.mapreduce.RecordReader

import org.apache.spark.sql.catalyst.InternalRow


class RecordReaderIterator[T](
    private[this] var rowReader: RecordReader[_, T]) extends Iterator[T] with Closeable {
  private[this] var havePair = false
  private[this] var finished = false

  override def hasNext: Boolean = {
    if (!finished && !havePair) {
      finished = !rowReader.nextKeyValue
      if (finished) {
        // Close and release the reader here; close() will also be called when the task
        // completes, but for tasks that read from many files, it helps to release the
        // resources early.
        close()
      }
      havePair = !finished
    }
    !finished
  }

  override def next(): T = {
    if (!hasNext) {
      throw new java.util.NoSuchElementException("End of stream")
    }
    havePair = false
    rowReader.getCurrentValue
  }

  override def close(): Unit = {
    if (rowReader != null) {
      try {
        // Close the reader and release it. Note: it's very important that we don't close the
        // reader more than once, since that exposes us to MAPREDUCE-5918 when running against
        // older Hadoop 2.x releases. That bug can lead to non-deterministic corruption issues
        // when reading compressed input.
        rowReader.close()
      } finally {
        rowReader = null
      }
    }
  }
} 
Example 17
Source File: WholeFileReader.scala    From magellan   with Apache License 2.0 5 votes vote down vote up
package magellan.mapreduce

import java.io.InputStream

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import org.apache.hadoop.io.compress.{CodecPool, CompressionCodecFactory, Decompressor}
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.mapreduce.lib.input.FileSplit
import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext}

class WholeFileReader extends RecordReader[NullWritable, Text] {

  private val key = NullWritable.get()
  private val value = new Text()
  private var split: FileSplit = _
  private var conf: Configuration = _
  private var path: Path = _
  private var done: Boolean = false

  override def getProgress: Float = ???

  override def nextKeyValue(): Boolean = {
    if (done){
      false
    } else {
      val fs = path.getFileSystem(conf)
      var is: FSDataInputStream = null
      var in: InputStream = null
      var decompressor: Decompressor = null
      try {
        is = fs.open(split.getPath)
        val codec = new CompressionCodecFactory(conf).getCodec(path)
        if (codec != null) {
          decompressor = CodecPool.getDecompressor(codec)
          in = codec.createInputStream(is, decompressor)
        } else {
          in = is
        }
        val result = IOUtils.toByteArray(in)
        value.clear()
        value.set(result)
        done = true
        true
      } finally {
        if (in != null) {
          IOUtils.closeQuietly(in)
        }
        if (decompressor != null) {
          CodecPool.returnDecompressor(decompressor)
        }
      }
    }
  }

  override def getCurrentValue: Text = value

  override def initialize(inputSplit: InputSplit,
    taskAttemptContext: TaskAttemptContext): Unit = {
    this.split = inputSplit.asInstanceOf[FileSplit]
    this.conf = MapReduceUtils.getConfigurationFromContext(taskAttemptContext)
    this.path = this.split.getPath
  }

  override def getCurrentKey: NullWritable = key

  override def close() {}
} 
Example 18
Source File: ShapefileReader.scala    From magellan   with Apache License 2.0 5 votes vote down vote up
package magellan.mapreduce

import java.io.DataInputStream

import org.apache.commons.io.EndianUtils
import org.apache.hadoop.mapreduce.lib.input.FileSplit
import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext}

import magellan.io.{ShapeKey, ShapeWritable}

private[magellan] class ShapefileReader extends RecordReader[ShapeKey, ShapeWritable] {

  private val key: ShapeKey = new ShapeKey()

  private var value: ShapeWritable = _

  private var dis: DataInputStream = _

  private var remaining: BigInt = _

  override def getProgress: Float = 0

  override def nextKeyValue(): Boolean = {
    if (remaining <= 0) {
      false
    } else {
      // record header has fixed length of 8 bytes
      // byte 0 = record #, byte 4 = content length
      val recordNumber = dis.readInt()
      // record numbers begin at 1
      require(recordNumber > 0)
      val contentLength = 2 * (dis.readInt() + 4)
      value.readFields(dis)
      remaining -= contentLength
      key.setRecordIndex(key.getRecordIndex() + 1)
      true
    }
  }

  override def getCurrentValue: ShapeWritable = value

  override def initialize(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext) {
    val split = inputSplit.asInstanceOf[FileSplit]
    val job = MapReduceUtils.getConfigurationFromContext(taskAttemptContext)

    val path = split.getPath()
    val fs = path.getFileSystem(job)
    val is = fs.open(path)

    val (start, end) = {
      val v = split.getStart
      if (v == 0) {
        is.seek(24)
        (100L, 2 * is.readInt().toLong)
      } else {
        (v, v + split.getLength)
      }
    }

    is.seek(start)
    dis = new DataInputStream(is)
    key.setFileNamePrefix(split.getPath.getName.split("\\.")(0))
    value = new ShapeWritable()
    remaining = (end - start)
  }

  override def getCurrentKey: ShapeKey = key

  override def close(): Unit = dis.close()

} 
Example 19
Source File: OsmRecordReader.scala    From magellan   with Apache License 2.0 5 votes vote down vote up
package magellan.mapreduce

import magellan.io.{OsmKey, OsmShape, OsmNode, OsmWay, OsmRelation}
import org.apache.hadoop.mapreduce.lib.input.FileSplit
import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext}
import scala.xml.{XML, Elem, Node}

private[magellan] class OsmRecordReader
  extends RecordReader[OsmKey, OsmShape] {
 
  val definedNodeLabels = Set("node", "way", "relation")
  var nodes : Seq[Node] = _
  
  var current : Int = 0
  lazy val total = nodes.length
  
  override def initialize(genericSplit: InputSplit, context: TaskAttemptContext) : Unit = {
    val split: FileSplit = genericSplit.asInstanceOf[FileSplit]
    val job = MapReduceUtils.getConfigurationFromContext(context)
    
    val file = split.getPath()
    val fs = file.getFileSystem(job)
    val fileIn = fs.open(file)
    
    val doc = XML.load(fileIn)
    fileIn.close()
    nodes = doc.child.filter(n => definedNodeLabels contains n.label)
  }
  
  override def nextKeyValue() : Boolean = {
    if (!nodes.isEmpty) { 
      if (current != 0) nodes = nodes.tail
      current += 1
    }
    !nodes.isEmpty
  }
  
  override def getCurrentKey() : OsmKey = {
    val current = nodes.head
    new OsmKey(current.label, (current \ "@id").text)
  }
  
  def getTags(shape: Node) = {
    (shape \ "tag").map(t => (t \ "@k").text -> (t \ "@v").text).toMap
  }
  
  def getOsmNode(shape: Node) = {
    new OsmNode(
        (shape \ "@id").text,
        (shape \ "@lat").text.toDouble,
        (shape \ "@lon").text.toDouble,
        getTags(shape))
  }
  
  def getOsmWay(shape: Node) = {
    new OsmWay((shape \ "@id").text, (shape \ "nd").map(w => (w \ "@ref").text), getTags(shape))
  }
  
  def getOsmRelation(shape: Node) = {
    new OsmRelation(
        (shape \ "@id").text,
        (shape \ "member").map(r => (r \ "@ref").text), getTags(shape)
    )
  }
  
  override def getCurrentValue() : OsmShape = {
    val current = nodes.head
    current.label match {
      case "node" => getOsmNode(current)
      case "way" => getOsmWay(current)
      case "relation" => getOsmRelation(current)
    }
  }
  
  override def getProgress() : Float = {
    current.toFloat / total
  }
  
  override def close() : Unit = { }
} 
Example 20
Source File: RecordReaderIterator.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources

import java.io.Closeable

import org.apache.hadoop.mapreduce.RecordReader

import org.apache.spark.sql.catalyst.InternalRow


class RecordReaderIterator[T](
    private[this] var rowReader: RecordReader[_, T]) extends Iterator[T] with Closeable {
  private[this] var havePair = false
  private[this] var finished = false

  override def hasNext: Boolean = {
    if (!finished && !havePair) {
      finished = !rowReader.nextKeyValue
      if (finished) {
        // Close and release the reader here; close() will also be called when the task
        // completes, but for tasks that read from many files, it helps to release the
        // resources early.
        close()
      }
      havePair = !finished
    }
    !finished
  }

  override def next(): T = {
    if (!hasNext) {
      throw new java.util.NoSuchElementException("End of stream")
    }
    havePair = false
    rowReader.getCurrentValue
  }

  override def close(): Unit = {
    if (rowReader != null) {
      try {
        rowReader.close()
      } finally {
        rowReader = null
      }
    }
  }
} 
Example 21
Source File: InputFormatConf.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.hadoop

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{ FileSystem, Path }
import org.apache.hadoop.io.{ LongWritable, Text, Writable }
import org.apache.hadoop.mapreduce.{ InputFormat, InputSplit, Job, RecordReader }
import org.apache.hadoop.mapreduce.lib.input.{ FileInputFormat, FileSplit, TextInputFormat }

import scala.collection.immutable

trait InputFormatConf[K, V] extends Serializable {
  type IF <: InputFormat[K, V]
  type Split <: InputSplit with Writable

  type KExtract <: Extract[K]
  type VExtract <: Extract[V]

  def kExtract: KExtract
  def vExtract: VExtract

  def makeInputFormat(): IF

  // I'm unsure if we should WriSer them for them
  def makeSplits(hadoopConf: Configuration): IndexedSeq[WriSer[Split]]

  // TODO do we want to require typing of the RecordReader as well?
  final def createRecordReader(hadoopConf: Configuration, split: Split,
    inputFormat: IF = makeInputFormat()): RecordReader[K, V] = {
    val tac = ConfOnlyTAC(hadoopConf)
    val recordReader = inputFormat.createRecordReader(split, tac)
    recordReader.initialize(split, tac)
    recordReader
  }
}

case class TextInputFormatConf(file: String, partitions: Int)
  extends InputFormatConf[LongWritable, Text] {
  type IF = TextInputFormat
  type Split = FileSplit

  // TODO now that we figured out what's up, see if we can't eliminate the need for this...
  val internalK = Extract.unit[LongWritable]
  val internalV = Extract.text

  type KExtract = internalK.type
  type VExtract = internalV.type

  override val kExtract: KExtract = internalK
  override val vExtract: VExtract = internalV

  def makeInputFormat() = new TextInputFormat()
  def makeSplits(hadoopConf: Configuration): immutable.IndexedSeq[WriSer[FileSplit]] = {
    val job = Job.getInstance(hadoopConf)
    FileInputFormat.setInputPaths(job, file)
    val path = new Path(file)
    val len = FileSystem.get(hadoopConf).listStatus(path).head.getLen
    val size_per = math.round(len / partitions.toDouble)

    ((0 until partitions - 1).map { p =>
      new FileSplit(path, size_per * p, size_per, null)
    } :+ {
      val fin = size_per * (partitions - 1)
      new FileSplit(path, fin, len - fin, null)
    }).map(WriSer(_))
  }
}

// TODO do we really get much from having this as its own class? consider just making a def csv method in TextInputFormatConf
object CSVInputFormatConf {
  def apply[V](ifc: InputFormatConf[LongWritable, V] { type Split = FileSplit }): InputFormatConf[LongWritable, V] {
    type IF = ifc.IF
    type Split = ifc.Split
    type KExtract = ifc.KExtract
    type VExtract = ifc.VExtract
  } = new InputFormatConf[LongWritable, V] {
    type IF = ifc.IF
    type Split = ifc.Split
    type KExtract = ifc.KExtract
    type VExtract = ifc.VExtract

    override val kExtract: KExtract = ifc.kExtract
    override val vExtract: VExtract = ifc.vExtract

    override def makeInputFormat() = ifc.makeInputFormat()
    override def makeSplits(hadoopConf: Configuration) = {
      val splits = ifc.makeSplits(hadoopConf)
      splits.headOption.fold(IndexedSeq.empty[WriSer[Split]]) {
        case WriSer(head) =>
          val rr = createRecordReader(hadoopConf, head)
          require(rr.nextKeyValue, "csv has no header, first line was empty")
          val afterHeader = rr.getCurrentKey.get
          require(rr.nextKeyValue, "first split is empty")
          WriSer(new FileSplit(head.getPath, afterHeader, head.getLength - afterHeader, null)) +:
            splits.tail
      }
    }
  }
} 
Example 22
Source File: FileLocalityInputFormat.scala    From ArchiveSpark   with MIT License 5 votes vote down vote up
package org.archive.archivespark.sparkling.util

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}

class FileLocalityInputFormat extends FileInputFormat[NullWritable, Text] {
  class FileLocalityRecordReader extends RecordReader[NullWritable, Text] {
    private var filePath: Text = new Text()
    private var read: Boolean = true

    override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = {
      filePath.set(split.asInstanceOf[FileSplit].getPath.toString)
      read = false
    }

    override def nextKeyValue(): Boolean = {
      if (read) false
      else {
        read = true
        true
      }
    }

    override def getCurrentKey: NullWritable = NullWritable.get
    override def getCurrentValue: Text = filePath
    override def getProgress: Float = if (read) 1.0f else 0.0f
    override def close(): Unit = read = true
  }

  override def isSplitable(context: JobContext, filename: Path): Boolean = false
  override def createRecordReader(split: InputSplit, context: TaskAttemptContext): RecordReader[NullWritable, Text] = new FileLocalityRecordReader
} 
Example 23
Source File: TFRecordInputFormat.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.utils.tf

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
import org.apache.hadoop.fs.FSDataInputStream

class TFRecordInputFormat extends FileInputFormat[BytesWritable, NullWritable] {
  override def createRecordReader(inputSplit: InputSplit, context: TaskAttemptContext):
  RecordReader[BytesWritable, NullWritable] = new RecordReader[BytesWritable, NullWritable] {

    private var inputStream: FSDataInputStream = null
    private var reader: TFRecordIterator = null
    private var length: Long = 0L
    private var begin: Long = 0L
    private var current: Array[Byte] = null


    override def getCurrentKey: BytesWritable = {
      new BytesWritable(current)
    }

    override def getProgress: Float = {
      (inputStream.getPos - begin) / (length + 1e-6f)
    }

    override def nextKeyValue(): Boolean = {
      if (reader.hasNext) {
        current = reader.next()
        true
      } else {
        false
      }
    }

    override def getCurrentValue: NullWritable = {
      NullWritable.get()
    }

    override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = {
      val conf = context.getConfiguration
      val fileSplit = split.asInstanceOf[FileSplit]
      length = fileSplit.getLength
      begin = fileSplit.getStart

      val file = fileSplit.getPath
      val fs = file.getFileSystem(conf)
      inputStream = fs.open(file, 4096)
      reader = new TFRecordIterator(inputStream)
    }

    override def close(): Unit = {
      inputStream.close()
    }
  }

  override protected def isSplitable(context: JobContext, filename: Path): Boolean = false
} 
Example 24
Source File: RosbagInputFormat.scala    From ros_hadoop   with Apache License 2.0 5 votes vote down vote up
package de.valtech.foss

import scala.io.Source
import scala.collection.JavaConverters._

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, LongWritable, MapWritable}
import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat

object RosbagInputFormat {
  def getRosChunkIdx(context: JobContext): String = {
    context.getConfiguration.get("RosbagInputFormat.chunkIdx")
  }
  def getBlockSize(context: JobContext): Long = {
    context.getConfiguration.get("dfs.blocksize").toLong
  }
}

class RosbagBytesInputFormat
  extends FileInputFormat[LongWritable, BytesWritable] {

  private var rosChunkIdx = ""
  private var recordLength = -1L

  override def isSplitable(context: JobContext, filename: Path): Boolean = {
    rosChunkIdx = RosbagInputFormat.getRosChunkIdx(context)
    recordLength = RosbagInputFormat.getBlockSize(context)
    true
  }

  override def computeSplitSize(blockSize: Long, minSize: Long, maxSize: Long): Long = {
    val defaultSize = super.computeSplitSize(blockSize, minSize, maxSize)
    defaultSize
  }

  override def createRecordReader(split: InputSplit, context: TaskAttemptContext)
      : RecordReader[LongWritable, BytesWritable] = {
    new RosbagBytesRecordReader
  }
}



class RosbagMapInputFormat
  extends FileInputFormat[LongWritable, MapWritable] {

  private var rosChunkIdx = ""
  private var recordLength = -1L

  override def isSplitable(context: JobContext, filename: Path): Boolean = {
    rosChunkIdx = RosbagInputFormat.getRosChunkIdx(context)
    recordLength = RosbagInputFormat.getBlockSize(context)
    true
  }

  override def computeSplitSize(blockSize: Long, minSize: Long, maxSize: Long): Long = {
    val defaultSize = super.computeSplitSize(blockSize, minSize, maxSize)
    defaultSize
  }

  override def createRecordReader(split: InputSplit, context: TaskAttemptContext)
      : RecordReader[LongWritable, MapWritable] = {
    new RosbagMapRecordReader
  }
} 
Example 25
Source File: WholeTextFileRecordReader.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.input

import com.google.common.io.{ByteStreams, Closeables}
import org.apache.hadoop.conf.{Configurable => HConfigurable, Configuration}
import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.CompressionCodecFactory
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.RecordReader
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.apache.hadoop.mapreduce.lib.input.{CombineFileRecordReader, CombineFileSplit}


private[spark] class ConfigurableCombineFileRecordReader[K, V](
    split: InputSplit,
    context: TaskAttemptContext,
    recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable])
  extends CombineFileRecordReader[K, V](
    split.asInstanceOf[CombineFileSplit],
    context,
    recordReaderClass
  ) with Configurable {

  override def initNextRecordReader(): Boolean = {
    val r = super.initNextRecordReader()
    if (r) {
      this.curReader.asInstanceOf[HConfigurable].setConf(getConf)
    }
    r
  }
} 
Example 26
Source File: PortableDataStream.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.input

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream}

import scala.collection.JavaConverters._

import com.google.common.io.{ByteStreams, Closeables}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat, CombineFileRecordReader, CombineFileSplit}


  def toArray(): Array[Byte] = {
    val stream = open()
    try {
      ByteStreams.toByteArray(stream)
    } finally {
      Closeables.close(stream, true)
    }
  }

  def getPath(): String = path
}