org.apache.hadoop.io.BytesWritable Scala Examples

The following examples show how to use org.apache.hadoop.io.BytesWritable. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: DefaultSource.scala    From spark-tensorflow-connector   with Apache License 2.0 5 votes vote down vote up
package org.trustedanalytics.spark.datasources.tensorflow

import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.spark.sql._
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import org.tensorflow.hadoop.io.TFRecordFileOutputFormat
import org.trustedanalytics.spark.datasources.tensorflow.serde.DefaultTfRecordRowEncoder


  override def shortName(): String = "tensorflow"

  // Writes DataFrame as TensorFlow Records
  override def createRelation(
    sqlContext: SQLContext,
    mode: SaveMode,
    parameters: Map[String, String],
    data: DataFrame): BaseRelation = {

    val path = parameters("path")

    //Export DataFrame as TFRecords
    val features = data.rdd.map(row => {
      val example = DefaultTfRecordRowEncoder.encodeTfRecord(row)
      (new BytesWritable(example.toByteArray), NullWritable.get())
    })
    features.saveAsNewAPIHadoopFile[TFRecordFileOutputFormat](path)

    TensorflowRelation(parameters)(sqlContext.sparkSession)
  }

  override def createRelation(sqlContext: SQLContext,
                      parameters: Map[String, String],
                      schema: StructType): BaseRelation = {
    TensorflowRelation(parameters, Some(schema))(sqlContext.sparkSession)
  }

  // Reads TensorFlow Records into DataFrame
  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): TensorflowRelation = {
    TensorflowRelation(parameters)(sqlContext.sparkSession)
  }
} 
Example 2
Source File: ScalaTeraSort.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
package com.intel.hibench.sparkbench.micro

import com.intel.hibench.sparkbench.common.IOCommon
import org.apache.hadoop.examples.terasort.{TeraInputFormat, TeraOutputFormat}
import org.apache.hadoop.io.Text
import org.apache.hadoop.io.BytesWritable
import org.apache.spark._
import org.apache.spark.rdd._

import scala.reflect.ClassTag

object ScalaTeraSort {
  implicit def rddToSampledOrderedRDDFunctions[K: Ordering : ClassTag, V: ClassTag]
  (rdd: RDD[(K, V)]) = new ConfigurableOrderedRDDFunctions[K, V, (K, V)](rdd)

  implicit def ArrayByteOrdering: Ordering[Array[Byte]] = Ordering.fromLessThan {
    case (a, b) => (new BytesWritable(a).compareTo(new BytesWritable(b))) < 0
  }

  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println(
        s"Usage: $ScalaTeraSort <INPUT_HDFS> <OUTPUT_HDFS>"
      )
      System.exit(1)
    }
    val sparkConf = new SparkConf().setAppName("ScalaTeraSort")
                                   .set("spark.shuffle.compress", "false")
                                   .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec")
                                   .set("spark.smartCompress", "false")
                         
    val sc = new SparkContext(sparkConf)
    val io = new IOCommon(sc)

    //val file = io.load[String](args(0), Some("Text"))
    val data = sc.newAPIHadoopFile[Text, Text, TeraInputFormat](args(0)).map {
      case (k,v) => (k.copyBytes, v.copyBytes)
    }
    val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism)
    val reducer  = IOCommon.getProperty("hibench.default.shuffle.parallelism")
      .getOrElse((parallel / 2).toString).toInt

    val partitioner = new BaseRangePartitioner(partitions = reducer, rdd = data)
    val ordered_data = new ConfigurableOrderedRDDFunctions[Array[Byte], Array[Byte], (Array[Byte], Array[Byte])](data)
    val sorted_data = ordered_data.sortByKeyWithPartitioner(partitioner = partitioner).map{case (k, v)=>(new Text(k), new Text(v))}

    sorted_data.saveAsNewAPIHadoopFile[TeraOutputFormat](args(1))
    //io.save(args(1), sorted_data)

    sc.stop()
  }
} 
Example 3
Source File: TensorflowRelation.scala    From ecosystem   with Apache License 2.0 5 votes vote down vote up
package org.tensorflow.spark.datasources.tfrecords

import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, SQLContext, SparkSession}
import org.tensorflow.example.{SequenceExample, Example}
import org.tensorflow.hadoop.io.TFRecordFileInputFormat
import org.tensorflow.spark.datasources.tfrecords.serde.DefaultTfRecordRowDecoder


case class TensorflowRelation(options: Map[String, String], customSchema: Option[StructType]=None)
                             (@transient val session: SparkSession) extends BaseRelation with TableScan {

  //Import TFRecords as DataFrame happens here
  lazy val (tfRdd, tfSchema) = {
    val rdd = session.sparkContext.newAPIHadoopFile(options("path"), classOf[TFRecordFileInputFormat], classOf[BytesWritable], classOf[NullWritable])

    val recordType = options.getOrElse("recordType", "Example")

    recordType match {
      case "Example" =>
        val exampleRdd = rdd.map{case (bytesWritable, nullWritable) =>
          Example.parseFrom(bytesWritable.getBytes)
        }
        val finalSchema = customSchema.getOrElse(TensorFlowInferSchema(exampleRdd))
        val rowRdd = exampleRdd.map(example => DefaultTfRecordRowDecoder.decodeExample(example, finalSchema))
        (rowRdd, finalSchema)
      case "SequenceExample" =>
        val sequenceExampleRdd = rdd.map{case (bytesWritable, nullWritable) =>
          SequenceExample.parseFrom(bytesWritable.getBytes)
        }
        val finalSchema = customSchema.getOrElse(TensorFlowInferSchema(sequenceExampleRdd))
        val rowRdd = sequenceExampleRdd.map(example => DefaultTfRecordRowDecoder.decodeSequenceExample(example, finalSchema))
        (rowRdd, finalSchema)
      case _ =>
        throw new IllegalArgumentException(s"Unsupported recordType ${recordType}: recordType can be Example or SequenceExample")
    }
  }

  override def sqlContext: SQLContext = session.sqlContext

  override def schema: StructType = tfSchema

  override def buildScan(): RDD[Row] = tfRdd
} 
Example 4
Source File: LasOutputWriter.scala    From spark-iqmulus   with Apache License 2.0 5 votes vote down vote up
package fr.ign.spark.iqmulus.las

import org.apache.spark.sql.types._
import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext }
import java.io.DataOutputStream
import org.apache.spark.sql.sources.OutputWriter
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.hadoop.io.{ NullWritable, BytesWritable }
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.fs.Path
import java.text.NumberFormat
import org.apache.spark.sql.{ Row, SQLContext, sources }
import fr.ign.spark.iqmulus.RowOutputStream

class LasOutputWriter(
  name: String,
  context: TaskAttemptContext,
  dataSchema: StructType,
  formatOpt: Option[Byte] = None,
  version: Version = Version(),
  offset: Array[Double] = Array(0F, 0F, 0F),
  scale: Array[Double] = Array(0.01F, 0.01F, 0.01F)
)
    extends OutputWriter {

  private val file = {
    val path = getDefaultWorkFile("/1.pdr")
    val fs = path.getFileSystem(context.getConfiguration)
    fs.create(path)
  }

  private val pmin = Array.fill[Double](3)(Double.PositiveInfinity)
  private val pmax = Array.fill[Double](3)(Double.NegativeInfinity)
  private val countByReturn = Array.fill[Long](15)(0)
  private def count = countByReturn.sum

  private val format = formatOpt.getOrElse(LasHeader.formatFromSchema(dataSchema))

  // todo, extra bytes
  private val schema = LasHeader.schema(format)
  private def header =
    new LasHeader(name, format, count, pmin, pmax, scale, offset, countByReturn)

  private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian = true, schema, dataSchema)

  def getDefaultWorkFile(extension: String): Path = {
    val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID")
    val taskAttemptId: TaskAttemptID = context.getTaskAttemptID
    val split = taskAttemptId.getTaskID.getId
    new Path(name, f"$split%05d-$uniqueWriteJobId$extension")
  }

  override def write(row: Row): Unit = {
    recordWriter.write(row)

    // gather statistics for the header
    val x = offset(0) + scale(0) * row.getAs[Int]("x").toDouble
    val y = offset(1) + scale(1) * row.getAs[Int]("y").toDouble
    val z = offset(2) + scale(2) * row.getAs[Int]("z").toDouble
    val ret = row.getAs[Byte]("flags") & 0x3
    countByReturn(ret) += 1
    pmin(0) = Math.min(pmin(0), x)
    pmin(1) = Math.min(pmin(1), y)
    pmin(2) = Math.min(pmin(2), z)
    pmax(0) = Math.max(pmax(0), x)
    pmax(1) = Math.max(pmax(1), y)
    pmax(2) = Math.max(pmax(2), z)
  }

  override def close(): Unit = {
    recordWriter.close

    // write header
    val path = getDefaultWorkFile("/0.header")
    val fs = path.getFileSystem(context.getConfiguration)
    val dos = new java.io.DataOutputStream(fs.create(path))
    header.write(dos)
    dos.close

    // copy header and pdf to a final las file (1 per split)
    org.apache.hadoop.fs.FileUtil.copyMerge(
      fs, getDefaultWorkFile("/"),
      fs, getDefaultWorkFile(".las"),
      true, context.getConfiguration, ""
    )
  }
} 
Example 5
Source File: PlyOutputWriter.scala    From spark-iqmulus   with Apache License 2.0 5 votes vote down vote up
package fr.ign.spark.iqmulus.ply

import org.apache.spark.sql.types._
import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext, JobContext }
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
import java.io.DataOutputStream
import org.apache.spark.sql.sources.OutputWriter
import org.apache.hadoop.io.{ NullWritable, BytesWritable }
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.fs.Path
import java.text.NumberFormat
import org.apache.spark.sql.{ Row, SQLContext, sources }
import fr.ign.spark.iqmulus.RowOutputStream

class PlyOutputWriter(
  name: String,
  context: TaskAttemptContext,
  dataSchema: StructType,
  element: String,
  littleEndian: Boolean
)
    extends OutputWriter {

  private val file = {
    val path = getDefaultWorkFile(s".ply.$element")
    val fs = path.getFileSystem(context.getConfiguration)
    fs.create(path)
  }

  private var count = 0L

  // strip out ids
  private val schema = StructType(dataSchema.filterNot { Seq("fid", "pid") contains _.name })

  private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian, schema, dataSchema)

  def getDefaultWorkFile(extension: String): Path = {
    val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID")
    val taskAttemptId: TaskAttemptID = context.getTaskAttemptID
    val split = taskAttemptId.getTaskID.getId
    new Path(name, f"$split%05d-$uniqueWriteJobId$extension")
  }

  override def write(row: Row): Unit = {
    recordWriter.write(row)
    count += 1
  }

  override def close(): Unit = {
    recordWriter.close

    // write header
    val path = getDefaultWorkFile(".ply.header")
    val fs = path.getFileSystem(context.getConfiguration)
    val dos = new java.io.DataOutputStream(fs.create(path))
    val header = new PlyHeader(path.toString, littleEndian, Map(element -> ((count, schema))))
    header.write(dos)
    dos.close
  }
} 
Example 6
Source File: ReadingWritingData.scala    From Spark-RSVD   with Apache License 2.0 5 votes vote down vote up
package com.criteo.rsvd

import java.nio.ByteBuffer

import com.esotericsoftware.kryo.Kryo
import com.typesafe.scalalogging.slf4j.StrictLogging
import de.javakaffee.kryoserializers.UnmodifiableCollectionsSerializer
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.spark.mllib.linalg.distributed.MatrixEntry
import org.apache.spark.rdd.RDD
import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer}
import org.apache.spark.{SparkConf, SparkContext}

import scala.reflect.ClassTag

object ReadingWritingData extends StrictLogging {

  def getInputDataSizeMB(inputPathPattern: String, sc: SparkContext): Int = {
    val fs = FileSystem.get(sc.hadoopConfiguration)
    val path = new Path(inputPathPattern)
    (fs.globStatus(path).map(f => f.getLen).sum / 1024 / 1024).toInt
  }

  def loadMatrixEntries(inputPath: String,
                        singlePartitionSizeMB: Int,
                        sc: SparkContext): RDD[MatrixEntry] = {

    logger.info(s"Input matrix path: $inputPath")
    val inputDataSizeMB = getInputDataSizeMB(inputPath + "
  def makeRddFromKryoFile[T: ClassTag](
      sc: SparkContext,
      path: String,
      minPartitionsOpt: Option[Int] = None): RDD[T] = {
    val minPartitions = minPartitionsOpt.getOrElse(sc.defaultMinPartitions)
    val serializer = new KryoSerializer(sc.getConf)
    sc.sequenceFile(path,
                    classOf[NullWritable],
                    classOf[BytesWritable],
                    minPartitions)
      .mapPartitions { it =>
        val instance = serializer.newInstance()
        it.flatMap {
          case (_, v) =>
            instance.deserialize[Array[T]](ByteBuffer.wrap(v.getBytes))
        }
      }
  }

  object RandomizedSVDKryoRegistrator extends KryoRegistrator {

    def registerClasses(kryo: Kryo): Unit = {
      UnmodifiableCollectionsSerializer.registerSerializers(kryo)
      kryo.register(classOf[MatrixEntry])
      kryo.register(classOf[Array[MatrixEntry]])
    }
  }

  def appendBasicRegistratorToSparkConf(sparkConf: SparkConf): SparkConf =
    appendRegistratorToSparkConf(sparkConf,
                                 RandomizedSVDKryoRegistrator.getClass.getName)

  def appendRegistratorToSparkConf(sparkConf: SparkConf,
                                   registratorName: String): SparkConf = {
    val oldValue = sparkConf.get("spark.kryo.registrator", "")
    if (oldValue == "") {
      sparkConf.set("spark.kryo.registrator", registratorName)
    } else {
      sparkConf.set("spark.kryo.registrator", oldValue + "," + registratorName)
    }
  }

} 
Example 7
Source File: RecordIOOutputFormatTests.scala    From sagemaker-spark   with Apache License 2.0 5 votes vote down vote up
package com.amazonaws.services.sagemaker.sparksdk.protobuf

import java.io.ByteArrayOutputStream

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FSDataOutputStream, Path}
import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.mockito.Matchers.any
import org.mockito.Mockito.{verify, when}
import org.scalatest.{BeforeAndAfter, FlatSpec}
import org.scalatest.mock.MockitoSugar

import com.amazonaws.services.sagemaker.sparksdk.protobuf.RecordIOOutputFormat.SageMakerProtobufRecordWriter


class RecordIOOutputFormatTests extends FlatSpec with MockitoSugar with BeforeAndAfter {

  var sagemakerProtobufRecordWriter: SageMakerProtobufRecordWriter = _
  var mockOutputStream : FSDataOutputStream = _
  var byteArrayOutputStream: ByteArrayOutputStream = _
  var mockTaskAttemptContext: TaskAttemptContext = _
  var mockPath: Path = _
  var mockFileSystem: FileSystem = _

  before {
    byteArrayOutputStream = new ByteArrayOutputStream()
    mockOutputStream = mock[FSDataOutputStream]
    sagemakerProtobufRecordWriter = new SageMakerProtobufRecordWriter(mockOutputStream)
    mockTaskAttemptContext = mock[TaskAttemptContext]
    mockPath = mock[Path]
    mockFileSystem = mock[FileSystem]
  }

  it should "write an empty array of bytes" in {
    val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray)

    val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes)
    sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable)

    verify(mockOutputStream).write(bytes, 0, bytes.length)
  }


  it should "write an array of bytes" in {
    val byteArray = Array[Byte](0, 0, 0, 0)
    byteArrayOutputStream.write(byteArray)
    val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray)
    val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes)

    sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable)

    verify(mockOutputStream).write(bytes, 0, bytes.length)
  }

  it should "write an array of bytes, padding as necessary" in {
    byteArrayOutputStream.write(5)
    val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray)
    val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes)

    sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable)

    verify(mockOutputStream).write(bytes, 0, bytes.length)
  }

  it should "write an array of bytes, padding only as much as necessary" in {
    byteArrayOutputStream.write(Array[Byte](0, 0, 0, 0, 0))
    val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray)
    val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes)

    sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable)

    verify(mockOutputStream).write(bytes, 0, bytes.length)
  }

  it should "create a record writer from a FSDataOutputStream created by the filesystem" in {
    val mockTaskAttemptContext = mock[TaskAttemptContext]
    val mockPath = mock[Path]
    val mockFileSystem = mock[FileSystem]
    when(mockPath.getFileSystem(any[Configuration])).thenReturn(mockFileSystem)
    new RecordIOOutputFormat() {
      override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
        mockPath
      }
    }.getRecordWriter(mockTaskAttemptContext)
    verify(mockFileSystem).create(mockPath, true)

  }

} 
Example 8
Source File: SageMakerProtobufWriter.scala    From sagemaker-spark   with Apache License 2.0 5 votes vote down vote up
package com.amazonaws.services.sagemaker.sparksdk.protobuf

import java.io.ByteArrayOutputStream

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext}

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
import org.apache.spark.sql.execution.datasources.OutputWriter
import org.apache.spark.sql.types.StructType


  def write(row: Row): Unit = {
    val labelColumnName = options.getOrElse("labelColumnName", "label")
    val featuresColumnName = options.getOrElse("featuresColumnName", "features")

    val record = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Some(labelColumnName))
    record.writeTo(byteArrayOutputStream)

    recordWriter.write(NullWritable.get(), new BytesWritable(byteArrayOutputStream.toByteArray))
    byteArrayOutputStream.reset()
  }

  override def close(): Unit = {
    recordWriter.close(context)
  }
} 
Example 9
Source File: ImageInputDStream.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.streaming

import java.io.InputStream
import java.net.Socket
import org.apache.hadoop.io.BytesWritable
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver
import scala.collection.mutable.ArrayBuffer
import org.apache.spark.Logging


class ImageInputDStream(@transient ssc_ : StreamingContext, host: String, port:
Int, storageLevel: StorageLevel) extends
  ReceiverInputDStream[BytesWritable](ssc_) with Logging {
  override def getReceiver(): Receiver[BytesWritable] = {
    new ImageRecevier(host, port, storageLevel)
  }
}

class ImageRecevier(host: String, port: Int, storageLevel: StorageLevel) extends
  Receiver[BytesWritable](storageLevel) with Logging {
  override def onStart(): Unit = {
    new Thread("Image Socket") {
      setDaemon(true)

      override def run(): Unit = {
        receive()
      }
    }.start()
  }

  def receive(): Unit = {
    var socket: Socket = null
    var in: InputStream = null
    try {
      log.info("Connecting to " + host + ":" + port)
      socket = new Socket(host, port)
      log.info("Connected to " + host + ":" + port)
      in = socket.getInputStream
      val buf = new ArrayBuffer[Byte]()
      var bytes = new Array[Byte](1024)
      var len = 0
      while (-1 < len) {
        len = in.read(bytes)
        if (len > 0) {
          buf ++= bytes
        }
      }
      val bw = new BytesWritable(buf.toArray)
      log.error("byte:::::" + bw.getLength)
      store(bw)
      log.info("Stopped receiving")
      restart("Retrying connecting to " + host + ":" + port)
    } catch {
      case e: java.net.ConnectException =>
        restart("Error connecting to " + host + ":" + port, e)
      case t: Throwable =>
        restart("Error receiving data", t)
    } finally {
      if (in != null) {
        in.close()
      }
      if (socket != null) {
        socket.close()
        log.info("Closed socket to " + host + ":" + port)
      }
    }
  }

  override def onStop(): Unit = {

  }
} 
Example 10
Source File: ErrorEventsWriter.scala    From etl-light   with MIT License 5 votes vote down vote up
package yamrcraft.etlite.writers

import java.io.OutputStream

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile, Text}
import org.json4s.jackson.Serialization
import org.json4s.{DefaultFormats, Formats, ShortTypeHints}
import yamrcraft.etlite.utils.FileUtils


case class ErrorInfo(
  errorType: String,
  errorMsg: Option[String]
)


class ErrorEventWriter(folder: String, jobId: Long, partitionId: Int)
  extends ErrorEventsWriter {

  // incremental record id
  var recordId = 1

  val fs = FileUtils.getFS(folder)

  val seqPath = new Path(folder, s"errors_job${jobId}_part$partitionId.seq")
  if (fs.exists(seqPath)) {
    fs.delete(seqPath, false)
  }

  val metaPath = new Path(folder, s"errors_job${jobId}_part$partitionId.meta.seq")
  if (fs.exists(metaPath)) {
    fs.delete(metaPath, false)
  }

  private var seqWriter: Option[SequenceFile.Writer] = None
  private var metaWriter: Option[SequenceFile.Writer] = None

  implicit val formats =
    new Formats {
      val dateFormat = DefaultFormats.lossless.dateFormat
      override val typeHints = ShortTypeHints(List(classOf[ErrorInfo]))
      override val typeHintFieldName = "type"
    }

  override def write(errorEvent: (Array[Byte], ErrorInfo)) = {

    if (seqWriter.isEmpty) {
      seqWriter = createSequenceFile(seqPath, classOf[IntWritable], classOf[BytesWritable])
      metaWriter = createSequenceFile(metaPath, classOf[IntWritable], classOf[Text])
    }

    val id = new IntWritable(recordId)
    seqWriter.get.append(id, new BytesWritable(errorEvent._1))
    metaWriter.get.append(id, new Text(Serialization.write(errorEvent._2)))

    recordId += 1
  }

  override def commit() = {
    seqWriter.foreach(p => p.close())
    metaWriter.foreach(p => p.close())
  }

  private def createSequenceFile(path: Path, keyClass: Class[_], valueClass: Class[_]) = {
    val optPath = SequenceFile.Writer.file(path)
    val optKey = SequenceFile.Writer.keyClass(keyClass)
    val optVal = SequenceFile.Writer.valueClass(valueClass)
    Some(SequenceFile.createWriter(fs.getConf, optPath, optKey, optVal))
  }
} 
Example 11
Source File: RosbagInputFormat.scala    From ros_hadoop   with Apache License 2.0 5 votes vote down vote up
package de.valtech.foss

import scala.io.Source
import scala.collection.JavaConverters._

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, LongWritable, MapWritable}
import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat

object RosbagInputFormat {
  def getRosChunkIdx(context: JobContext): String = {
    context.getConfiguration.get("RosbagInputFormat.chunkIdx")
  }
  def getBlockSize(context: JobContext): Long = {
    context.getConfiguration.get("dfs.blocksize").toLong
  }
}

class RosbagBytesInputFormat
  extends FileInputFormat[LongWritable, BytesWritable] {

  private var rosChunkIdx = ""
  private var recordLength = -1L

  override def isSplitable(context: JobContext, filename: Path): Boolean = {
    rosChunkIdx = RosbagInputFormat.getRosChunkIdx(context)
    recordLength = RosbagInputFormat.getBlockSize(context)
    true
  }

  override def computeSplitSize(blockSize: Long, minSize: Long, maxSize: Long): Long = {
    val defaultSize = super.computeSplitSize(blockSize, minSize, maxSize)
    defaultSize
  }

  override def createRecordReader(split: InputSplit, context: TaskAttemptContext)
      : RecordReader[LongWritable, BytesWritable] = {
    new RosbagBytesRecordReader
  }
}



class RosbagMapInputFormat
  extends FileInputFormat[LongWritable, MapWritable] {

  private var rosChunkIdx = ""
  private var recordLength = -1L

  override def isSplitable(context: JobContext, filename: Path): Boolean = {
    rosChunkIdx = RosbagInputFormat.getRosChunkIdx(context)
    recordLength = RosbagInputFormat.getBlockSize(context)
    true
  }

  override def computeSplitSize(blockSize: Long, minSize: Long, maxSize: Long): Long = {
    val defaultSize = super.computeSplitSize(blockSize, minSize, maxSize)
    defaultSize
  }

  override def createRecordReader(split: InputSplit, context: TaskAttemptContext)
      : RecordReader[LongWritable, MapWritable] = {
    new RosbagMapRecordReader
  }
} 
Example 12
Source File: SequenceSinkTest.scala    From eel-sdk   with Apache License 2.0 5 votes vote down vote up
package io.eels.component.sequence

import io.eels.datastream.DataStream
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile}
import org.scalatest.{Matchers, WordSpec}

class SequenceSinkTest extends WordSpec with Matchers {

  private val ds = DataStream.fromValues(
    StructType("a", "b", "c", "d"),
    Seq(
      List("1", "2", "3", "4"),
      List("5", "6", "7", "8")
    )
  )

  "SequenceSink" should {
    "write sequence files" in {

      implicit val conf = new Configuration
      implicit val fs = FileSystem.get(conf)

      val path = new Path("seqsink.seq")
      if (fs.exists(path))
        fs.delete(path, true)

      ds.to(SequenceSink(path))

      val reader = new SequenceFile.Reader(new Configuration, SequenceFile.Reader.file(path))

      val k = new IntWritable
      val v = new BytesWritable

      val set = for (_ <- 1 to 3) yield {
        reader.next(k, v)
        new String(v.copyBytes)
      }

      set.toSet shouldBe Set(
        "a,b,c,d",
        "1,2,3,4",
        "5,6,7,8"
      )

      reader.close()

      fs.delete(path, true)
    }
  }
} 
Example 13
Source File: SequenceSource.scala    From eel-sdk   with Apache License 2.0 5 votes vote down vote up
package io.eels.component.sequence

import java.util.concurrent.atomic.AtomicBoolean

import com.sksamuel.exts.Logging
import com.sksamuel.exts.io.Using
import io.eels._
import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription}
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile}

case class SequenceSource(path: Path)(implicit conf: Configuration) extends Source with Logging {
  logger.debug(s"Creating sequence source from $path")

  override def schema: StructType = SequenceSupport.schema(path)
  override def parts(): Seq[Publisher[Seq[Row]]] = List(new SequencePublisher(path))
}

object SequenceReaderIterator {
  def apply(schema: StructType, reader: SequenceFile.Reader): Iterator[Row] = new Iterator[Row] {
    private val k = new IntWritable()
    private val v = new BytesWritable()
    // throw away the header
    reader.next(k, v)
    override def next(): Row = Row(schema, SequenceSupport.toValues(v).toVector)
    override def hasNext(): Boolean = reader.next(k, v)
  }
}

class SequencePublisher(val path: Path)(implicit conf: Configuration) extends Publisher[Seq[Row]] with Logging with Using {

  override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {
    try {
      using(SequenceSupport.createReader(path)) { reader =>
        val schema = SequenceSupport.schema(path)
        val running = new AtomicBoolean(true)
        subscriber.subscribed(Subscription.fromRunning(running))
        SequenceReaderIterator(schema, reader)
          .takeWhile(_ => running.get)
          .grouped(DataStream.DefaultBatchSize)
          .foreach(subscriber.next)

        subscriber.completed()
      }
    } catch {
      case t: Throwable => subscriber.error(t)
    }
  }
} 
Example 14
Source File: SequenceSupport.scala    From eel-sdk   with Apache License 2.0 5 votes vote down vote up
package io.eels.component.sequence

import java.io.StringReader
import java.nio.charset.Charset

import com.sksamuel.exts.Logging
import com.sksamuel.exts.io.Using
import io.eels.component.csv.{CsvFormat, CsvSupport}
import io.eels.schema.{Field, StructType}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile}

object SequenceSupport extends Logging with Using {

  def createReader(path: Path)(implicit conf: Configuration): SequenceFile.Reader =
    new SequenceFile.Reader(conf, SequenceFile.Reader.file(path))

  def toValues(v: BytesWritable): Array[String] = toValues(new String(v.copyBytes(), Charset.forName("UTF8")))

  def toValues(str: String): Array[String] = {
    val parser = CsvSupport.createParser(CsvFormat(), false, false, false, null, null)
    parser.beginParsing(new StringReader(str))
    val record = parser.parseNext()
    parser.stopParsing()
    record
  }

  def schema(path: Path)(implicit conf: Configuration): StructType = {
    logger.debug(s"Fetching sequence schema for $path")
    using(createReader(path)) { it =>
      val k = new IntWritable()
      val v = new BytesWritable()
      val fields: Array[Field] = {
        it.next(k, v)
        toValues(v).map { it => new Field(it) }
      }
      StructType(fields.toList)
    }
  }
} 
Example 15
Source File: SequenceSink.scala    From eel-sdk   with Apache License 2.0 5 votes vote down vote up
package io.eels.component.sequence

import java.io.StringWriter

import com.univocity.parsers.csv.{CsvWriter, CsvWriterSettings}
import io.eels.{Row, Sink, SinkWriter}
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile}

case class SequenceSink(path: Path)(implicit conf: Configuration) extends Sink {

  override def open(schema: StructType): SinkWriter = new SequenceSinkWriter(schema, path)

  class SequenceSinkWriter(schema: StructType, path: Path) extends SinkWriter {

    val writer = SequenceFile.createWriter(conf,
        SequenceFile.Writer.file(path),
      SequenceFile.Writer.keyClass(classOf[IntWritable]),
      SequenceFile.Writer.valueClass(classOf[BytesWritable])
    )

    val key = new IntWritable(0)

    val headers = valuesToCsv(schema.fieldNames())
    writer.append(key, new BytesWritable(headers.getBytes))

    override def close(): Unit = writer.close()

    override def write(row: Row): Unit = {
      this.synchronized {
        val csv = valuesToCsv(row.values)
        writer.append(key, new BytesWritable(csv.getBytes()))
        key.set(key.get() + 1)
      }
    }

    private def valuesToCsv(values: Seq[Any]): String = {
      val swriter = new StringWriter()
      val csv = new CsvWriter(swriter, new CsvWriterSettings())
      csv.writeRow(values.map {
        case null => null
        case other => other.toString
      }: _*)
      csv.close()
      swriter.toString().trim()
    }
  }
} 
Example 16
Source File: PailDataSource.scala    From utils   with Apache License 2.0 5 votes vote down vote up
package com.indix.utils.spark.pail

import com.backtype.hadoop.pail._
import com.backtype.support.{Utils => PailUtils}
import org.apache.hadoop.io.{BytesWritable, Text}
import org.apache.hadoop.mapred.{InputFormat, JobConf}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

trait PailDataSource {

  implicit class PailBasedReader(sc: SparkContext) {
    def pailFile[R: ClassTag](inputLocation: String): RDD[R] = {
      pailFileWithInfo[R](inputLocation).map(_._2)
    }

    def pailFileWithInfo[R: ClassTag](inputLocation: String) = {
      val pail = new Pail(inputLocation)
      val pailSpec = pail.getSpec
      val inputFormat = pail.getFormat.getInputFormatClass.asSubclass(classOf[InputFormat[PailRecordInfo, BytesWritable]])
      sc.hadoopFile(inputLocation, inputFormat, classOf[PailRecordInfo], classOf[BytesWritable])
        .map {
          case (recordInfo, recordInBytes) =>
            recordInfo -> pailSpec.getStructure.deserialize(recordInBytes.getBytes).asInstanceOf[R]
        }
    }
  }

  implicit class PailBasedWriter[R: ClassTag](rdd: RDD[R]) {
    def saveAsPail(outputLocation: String, pailSpec: PailSpec) = {
      val jobConf = new JobConf(rdd.context.hadoopConfiguration)

      PailUtils.setObject(jobConf, PailOutputFormat.SPEC_ARG, pailSpec)

      rdd.map { record =>
        val pailStruct = pailSpec.getStructure.asInstanceOf[PailStructure[R]]

        val attr = PailUtils.join(pailStruct.getTarget(record), "/")
        val recordInBytes = pailStruct.serialize(record)
        new Text(attr) -> new BytesWritable(recordInBytes)
      }.saveAsHadoopFile(outputLocation, classOf[Text], classOf[BytesWritable], classOf[PailOutputFormat], jobConf)
    }
  }

} 
Example 17
Source File: COCOSeqFileGenerator.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.models.utils

import com.intel.analytics.bigdl.dataset.segmentation.{COCODataset, COCOSerializeContext}
import java.io.File
import java.nio.file.{Files, Paths}
import java.util.concurrent.atomic.AtomicInteger
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.SequenceFile.Writer
import org.apache.hadoop.io.compress.BZip2Codec
import org.apache.hadoop.io.{BytesWritable, SequenceFile}
import scala.collection.parallel.ForkJoinTaskSupport
import scopt.OptionParser

object COCOSeqFileGenerator {

  
  case class COCOSeqFileGeneratorParams(
    folder: String = ".",
    metaPath: String = "instances_val2014.json",
    output: String = ".",
    parallel: Int = 1,
    blockSize: Int = 12800
  )

  private val parser = new OptionParser[COCOSeqFileGeneratorParams]("BigDL COCO " +
    "Sequence File Generator") {
    head("BigDL COCO Sequence File Generator")
    opt[String]('f', "folder")
      .text("where you put the COCO image files")
      .action((x, c) => c.copy(folder = x))
    opt[String]('o', "output folder")
      .text("where you put the generated seq files")
      .action((x, c) => c.copy(output = x))
    opt[Int]('p', "parallel")
      .text("parallel num")
      .action((x, c) => c.copy(parallel = x))
    opt[Int]('b', "blockSize")
      .text("block size")
      .action((x, c) => c.copy(blockSize = x))
    opt[String]('m', "metaPath")
      .text("metadata json file path")
      .action((x, c) => c.copy(metaPath = x))
  }

  def main(args: Array[String]): Unit = {
    parser.parse(args, COCOSeqFileGeneratorParams()).foreach { param =>
      println("Loading COCO metadata")
      val meta = COCODataset.load(param.metaPath, param.folder)
      println("Metadata loaded")
      val conf: Configuration = new Configuration
      val doneCount = new AtomicInteger(0)
      val tasks = meta.images.filter(img => {
        val path = img.path
        val valid = Files.exists(path) && !Files.isDirectory(path)
        if (!valid) {
          System.err.print(s"[Warning] The image file ${path.getFileName} does not exist.\n")
        }
        valid
      }).grouped(param.blockSize).zipWithIndex.toArray.par
      tasks.tasksupport = new ForkJoinTaskSupport(
        new scala.concurrent.forkjoin.ForkJoinPool(param.parallel))
      tasks.foreach { case (imgs, blkId) =>
        val outFile = new Path(param.output, s"coco-seq-$blkId.seq")
        val key = new BytesWritable
        val value = new BytesWritable
        val writer = SequenceFile.createWriter(conf, Writer.file(outFile), Writer.keyClass(key
          .getClass), Writer.valueClass(value.getClass), Writer.compression(SequenceFile
          .CompressionType.BLOCK, new BZip2Codec))
        val context = new COCOSerializeContext
        imgs.foreach { img =>
          context.clear()
          context.dump(img.fileName)
          img.dumpTo(context)
          context.dump(COCODataset.MAGIC_NUM)
          val keyBytes = context.toByteArray
          key.set(keyBytes, 0, keyBytes.length)
          val bytes = img.data
          value.set(bytes, 0, bytes.length)
          writer.append(key, value)
          val cnt = doneCount.incrementAndGet()
          if (cnt % 500 == 0) {
            System.err.print(s"\r$cnt / ${meta.images.length} = ${cnt.toFloat/meta.images.length}")
          }
        }
        writer.close()
      }
      System.err.print("\n")
    }
  }
} 
Example 18
Source File: TFRecordInputFormat.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.utils.tf

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
import org.apache.hadoop.fs.FSDataInputStream

class TFRecordInputFormat extends FileInputFormat[BytesWritable, NullWritable] {
  override def createRecordReader(inputSplit: InputSplit, context: TaskAttemptContext):
  RecordReader[BytesWritable, NullWritable] = new RecordReader[BytesWritable, NullWritable] {

    private var inputStream: FSDataInputStream = null
    private var reader: TFRecordIterator = null
    private var length: Long = 0L
    private var begin: Long = 0L
    private var current: Array[Byte] = null


    override def getCurrentKey: BytesWritable = {
      new BytesWritable(current)
    }

    override def getProgress: Float = {
      (inputStream.getPos - begin) / (length + 1e-6f)
    }

    override def nextKeyValue(): Boolean = {
      if (reader.hasNext) {
        current = reader.next()
        true
      } else {
        false
      }
    }

    override def getCurrentValue: NullWritable = {
      NullWritable.get()
    }

    override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = {
      val conf = context.getConfiguration
      val fileSplit = split.asInstanceOf[FileSplit]
      length = fileSplit.getLength
      begin = fileSplit.getStart

      val file = fileSplit.getPath
      val fs = file.getFileSystem(conf)
      inputStream = fs.open(file, 4096)
      reader = new TFRecordIterator(inputStream)
    }

    override def close(): Unit = {
      inputStream.close()
    }
  }

  override protected def isSplitable(context: JobContext, filename: Path): Boolean = false
} 
Example 19
Source File: TFRecordOutputFormat.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.utils.tf

import org.apache.hadoop.io.BytesWritable
import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.mapreduce.RecordWriter
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat

class TFRecordOutputFormat extends FileOutputFormat[BytesWritable, NullWritable]{
  override def getRecordWriter(taskAttemptContext: TaskAttemptContext):
  RecordWriter[BytesWritable, NullWritable] = {
    val conf = taskAttemptContext.getConfiguration
    val file = getDefaultWorkFile(taskAttemptContext, "")
    val fs = file.getFileSystem(conf)

    val bufferSize = 4096
    val outStream = fs.create(file, true, bufferSize)

    val writer = new TFRecordWriter(outStream)

    new RecordWriter[BytesWritable, NullWritable]() {
      override def close(context: TaskAttemptContext): Unit = {
        outStream.close()
      }

      override def write(k: BytesWritable, v: NullWritable): Unit = {
        writer.write(k.getBytes, 0, k.getLength)
      }
    }
  }
}