org.apache.hadoop.io.BytesWritable Scala Example

Source File: DefaultSource.scala From spark-tensorflow-connector with Apache License 2.0

5 votes

package org.trustedanalytics.spark.datasources.tensorflow

import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.spark.sql._
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import org.tensorflow.hadoop.io.TFRecordFileOutputFormat
import org.trustedanalytics.spark.datasources.tensorflow.serde.DefaultTfRecordRowEncoder


  override def shortName(): String = "tensorflow"

  // Writes DataFrame as TensorFlow Records
  override def createRelation(
    sqlContext: SQLContext,
    mode: SaveMode,
    parameters: Map[String, String],
    data: DataFrame): BaseRelation = {

    val path = parameters("path")

    //Export DataFrame as TFRecords
    val features = data.rdd.map(row => {
      val example = DefaultTfRecordRowEncoder.encodeTfRecord(row)
      (new BytesWritable(example.toByteArray), NullWritable.get())
    })
    features.saveAsNewAPIHadoopFile[TFRecordFileOutputFormat](path)

    TensorflowRelation(parameters)(sqlContext.sparkSession)
  }

  override def createRelation(sqlContext: SQLContext,
                      parameters: Map[String, String],
                      schema: StructType): BaseRelation = {
    TensorflowRelation(parameters, Some(schema))(sqlContext.sparkSession)
  }

  // Reads TensorFlow Records into DataFrame
  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): TensorflowRelation = {
    TensorflowRelation(parameters)(sqlContext.sparkSession)
  }
}

Source File: ScalaTeraSort.scala From Swallow with Apache License 2.0

5 votes

package com.intel.hibench.sparkbench.micro

import com.intel.hibench.sparkbench.common.IOCommon
import org.apache.hadoop.examples.terasort.{TeraInputFormat, TeraOutputFormat}
import org.apache.hadoop.io.Text
import org.apache.hadoop.io.BytesWritable
import org.apache.spark._
import org.apache.spark.rdd._

import scala.reflect.ClassTag

object ScalaTeraSort {
  implicit def rddToSampledOrderedRDDFunctions[K: Ordering : ClassTag, V: ClassTag]
  (rdd: RDD[(K, V)]) = new ConfigurableOrderedRDDFunctions[K, V, (K, V)](rdd)

  implicit def ArrayByteOrdering: Ordering[Array[Byte]] = Ordering.fromLessThan {
    case (a, b) => (new BytesWritable(a).compareTo(new BytesWritable(b))) < 0
  }

  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println(
        s"Usage: $ScalaTeraSort <INPUT_HDFS> <OUTPUT_HDFS>"
      )
      System.exit(1)
    }
    val sparkConf = new SparkConf().setAppName("ScalaTeraSort")
                                   .set("spark.shuffle.compress", "false")
                                   .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec")
                                   .set("spark.smartCompress", "false")
                         
    val sc = new SparkContext(sparkConf)
    val io = new IOCommon(sc)

    //val file = io.load[String](args(0), Some("Text"))
    val data = sc.newAPIHadoopFile[Text, Text, TeraInputFormat](args(0)).map {
      case (k,v) => (k.copyBytes, v.copyBytes)
    }
    val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism)
    val reducer  = IOCommon.getProperty("hibench.default.shuffle.parallelism")
      .getOrElse((parallel / 2).toString).toInt

    val partitioner = new BaseRangePartitioner(partitions = reducer, rdd = data)
    val ordered_data = new ConfigurableOrderedRDDFunctions[Array[Byte], Array[Byte], (Array[Byte], Array[Byte])](data)
    val sorted_data = ordered_data.sortByKeyWithPartitioner(partitioner = partitioner).map{case (k, v)=>(new Text(k), new Text(v))}

    sorted_data.saveAsNewAPIHadoopFile[TeraOutputFormat](args(1))
    //io.save(args(1), sorted_data)

    sc.stop()
  }
}

Source File: TensorflowRelation.scala From ecosystem with Apache License 2.0

5 votes

package org.tensorflow.spark.datasources.tfrecords

import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, SQLContext, SparkSession}
import org.tensorflow.example.{SequenceExample, Example}
import org.tensorflow.hadoop.io.TFRecordFileInputFormat
import org.tensorflow.spark.datasources.tfrecords.serde.DefaultTfRecordRowDecoder


case class TensorflowRelation(options: Map[String, String], customSchema: Option[StructType]=None)
                             (@transient val session: SparkSession) extends BaseRelation with TableScan {

  //Import TFRecords as DataFrame happens here
  lazy val (tfRdd, tfSchema) = {
    val rdd = session.sparkContext.newAPIHadoopFile(options("path"), classOf[TFRecordFileInputFormat], classOf[BytesWritable], classOf[NullWritable])

    val recordType = options.getOrElse("recordType", "Example")

    recordType match {
      case "Example" =>
        val exampleRdd = rdd.map{case (bytesWritable, nullWritable) =>
          Example.parseFrom(bytesWritable.getBytes)
        }
        val finalSchema = customSchema.getOrElse(TensorFlowInferSchema(exampleRdd))
        val rowRdd = exampleRdd.map(example => DefaultTfRecordRowDecoder.decodeExample(example, finalSchema))
        (rowRdd, finalSchema)
      case "SequenceExample" =>
        val sequenceExampleRdd = rdd.map{case (bytesWritable, nullWritable) =>
          SequenceExample.parseFrom(bytesWritable.getBytes)
        }
        val finalSchema = customSchema.getOrElse(TensorFlowInferSchema(sequenceExampleRdd))
        val rowRdd = sequenceExampleRdd.map(example => DefaultTfRecordRowDecoder.decodeSequenceExample(example, finalSchema))
        (rowRdd, finalSchema)
      case _ =>
        throw new IllegalArgumentException(s"Unsupported recordType ${recordType}: recordType can be Example or SequenceExample")
    }
  }

  override def sqlContext: SQLContext = session.sqlContext

  override def schema: StructType = tfSchema

  override def buildScan(): RDD[Row] = tfRdd
}

Source File: LasOutputWriter.scala From spark-iqmulus with Apache License 2.0

5 votes

package fr.ign.spark.iqmulus.las

import org.apache.spark.sql.types._
import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext }
import java.io.DataOutputStream
import org.apache.spark.sql.sources.OutputWriter
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.hadoop.io.{ NullWritable, BytesWritable }
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.fs.Path
import java.text.NumberFormat
import org.apache.spark.sql.{ Row, SQLContext, sources }
import fr.ign.spark.iqmulus.RowOutputStream

class LasOutputWriter(
  name: String,
  context: TaskAttemptContext,
  dataSchema: StructType,
  formatOpt: Option[Byte] = None,
  version: Version = Version(),
  offset: Array[Double] = Array(0F, 0F, 0F),
  scale: Array[Double] = Array(0.01F, 0.01F, 0.01F)
)
    extends OutputWriter {

  private val file = {
    val path = getDefaultWorkFile("/1.pdr")
    val fs = path.getFileSystem(context.getConfiguration)
    fs.create(path)
  }

  private val pmin = Array.fill[Double](3)(Double.PositiveInfinity)
  private val pmax = Array.fill[Double](3)(Double.NegativeInfinity)
  private val countByReturn = Array.fill[Long](15)(0)
  private def count = countByReturn.sum

  private val format = formatOpt.getOrElse(LasHeader.formatFromSchema(dataSchema))

  // todo, extra bytes
  private val schema = LasHeader.schema(format)
  private def header =
    new LasHeader(name, format, count, pmin, pmax, scale, offset, countByReturn)

  private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian = true, schema, dataSchema)

  def getDefaultWorkFile(extension: String): Path = {
    val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID")
    val taskAttemptId: TaskAttemptID = context.getTaskAttemptID
    val split = taskAttemptId.getTaskID.getId
    new Path(name, f"$split%05d-$uniqueWriteJobId$extension")
  }

  override def write(row: Row): Unit = {
    recordWriter.write(row)

    // gather statistics for the header
    val x = offset(0) + scale(0) * row.getAs[Int]("x").toDouble
    val y = offset(1) + scale(1) * row.getAs[Int]("y").toDouble
    val z = offset(2) + scale(2) * row.getAs[Int]("z").toDouble
    val ret = row.getAs[Byte]("flags") & 0x3
    countByReturn(ret) += 1
    pmin(0) = Math.min(pmin(0), x)
    pmin(1) = Math.min(pmin(1), y)
    pmin(2) = Math.min(pmin(2), z)
    pmax(0) = Math.max(pmax(0), x)
    pmax(1) = Math.max(pmax(1), y)
    pmax(2) = Math.max(pmax(2), z)
  }

  override def close(): Unit = {
    recordWriter.close

    // write header
    val path = getDefaultWorkFile("/0.header")
    val fs = path.getFileSystem(context.getConfiguration)
    val dos = new java.io.DataOutputStream(fs.create(path))
    header.write(dos)
    dos.close

    // copy header and pdf to a final las file (1 per split)
    org.apache.hadoop.fs.FileUtil.copyMerge(
      fs, getDefaultWorkFile("/"),
      fs, getDefaultWorkFile(".las"),
      true, context.getConfiguration, ""
    )
  }
}

Source File: PlyOutputWriter.scala From spark-iqmulus with Apache License 2.0

5 votes

package fr.ign.spark.iqmulus.ply

import org.apache.spark.sql.types._
import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext, JobContext }
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
import java.io.DataOutputStream
import org.apache.spark.sql.sources.OutputWriter
import org.apache.hadoop.io.{ NullWritable, BytesWritable }
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.fs.Path
import java.text.NumberFormat
import org.apache.spark.sql.{ Row, SQLContext, sources }
import fr.ign.spark.iqmulus.RowOutputStream

class PlyOutputWriter(
  name: String,
  context: TaskAttemptContext,
  dataSchema: StructType,
  element: String,
  littleEndian: Boolean
)
    extends OutputWriter {

  private val file = {
    val path = getDefaultWorkFile(s".ply.$element")
    val fs = path.getFileSystem(context.getConfiguration)
    fs.create(path)
  }

  private var count = 0L

  // strip out ids
  private val schema = StructType(dataSchema.filterNot { Seq("fid", "pid") contains _.name })

  private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian, schema, dataSchema)

  def getDefaultWorkFile(extension: String): Path = {
    val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID")
    val taskAttemptId: TaskAttemptID = context.getTaskAttemptID
    val split = taskAttemptId.getTaskID.getId
    new Path(name, f"$split%05d-$uniqueWriteJobId$extension")
  }

  override def write(row: Row): Unit = {
    recordWriter.write(row)
    count += 1
  }

  override def close(): Unit = {
    recordWriter.close

    // write header
    val path = getDefaultWorkFile(".ply.header")
    val fs = path.getFileSystem(context.getConfiguration)
    val dos = new java.io.DataOutputStream(fs.create(path))
    val header = new PlyHeader(path.toString, littleEndian, Map(element -> ((count, schema))))
    header.write(dos)
    dos.close
  }
}

Source File: ReadingWritingData.scala From Spark-RSVD with Apache License 2.0

5 votes

package com.criteo.rsvd

import java.nio.ByteBuffer

import com.esotericsoftware.kryo.Kryo
import com.typesafe.scalalogging.slf4j.StrictLogging
import de.javakaffee.kryoserializers.UnmodifiableCollectionsSerializer
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.spark.mllib.linalg.distributed.MatrixEntry
import org.apache.spark.rdd.RDD
import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer}
import org.apache.spark.{SparkConf, SparkContext}

import scala.reflect.ClassTag

object ReadingWritingData extends StrictLogging {

  def getInputDataSizeMB(inputPathPattern: String, sc: SparkContext): Int = {
    val fs = FileSystem.get(sc.hadoopConfiguration)
    val path = new Path(inputPathPattern)
    (fs.globStatus(path).map(f => f.getLen).sum / 1024 / 1024).toInt
  }

  def loadMatrixEntries(inputPath: String,
                        singlePartitionSizeMB: Int,
                        sc: SparkContext): RDD[MatrixEntry] = {

    logger.info(s"Input matrix path: $inputPath")
    val inputDataSizeMB = getInputDataSizeMB(inputPath + "
  def makeRddFromKryoFile[T: ClassTag](
      sc: SparkContext,
      path: String,
      minPartitionsOpt: Option[Int] = None): RDD[T] = {
    val minPartitions = minPartitionsOpt.getOrElse(sc.defaultMinPartitions)
    val serializer = new KryoSerializer(sc.getConf)
    sc.sequenceFile(path,
                    classOf[NullWritable],
                    classOf[BytesWritable],
                    minPartitions)
      .mapPartitions { it =>
        val instance = serializer.newInstance()
        it.flatMap {
          case (_, v) =>
            instance.deserialize[Array[T]](ByteBuffer.wrap(v.getBytes))
        }
      }
  }

  object RandomizedSVDKryoRegistrator extends KryoRegistrator {

    def registerClasses(kryo: Kryo): Unit = {
      UnmodifiableCollectionsSerializer.registerSerializers(kryo)
      kryo.register(classOf[MatrixEntry])
      kryo.register(classOf[Array[MatrixEntry]])
    }
  }

  def appendBasicRegistratorToSparkConf(sparkConf: SparkConf): SparkConf =
    appendRegistratorToSparkConf(sparkConf,
                                 RandomizedSVDKryoRegistrator.getClass.getName)

  def appendRegistratorToSparkConf(sparkConf: SparkConf,
                                   registratorName: String): SparkConf = {
    val oldValue = sparkConf.get("spark.kryo.registrator", "")
    if (oldValue == "") {
      sparkConf.set("spark.kryo.registrator", registratorName)
    } else {
      sparkConf.set("spark.kryo.registrator", oldValue + "," + registratorName)
    }
  }

}

Source File: RecordIOOutputFormatTests.scala From sagemaker-spark with Apache License 2.0

5 votes

package com.amazonaws.services.sagemaker.sparksdk.protobuf

import java.io.ByteArrayOutputStream

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FSDataOutputStream, Path}
import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.mockito.Matchers.any
import org.mockito.Mockito.{verify, when}
import org.scalatest.{BeforeAndAfter, FlatSpec}
import org.scalatest.mock.MockitoSugar

import com.amazonaws.services.sagemaker.sparksdk.protobuf.RecordIOOutputFormat.SageMakerProtobufRecordWriter


class RecordIOOutputFormatTests extends FlatSpec with MockitoSugar with BeforeAndAfter {

  var sagemakerProtobufRecordWriter: SageMakerProtobufRecordWriter = _
  var mockOutputStream : FSDataOutputStream = _
  var byteArrayOutputStream: ByteArrayOutputStream = _
  var mockTaskAttemptContext: TaskAttemptContext = _
  var mockPath: Path = _
  var mockFileSystem: FileSystem = _

  before {
    byteArrayOutputStream = new ByteArrayOutputStream()
    mockOutputStream = mock[FSDataOutputStream]
    sagemakerProtobufRecordWriter = new SageMakerProtobufRecordWriter(mockOutputStream)
    mockTaskAttemptContext = mock[TaskAttemptContext]
    mockPath = mock[Path]
    mockFileSystem = mock[FileSystem]
  }

  it should "write an empty array of bytes" in {
    val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray)

    val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes)
    sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable)

    verify(mockOutputStream).write(bytes, 0, bytes.length)
  }


  it should "write an array of bytes" in {
    val byteArray = Array[Byte](0, 0, 0, 0)
    byteArrayOutputStream.write(byteArray)
    val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray)
    val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes)

    sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable)

    verify(mockOutputStream).write(bytes, 0, bytes.length)
  }

  it should "write an array of bytes, padding as necessary" in {
    byteArrayOutputStream.write(5)
    val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray)
    val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes)

    sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable)

    verify(mockOutputStream).write(bytes, 0, bytes.length)
  }

  it should "write an array of bytes, padding only as much as necessary" in {
    byteArrayOutputStream.write(Array[Byte](0, 0, 0, 0, 0))
    val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray)
    val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes)

    sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable)

    verify(mockOutputStream).write(bytes, 0, bytes.length)
  }

  it should "create a record writer from a FSDataOutputStream created by the filesystem" in {
    val mockTaskAttemptContext = mock[TaskAttemptContext]
    val mockPath = mock[Path]
    val mockFileSystem = mock[FileSystem]
    when(mockPath.getFileSystem(any[Configuration])).thenReturn(mockFileSystem)
    new RecordIOOutputFormat() {
      override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
        mockPath
      }
    }.getRecordWriter(mockTaskAttemptContext)
    verify(mockFileSystem).create(mockPath, true)

  }

}

Source File: SageMakerProtobufWriter.scala From sagemaker-spark with Apache License 2.0

5 votes

package com.amazonaws.services.sagemaker.sparksdk.protobuf

import java.io.ByteArrayOutputStream

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext}

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
import org.apache.spark.sql.execution.datasources.OutputWriter
import org.apache.spark.sql.types.StructType


  def write(row: Row): Unit = {
    val labelColumnName = options.getOrElse("labelColumnName", "label")
    val featuresColumnName = options.getOrElse("featuresColumnName", "features")

    val record = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Some(labelColumnName))
    record.writeTo(byteArrayOutputStream)

    recordWriter.write(NullWritable.get(), new BytesWritable(byteArrayOutputStream.toByteArray))
    byteArrayOutputStream.reset()
  }

  override def close(): Unit = {
    recordWriter.close(context)
  }
}

Source File: ImageInputDStream.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.examples.streaming

import java.io.InputStream
import java.net.Socket
import org.apache.hadoop.io.BytesWritable
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver
import scala.collection.mutable.ArrayBuffer
import org.apache.spark.Logging


class ImageInputDStream(@transient ssc_ : StreamingContext, host: String, port:
Int, storageLevel: StorageLevel) extends
  ReceiverInputDStream[BytesWritable](ssc_) with Logging {
  override def getReceiver(): Receiver[BytesWritable] = {
    new ImageRecevier(host, port, storageLevel)
  }
}

class ImageRecevier(host: String, port: Int, storageLevel: StorageLevel) extends
  Receiver[BytesWritable](storageLevel) with Logging {
  override def onStart(): Unit = {
    new Thread("Image Socket") {
      setDaemon(true)

      override def run(): Unit = {
        receive()
      }
    }.start()
  }

  def receive(): Unit = {
    var socket: Socket = null
    var in: InputStream = null
    try {
      log.info("Connecting to " + host + ":" + port)
      socket = new Socket(host, port)
      log.info("Connected to " + host + ":" + port)
      in = socket.getInputStream
      val buf = new ArrayBuffer[Byte]()
      var bytes = new Array[Byte](1024)
      var len = 0
      while (-1 < len) {
        len = in.read(bytes)
        if (len > 0) {
          buf ++= bytes
        }
      }
      val bw = new BytesWritable(buf.toArray)
      log.error("byte:::::" + bw.getLength)
      store(bw)
      log.info("Stopped receiving")
      restart("Retrying connecting to " + host + ":" + port)
    } catch {
      case e: java.net.ConnectException =>
        restart("Error connecting to " + host + ":" + port, e)
      case t: Throwable =>
        restart("Error receiving data", t)
    } finally {
      if (in != null) {
        in.close()
      }
      if (socket != null) {
        socket.close()
        log.info("Closed socket to " + host + ":" + port)
      }
    }
  }

  override def onStop(): Unit = {

  }
}

Source File: ErrorEventsWriter.scala From etl-light with MIT License

5 votes

package yamrcraft.etlite.writers

import java.io.OutputStream

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile, Text}
import org.json4s.jackson.Serialization
import org.json4s.{DefaultFormats, Formats, ShortTypeHints}
import yamrcraft.etlite.utils.FileUtils


case class ErrorInfo(
  errorType: String,
  errorMsg: Option[String]
)


class ErrorEventWriter(folder: String, jobId: Long, partitionId: Int)
  extends ErrorEventsWriter {

  // incremental record id
  var recordId = 1

  val fs = FileUtils.getFS(folder)

  val seqPath = new Path(folder, s"errors_job${jobId}_part$partitionId.seq")
  if (fs.exists(seqPath)) {
    fs.delete(seqPath, false)
  }

  val metaPath = new Path(folder, s"errors_job${jobId}_part$partitionId.meta.seq")
  if (fs.exists(metaPath)) {
    fs.delete(metaPath, false)
  }

  private var seqWriter: Option[SequenceFile.Writer] = None
  private var metaWriter: Option[SequenceFile.Writer] = None

  implicit val formats =
    new Formats {
      val dateFormat = DefaultFormats.lossless.dateFormat
      override val typeHints = ShortTypeHints(List(classOf[ErrorInfo]))
      override val typeHintFieldName = "type"
    }

  override def write(errorEvent: (Array[Byte], ErrorInfo)) = {

    if (seqWriter.isEmpty) {
      seqWriter = createSequenceFile(seqPath, classOf[IntWritable], classOf[BytesWritable])
      metaWriter = createSequenceFile(metaPath, classOf[IntWritable], classOf[Text])
    }

    val id = new IntWritable(recordId)
    seqWriter.get.append(id, new BytesWritable(errorEvent._1))
    metaWriter.get.append(id, new Text(Serialization.write(errorEvent._2)))

    recordId += 1
  }

  override def commit() = {
    seqWriter.foreach(p => p.close())
    metaWriter.foreach(p => p.close())
  }

  private def createSequenceFile(path: Path, keyClass: Class[_], valueClass: Class[_]) = {
    val optPath = SequenceFile.Writer.file(path)
    val optKey = SequenceFile.Writer.keyClass(keyClass)
    val optVal = SequenceFile.Writer.valueClass(valueClass)
    Some(SequenceFile.createWriter(fs.getConf, optPath, optKey, optVal))
  }
}

Source File: RosbagInputFormat.scala From ros_hadoop with Apache License 2.0

5 votes

package de.valtech.foss

import scala.io.Source
import scala.collection.JavaConverters._

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, LongWritable, MapWritable}
import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat

object RosbagInputFormat {
  def getRosChunkIdx(context: JobContext): String = {
    context.getConfiguration.get("RosbagInputFormat.chunkIdx")
  }
  def getBlockSize(context: JobContext): Long = {
    context.getConfiguration.get("dfs.blocksize").toLong
  }
}

class RosbagBytesInputFormat
  extends FileInputFormat[LongWritable, BytesWritable] {

  private var rosChunkIdx = ""
  private var recordLength = -1L

  override def isSplitable(context: JobContext, filename: Path): Boolean = {
    rosChunkIdx = RosbagInputFormat.getRosChunkIdx(context)
    recordLength = RosbagInputFormat.getBlockSize(context)
    true
  }

  override def computeSplitSize(blockSize: Long, minSize: Long, maxSize: Long): Long = {
    val defaultSize = super.computeSplitSize(blockSize, minSize, maxSize)
    defaultSize
  }

  override def createRecordReader(split: InputSplit, context: TaskAttemptContext)
      : RecordReader[LongWritable, BytesWritable] = {
    new RosbagBytesRecordReader
  }
}



class RosbagMapInputFormat
  extends FileInputFormat[LongWritable, MapWritable] {

  private var rosChunkIdx = ""
  private var recordLength = -1L

  override def isSplitable(context: JobContext, filename: Path): Boolean = {
    rosChunkIdx = RosbagInputFormat.getRosChunkIdx(context)
    recordLength = RosbagInputFormat.getBlockSize(context)
    true
  }

  override def computeSplitSize(blockSize: Long, minSize: Long, maxSize: Long): Long = {
    val defaultSize = super.computeSplitSize(blockSize, minSize, maxSize)
    defaultSize
  }

  override def createRecordReader(split: InputSplit, context: TaskAttemptContext)
      : RecordReader[LongWritable, MapWritable] = {
    new RosbagMapRecordReader
  }
}

Source File: SequenceSinkTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.sequence

import io.eels.datastream.DataStream
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile}
import org.scalatest.{Matchers, WordSpec}

class SequenceSinkTest extends WordSpec with Matchers {

  private val ds = DataStream.fromValues(
    StructType("a", "b", "c", "d"),
    Seq(
      List("1", "2", "3", "4"),
      List("5", "6", "7", "8")
    )
  )

  "SequenceSink" should {
    "write sequence files" in {

      implicit val conf = new Configuration
      implicit val fs = FileSystem.get(conf)

      val path = new Path("seqsink.seq")
      if (fs.exists(path))
        fs.delete(path, true)

      ds.to(SequenceSink(path))

      val reader = new SequenceFile.Reader(new Configuration, SequenceFile.Reader.file(path))

      val k = new IntWritable
      val v = new BytesWritable

      val set = for (_ <- 1 to 3) yield {
        reader.next(k, v)
        new String(v.copyBytes)
      }

      set.toSet shouldBe Set(
        "a,b,c,d",
        "1,2,3,4",
        "5,6,7,8"
      )

      reader.close()

      fs.delete(path, true)
    }
  }
}

Source File: SequenceSource.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.sequence

import java.util.concurrent.atomic.AtomicBoolean

import com.sksamuel.exts.Logging
import com.sksamuel.exts.io.Using
import io.eels._
import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription}
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile}

case class SequenceSource(path: Path)(implicit conf: Configuration) extends Source with Logging {
  logger.debug(s"Creating sequence source from $path")

  override def schema: StructType = SequenceSupport.schema(path)
  override def parts(): Seq[Publisher[Seq[Row]]] = List(new SequencePublisher(path))
}

object SequenceReaderIterator {
  def apply(schema: StructType, reader: SequenceFile.Reader): Iterator[Row] = new Iterator[Row] {
    private val k = new IntWritable()
    private val v = new BytesWritable()
    // throw away the header
    reader.next(k, v)
    override def next(): Row = Row(schema, SequenceSupport.toValues(v).toVector)
    override def hasNext(): Boolean = reader.next(k, v)
  }
}

class SequencePublisher(val path: Path)(implicit conf: Configuration) extends Publisher[Seq[Row]] with Logging with Using {

  override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {
    try {
      using(SequenceSupport.createReader(path)) { reader =>
        val schema = SequenceSupport.schema(path)
        val running = new AtomicBoolean(true)
        subscriber.subscribed(Subscription.fromRunning(running))
        SequenceReaderIterator(schema, reader)
          .takeWhile(_ => running.get)
          .grouped(DataStream.DefaultBatchSize)
          .foreach(subscriber.next)

        subscriber.completed()
      }
    } catch {
      case t: Throwable => subscriber.error(t)
    }
  }
}

Source File: SequenceSupport.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.sequence

import java.io.StringReader
import java.nio.charset.Charset

import com.sksamuel.exts.Logging
import com.sksamuel.exts.io.Using
import io.eels.component.csv.{CsvFormat, CsvSupport}
import io.eels.schema.{Field, StructType}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile}

object SequenceSupport extends Logging with Using {

  def createReader(path: Path)(implicit conf: Configuration): SequenceFile.Reader =
    new SequenceFile.Reader(conf, SequenceFile.Reader.file(path))

  def toValues(v: BytesWritable): Array[String] = toValues(new String(v.copyBytes(), Charset.forName("UTF8")))

  def toValues(str: String): Array[String] = {
    val parser = CsvSupport.createParser(CsvFormat(), false, false, false, null, null)
    parser.beginParsing(new StringReader(str))
    val record = parser.parseNext()
    parser.stopParsing()
    record
  }

  def schema(path: Path)(implicit conf: Configuration): StructType = {
    logger.debug(s"Fetching sequence schema for $path")
    using(createReader(path)) { it =>
      val k = new IntWritable()
      val v = new BytesWritable()
      val fields: Array[Field] = {
        it.next(k, v)
        toValues(v).map { it => new Field(it) }
      }
      StructType(fields.toList)
    }
  }
}

Source File: SequenceSink.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.sequence

import java.io.StringWriter

import com.univocity.parsers.csv.{CsvWriter, CsvWriterSettings}
import io.eels.{Row, Sink, SinkWriter}
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile}

case class SequenceSink(path: Path)(implicit conf: Configuration) extends Sink {

  override def open(schema: StructType): SinkWriter = new SequenceSinkWriter(schema, path)

  class SequenceSinkWriter(schema: StructType, path: Path) extends SinkWriter {

    val writer = SequenceFile.createWriter(conf,
        SequenceFile.Writer.file(path),
      SequenceFile.Writer.keyClass(classOf[IntWritable]),
      SequenceFile.Writer.valueClass(classOf[BytesWritable])
    )

    val key = new IntWritable(0)

    val headers = valuesToCsv(schema.fieldNames())
    writer.append(key, new BytesWritable(headers.getBytes))

    override def close(): Unit = writer.close()

    override def write(row: Row): Unit = {
      this.synchronized {
        val csv = valuesToCsv(row.values)
        writer.append(key, new BytesWritable(csv.getBytes()))
        key.set(key.get() + 1)
      }
    }

    private def valuesToCsv(values: Seq[Any]): String = {
      val swriter = new StringWriter()
      val csv = new CsvWriter(swriter, new CsvWriterSettings())
      csv.writeRow(values.map {
        case null => null
        case other => other.toString
      }: _*)
      csv.close()
      swriter.toString().trim()
    }
  }
}

Source File: PailDataSource.scala From utils with Apache License 2.0

5 votes

package com.indix.utils.spark.pail

import com.backtype.hadoop.pail._
import com.backtype.support.{Utils => PailUtils}
import org.apache.hadoop.io.{BytesWritable, Text}
import org.apache.hadoop.mapred.{InputFormat, JobConf}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

trait PailDataSource {

  implicit class PailBasedReader(sc: SparkContext) {
    def pailFile[R: ClassTag](inputLocation: String): RDD[R] = {
      pailFileWithInfo[R](inputLocation).map(_._2)
    }

    def pailFileWithInfo[R: ClassTag](inputLocation: String) = {
      val pail = new Pail(inputLocation)
      val pailSpec = pail.getSpec
      val inputFormat = pail.getFormat.getInputFormatClass.asSubclass(classOf[InputFormat[PailRecordInfo, BytesWritable]])
      sc.hadoopFile(inputLocation, inputFormat, classOf[PailRecordInfo], classOf[BytesWritable])
        .map {
          case (recordInfo, recordInBytes) =>
            recordInfo -> pailSpec.getStructure.deserialize(recordInBytes.getBytes).asInstanceOf[R]
        }
    }
  }

  implicit class PailBasedWriter[R: ClassTag](rdd: RDD[R]) {
    def saveAsPail(outputLocation: String, pailSpec: PailSpec) = {
      val jobConf = new JobConf(rdd.context.hadoopConfiguration)

      PailUtils.setObject(jobConf, PailOutputFormat.SPEC_ARG, pailSpec)

      rdd.map { record =>
        val pailStruct = pailSpec.getStructure.asInstanceOf[PailStructure[R]]

        val attr = PailUtils.join(pailStruct.getTarget(record), "/")
        val recordInBytes = pailStruct.serialize(record)
        new Text(attr) -> new BytesWritable(recordInBytes)
      }.saveAsHadoopFile(outputLocation, classOf[Text], classOf[BytesWritable], classOf[PailOutputFormat], jobConf)
    }
  }

}

Source File: COCOSeqFileGenerator.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.models.utils

import com.intel.analytics.bigdl.dataset.segmentation.{COCODataset, COCOSerializeContext}
import java.io.File
import java.nio.file.{Files, Paths}
import java.util.concurrent.atomic.AtomicInteger
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.SequenceFile.Writer
import org.apache.hadoop.io.compress.BZip2Codec
import org.apache.hadoop.io.{BytesWritable, SequenceFile}
import scala.collection.parallel.ForkJoinTaskSupport
import scopt.OptionParser

object COCOSeqFileGenerator {

  
  case class COCOSeqFileGeneratorParams(
    folder: String = ".",
    metaPath: String = "instances_val2014.json",
    output: String = ".",
    parallel: Int = 1,
    blockSize: Int = 12800
  )

  private val parser = new OptionParser[COCOSeqFileGeneratorParams]("BigDL COCO " +
    "Sequence File Generator") {
    head("BigDL COCO Sequence File Generator")
    opt[String]('f', "folder")
      .text("where you put the COCO image files")
      .action((x, c) => c.copy(folder = x))
    opt[String]('o', "output folder")
      .text("where you put the generated seq files")
      .action((x, c) => c.copy(output = x))
    opt[Int]('p', "parallel")
      .text("parallel num")
      .action((x, c) => c.copy(parallel = x))
    opt[Int]('b', "blockSize")
      .text("block size")
      .action((x, c) => c.copy(blockSize = x))
    opt[String]('m', "metaPath")
      .text("metadata json file path")
      .action((x, c) => c.copy(metaPath = x))
  }

  def main(args: Array[String]): Unit = {
    parser.parse(args, COCOSeqFileGeneratorParams()).foreach { param =>
      println("Loading COCO metadata")
      val meta = COCODataset.load(param.metaPath, param.folder)
      println("Metadata loaded")
      val conf: Configuration = new Configuration
      val doneCount = new AtomicInteger(0)
      val tasks = meta.images.filter(img => {
        val path = img.path
        val valid = Files.exists(path) && !Files.isDirectory(path)
        if (!valid) {
          System.err.print(s"[Warning] The image file ${path.getFileName} does not exist.\n")
        }
        valid
      }).grouped(param.blockSize).zipWithIndex.toArray.par
      tasks.tasksupport = new ForkJoinTaskSupport(
        new scala.concurrent.forkjoin.ForkJoinPool(param.parallel))
      tasks.foreach { case (imgs, blkId) =>
        val outFile = new Path(param.output, s"coco-seq-$blkId.seq")
        val key = new BytesWritable
        val value = new BytesWritable
        val writer = SequenceFile.createWriter(conf, Writer.file(outFile), Writer.keyClass(key
          .getClass), Writer.valueClass(value.getClass), Writer.compression(SequenceFile
          .CompressionType.BLOCK, new BZip2Codec))
        val context = new COCOSerializeContext
        imgs.foreach { img =>
          context.clear()
          context.dump(img.fileName)
          img.dumpTo(context)
          context.dump(COCODataset.MAGIC_NUM)
          val keyBytes = context.toByteArray
          key.set(keyBytes, 0, keyBytes.length)
          val bytes = img.data
          value.set(bytes, 0, bytes.length)
          writer.append(key, value)
          val cnt = doneCount.incrementAndGet()
          if (cnt % 500 == 0) {
            System.err.print(s"\r$cnt / ${meta.images.length} = ${cnt.toFloat/meta.images.length}")
          }
        }
        writer.close()
      }
      System.err.print("\n")
    }
  }
}

Source File: TFRecordInputFormat.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.utils.tf

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
import org.apache.hadoop.fs.FSDataInputStream

class TFRecordInputFormat extends FileInputFormat[BytesWritable, NullWritable] {
  override def createRecordReader(inputSplit: InputSplit, context: TaskAttemptContext):
  RecordReader[BytesWritable, NullWritable] = new RecordReader[BytesWritable, NullWritable] {

    private var inputStream: FSDataInputStream = null
    private var reader: TFRecordIterator = null
    private var length: Long = 0L
    private var begin: Long = 0L
    private var current: Array[Byte] = null


    override def getCurrentKey: BytesWritable = {
      new BytesWritable(current)
    }

    override def getProgress: Float = {
      (inputStream.getPos - begin) / (length + 1e-6f)
    }

    override def nextKeyValue(): Boolean = {
      if (reader.hasNext) {
        current = reader.next()
        true
      } else {
        false
      }
    }

    override def getCurrentValue: NullWritable = {
      NullWritable.get()
    }

    override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = {
      val conf = context.getConfiguration
      val fileSplit = split.asInstanceOf[FileSplit]
      length = fileSplit.getLength
      begin = fileSplit.getStart

      val file = fileSplit.getPath
      val fs = file.getFileSystem(conf)
      inputStream = fs.open(file, 4096)
      reader = new TFRecordIterator(inputStream)
    }

    override def close(): Unit = {
      inputStream.close()
    }
  }

  override protected def isSplitable(context: JobContext, filename: Path): Boolean = false
}

Source File: TFRecordOutputFormat.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.utils.tf

import org.apache.hadoop.io.BytesWritable
import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.mapreduce.RecordWriter
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat

class TFRecordOutputFormat extends FileOutputFormat[BytesWritable, NullWritable]{
  override def getRecordWriter(taskAttemptContext: TaskAttemptContext):
  RecordWriter[BytesWritable, NullWritable] = {
    val conf = taskAttemptContext.getConfiguration
    val file = getDefaultWorkFile(taskAttemptContext, "")
    val fs = file.getFileSystem(conf)

    val bufferSize = 4096
    val outStream = fs.create(file, true, bufferSize)

    val writer = new TFRecordWriter(outStream)

    new RecordWriter[BytesWritable, NullWritable]() {
      override def close(context: TaskAttemptContext): Unit = {
        outStream.close()
      }

      override def write(k: BytesWritable, v: NullWritable): Unit = {
        writer.write(k.getBytes, 0, k.getLength)
      }
    }
  }
}

org.apache.hadoop.io.BytesWritable Scala Examples