org.apache.hadoop.io.BytesWritable Scala Examples
The following examples show how to use org.apache.hadoop.io.BytesWritable.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DefaultSource.scala From spark-tensorflow-connector with Apache License 2.0 | 5 votes |
package org.trustedanalytics.spark.datasources.tensorflow import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.spark.sql._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.StructType import org.tensorflow.hadoop.io.TFRecordFileOutputFormat import org.trustedanalytics.spark.datasources.tensorflow.serde.DefaultTfRecordRowEncoder override def shortName(): String = "tensorflow" // Writes DataFrame as TensorFlow Records override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val path = parameters("path") //Export DataFrame as TFRecords val features = data.rdd.map(row => { val example = DefaultTfRecordRowEncoder.encodeTfRecord(row) (new BytesWritable(example.toByteArray), NullWritable.get()) }) features.saveAsNewAPIHadoopFile[TFRecordFileOutputFormat](path) TensorflowRelation(parameters)(sqlContext.sparkSession) } override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = { TensorflowRelation(parameters, Some(schema))(sqlContext.sparkSession) } // Reads TensorFlow Records into DataFrame override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): TensorflowRelation = { TensorflowRelation(parameters)(sqlContext.sparkSession) } }
Example 2
Source File: ScalaTeraSort.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.micro import com.intel.hibench.sparkbench.common.IOCommon import org.apache.hadoop.examples.terasort.{TeraInputFormat, TeraOutputFormat} import org.apache.hadoop.io.Text import org.apache.hadoop.io.BytesWritable import org.apache.spark._ import org.apache.spark.rdd._ import scala.reflect.ClassTag object ScalaTeraSort { implicit def rddToSampledOrderedRDDFunctions[K: Ordering : ClassTag, V: ClassTag] (rdd: RDD[(K, V)]) = new ConfigurableOrderedRDDFunctions[K, V, (K, V)](rdd) implicit def ArrayByteOrdering: Ordering[Array[Byte]] = Ordering.fromLessThan { case (a, b) => (new BytesWritable(a).compareTo(new BytesWritable(b))) < 0 } def main(args: Array[String]) { if (args.length != 2) { System.err.println( s"Usage: $ScalaTeraSort <INPUT_HDFS> <OUTPUT_HDFS>" ) System.exit(1) } val sparkConf = new SparkConf().setAppName("ScalaTeraSort") .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(sparkConf) val io = new IOCommon(sc) //val file = io.load[String](args(0), Some("Text")) val data = sc.newAPIHadoopFile[Text, Text, TeraInputFormat](args(0)).map { case (k,v) => (k.copyBytes, v.copyBytes) } val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism) val reducer = IOCommon.getProperty("hibench.default.shuffle.parallelism") .getOrElse((parallel / 2).toString).toInt val partitioner = new BaseRangePartitioner(partitions = reducer, rdd = data) val ordered_data = new ConfigurableOrderedRDDFunctions[Array[Byte], Array[Byte], (Array[Byte], Array[Byte])](data) val sorted_data = ordered_data.sortByKeyWithPartitioner(partitioner = partitioner).map{case (k, v)=>(new Text(k), new Text(v))} sorted_data.saveAsNewAPIHadoopFile[TeraOutputFormat](args(1)) //io.save(args(1), sorted_data) sc.stop() } }
Example 3
Source File: TensorflowRelation.scala From ecosystem with Apache License 2.0 | 5 votes |
package org.tensorflow.spark.datasources.tfrecords import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{BaseRelation, TableScan} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{Row, SQLContext, SparkSession} import org.tensorflow.example.{SequenceExample, Example} import org.tensorflow.hadoop.io.TFRecordFileInputFormat import org.tensorflow.spark.datasources.tfrecords.serde.DefaultTfRecordRowDecoder case class TensorflowRelation(options: Map[String, String], customSchema: Option[StructType]=None) (@transient val session: SparkSession) extends BaseRelation with TableScan { //Import TFRecords as DataFrame happens here lazy val (tfRdd, tfSchema) = { val rdd = session.sparkContext.newAPIHadoopFile(options("path"), classOf[TFRecordFileInputFormat], classOf[BytesWritable], classOf[NullWritable]) val recordType = options.getOrElse("recordType", "Example") recordType match { case "Example" => val exampleRdd = rdd.map{case (bytesWritable, nullWritable) => Example.parseFrom(bytesWritable.getBytes) } val finalSchema = customSchema.getOrElse(TensorFlowInferSchema(exampleRdd)) val rowRdd = exampleRdd.map(example => DefaultTfRecordRowDecoder.decodeExample(example, finalSchema)) (rowRdd, finalSchema) case "SequenceExample" => val sequenceExampleRdd = rdd.map{case (bytesWritable, nullWritable) => SequenceExample.parseFrom(bytesWritable.getBytes) } val finalSchema = customSchema.getOrElse(TensorFlowInferSchema(sequenceExampleRdd)) val rowRdd = sequenceExampleRdd.map(example => DefaultTfRecordRowDecoder.decodeSequenceExample(example, finalSchema)) (rowRdd, finalSchema) case _ => throw new IllegalArgumentException(s"Unsupported recordType ${recordType}: recordType can be Example or SequenceExample") } } override def sqlContext: SQLContext = session.sqlContext override def schema: StructType = tfSchema override def buildScan(): RDD[Row] = tfRdd }
Example 4
Source File: LasOutputWriter.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus.las import org.apache.spark.sql.types._ import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext } import java.io.DataOutputStream import org.apache.spark.sql.sources.OutputWriter import org.apache.spark.deploy.SparkHadoopUtil import org.apache.hadoop.io.{ NullWritable, BytesWritable } import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.fs.Path import java.text.NumberFormat import org.apache.spark.sql.{ Row, SQLContext, sources } import fr.ign.spark.iqmulus.RowOutputStream class LasOutputWriter( name: String, context: TaskAttemptContext, dataSchema: StructType, formatOpt: Option[Byte] = None, version: Version = Version(), offset: Array[Double] = Array(0F, 0F, 0F), scale: Array[Double] = Array(0.01F, 0.01F, 0.01F) ) extends OutputWriter { private val file = { val path = getDefaultWorkFile("/1.pdr") val fs = path.getFileSystem(context.getConfiguration) fs.create(path) } private val pmin = Array.fill[Double](3)(Double.PositiveInfinity) private val pmax = Array.fill[Double](3)(Double.NegativeInfinity) private val countByReturn = Array.fill[Long](15)(0) private def count = countByReturn.sum private val format = formatOpt.getOrElse(LasHeader.formatFromSchema(dataSchema)) // todo, extra bytes private val schema = LasHeader.schema(format) private def header = new LasHeader(name, format, count, pmin, pmax, scale, offset, countByReturn) private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian = true, schema, dataSchema) def getDefaultWorkFile(extension: String): Path = { val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID") val taskAttemptId: TaskAttemptID = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId new Path(name, f"$split%05d-$uniqueWriteJobId$extension") } override def write(row: Row): Unit = { recordWriter.write(row) // gather statistics for the header val x = offset(0) + scale(0) * row.getAs[Int]("x").toDouble val y = offset(1) + scale(1) * row.getAs[Int]("y").toDouble val z = offset(2) + scale(2) * row.getAs[Int]("z").toDouble val ret = row.getAs[Byte]("flags") & 0x3 countByReturn(ret) += 1 pmin(0) = Math.min(pmin(0), x) pmin(1) = Math.min(pmin(1), y) pmin(2) = Math.min(pmin(2), z) pmax(0) = Math.max(pmax(0), x) pmax(1) = Math.max(pmax(1), y) pmax(2) = Math.max(pmax(2), z) } override def close(): Unit = { recordWriter.close // write header val path = getDefaultWorkFile("/0.header") val fs = path.getFileSystem(context.getConfiguration) val dos = new java.io.DataOutputStream(fs.create(path)) header.write(dos) dos.close // copy header and pdf to a final las file (1 per split) org.apache.hadoop.fs.FileUtil.copyMerge( fs, getDefaultWorkFile("/"), fs, getDefaultWorkFile(".las"), true, context.getConfiguration, "" ) } }
Example 5
Source File: PlyOutputWriter.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus.ply import org.apache.spark.sql.types._ import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext, JobContext } import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import java.io.DataOutputStream import org.apache.spark.sql.sources.OutputWriter import org.apache.hadoop.io.{ NullWritable, BytesWritable } import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.fs.Path import java.text.NumberFormat import org.apache.spark.sql.{ Row, SQLContext, sources } import fr.ign.spark.iqmulus.RowOutputStream class PlyOutputWriter( name: String, context: TaskAttemptContext, dataSchema: StructType, element: String, littleEndian: Boolean ) extends OutputWriter { private val file = { val path = getDefaultWorkFile(s".ply.$element") val fs = path.getFileSystem(context.getConfiguration) fs.create(path) } private var count = 0L // strip out ids private val schema = StructType(dataSchema.filterNot { Seq("fid", "pid") contains _.name }) private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian, schema, dataSchema) def getDefaultWorkFile(extension: String): Path = { val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID") val taskAttemptId: TaskAttemptID = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId new Path(name, f"$split%05d-$uniqueWriteJobId$extension") } override def write(row: Row): Unit = { recordWriter.write(row) count += 1 } override def close(): Unit = { recordWriter.close // write header val path = getDefaultWorkFile(".ply.header") val fs = path.getFileSystem(context.getConfiguration) val dos = new java.io.DataOutputStream(fs.create(path)) val header = new PlyHeader(path.toString, littleEndian, Map(element -> ((count, schema)))) header.write(dos) dos.close } }
Example 6
Source File: ReadingWritingData.scala From Spark-RSVD with Apache License 2.0 | 5 votes |
package com.criteo.rsvd import java.nio.ByteBuffer import com.esotericsoftware.kryo.Kryo import com.typesafe.scalalogging.slf4j.StrictLogging import de.javakaffee.kryoserializers.UnmodifiableCollectionsSerializer import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.spark.mllib.linalg.distributed.MatrixEntry import org.apache.spark.rdd.RDD import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer} import org.apache.spark.{SparkConf, SparkContext} import scala.reflect.ClassTag object ReadingWritingData extends StrictLogging { def getInputDataSizeMB(inputPathPattern: String, sc: SparkContext): Int = { val fs = FileSystem.get(sc.hadoopConfiguration) val path = new Path(inputPathPattern) (fs.globStatus(path).map(f => f.getLen).sum / 1024 / 1024).toInt } def loadMatrixEntries(inputPath: String, singlePartitionSizeMB: Int, sc: SparkContext): RDD[MatrixEntry] = { logger.info(s"Input matrix path: $inputPath") val inputDataSizeMB = getInputDataSizeMB(inputPath + " def makeRddFromKryoFile[T: ClassTag]( sc: SparkContext, path: String, minPartitionsOpt: Option[Int] = None): RDD[T] = { val minPartitions = minPartitionsOpt.getOrElse(sc.defaultMinPartitions) val serializer = new KryoSerializer(sc.getConf) sc.sequenceFile(path, classOf[NullWritable], classOf[BytesWritable], minPartitions) .mapPartitions { it => val instance = serializer.newInstance() it.flatMap { case (_, v) => instance.deserialize[Array[T]](ByteBuffer.wrap(v.getBytes)) } } } object RandomizedSVDKryoRegistrator extends KryoRegistrator { def registerClasses(kryo: Kryo): Unit = { UnmodifiableCollectionsSerializer.registerSerializers(kryo) kryo.register(classOf[MatrixEntry]) kryo.register(classOf[Array[MatrixEntry]]) } } def appendBasicRegistratorToSparkConf(sparkConf: SparkConf): SparkConf = appendRegistratorToSparkConf(sparkConf, RandomizedSVDKryoRegistrator.getClass.getName) def appendRegistratorToSparkConf(sparkConf: SparkConf, registratorName: String): SparkConf = { val oldValue = sparkConf.get("spark.kryo.registrator", "") if (oldValue == "") { sparkConf.set("spark.kryo.registrator", registratorName) } else { sparkConf.set("spark.kryo.registrator", oldValue + "," + registratorName) } } }
Example 7
Source File: RecordIOOutputFormatTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.protobuf import java.io.ByteArrayOutputStream import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataOutputStream, Path} import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.hadoop.mapreduce.TaskAttemptContext import org.mockito.Matchers.any import org.mockito.Mockito.{verify, when} import org.scalatest.{BeforeAndAfter, FlatSpec} import org.scalatest.mock.MockitoSugar import com.amazonaws.services.sagemaker.sparksdk.protobuf.RecordIOOutputFormat.SageMakerProtobufRecordWriter class RecordIOOutputFormatTests extends FlatSpec with MockitoSugar with BeforeAndAfter { var sagemakerProtobufRecordWriter: SageMakerProtobufRecordWriter = _ var mockOutputStream : FSDataOutputStream = _ var byteArrayOutputStream: ByteArrayOutputStream = _ var mockTaskAttemptContext: TaskAttemptContext = _ var mockPath: Path = _ var mockFileSystem: FileSystem = _ before { byteArrayOutputStream = new ByteArrayOutputStream() mockOutputStream = mock[FSDataOutputStream] sagemakerProtobufRecordWriter = new SageMakerProtobufRecordWriter(mockOutputStream) mockTaskAttemptContext = mock[TaskAttemptContext] mockPath = mock[Path] mockFileSystem = mock[FileSystem] } it should "write an empty array of bytes" in { val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "write an array of bytes" in { val byteArray = Array[Byte](0, 0, 0, 0) byteArrayOutputStream.write(byteArray) val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "write an array of bytes, padding as necessary" in { byteArrayOutputStream.write(5) val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "write an array of bytes, padding only as much as necessary" in { byteArrayOutputStream.write(Array[Byte](0, 0, 0, 0, 0)) val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "create a record writer from a FSDataOutputStream created by the filesystem" in { val mockTaskAttemptContext = mock[TaskAttemptContext] val mockPath = mock[Path] val mockFileSystem = mock[FileSystem] when(mockPath.getFileSystem(any[Configuration])).thenReturn(mockFileSystem) new RecordIOOutputFormat() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { mockPath } }.getRecordWriter(mockTaskAttemptContext) verify(mockFileSystem).create(mockPath, true) } }
Example 8
Source File: SageMakerProtobufWriter.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.protobuf import java.io.ByteArrayOutputStream import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext} import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.execution.datasources.OutputWriter import org.apache.spark.sql.types.StructType def write(row: Row): Unit = { val labelColumnName = options.getOrElse("labelColumnName", "label") val featuresColumnName = options.getOrElse("featuresColumnName", "features") val record = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Some(labelColumnName)) record.writeTo(byteArrayOutputStream) recordWriter.write(NullWritable.get(), new BytesWritable(byteArrayOutputStream.toByteArray)) byteArrayOutputStream.reset() } override def close(): Unit = { recordWriter.close(context) } }
Example 9
Source File: ImageInputDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import java.io.InputStream import java.net.Socket import org.apache.hadoop.io.BytesWritable import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import scala.collection.mutable.ArrayBuffer import org.apache.spark.Logging class ImageInputDStream(@transient ssc_ : StreamingContext, host: String, port: Int, storageLevel: StorageLevel) extends ReceiverInputDStream[BytesWritable](ssc_) with Logging { override def getReceiver(): Receiver[BytesWritable] = { new ImageRecevier(host, port, storageLevel) } } class ImageRecevier(host: String, port: Int, storageLevel: StorageLevel) extends Receiver[BytesWritable](storageLevel) with Logging { override def onStart(): Unit = { new Thread("Image Socket") { setDaemon(true) override def run(): Unit = { receive() } }.start() } def receive(): Unit = { var socket: Socket = null var in: InputStream = null try { log.info("Connecting to " + host + ":" + port) socket = new Socket(host, port) log.info("Connected to " + host + ":" + port) in = socket.getInputStream val buf = new ArrayBuffer[Byte]() var bytes = new Array[Byte](1024) var len = 0 while (-1 < len) { len = in.read(bytes) if (len > 0) { buf ++= bytes } } val bw = new BytesWritable(buf.toArray) log.error("byte:::::" + bw.getLength) store(bw) log.info("Stopped receiving") restart("Retrying connecting to " + host + ":" + port) } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } finally { if (in != null) { in.close() } if (socket != null) { socket.close() log.info("Closed socket to " + host + ":" + port) } } } override def onStop(): Unit = { } }
Example 10
Source File: ErrorEventsWriter.scala From etl-light with MIT License | 5 votes |
package yamrcraft.etlite.writers import java.io.OutputStream import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile, Text} import org.json4s.jackson.Serialization import org.json4s.{DefaultFormats, Formats, ShortTypeHints} import yamrcraft.etlite.utils.FileUtils case class ErrorInfo( errorType: String, errorMsg: Option[String] ) class ErrorEventWriter(folder: String, jobId: Long, partitionId: Int) extends ErrorEventsWriter { // incremental record id var recordId = 1 val fs = FileUtils.getFS(folder) val seqPath = new Path(folder, s"errors_job${jobId}_part$partitionId.seq") if (fs.exists(seqPath)) { fs.delete(seqPath, false) } val metaPath = new Path(folder, s"errors_job${jobId}_part$partitionId.meta.seq") if (fs.exists(metaPath)) { fs.delete(metaPath, false) } private var seqWriter: Option[SequenceFile.Writer] = None private var metaWriter: Option[SequenceFile.Writer] = None implicit val formats = new Formats { val dateFormat = DefaultFormats.lossless.dateFormat override val typeHints = ShortTypeHints(List(classOf[ErrorInfo])) override val typeHintFieldName = "type" } override def write(errorEvent: (Array[Byte], ErrorInfo)) = { if (seqWriter.isEmpty) { seqWriter = createSequenceFile(seqPath, classOf[IntWritable], classOf[BytesWritable]) metaWriter = createSequenceFile(metaPath, classOf[IntWritable], classOf[Text]) } val id = new IntWritable(recordId) seqWriter.get.append(id, new BytesWritable(errorEvent._1)) metaWriter.get.append(id, new Text(Serialization.write(errorEvent._2))) recordId += 1 } override def commit() = { seqWriter.foreach(p => p.close()) metaWriter.foreach(p => p.close()) } private def createSequenceFile(path: Path, keyClass: Class[_], valueClass: Class[_]) = { val optPath = SequenceFile.Writer.file(path) val optKey = SequenceFile.Writer.keyClass(keyClass) val optVal = SequenceFile.Writer.valueClass(valueClass) Some(SequenceFile.createWriter(fs.getConf, optPath, optKey, optVal)) } }
Example 11
Source File: RosbagInputFormat.scala From ros_hadoop with Apache License 2.0 | 5 votes |
package de.valtech.foss import scala.io.Source import scala.collection.JavaConverters._ import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, LongWritable, MapWritable} import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.input.FileInputFormat object RosbagInputFormat { def getRosChunkIdx(context: JobContext): String = { context.getConfiguration.get("RosbagInputFormat.chunkIdx") } def getBlockSize(context: JobContext): Long = { context.getConfiguration.get("dfs.blocksize").toLong } } class RosbagBytesInputFormat extends FileInputFormat[LongWritable, BytesWritable] { private var rosChunkIdx = "" private var recordLength = -1L override def isSplitable(context: JobContext, filename: Path): Boolean = { rosChunkIdx = RosbagInputFormat.getRosChunkIdx(context) recordLength = RosbagInputFormat.getBlockSize(context) true } override def computeSplitSize(blockSize: Long, minSize: Long, maxSize: Long): Long = { val defaultSize = super.computeSplitSize(blockSize, minSize, maxSize) defaultSize } override def createRecordReader(split: InputSplit, context: TaskAttemptContext) : RecordReader[LongWritable, BytesWritable] = { new RosbagBytesRecordReader } } class RosbagMapInputFormat extends FileInputFormat[LongWritable, MapWritable] { private var rosChunkIdx = "" private var recordLength = -1L override def isSplitable(context: JobContext, filename: Path): Boolean = { rosChunkIdx = RosbagInputFormat.getRosChunkIdx(context) recordLength = RosbagInputFormat.getBlockSize(context) true } override def computeSplitSize(blockSize: Long, minSize: Long, maxSize: Long): Long = { val defaultSize = super.computeSplitSize(blockSize, minSize, maxSize) defaultSize } override def createRecordReader(split: InputSplit, context: TaskAttemptContext) : RecordReader[LongWritable, MapWritable] = { new RosbagMapRecordReader } }
Example 12
Source File: SequenceSinkTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.sequence import io.eels.datastream.DataStream import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile} import org.scalatest.{Matchers, WordSpec} class SequenceSinkTest extends WordSpec with Matchers { private val ds = DataStream.fromValues( StructType("a", "b", "c", "d"), Seq( List("1", "2", "3", "4"), List("5", "6", "7", "8") ) ) "SequenceSink" should { "write sequence files" in { implicit val conf = new Configuration implicit val fs = FileSystem.get(conf) val path = new Path("seqsink.seq") if (fs.exists(path)) fs.delete(path, true) ds.to(SequenceSink(path)) val reader = new SequenceFile.Reader(new Configuration, SequenceFile.Reader.file(path)) val k = new IntWritable val v = new BytesWritable val set = for (_ <- 1 to 3) yield { reader.next(k, v) new String(v.copyBytes) } set.toSet shouldBe Set( "a,b,c,d", "1,2,3,4", "5,6,7,8" ) reader.close() fs.delete(path, true) } } }
Example 13
Source File: SequenceSource.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.sequence import java.util.concurrent.atomic.AtomicBoolean import com.sksamuel.exts.Logging import com.sksamuel.exts.io.Using import io.eels._ import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription} import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile} case class SequenceSource(path: Path)(implicit conf: Configuration) extends Source with Logging { logger.debug(s"Creating sequence source from $path") override def schema: StructType = SequenceSupport.schema(path) override def parts(): Seq[Publisher[Seq[Row]]] = List(new SequencePublisher(path)) } object SequenceReaderIterator { def apply(schema: StructType, reader: SequenceFile.Reader): Iterator[Row] = new Iterator[Row] { private val k = new IntWritable() private val v = new BytesWritable() // throw away the header reader.next(k, v) override def next(): Row = Row(schema, SequenceSupport.toValues(v).toVector) override def hasNext(): Boolean = reader.next(k, v) } } class SequencePublisher(val path: Path)(implicit conf: Configuration) extends Publisher[Seq[Row]] with Logging with Using { override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { try { using(SequenceSupport.createReader(path)) { reader => val schema = SequenceSupport.schema(path) val running = new AtomicBoolean(true) subscriber.subscribed(Subscription.fromRunning(running)) SequenceReaderIterator(schema, reader) .takeWhile(_ => running.get) .grouped(DataStream.DefaultBatchSize) .foreach(subscriber.next) subscriber.completed() } } catch { case t: Throwable => subscriber.error(t) } } }
Example 14
Source File: SequenceSupport.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.sequence import java.io.StringReader import java.nio.charset.Charset import com.sksamuel.exts.Logging import com.sksamuel.exts.io.Using import io.eels.component.csv.{CsvFormat, CsvSupport} import io.eels.schema.{Field, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile} object SequenceSupport extends Logging with Using { def createReader(path: Path)(implicit conf: Configuration): SequenceFile.Reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(path)) def toValues(v: BytesWritable): Array[String] = toValues(new String(v.copyBytes(), Charset.forName("UTF8"))) def toValues(str: String): Array[String] = { val parser = CsvSupport.createParser(CsvFormat(), false, false, false, null, null) parser.beginParsing(new StringReader(str)) val record = parser.parseNext() parser.stopParsing() record } def schema(path: Path)(implicit conf: Configuration): StructType = { logger.debug(s"Fetching sequence schema for $path") using(createReader(path)) { it => val k = new IntWritable() val v = new BytesWritable() val fields: Array[Field] = { it.next(k, v) toValues(v).map { it => new Field(it) } } StructType(fields.toList) } } }
Example 15
Source File: SequenceSink.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.sequence import java.io.StringWriter import com.univocity.parsers.csv.{CsvWriter, CsvWriterSettings} import io.eels.{Row, Sink, SinkWriter} import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile} case class SequenceSink(path: Path)(implicit conf: Configuration) extends Sink { override def open(schema: StructType): SinkWriter = new SequenceSinkWriter(schema, path) class SequenceSinkWriter(schema: StructType, path: Path) extends SinkWriter { val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(path), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[BytesWritable]) ) val key = new IntWritable(0) val headers = valuesToCsv(schema.fieldNames()) writer.append(key, new BytesWritable(headers.getBytes)) override def close(): Unit = writer.close() override def write(row: Row): Unit = { this.synchronized { val csv = valuesToCsv(row.values) writer.append(key, new BytesWritable(csv.getBytes())) key.set(key.get() + 1) } } private def valuesToCsv(values: Seq[Any]): String = { val swriter = new StringWriter() val csv = new CsvWriter(swriter, new CsvWriterSettings()) csv.writeRow(values.map { case null => null case other => other.toString }: _*) csv.close() swriter.toString().trim() } } }
Example 16
Source File: PailDataSource.scala From utils with Apache License 2.0 | 5 votes |
package com.indix.utils.spark.pail import com.backtype.hadoop.pail._ import com.backtype.support.{Utils => PailUtils} import org.apache.hadoop.io.{BytesWritable, Text} import org.apache.hadoop.mapred.{InputFormat, JobConf} import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import scala.reflect.ClassTag trait PailDataSource { implicit class PailBasedReader(sc: SparkContext) { def pailFile[R: ClassTag](inputLocation: String): RDD[R] = { pailFileWithInfo[R](inputLocation).map(_._2) } def pailFileWithInfo[R: ClassTag](inputLocation: String) = { val pail = new Pail(inputLocation) val pailSpec = pail.getSpec val inputFormat = pail.getFormat.getInputFormatClass.asSubclass(classOf[InputFormat[PailRecordInfo, BytesWritable]]) sc.hadoopFile(inputLocation, inputFormat, classOf[PailRecordInfo], classOf[BytesWritable]) .map { case (recordInfo, recordInBytes) => recordInfo -> pailSpec.getStructure.deserialize(recordInBytes.getBytes).asInstanceOf[R] } } } implicit class PailBasedWriter[R: ClassTag](rdd: RDD[R]) { def saveAsPail(outputLocation: String, pailSpec: PailSpec) = { val jobConf = new JobConf(rdd.context.hadoopConfiguration) PailUtils.setObject(jobConf, PailOutputFormat.SPEC_ARG, pailSpec) rdd.map { record => val pailStruct = pailSpec.getStructure.asInstanceOf[PailStructure[R]] val attr = PailUtils.join(pailStruct.getTarget(record), "/") val recordInBytes = pailStruct.serialize(record) new Text(attr) -> new BytesWritable(recordInBytes) }.saveAsHadoopFile(outputLocation, classOf[Text], classOf[BytesWritable], classOf[PailOutputFormat], jobConf) } } }
Example 17
Source File: COCOSeqFileGenerator.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.utils import com.intel.analytics.bigdl.dataset.segmentation.{COCODataset, COCOSerializeContext} import java.io.File import java.nio.file.{Files, Paths} import java.util.concurrent.atomic.AtomicInteger import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.SequenceFile.Writer import org.apache.hadoop.io.compress.BZip2Codec import org.apache.hadoop.io.{BytesWritable, SequenceFile} import scala.collection.parallel.ForkJoinTaskSupport import scopt.OptionParser object COCOSeqFileGenerator { case class COCOSeqFileGeneratorParams( folder: String = ".", metaPath: String = "instances_val2014.json", output: String = ".", parallel: Int = 1, blockSize: Int = 12800 ) private val parser = new OptionParser[COCOSeqFileGeneratorParams]("BigDL COCO " + "Sequence File Generator") { head("BigDL COCO Sequence File Generator") opt[String]('f', "folder") .text("where you put the COCO image files") .action((x, c) => c.copy(folder = x)) opt[String]('o', "output folder") .text("where you put the generated seq files") .action((x, c) => c.copy(output = x)) opt[Int]('p', "parallel") .text("parallel num") .action((x, c) => c.copy(parallel = x)) opt[Int]('b', "blockSize") .text("block size") .action((x, c) => c.copy(blockSize = x)) opt[String]('m', "metaPath") .text("metadata json file path") .action((x, c) => c.copy(metaPath = x)) } def main(args: Array[String]): Unit = { parser.parse(args, COCOSeqFileGeneratorParams()).foreach { param => println("Loading COCO metadata") val meta = COCODataset.load(param.metaPath, param.folder) println("Metadata loaded") val conf: Configuration = new Configuration val doneCount = new AtomicInteger(0) val tasks = meta.images.filter(img => { val path = img.path val valid = Files.exists(path) && !Files.isDirectory(path) if (!valid) { System.err.print(s"[Warning] The image file ${path.getFileName} does not exist.\n") } valid }).grouped(param.blockSize).zipWithIndex.toArray.par tasks.tasksupport = new ForkJoinTaskSupport( new scala.concurrent.forkjoin.ForkJoinPool(param.parallel)) tasks.foreach { case (imgs, blkId) => val outFile = new Path(param.output, s"coco-seq-$blkId.seq") val key = new BytesWritable val value = new BytesWritable val writer = SequenceFile.createWriter(conf, Writer.file(outFile), Writer.keyClass(key .getClass), Writer.valueClass(value.getClass), Writer.compression(SequenceFile .CompressionType.BLOCK, new BZip2Codec)) val context = new COCOSerializeContext imgs.foreach { img => context.clear() context.dump(img.fileName) img.dumpTo(context) context.dump(COCODataset.MAGIC_NUM) val keyBytes = context.toByteArray key.set(keyBytes, 0, keyBytes.length) val bytes = img.data value.set(bytes, 0, bytes.length) writer.append(key, value) val cnt = doneCount.incrementAndGet() if (cnt % 500 == 0) { System.err.print(s"\r$cnt / ${meta.images.length} = ${cnt.toFloat/meta.images.length}") } } writer.close() } System.err.print("\n") } } }
Example 18
Source File: TFRecordInputFormat.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.utils.tf import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} import org.apache.hadoop.fs.FSDataInputStream class TFRecordInputFormat extends FileInputFormat[BytesWritable, NullWritable] { override def createRecordReader(inputSplit: InputSplit, context: TaskAttemptContext): RecordReader[BytesWritable, NullWritable] = new RecordReader[BytesWritable, NullWritable] { private var inputStream: FSDataInputStream = null private var reader: TFRecordIterator = null private var length: Long = 0L private var begin: Long = 0L private var current: Array[Byte] = null override def getCurrentKey: BytesWritable = { new BytesWritable(current) } override def getProgress: Float = { (inputStream.getPos - begin) / (length + 1e-6f) } override def nextKeyValue(): Boolean = { if (reader.hasNext) { current = reader.next() true } else { false } } override def getCurrentValue: NullWritable = { NullWritable.get() } override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = { val conf = context.getConfiguration val fileSplit = split.asInstanceOf[FileSplit] length = fileSplit.getLength begin = fileSplit.getStart val file = fileSplit.getPath val fs = file.getFileSystem(conf) inputStream = fs.open(file, 4096) reader = new TFRecordIterator(inputStream) } override def close(): Unit = { inputStream.close() } } override protected def isSplitable(context: JobContext, filename: Path): Boolean = false }
Example 19
Source File: TFRecordOutputFormat.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.utils.tf import org.apache.hadoop.io.BytesWritable import org.apache.hadoop.io.NullWritable import org.apache.hadoop.mapreduce.RecordWriter import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat class TFRecordOutputFormat extends FileOutputFormat[BytesWritable, NullWritable]{ override def getRecordWriter(taskAttemptContext: TaskAttemptContext): RecordWriter[BytesWritable, NullWritable] = { val conf = taskAttemptContext.getConfiguration val file = getDefaultWorkFile(taskAttemptContext, "") val fs = file.getFileSystem(conf) val bufferSize = 4096 val outStream = fs.create(file, true, bufferSize) val writer = new TFRecordWriter(outStream) new RecordWriter[BytesWritable, NullWritable]() { override def close(context: TaskAttemptContext): Unit = { outStream.close() } override def write(k: BytesWritable, v: NullWritable): Unit = { writer.write(k.getBytes, 0, k.getLength) } } } }