org.apache.hadoop.fs.FSDataOutputStream Scala Examples
The following examples show how to use org.apache.hadoop.fs.FSDataOutputStream.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RecordWriter.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.visualization.tensorboard import java.io.{File, FileOutputStream} import com.google.common.primitives.{Ints, Longs} import com.intel.analytics.bigdl.utils.Crc32 import netty.Crc32c import org.apache.hadoop.fs.{FSDataOutputStream, FileSystem, Path} import org.tensorflow.util.Event private[bigdl] class RecordWriter(file: Path, fs: FileSystem) { val outputStream = if (file.toString.startsWith("hdfs://")) { // FSDataOutputStream couldn't flush data to localFileSystem in time. So reading summaries // will throw exception. fs.create(file, true, 1024) } else { // Using FileOutputStream when write to local. new FileOutputStream(new File(file.toString)) } val crc32 = new Crc32c() def write(event: Event): Unit = { val eventString = event.toByteArray val header = Longs.toByteArray(eventString.length.toLong).reverse outputStream.write(header) outputStream.write(Ints.toByteArray(Crc32.maskedCRC32(crc32, header).toInt).reverse) outputStream.write(eventString) outputStream.write(Ints.toByteArray(Crc32.maskedCRC32(crc32, eventString).toInt).reverse) if (outputStream.isInstanceOf[FSDataOutputStream]) { // Flush data to HDFS. outputStream.asInstanceOf[FSDataOutputStream].hflush() } } def close(): Unit = { outputStream.close() } }
Example 2
Source File: Pathway.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.microorganism import java.io.{BufferedReader, InputStreamReader, OutputStreamWriter} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path} import org.apache.spark.sql.{DataFrame, SparkSession} import org.json.JSONObject class Pathway extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Parse Pathway data" override val inportList: List[String] =List(Port.DefaultPort.toString) override val outportList: List[String] = List(Port.DefaultPort.toString) var cachePath:String = _ def setProperties(map: Map[String, Any]): Unit = { cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path") .defaultValue("/pathway").required(true) descriptor = cachePath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/microorganism/Pathway.png") } override def getGroup(): List[String] = { List(StopGroup.MicroorganismGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val inDf: DataFrame = in.read() var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String] val configuration: Configuration = new Configuration() val pathARR: Array[String] = pathStr.split("\\/") var hdfsUrl:String="" for (x <- (0 until 3)){ hdfsUrl+=(pathARR(x) +"/") } configuration.set("fs.defaultFS",hdfsUrl) var fs: FileSystem = FileSystem.get(configuration) val hdfsPathTemporary = hdfsUrl+cachePath+"/pathwayCache/pathwayCache.json" val path: Path = new Path(hdfsPathTemporary) if(fs.exists(path)){ fs.delete(path) } fs.create(path).close() val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path)) var fdis: FSDataInputStream = null var br: BufferedReader = null var doc: JSONObject = null var hasAnotherSequence:Boolean = true inDf.collect().foreach(row => { pathStr = row.get(0).asInstanceOf[String] fdis = fs.open(new Path(pathStr)) br = new BufferedReader(new InputStreamReader(fdis)) var count = 0 while (hasAnotherSequence) { count += 1 doc = new JSONObject hasAnotherSequence = util.KeggPathway.process(br, doc) doc.write(hdfsWriter) hdfsWriter.write("\n") } br.close() fdis.close() }) hdfsWriter.close() val df: DataFrame = pec.get[SparkSession]().read.json(hdfsPathTemporary) df.schema.printTreeString() println(df.count) out.write(df) } }
Example 3
Source File: PDBData.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.microorganism import java.io._ import cn.piflow.bundle.microorganism.util.PDB import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path} import org.apache.spark.sql.{DataFrame, SparkSession} import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator} import org.json.JSONObject class PDBData extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Parse PDB data" override val inportList: List[String] =List(Port.DefaultPort.toString) override val outportList: List[String] = List(Port.DefaultPort.toString) var cachePath:String = _ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val session = pec.get[SparkSession]() val inDf: DataFrame = in.read() val configuration: Configuration = new Configuration() var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String] val pathARR: Array[String] = pathStr.split("\\/") var hdfsUrl:String="" for (x <- (0 until 3)){ hdfsUrl+=(pathARR(x) +"/") } configuration.set("fs.defaultFS",hdfsUrl) var fs: FileSystem = FileSystem.get(configuration) val hdfsPathTemporary = hdfsUrl+cachePath+"/PDBCache/PDBCache.json" val path: Path = new Path(hdfsPathTemporary) if(fs.exists(path)){ fs.delete(path) } fs.create(path).close() val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path)) var doc: JSONObject = null var pdb: PDB = null var count:Int=0 inDf.collect().foreach(row => { count += 1 pathStr = row.get(0).asInstanceOf[String] pdb = new PDB(pathStr,fs) doc = pdb.getDoc doc.write(hdfsWriter) hdfsWriter.write("\n") doc = null }) hdfsWriter.close() val df: DataFrame = session.read.json(hdfsPathTemporary) out.write(df) } def setProperties(map: Map[String, Any]): Unit = { cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path") .defaultValue("/PDB").required(true) descriptor = cachePath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/microorganism/PDBData.png") } override def getGroup(): List[String] = { List(StopGroup.MicroorganismGroup) } override def initialize(ctx: ProcessContext): Unit = { } }
Example 4
Source File: Ensembl.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.microorganism import java.io._ import cn.piflow.bundle.microorganism.util.ParserGff3Data import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path} import org.apache.spark.sql.{DataFrame, SparkSession} import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator} import org.json.JSONObject class Ensembl extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Parse ensembl data" override val inportList: List[String] =List(Port.DefaultPort.toString) override val outportList: List[String] = List(Port.DefaultPort.toString) var cachePath:String = _ def setProperties(map: Map[String, Any]): Unit = { cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path") .defaultValue("/ensembl").required(true) descriptor = cachePath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/microorganism/Ensembl.png") } override def getGroup(): List[String] = { List(StopGroup.MicroorganismGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val session = pec.get[SparkSession]() val inDf: DataFrame = in.read() val configuration: Configuration = new Configuration() var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String] val pathARR: Array[String] = pathStr.split("\\/") var hdfsUrl:String="" for (x <- (0 until 3)){ hdfsUrl+=(pathARR(x) +"/") } configuration.set("fs.defaultFS",hdfsUrl) var fs: FileSystem = FileSystem.get(configuration) val hdfsPathTemporary = hdfsUrl+cachePath+"/ensemblCache/ensemblCache.json" val path: Path = new Path(hdfsPathTemporary) if(fs.exists(path)){ fs.delete(path) } fs.create(path).close() val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path)) val parser: ParserGff3Data = new ParserGff3Data var fdis: FSDataInputStream =null var br: BufferedReader = null var doc: JSONObject = null var count:Int = 0 inDf.collect().foreach(row => { pathStr = row.get(0).asInstanceOf[String] fdis = fs.open(new Path(pathStr)) br = new BufferedReader(new InputStreamReader(fdis)) var eachStr:String=null while((eachStr = br.readLine()) != null && eachStr != null ){ doc = parser.parserGff3(eachStr) if(doc.toString.length > 2){ count += 1 doc.write(hdfsWriter) hdfsWriter.write("\n") } } br.close() fdis.close() }) hdfsWriter.close() out.write(session.read.json(hdfsPathTemporary)) } }
Example 5
Source File: MergeStrategySpec.scala From daf with BSD 3-Clause "New" or "Revised" License | 5 votes |
package daf.filesystem import java.io.{ Closeable, InputStream } import java.util.Scanner import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{ FSDataInputStream, FSDataOutputStream, FileSystem, Path } import org.scalatest.{ BeforeAndAfterAll, Matchers, WordSpec } import scala.collection.convert.decorateAsScala._ import scala.util.{ Random, Try } class MergeStrategySpec extends WordSpec with Matchers with BeforeAndAfterAll { private implicit val fileSystem = FileSystem.getLocal(new Configuration) private val numFiles = 10 private val baseDir = "test-dir".asHadoop private val workingDir = baseDir / f"merge-strategy-spec-${Random.nextInt(10000)}%05d" private def safely[A <: Closeable, U](f: A => U) = { stream: A => val attempt = Try { f(stream) } stream.close() attempt } private def readFile(path: Path) = safely[FSDataInputStream, Seq[String]] { _.scanner.asScala.toSeq } apply fileSystem.open(path) private def readFiles = Try { fileSystem.listStatus(workingDir).toSeq.flatMap { status => readFile(status.getPath).get } } private def openFiles = Try { fileSystem.listStatus(workingDir).toSeq.map { status => fileSystem.open(status.getPath) } } private def createFile(fileName: String) = safely[FSDataOutputStream, Unit] { stream => Random.alphanumeric.grouped(200).take(10).map { randomSplits(_) }.foreach { row => stream.writeUTF { row.mkString("", ",", "\n") } } } apply fileSystem.create { workingDir / fileName } private def randomSplits(chars: Stream[Char], strings: Seq[String] = Seq.empty): Seq[String] = chars.splitAt { Random.nextInt(10) + 5 } match { case (head, tail) if tail.isEmpty => head.drop(1).mkString +: strings case (head, tail) => randomSplits(tail, head.mkString +: strings) } private def createWorkingDir = Try { fileSystem.mkdirs(workingDir) } private def createFiles = Try { 0 until numFiles foreach { index => createFile(s"test-file-$index").get } // this is relatively nasty, and should be handled in a `traverse` } private def prepareData = for { _ <- createWorkingDir _ <- createFiles } yield () private def purgeData = Try { fileSystem.delete(workingDir, true) } override def beforeAll() = prepareData.get override def afterAll() = purgeData.get "MergeStrategies info" when { "given compressed format files" must { "throw an exception" in { an[IllegalArgumentException] must be thrownBy MergeStrategies.find { FileInfo(workingDir / "test-file-0", 0, FileDataFormats.raw, FileCompressionFormats.gzip) } } } "given data as csv" must { "drop one line and merge the rest" in { safely[InputStream, Seq[String]] { new Scanner(_).asScala.toList }.andThen { attempt => for { merged <- attempt expected <- readFiles } merged.size should be { expected.size - numFiles + 1 } } apply MergeStrategies.csv.merge { openFiles.get } } } "given data as json" must { "just merge the files into one" in { safely[InputStream, Seq[String]] { new Scanner(_).asScala.toList }.andThen { attempt => for { merged <- attempt expected <- readFiles } merged.size should be { expected.size } } apply MergeStrategies.json.merge { openFiles.get } } } } }
Example 6
Source File: HdfsUrl.scala From amadou with Apache License 2.0 | 5 votes |
package com.mediative.amadou import org.apache.hadoop.fs.{Path, FSDataOutputStream} import org.apache.spark.sql.SparkSession case class HdfsUrl(url: String, dateFormat: Option[String] = None) { def path = new Path(url) def /(subPath: String): HdfsUrl = copy(url = new Path(path, subPath).toString) def /(date: DateInterval): HdfsUrl = { val datePath = dateFormat.fold(date.toString)(date.format) this./(datePath) } def exists(spark: SparkSession) = fileSystem(spark).exists(path) def open[T](spark: SparkSession)(f: FSDataOutputStream => T): T = { val stream = fileSystem(spark).create(path) try { f(stream) } finally { stream.close() } } def fileSystem(spark: SparkSession) = path.getFileSystem(spark.sparkContext.hadoopConfiguration) override def toString = path.toString }
Example 7
Source File: StreamMetadata.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.{InputStreamReader, OutputStreamWriter} import java.nio.charset.StandardCharsets import scala.util.control.NonFatal import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.apache.spark.internal.Logging import org.apache.spark.sql.streaming.StreamingQuery def write( metadata: StreamMetadata, metadataFile: Path, hadoopConf: Configuration): Unit = { var output: FSDataOutputStream = null try { val fs = FileSystem.get(hadoopConf) output = fs.create(metadataFile) val writer = new OutputStreamWriter(output) Serialization.write(metadata, writer) writer.close() } catch { case NonFatal(e) => logError(s"Error writing stream metadata $metadata to $metadataFile", e) throw e } finally { IOUtils.closeQuietly(output) } } }
Example 8
Source File: StringWriter.scala From etl-light with MIT License | 5 votes |
package yamrcraft.etlite.writers import org.apache.hadoop.fs.{FSDataOutputStream, Path} import yamrcraft.etlite.utils.FileUtils class StringWriter(tempFile: String, outputFile: String) extends Writer[String] { // lazy initialization var writer: Option[FSDataOutputStream] = None val tempPath = new Path(tempFile + ".txt") val outputPath = new Path(outputFile + ".txt") override def write(event: String): Unit = { if (writer.isEmpty) { writer = Some(createWriter(tempPath.toString)) } writer.get.writeUTF(event) writer.get.writeChar('\n') } override def commit(): Unit = { writer.get.close() val fs = FileUtils.getFS(outputPath.toString) fs.mkdirs(outputPath.getParent) if (fs.exists(outputPath)) { fs.rename(outputPath, new Path(outputPath.getParent, s"__${outputPath.getName}.${System.currentTimeMillis()}.old.__")) } if (tempFile.startsWith("hdfs")) { fs.copyFromLocalFile(true, true, tempPath, outputPath) } else { fs.rename(tempPath, outputPath) } } private def createWriter(file: String) = { val fs = FileUtils.getFS(file) val path = new Path(file) if (fs.exists(path)) { fs.delete(path, true) } fs.create(path) } }
Example 9
Source File: StreamMetadata.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.{InputStreamReader, OutputStreamWriter} import java.nio.charset.StandardCharsets import scala.util.control.NonFatal import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.apache.spark.internal.Logging import org.apache.spark.sql.streaming.StreamingQuery def write( metadata: StreamMetadata, metadataFile: Path, hadoopConf: Configuration): Unit = { var output: FSDataOutputStream = null try { val fs = FileSystem.get(hadoopConf) output = fs.create(metadataFile) val writer = new OutputStreamWriter(output) Serialization.write(metadata, writer) writer.close() } catch { case NonFatal(e) => logError(s"Error writing stream metadata $metadata to $metadataFile", e) throw e } finally { IOUtils.closeQuietly(output) } } }
Example 10
Source File: FileBasedWriteAheadLogWriter.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io._ import java.nio.ByteBuffer import scala.util.Try import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FSDataOutputStream def write(data: ByteBuffer): FileBasedWriteAheadLogSegment = synchronized { assertOpen() data.rewind() // Rewind to ensure all data in the buffer is retrieved val lengthToWrite = data.remaining() val segment = new FileBasedWriteAheadLogSegment(path, nextOffset, lengthToWrite) stream.writeInt(lengthToWrite) if (data.hasArray) { stream.write(data.array()) } else { // If the buffer is not backed by an array, we transfer using temp array // Note that despite the extra array copy, this should be faster than byte-by-byte copy while (data.hasRemaining) { val array = new Array[Byte](data.remaining) data.get(array) stream.write(array) } } flush() nextOffset = stream.getPos() segment } override def close(): Unit = synchronized { closed = true stream.close() } private def flush() { hadoopFlushMethod.foreach { _.invoke(stream) } // Useful for local file system where hflush/sync does not work (HADOOP-7844) stream.getWrappedStream.flush() } private def assertOpen() { HdfsUtils.checkState(!closed, "Stream is closed. Create a new Writer to write to file.") } }
Example 11
Source File: FileBasedWriteAheadLogWriter.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io._ import java.nio.ByteBuffer import scala.util.Try import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FSDataOutputStream def write(data: ByteBuffer): FileBasedWriteAheadLogSegment = synchronized { assertOpen() //rewind 将文件内部的位置指针重新指向一个流(数据流/文件) data.rewind() // Rewind to ensure all data in the buffer is retrieved //remaining返回剩余的可用长度,此长度为实际读取的数据长度 val lengthToWrite = data.remaining() //数据写到文件完成后,记录一下文件 path、offset 和 length,封装为一个 FileBasedWriteAheadLogSegment返回 val segment = new FileBasedWriteAheadLogSegment(path, nextOffset, lengthToWrite) stream.writeInt(lengthToWrite) //hasArray 判断是否可通过一个可访问的字节数组实现此缓冲区 if (data.hasArray) { stream.write(data.array()) } else { // If the buffer is not backed by an array, we transfer using temp array //如果缓冲区没有一个数组支持,我们使用临时数组传输 // Note that despite the extra array copy, this should be faster than byte-by-byte copy //请注意,尽管额外的数组副本,这应该是更快比字节字节的副本 while (data.hasRemaining) { val array = new Array[Byte](data.remaining) data.get(array) stream.write(array) } } flush() nextOffset = stream.getPos() //数据写到文件完成后,记录一下文件 path、offset 和 length,封装为一个 FileBasedWriteAheadLogSegment返回 segment } override def close(): Unit = synchronized { closed = true stream.close() } private def flush() { hadoopFlushMethod.foreach { _.invoke(stream) } // Useful for local file system where hflush/sync does not work (HADOOP-7844) stream.getWrappedStream.flush() } private def assertOpen() { HdfsUtils.checkState(!closed, "Stream is closed. Create a new Writer to write to file.") } }
Example 12
Source File: StreamMetadata.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.{InputStreamReader, OutputStreamWriter} import java.nio.charset.StandardCharsets import scala.util.control.NonFatal import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.apache.spark.internal.Logging import org.apache.spark.sql.streaming.StreamingQuery def write( metadata: StreamMetadata, metadataFile: Path, hadoopConf: Configuration): Unit = { var output: FSDataOutputStream = null try { val fs = metadataFile.getFileSystem(hadoopConf) output = fs.create(metadataFile) val writer = new OutputStreamWriter(output) Serialization.write(metadata, writer) writer.close() } catch { case NonFatal(e) => logError(s"Error writing stream metadata $metadata to $metadataFile", e) throw e } finally { IOUtils.closeQuietly(output) } } }
Example 13
Source File: RecordIOOutputFormatTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.protobuf import java.io.ByteArrayOutputStream import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataOutputStream, Path} import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.hadoop.mapreduce.TaskAttemptContext import org.mockito.Matchers.any import org.mockito.Mockito.{verify, when} import org.scalatest.{BeforeAndAfter, FlatSpec} import org.scalatest.mock.MockitoSugar import com.amazonaws.services.sagemaker.sparksdk.protobuf.RecordIOOutputFormat.SageMakerProtobufRecordWriter class RecordIOOutputFormatTests extends FlatSpec with MockitoSugar with BeforeAndAfter { var sagemakerProtobufRecordWriter: SageMakerProtobufRecordWriter = _ var mockOutputStream : FSDataOutputStream = _ var byteArrayOutputStream: ByteArrayOutputStream = _ var mockTaskAttemptContext: TaskAttemptContext = _ var mockPath: Path = _ var mockFileSystem: FileSystem = _ before { byteArrayOutputStream = new ByteArrayOutputStream() mockOutputStream = mock[FSDataOutputStream] sagemakerProtobufRecordWriter = new SageMakerProtobufRecordWriter(mockOutputStream) mockTaskAttemptContext = mock[TaskAttemptContext] mockPath = mock[Path] mockFileSystem = mock[FileSystem] } it should "write an empty array of bytes" in { val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "write an array of bytes" in { val byteArray = Array[Byte](0, 0, 0, 0) byteArrayOutputStream.write(byteArray) val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "write an array of bytes, padding as necessary" in { byteArrayOutputStream.write(5) val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "write an array of bytes, padding only as much as necessary" in { byteArrayOutputStream.write(Array[Byte](0, 0, 0, 0, 0)) val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "create a record writer from a FSDataOutputStream created by the filesystem" in { val mockTaskAttemptContext = mock[TaskAttemptContext] val mockPath = mock[Path] val mockFileSystem = mock[FileSystem] when(mockPath.getFileSystem(any[Configuration])).thenReturn(mockFileSystem) new RecordIOOutputFormat() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { mockPath } }.getRecordWriter(mockTaskAttemptContext) verify(mockFileSystem).create(mockPath, true) } }
Example 14
Source File: FileBasedWriteAheadLogWriter.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io._ import java.nio.ByteBuffer import scala.util.Try import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FSDataOutputStream def write(data: ByteBuffer): FileBasedWriteAheadLogSegment = synchronized { assertOpen() data.rewind() // Rewind to ensure all data in the buffer is retrieved val lengthToWrite = data.remaining() val segment = new FileBasedWriteAheadLogSegment(path, nextOffset, lengthToWrite) stream.writeInt(lengthToWrite) if (data.hasArray) { stream.write(data.array()) } else { // If the buffer is not backed by an array, we transfer using temp array // Note that despite the extra array copy, this should be faster than byte-by-byte copy while (data.hasRemaining) { val array = new Array[Byte](data.remaining) data.get(array) stream.write(array) } } flush() nextOffset = stream.getPos() segment } override def close(): Unit = synchronized { closed = true stream.close() } private def flush() { hadoopFlushMethod.foreach { _.invoke(stream) } // Useful for local file system where hflush/sync does not work (HADOOP-7844) stream.getWrappedStream.flush() } private def assertOpen() { HdfsUtils.checkState(!closed, "Stream is closed. Create a new Writer to write to file.") } }