org.apache.hadoop.fs.FSDataInputStream Scala Examples
The following examples show how to use org.apache.hadoop.fs.FSDataInputStream.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: OrcDataFileMeta.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.io import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FSDataInputStream import org.apache.hadoop.fs.Path import org.apache.orc.OrcFile import org.apache.orc.Reader import org.apache.orc.mapred.OrcInputFormat private[oap] class OrcDataFileMeta(val path: Path, val configuration: Configuration) extends DataFileMeta { val fs = path.getFileSystem(configuration) private val readerOptions = OrcFile.readerOptions(configuration).filesystem(fs) private val fileReader = OrcFile.createReader(path, readerOptions) val length = fs.getFileStatus(path).getLen // val options: Reader.Options = OrcInputFormat.buildOptions(configuration, fileReader, 0, length) // Record reader from ORC row batch. // val recordReader = fileReader.rows(options) def getOrcFileReader(): Reader = fileReader val listStripeInformation = fileReader.getStripes() def numberOfRows: Long = fileReader.getNumberOfRows() override def len: Long = fileReader.getContentLength() override def getGroupCount: Int = fileReader.getStripes().size() override def getFieldCount: Int = fileReader.getSchema().getFieldNames().size() // Not used by orc data file. override def fin: FSDataInputStream = null }
Example 2
Source File: BedFileIterator.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.plink import com.google.common.io.LittleEndianDataInputStream import org.apache.hadoop.fs.FSDataInputStream class BedFileIterator( stream: LittleEndianDataInputStream, underlyingStream: FSDataInputStream, numBlocks: Int, blockSize: Int) extends Iterator[Array[Byte]] { var blockIdx = 0 val byteArray: Array[Byte] = new Array[Byte](blockSize) def hasNext(): Boolean = { val ret = blockIdx < numBlocks if (!ret) { cleanup() } ret } def next(): Array[Byte] = { blockIdx += 1 stream.readFully(byteArray) byteArray } private def cleanup(): Unit = { underlyingStream.close() } }
Example 3
Source File: MatrixMetaUtils.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.graph.utils import java.io.IOException import com.tencent.angel.conf.AngelConf import com.tencent.angel.ml.math2.utils.RowType import com.tencent.angel.ml.matrix.MatrixContext import com.tencent.angel.model.output.format.{MatrixFilesMeta, ModelFilesConstent} import com.tencent.angel.ps.storage.partitioner.Partitioner import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} object MatrixMetaUtils { def readMatrixContext(matPath: Path, fs: FileSystem): MatrixContext = { var input: FSDataInputStream= null var matrixFilesMeta: MatrixFilesMeta = null try { val metaFilePath = new Path(matPath.getName, ModelFilesConstent.modelMetaFileName) if (!fs.exists(metaFilePath)) { throw new IOException(s"Can not find meta file for matrix $metaFilePath") } fs.setVerifyChecksum(false) input = fs.open(metaFilePath) matrixFilesMeta = new MatrixFilesMeta() matrixFilesMeta.read(input) } catch { case e: IOException => e.printStackTrace() throw e case e: Exception => e.printStackTrace() throw e case ae: AssertionError => ae.printStackTrace() throw ae } finally { if (input != null) { input.close() } } val mc = new MatrixContext mc.setName(matrixFilesMeta.getMatrixName) mc.setRowNum(matrixFilesMeta.getRow) mc.setColNum(matrixFilesMeta.getCol) mc.setMaxRowNumInBlock(matrixFilesMeta.getBlockRow) mc.setMaxColNumInBlock(matrixFilesMeta.getBlockCol) mc.setRowType(RowType.valueOf(matrixFilesMeta.getRowType)) mc.getAttributes.putAll(matrixFilesMeta.getOptions) if (mc.getAttributes.containsKey(AngelConf.Angel_PS_PARTITION_CLASS)) { val partitionClassName = mc.getAttributes.get(AngelConf.Angel_PS_PARTITION_CLASS) mc.setPartitionerClass(Class.forName(partitionClassName).asInstanceOf[Class[Partitioner]]) mc.getAttributes.remove(AngelConf.Angel_PS_PARTITION_CLASS) } mc } }
Example 4
Source File: DataBuffer.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.nio.{ByteBuffer, ByteOrder} import org.apache.hadoop.fs.FSDataInputStream class DataBuffer(dataInput: FSDataInputStream) extends Serializable { private var bytes = new Array[Byte](1024) private var byteBuffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN) def readBytes(length: Int) = { if (length > bytes.length) { bytes = new Array[Byte](length) byteBuffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN) } else { byteBuffer.clear } dataInput.readFully(bytes, 0, length) byteBuffer } def seek(position: Long) = { dataInput.seek(position) this } def close() { dataInput.close() } } object DataBuffer { def apply(dataInput: FSDataInputStream) = { new DataBuffer(dataInput) } }
Example 5
Source File: GDBIndex.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.io.{DataInput, File} import java.nio.{ByteBuffer, ByteOrder} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, Path} import org.apache.spark.Logging object GDBIndex { def apply(path: String, name: String, conf: Configuration = new Configuration()) = { val filename = StringBuilder.newBuilder.append(path).append(File.separator).append(name).append(".gdbtablx").toString() val hdfsPath = new Path(filename) val dataInput = hdfsPath.getFileSystem(conf).open(hdfsPath) val bytes = new Array[Byte](16) dataInput.readFully(bytes) val byteBuffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN) val signature = byteBuffer.getInt val n1024Blocks = byteBuffer.getInt val numRows = byteBuffer.getInt val indexSize = byteBuffer.getInt new GDBIndex(dataInput, numRows, indexSize) } } private[gdb] class GDBIndex(dataInput: FSDataInputStream, val numRows: Int, indexSize: Int ) extends Logging with AutoCloseable with Serializable { def readSeekForRowNum(rowNum: Int) = { val bytes = new Array[Byte](indexSize) dataInput.seek(16 + rowNum * indexSize) dataInput.readFully(bytes) ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).getInt } def iterator(startAtRow: Int = 0, numRowsToRead: Int = -1) = { dataInput.seek(16 + startAtRow * indexSize) val maxRows = if (numRowsToRead == -1) numRows else numRowsToRead // log.info(s"iterator::startAtRow=$startAtRow maxRows=$maxRows") new GDBIndexIterator(dataInput, startAtRow, maxRows, indexSize).withFilter(_.isSeekable) } def close() { dataInput.close() } } private[gdb] class GDBIndexIterator(dataInput: DataInput, startID: Int, maxRows: Int, indexSize: Int ) extends Iterator[IndexInfo] with Logging with Serializable { private val indexInfo = IndexInfo(0, 0) private val bytes = new Array[Byte](indexSize) private val byteBuffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN) private var objectID = startID private var nextRow = 0 def hasNext() = nextRow < maxRows def next() = { // log.info(s"next::nextRow=$nextRow maxRows=$maxRows") nextRow += 1 objectID += 1 indexInfo.objectID = objectID byteBuffer.clear dataInput.readFully(bytes) indexInfo.seek = byteBuffer.getInt indexInfo } }
Example 6
Source File: StreamMetadata.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.{InputStreamReader, OutputStreamWriter} import java.nio.charset.StandardCharsets import scala.util.control.NonFatal import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.apache.spark.internal.Logging import org.apache.spark.sql.streaming.StreamingQuery def write( metadata: StreamMetadata, metadataFile: Path, hadoopConf: Configuration): Unit = { var output: FSDataOutputStream = null try { val fs = metadataFile.getFileSystem(hadoopConf) output = fs.create(metadataFile) val writer = new OutputStreamWriter(output) Serialization.write(metadata, writer) writer.close() } catch { case NonFatal(e) => logError(s"Error writing stream metadata $metadata to $metadataFile", e) throw e } finally { IOUtils.closeQuietly(output) } } }
Example 7
Source File: StreamMetadata.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.{InputStreamReader, OutputStreamWriter} import java.nio.charset.StandardCharsets import scala.util.control.NonFatal import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.apache.spark.internal.Logging import org.apache.spark.sql.streaming.StreamingQuery def write( metadata: StreamMetadata, metadataFile: Path, hadoopConf: Configuration): Unit = { var output: FSDataOutputStream = null try { val fs = FileSystem.get(hadoopConf) output = fs.create(metadataFile) val writer = new OutputStreamWriter(output) Serialization.write(metadata, writer) writer.close() } catch { case NonFatal(e) => logError(s"Error writing stream metadata $metadata to $metadataFile", e) throw e } finally { IOUtils.closeQuietly(output) } } }
Example 8
Source File: StreamMetadata.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.{InputStreamReader, OutputStreamWriter} import java.nio.charset.StandardCharsets import scala.util.control.NonFatal import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.apache.spark.internal.Logging import org.apache.spark.sql.streaming.StreamingQuery def write( metadata: StreamMetadata, metadataFile: Path, hadoopConf: Configuration): Unit = { var output: FSDataOutputStream = null try { val fs = FileSystem.get(hadoopConf) output = fs.create(metadataFile) val writer = new OutputStreamWriter(output) Serialization.write(metadata, writer) writer.close() } catch { case NonFatal(e) => logError(s"Error writing stream metadata $metadata to $metadataFile", e) throw e } finally { IOUtils.closeQuietly(output) } } }
Example 9
Source File: OapBitmapWrappedFiberCacheSuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.utils import java.io.{ByteArrayOutputStream, DataOutputStream, FileOutputStream} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, Path} import org.roaringbitmap.RoaringBitmap import org.apache.spark.sql.QueryTest import org.apache.spark.sql.execution.datasources.OapException import org.apache.spark.sql.execution.datasources.oap.filecache.{BitmapFiberId, FiberCache} import org.apache.spark.sql.oap.OapRuntime import org.apache.spark.sql.test.oap.SharedOapContext import org.apache.spark.util.Utils // Below are used to test the functionality of OapBitmapWrappedFiberCache class. class OapBitmapWrappedFiberCacheSuite extends QueryTest with SharedOapContext { private def loadRbFile(fin: FSDataInputStream, offset: Long, size: Int): FiberCache = OapRuntime.getOrCreate.fiberCacheManager.toIndexFiberCache(fin, offset, size) test("test the functionality of OapBitmapWrappedFiberCache class") { val CHUNK_SIZE = 1 << 16 val dataForRunChunk = (1 to 9).toSeq val dataForArrayChunk = Seq(1, 3, 5, 7, 9) val dataForBitmapChunk = (1 to 10000).filter(_ % 2 == 1) val dataCombination = dataForBitmapChunk ++ dataForArrayChunk ++ dataForRunChunk val dataArray = Array(dataForRunChunk, dataForArrayChunk, dataForBitmapChunk, dataCombination) dataArray.foreach(dataIdx => { val dir = Utils.createTempDir() val rb = new RoaringBitmap() dataIdx.foreach(rb.add) val rbFile = dir.getAbsolutePath + "rb.bin" rb.runOptimize() val rbFos = new FileOutputStream(rbFile) val rbBos = new ByteArrayOutputStream() val rbDos = new DataOutputStream(rbBos) rb.serialize(rbDos) rbBos.writeTo(rbFos) rbBos.close() rbDos.close() rbFos.close() val rbPath = new Path(rbFile.toString) val conf = new Configuration() val fin = rbPath.getFileSystem(conf).open(rbPath) val rbFileSize = rbPath.getFileSystem(conf).getFileStatus(rbPath).getLen val rbFiber = BitmapFiberId( () => loadRbFile(fin, 0L, rbFileSize.toInt), rbPath.toString, 0, 0) val rbWfc = new OapBitmapWrappedFiberCache( OapRuntime.getOrCreate.fiberCacheManager.get(rbFiber)) rbWfc.init val chunkLength = rbWfc.getTotalChunkLength val length = dataIdx.size / CHUNK_SIZE assert(chunkLength == (length + 1)) val chunkKeys = rbWfc.getChunkKeys assert(chunkKeys(0).toInt == 0) rbWfc.setOffset(0) val chunk = rbWfc.getIteratorForChunk(0) chunk match { case RunChunkIterator(rbWfc) => assert(chunk == RunChunkIterator(rbWfc)) case ArrayChunkIterator(rbWfc, 0) => assert(chunk == ArrayChunkIterator(rbWfc, 0)) case BitmapChunkIterator(rbWfc) => assert(chunk == BitmapChunkIterator(rbWfc)) case _ => throw new OapException("unexpected chunk in OapBitmapWrappedFiberCache.") } rbWfc.release fin.close dir.delete }) } }
Example 10
Source File: ParquetDataFileMeta.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.io import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, Path} import org.apache.hadoop.util.StringUtils import org.apache.parquet.hadoop.OapParquetFileReader import org.apache.parquet.hadoop.metadata.ParquetFooter private[oap] class ParquetDataFileMeta(val footer: ParquetFooter) extends DataFileMeta { require(footer != null, "footer of ParquetDataFileMeta should not be null.") override def fin: FSDataInputStream = null override def len: Long = 0 override def getGroupCount: Int = footer.getBlocks.size() override def getFieldCount: Int = footer.getFileMetaData.getSchema.getColumns.size() } private[oap] object ParquetDataFileMeta { def apply(conf: Configuration, pathString: String): ParquetDataFileMeta = { val path = new Path(StringUtils.unEscapeString(pathString)) new ParquetDataFileMeta(OapParquetFileReader.readParquetFooter(conf, path)) } }
Example 11
Source File: OapDataReader.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.io import org.apache.hadoop.fs.FSDataInputStream import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.{OapException, PartitionedFile} import org.apache.spark.sql.execution.datasources.oap.INDEX_STAT._ import org.apache.spark.sql.execution.datasources.oap.OapFileFormat import org.apache.spark.sql.execution.datasources.oap.io.OapDataFileProperties.DataFileVersion import org.apache.spark.sql.execution.datasources.oap.io.OapDataFileProperties.DataFileVersion.DataFileVersion import org.apache.spark.unsafe.types.UTF8String abstract class OapDataReader { def read(file: PartitionedFile): Iterator[InternalRow] // The two following fields have to be defined by certain versions of OapDataReader for use in // [[OapMetricsManager]] def rowsReadByIndex: Option[Long] def indexStat: INDEX_STAT } object OapDataReader extends Logging { def readVersion(is: FSDataInputStream, fileLen: Long): DataFileVersion = { val MAGIC_VERSION_LENGTH = 4 val metaEnd = fileLen - 4 // seek to the position of data file meta length is.seek(metaEnd) val metaLength = is.readInt() // read all bytes of data file meta val magicBuffer = new Array[Byte](MAGIC_VERSION_LENGTH) is.readFully(metaEnd - metaLength, magicBuffer) val magic = UTF8String.fromBytes(magicBuffer).toString magic match { case m if ! m.contains("OAP") => throw new OapException("Not a valid Oap Data File") case m if m == "OAP1" => DataFileVersion.OAP_DATAFILE_V1 case _ => throw new OapException("Not a supported Oap Data File version") } } def getDataFileClassFor(dataReaderClassFromDataSourceMeta: String, reader: OapDataReader): String = { dataReaderClassFromDataSourceMeta match { case c if c == OapFileFormat.PARQUET_DATA_FILE_CLASSNAME => c case c if c == OapFileFormat.ORC_DATA_FILE_CLASSNAME => c case c if c == OapFileFormat.OAP_DATA_FILE_CLASSNAME => reader match { case r: OapDataReaderV1 => OapFileFormat.OAP_DATA_FILE_V1_CLASSNAME case _ => throw new OapException(s"Undefined connection for $reader") } case _ => throw new OapException( s"Undefined data reader class name $dataReaderClassFromDataSourceMeta") } } }
Example 12
Source File: TFRecordInputFormat.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.utils.tf import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} import org.apache.hadoop.fs.FSDataInputStream class TFRecordInputFormat extends FileInputFormat[BytesWritable, NullWritable] { override def createRecordReader(inputSplit: InputSplit, context: TaskAttemptContext): RecordReader[BytesWritable, NullWritable] = new RecordReader[BytesWritable, NullWritable] { private var inputStream: FSDataInputStream = null private var reader: TFRecordIterator = null private var length: Long = 0L private var begin: Long = 0L private var current: Array[Byte] = null override def getCurrentKey: BytesWritable = { new BytesWritable(current) } override def getProgress: Float = { (inputStream.getPos - begin) / (length + 1e-6f) } override def nextKeyValue(): Boolean = { if (reader.hasNext) { current = reader.next() true } else { false } } override def getCurrentValue: NullWritable = { NullWritable.get() } override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = { val conf = context.getConfiguration val fileSplit = split.asInstanceOf[FileSplit] length = fileSplit.getLength begin = fileSplit.getStart val file = fileSplit.getPath val fs = file.getFileSystem(conf) inputStream = fs.open(file, 4096) reader = new TFRecordIterator(inputStream) } override def close(): Unit = { inputStream.close() } } override protected def isSplitable(context: JobContext, filename: Path): Boolean = false }
Example 13
Source File: ExcelFileSaver.scala From spark-excel with Apache License 2.0 | 5 votes |
package com.crealytics.spark.excel import java.io.BufferedOutputStream import com.crealytics.spark.excel.ExcelFileSaver.{DEFAULT_DATE_FORMAT, DEFAULT_SHEET_NAME, DEFAULT_TIMESTAMP_FORMAT} import com.norbitltd.spoiwo.model._ import com.norbitltd.spoiwo.natures.xlsx.Model2XlsxConversions._ import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.poi.ss.util.CellRangeAddress import org.apache.poi.xssf.usermodel.XSSFWorkbook import org.apache.spark.sql.{DataFrame, SaveMode} import scala.collection.JavaConverters._ object ExcelFileSaver { final val DEFAULT_SHEET_NAME = "Sheet1" final val DEFAULT_DATE_FORMAT = "yy-m-d h:mm" final val DEFAULT_TIMESTAMP_FORMAT = "yyyy-mm-dd hh:mm:ss.000" } class ExcelFileSaver( fs: FileSystem, location: Path, dataFrame: DataFrame, saveMode: SaveMode, dataLocator: DataLocator, header: Boolean = true ) { def save(): Unit = { def sheet(workbook: XSSFWorkbook) = { val headerRow = if (header) Some(dataFrame.schema.fields.map(_.name).toSeq) else None val dataRows = dataFrame .toLocalIterator() .asScala .map(_.toSeq) dataLocator.toSheet(headerRow, dataRows, workbook) } val fileAlreadyExists = fs.exists(location) def writeToWorkbook(workbook: XSSFWorkbook): Unit = { Workbook(sheet(workbook)).writeToExisting(workbook) autoClose(new BufferedOutputStream(fs.create(location)))(workbook.write) } (fileAlreadyExists, saveMode) match { case (false, _) | (_, SaveMode.Overwrite) => if (fileAlreadyExists) { fs.delete(location, true) } writeToWorkbook(new XSSFWorkbook()) case (true, SaveMode.ErrorIfExists) => sys.error(s"path $location already exists.") case (true, SaveMode.Ignore) => () case (true, SaveMode.Append) => val inputStream: FSDataInputStream = fs.open(location) val workbook = new XSSFWorkbook(inputStream) inputStream.close() writeToWorkbook(workbook) } } def autoClose[A <: AutoCloseable, B](closeable: A)(fun: (A) => B): B = { try { fun(closeable) } finally { closeable.close() } } }
Example 14
Source File: WholeFileReader.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import java.io.InputStream import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.hadoop.io.compress.{CodecPool, CompressionCodecFactory, Decompressor} import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapreduce.lib.input.FileSplit import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext} class WholeFileReader extends RecordReader[NullWritable, Text] { private val key = NullWritable.get() private val value = new Text() private var split: FileSplit = _ private var conf: Configuration = _ private var path: Path = _ private var done: Boolean = false override def getProgress: Float = ??? override def nextKeyValue(): Boolean = { if (done){ false } else { val fs = path.getFileSystem(conf) var is: FSDataInputStream = null var in: InputStream = null var decompressor: Decompressor = null try { is = fs.open(split.getPath) val codec = new CompressionCodecFactory(conf).getCodec(path) if (codec != null) { decompressor = CodecPool.getDecompressor(codec) in = codec.createInputStream(is, decompressor) } else { in = is } val result = IOUtils.toByteArray(in) value.clear() value.set(result) done = true true } finally { if (in != null) { IOUtils.closeQuietly(in) } if (decompressor != null) { CodecPool.returnDecompressor(decompressor) } } } } override def getCurrentValue: Text = value override def initialize(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext): Unit = { this.split = inputSplit.asInstanceOf[FileSplit] this.conf = MapReduceUtils.getConfigurationFromContext(taskAttemptContext) this.path = this.split.getPath } override def getCurrentKey: NullWritable = key override def close() {} }
Example 15
Source File: StreamMetadata.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.{InputStreamReader, OutputStreamWriter} import java.nio.charset.StandardCharsets import java.util.ConcurrentModificationException import scala.util.control.NonFatal import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileAlreadyExistsException, FSDataInputStream, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.apache.spark.internal.Logging import org.apache.spark.sql.execution.streaming.CheckpointFileManager.CancellableFSDataOutputStream import org.apache.spark.sql.streaming.StreamingQuery def write( metadata: StreamMetadata, metadataFile: Path, hadoopConf: Configuration): Unit = { var output: CancellableFSDataOutputStream = null try { val fileManager = CheckpointFileManager.create(metadataFile.getParent, hadoopConf) output = fileManager.createAtomic(metadataFile, overwriteIfPossible = false) val writer = new OutputStreamWriter(output) Serialization.write(metadata, writer) writer.close() } catch { case e: FileAlreadyExistsException => if (output != null) { output.cancel() } throw new ConcurrentModificationException( s"Multiple streaming queries are concurrently using $metadataFile", e) case e: Throwable => if (output != null) { output.cancel() } logError(s"Error writing stream metadata $metadata to $metadataFile", e) throw e } } }
Example 16
Source File: MergeStrategySpec.scala From daf with BSD 3-Clause "New" or "Revised" License | 5 votes |
package daf.filesystem import java.io.{ Closeable, InputStream } import java.util.Scanner import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{ FSDataInputStream, FSDataOutputStream, FileSystem, Path } import org.scalatest.{ BeforeAndAfterAll, Matchers, WordSpec } import scala.collection.convert.decorateAsScala._ import scala.util.{ Random, Try } class MergeStrategySpec extends WordSpec with Matchers with BeforeAndAfterAll { private implicit val fileSystem = FileSystem.getLocal(new Configuration) private val numFiles = 10 private val baseDir = "test-dir".asHadoop private val workingDir = baseDir / f"merge-strategy-spec-${Random.nextInt(10000)}%05d" private def safely[A <: Closeable, U](f: A => U) = { stream: A => val attempt = Try { f(stream) } stream.close() attempt } private def readFile(path: Path) = safely[FSDataInputStream, Seq[String]] { _.scanner.asScala.toSeq } apply fileSystem.open(path) private def readFiles = Try { fileSystem.listStatus(workingDir).toSeq.flatMap { status => readFile(status.getPath).get } } private def openFiles = Try { fileSystem.listStatus(workingDir).toSeq.map { status => fileSystem.open(status.getPath) } } private def createFile(fileName: String) = safely[FSDataOutputStream, Unit] { stream => Random.alphanumeric.grouped(200).take(10).map { randomSplits(_) }.foreach { row => stream.writeUTF { row.mkString("", ",", "\n") } } } apply fileSystem.create { workingDir / fileName } private def randomSplits(chars: Stream[Char], strings: Seq[String] = Seq.empty): Seq[String] = chars.splitAt { Random.nextInt(10) + 5 } match { case (head, tail) if tail.isEmpty => head.drop(1).mkString +: strings case (head, tail) => randomSplits(tail, head.mkString +: strings) } private def createWorkingDir = Try { fileSystem.mkdirs(workingDir) } private def createFiles = Try { 0 until numFiles foreach { index => createFile(s"test-file-$index").get } // this is relatively nasty, and should be handled in a `traverse` } private def prepareData = for { _ <- createWorkingDir _ <- createFiles } yield () private def purgeData = Try { fileSystem.delete(workingDir, true) } override def beforeAll() = prepareData.get override def afterAll() = purgeData.get "MergeStrategies info" when { "given compressed format files" must { "throw an exception" in { an[IllegalArgumentException] must be thrownBy MergeStrategies.find { FileInfo(workingDir / "test-file-0", 0, FileDataFormats.raw, FileCompressionFormats.gzip) } } } "given data as csv" must { "drop one line and merge the rest" in { safely[InputStream, Seq[String]] { new Scanner(_).asScala.toList }.andThen { attempt => for { merged <- attempt expected <- readFiles } merged.size should be { expected.size - numFiles + 1 } } apply MergeStrategies.csv.merge { openFiles.get } } } "given data as json" must { "just merge the files into one" in { safely[InputStream, Seq[String]] { new Scanner(_).asScala.toList }.andThen { attempt => for { merged <- attempt expected <- readFiles } merged.size should be { expected.size } } apply MergeStrategies.json.merge { openFiles.get } } } } }
Example 17
Source File: Ensembl.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.microorganism import java.io._ import cn.piflow.bundle.microorganism.util.ParserGff3Data import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path} import org.apache.spark.sql.{DataFrame, SparkSession} import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator} import org.json.JSONObject class Ensembl extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Parse ensembl data" override val inportList: List[String] =List(Port.DefaultPort.toString) override val outportList: List[String] = List(Port.DefaultPort.toString) var cachePath:String = _ def setProperties(map: Map[String, Any]): Unit = { cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path") .defaultValue("/ensembl").required(true) descriptor = cachePath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/microorganism/Ensembl.png") } override def getGroup(): List[String] = { List(StopGroup.MicroorganismGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val session = pec.get[SparkSession]() val inDf: DataFrame = in.read() val configuration: Configuration = new Configuration() var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String] val pathARR: Array[String] = pathStr.split("\\/") var hdfsUrl:String="" for (x <- (0 until 3)){ hdfsUrl+=(pathARR(x) +"/") } configuration.set("fs.defaultFS",hdfsUrl) var fs: FileSystem = FileSystem.get(configuration) val hdfsPathTemporary = hdfsUrl+cachePath+"/ensemblCache/ensemblCache.json" val path: Path = new Path(hdfsPathTemporary) if(fs.exists(path)){ fs.delete(path) } fs.create(path).close() val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path)) val parser: ParserGff3Data = new ParserGff3Data var fdis: FSDataInputStream =null var br: BufferedReader = null var doc: JSONObject = null var count:Int = 0 inDf.collect().foreach(row => { pathStr = row.get(0).asInstanceOf[String] fdis = fs.open(new Path(pathStr)) br = new BufferedReader(new InputStreamReader(fdis)) var eachStr:String=null while((eachStr = br.readLine()) != null && eachStr != null ){ doc = parser.parserGff3(eachStr) if(doc.toString.length > 2){ count += 1 doc.write(hdfsWriter) hdfsWriter.write("\n") } } br.close() fdis.close() }) hdfsWriter.close() out.write(session.read.json(hdfsPathTemporary)) } }
Example 18
Source File: PDBData.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.microorganism import java.io._ import cn.piflow.bundle.microorganism.util.PDB import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path} import org.apache.spark.sql.{DataFrame, SparkSession} import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator} import org.json.JSONObject class PDBData extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Parse PDB data" override val inportList: List[String] =List(Port.DefaultPort.toString) override val outportList: List[String] = List(Port.DefaultPort.toString) var cachePath:String = _ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val session = pec.get[SparkSession]() val inDf: DataFrame = in.read() val configuration: Configuration = new Configuration() var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String] val pathARR: Array[String] = pathStr.split("\\/") var hdfsUrl:String="" for (x <- (0 until 3)){ hdfsUrl+=(pathARR(x) +"/") } configuration.set("fs.defaultFS",hdfsUrl) var fs: FileSystem = FileSystem.get(configuration) val hdfsPathTemporary = hdfsUrl+cachePath+"/PDBCache/PDBCache.json" val path: Path = new Path(hdfsPathTemporary) if(fs.exists(path)){ fs.delete(path) } fs.create(path).close() val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path)) var doc: JSONObject = null var pdb: PDB = null var count:Int=0 inDf.collect().foreach(row => { count += 1 pathStr = row.get(0).asInstanceOf[String] pdb = new PDB(pathStr,fs) doc = pdb.getDoc doc.write(hdfsWriter) hdfsWriter.write("\n") doc = null }) hdfsWriter.close() val df: DataFrame = session.read.json(hdfsPathTemporary) out.write(df) } def setProperties(map: Map[String, Any]): Unit = { cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path") .defaultValue("/PDB").required(true) descriptor = cachePath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/microorganism/PDBData.png") } override def getGroup(): List[String] = { List(StopGroup.MicroorganismGroup) } override def initialize(ctx: ProcessContext): Unit = { } }
Example 19
Source File: Pathway.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.microorganism import java.io.{BufferedReader, InputStreamReader, OutputStreamWriter} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path} import org.apache.spark.sql.{DataFrame, SparkSession} import org.json.JSONObject class Pathway extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Parse Pathway data" override val inportList: List[String] =List(Port.DefaultPort.toString) override val outportList: List[String] = List(Port.DefaultPort.toString) var cachePath:String = _ def setProperties(map: Map[String, Any]): Unit = { cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path") .defaultValue("/pathway").required(true) descriptor = cachePath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/microorganism/Pathway.png") } override def getGroup(): List[String] = { List(StopGroup.MicroorganismGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val inDf: DataFrame = in.read() var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String] val configuration: Configuration = new Configuration() val pathARR: Array[String] = pathStr.split("\\/") var hdfsUrl:String="" for (x <- (0 until 3)){ hdfsUrl+=(pathARR(x) +"/") } configuration.set("fs.defaultFS",hdfsUrl) var fs: FileSystem = FileSystem.get(configuration) val hdfsPathTemporary = hdfsUrl+cachePath+"/pathwayCache/pathwayCache.json" val path: Path = new Path(hdfsPathTemporary) if(fs.exists(path)){ fs.delete(path) } fs.create(path).close() val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path)) var fdis: FSDataInputStream = null var br: BufferedReader = null var doc: JSONObject = null var hasAnotherSequence:Boolean = true inDf.collect().foreach(row => { pathStr = row.get(0).asInstanceOf[String] fdis = fs.open(new Path(pathStr)) br = new BufferedReader(new InputStreamReader(fdis)) var count = 0 while (hasAnotherSequence) { count += 1 doc = new JSONObject hasAnotherSequence = util.KeggPathway.process(br, doc) doc.write(hdfsWriter) hdfsWriter.write("\n") } br.close() fdis.close() }) hdfsWriter.close() val df: DataFrame = pec.get[SparkSession]().read.json(hdfsPathTemporary) df.schema.printTreeString() println(df.count) out.write(df) } }
Example 20
Source File: PostUrl.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.http import java.io.{BufferedReader, InputStreamReader} import java.net.URI import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.commons.httpclient.HttpClient import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.http.client.methods.HttpPost import org.apache.http.entity.StringEntity import org.apache.http.impl.client.HttpClients import org.apache.http.util.EntityUtils import org.apache.spark.sql.SparkSession class PostUrl extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) override val description: String = "Send a post request to the specified http" var url : String= _ var jsonPath : String = _ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession]() //read json from hdfs val conf = new Configuration() val fs = FileSystem.get(URI.create(jsonPath),conf) val stream: FSDataInputStream = fs.open(new Path(jsonPath)) val bufferReader = new BufferedReader(new InputStreamReader(stream)) var lineTxt = bufferReader.readLine() val buffer = new StringBuffer() while (lineTxt != null ){ buffer.append(lineTxt.mkString) lineTxt=bufferReader.readLine() } // post val client = HttpClients.createDefault() val httpClient = new HttpClient() httpClient.getParams().setContentCharset("utf-8") val post = new HttpPost(url) post.addHeader("content-Type","application/json") post.setEntity(new StringEntity(buffer.toString)) val response = client.execute(post) val entity = response.getEntity val str = EntityUtils.toString(entity,"UTF-8") println("Code is " + str) } override def setProperties(map: Map[String, Any]): Unit = { url = MapUtil.get(map,key="url").asInstanceOf[String] jsonPath = MapUtil.get(map,key="jsonPath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val url = new PropertyDescriptor() .name("url") .displayName("Url") .defaultValue("") .description("http request address") .required(true) .example("http://master:8002/flow/start") val jsonPath = new PropertyDescriptor() .name("jsonPath") .displayName("JsonPath") .defaultValue("") .description("json parameter path for post request") .required(true) .example("hdfs://master:9000/work/flow.json") descriptor = url :: descriptor descriptor = jsonPath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/http/PostUrl.png") } override def getGroup(): List[String] = { List(StopGroup.HttpGroup.toString) } override def initialize(ctx: ProcessContext): Unit = { } }
Example 21
Source File: HdfsStreamAccessor.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.dataspecs.access import java.io.InputStream import org.apache.commons.io.input.BoundedInputStream import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.spark.deploy.SparkHadoopUtil class HdfsStreamAccessor(location: HdfsLocationInfo) extends CloseableDataAccessor[InputStream] { override def get: Option[InputStream] = { if (location.length < 0 || location.offset < 0) None else { val fs = FileSystem.get(SparkHadoopUtil.get.conf) var stream: FSDataInputStream = null try { stream = fs.open(new Path(location.path)) stream.seek(location.offset) Some(new BoundedInputStream(stream, location.length)) } catch { case e: Exception => e.printStackTrace() if (stream != null) stream.close() None } } } }