org.apache.hadoop.fs.FSDataInputStream Scala Example

Source File: OrcDataFileMeta.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.io

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FSDataInputStream
import org.apache.hadoop.fs.Path
import org.apache.orc.OrcFile
import org.apache.orc.Reader
import org.apache.orc.mapred.OrcInputFormat

private[oap] class OrcDataFileMeta(val path: Path, val configuration: Configuration)
    extends DataFileMeta {

  val fs = path.getFileSystem(configuration)
  private val readerOptions = OrcFile.readerOptions(configuration).filesystem(fs)
  private val fileReader = OrcFile.createReader(path, readerOptions)
  val length = fs.getFileStatus(path).getLen
//  val options: Reader.Options = OrcInputFormat.buildOptions(configuration, fileReader, 0, length)
  // Record reader from ORC row batch.
//  val recordReader = fileReader.rows(options)

  def getOrcFileReader(): Reader = fileReader
  val listStripeInformation = fileReader.getStripes()

  def numberOfRows: Long = fileReader.getNumberOfRows()
  override def len: Long = fileReader.getContentLength()
  override def getGroupCount: Int = fileReader.getStripes().size()
  override def getFieldCount: Int = fileReader.getSchema().getFieldNames().size()
  // Not used by orc data file.
  override def fin: FSDataInputStream = null
}

Source File: BedFileIterator.scala From glow with Apache License 2.0

5 votes

package io.projectglow.plink

import com.google.common.io.LittleEndianDataInputStream
import org.apache.hadoop.fs.FSDataInputStream


class BedFileIterator(
    stream: LittleEndianDataInputStream,
    underlyingStream: FSDataInputStream,
    numBlocks: Int,
    blockSize: Int)
    extends Iterator[Array[Byte]] {

  var blockIdx = 0
  val byteArray: Array[Byte] = new Array[Byte](blockSize)

  def hasNext(): Boolean = {
    val ret = blockIdx < numBlocks
    if (!ret) {
      cleanup()
    }
    ret
  }

  def next(): Array[Byte] = {
    blockIdx += 1
    stream.readFully(byteArray)
    byteArray
  }

  private def cleanup(): Unit = {
    underlyingStream.close()
  }
}

Source File: MatrixMetaUtils.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.graph.utils
import java.io.IOException

import com.tencent.angel.conf.AngelConf
import com.tencent.angel.ml.math2.utils.RowType
import com.tencent.angel.ml.matrix.MatrixContext
import com.tencent.angel.model.output.format.{MatrixFilesMeta, ModelFilesConstent}
import com.tencent.angel.ps.storage.partitioner.Partitioner
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}

object MatrixMetaUtils {
  def readMatrixContext(matPath: Path, fs: FileSystem): MatrixContext = {
    var input: FSDataInputStream= null
    var matrixFilesMeta: MatrixFilesMeta = null
    try {
      val metaFilePath = new Path(matPath.getName, ModelFilesConstent.modelMetaFileName)
      if (!fs.exists(metaFilePath)) {
        throw new IOException(s"Can not find meta file for matrix $metaFilePath")
      }
      fs.setVerifyChecksum(false)
      input = fs.open(metaFilePath)
      matrixFilesMeta = new MatrixFilesMeta()
      matrixFilesMeta.read(input)
    } catch {
      case e: IOException =>
        e.printStackTrace()
        throw e
      case e: Exception =>
        e.printStackTrace()
        throw e
      case ae: AssertionError =>
        ae.printStackTrace()
        throw ae
    } finally {
      if (input != null) {
        input.close()
      }
    }

    val mc = new MatrixContext
    mc.setName(matrixFilesMeta.getMatrixName)
    mc.setRowNum(matrixFilesMeta.getRow)
    mc.setColNum(matrixFilesMeta.getCol)
    mc.setMaxRowNumInBlock(matrixFilesMeta.getBlockRow)
    mc.setMaxColNumInBlock(matrixFilesMeta.getBlockCol)
    mc.setRowType(RowType.valueOf(matrixFilesMeta.getRowType))
    mc.getAttributes.putAll(matrixFilesMeta.getOptions)
    if (mc.getAttributes.containsKey(AngelConf.Angel_PS_PARTITION_CLASS)) {
      val partitionClassName = mc.getAttributes.get(AngelConf.Angel_PS_PARTITION_CLASS)
      mc.setPartitionerClass(Class.forName(partitionClassName).asInstanceOf[Class[Partitioner]])
      mc.getAttributes.remove(AngelConf.Angel_PS_PARTITION_CLASS)
    }

    mc
  }
}

Source File: DataBuffer.scala From spark-gdb with Apache License 2.0

5 votes

package com.esri.gdb

import java.nio.{ByteBuffer, ByteOrder}

import org.apache.hadoop.fs.FSDataInputStream


class DataBuffer(dataInput: FSDataInputStream) extends Serializable {

  private var bytes = new Array[Byte](1024)
  private var byteBuffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN)

  def readBytes(length: Int) = {
    if (length > bytes.length) {
      bytes = new Array[Byte](length)
      byteBuffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN)
    }
    else {
      byteBuffer.clear
    }
    dataInput.readFully(bytes, 0, length)
    byteBuffer
  }

  def seek(position: Long) = {
    dataInput.seek(position)
    this
  }

  def close() {
    dataInput.close()
  }
}

object DataBuffer {
  def apply(dataInput: FSDataInputStream) = {
    new DataBuffer(dataInput)
  }
}

Source File: GDBIndex.scala From spark-gdb with Apache License 2.0

5 votes

package com.esri.gdb

import java.io.{DataInput, File}
import java.nio.{ByteBuffer, ByteOrder}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, Path}
import org.apache.spark.Logging

object GDBIndex {
  def apply(path: String, name: String, conf: Configuration = new Configuration()) = {
    val filename = StringBuilder.newBuilder.append(path).append(File.separator).append(name).append(".gdbtablx").toString()
    val hdfsPath = new Path(filename)
    val dataInput = hdfsPath.getFileSystem(conf).open(hdfsPath)

    val bytes = new Array[Byte](16)
    dataInput.readFully(bytes)
    val byteBuffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN)

    val signature = byteBuffer.getInt
    val n1024Blocks = byteBuffer.getInt
    val numRows = byteBuffer.getInt
    val indexSize = byteBuffer.getInt

    new GDBIndex(dataInput, numRows, indexSize)
  }
}

private[gdb] class GDBIndex(dataInput: FSDataInputStream,
                            val numRows: Int,
                            indexSize: Int
                           ) extends Logging with AutoCloseable with Serializable {

  def readSeekForRowNum(rowNum: Int) = {
    val bytes = new Array[Byte](indexSize)
    dataInput.seek(16 + rowNum * indexSize)
    dataInput.readFully(bytes)
    ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).getInt
  }

  def iterator(startAtRow: Int = 0, numRowsToRead: Int = -1) = {
    dataInput.seek(16 + startAtRow * indexSize)
    val maxRows = if (numRowsToRead == -1) numRows else numRowsToRead
    // log.info(s"iterator::startAtRow=$startAtRow maxRows=$maxRows")
    new GDBIndexIterator(dataInput, startAtRow, maxRows, indexSize).withFilter(_.isSeekable)
  }

  def close() {
    dataInput.close()
  }
}

private[gdb] class GDBIndexIterator(dataInput: DataInput,
                                    startID: Int,
                                    maxRows: Int,
                                    indexSize: Int
                                   ) extends Iterator[IndexInfo] with Logging with Serializable {

  private val indexInfo = IndexInfo(0, 0)
  private val bytes = new Array[Byte](indexSize)
  private val byteBuffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN)

  private var objectID = startID
  private var nextRow = 0

  def hasNext() = nextRow < maxRows

  def next() = {
    // log.info(s"next::nextRow=$nextRow maxRows=$maxRows")
    nextRow += 1

    objectID += 1
    indexInfo.objectID = objectID

    byteBuffer.clear
    dataInput.readFully(bytes)
    indexInfo.seek = byteBuffer.getInt

    indexInfo
  }
}

Source File: StreamMetadata.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.io.{InputStreamReader, OutputStreamWriter}
import java.nio.charset.StandardCharsets

import scala.util.control.NonFatal

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization

import org.apache.spark.internal.Logging
import org.apache.spark.sql.streaming.StreamingQuery


  def write(
      metadata: StreamMetadata,
      metadataFile: Path,
      hadoopConf: Configuration): Unit = {
    var output: FSDataOutputStream = null
    try {
      val fs = metadataFile.getFileSystem(hadoopConf)
      output = fs.create(metadataFile)
      val writer = new OutputStreamWriter(output)
      Serialization.write(metadata, writer)
      writer.close()
    } catch {
      case NonFatal(e) =>
        logError(s"Error writing stream metadata $metadata to $metadataFile", e)
        throw e
    } finally {
      IOUtils.closeQuietly(output)
    }
  }
}

Source File: StreamMetadata.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.io.{InputStreamReader, OutputStreamWriter}
import java.nio.charset.StandardCharsets

import scala.util.control.NonFatal

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization

import org.apache.spark.internal.Logging
import org.apache.spark.sql.streaming.StreamingQuery


  def write(
      metadata: StreamMetadata,
      metadataFile: Path,
      hadoopConf: Configuration): Unit = {
    var output: FSDataOutputStream = null
    try {
      val fs = FileSystem.get(hadoopConf)
      output = fs.create(metadataFile)
      val writer = new OutputStreamWriter(output)
      Serialization.write(metadata, writer)
      writer.close()
    } catch {
      case NonFatal(e) =>
        logError(s"Error writing stream metadata $metadata to $metadataFile", e)
        throw e
    } finally {
      IOUtils.closeQuietly(output)
    }
  }
}

Source File: StreamMetadata.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.io.{InputStreamReader, OutputStreamWriter}
import java.nio.charset.StandardCharsets

import scala.util.control.NonFatal

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization

import org.apache.spark.internal.Logging
import org.apache.spark.sql.streaming.StreamingQuery


  def write(
      metadata: StreamMetadata,
      metadataFile: Path,
      hadoopConf: Configuration): Unit = {
    var output: FSDataOutputStream = null
    try {
      val fs = FileSystem.get(hadoopConf)
      output = fs.create(metadataFile)
      val writer = new OutputStreamWriter(output)
      Serialization.write(metadata, writer)
      writer.close()
    } catch {
      case NonFatal(e) =>
        logError(s"Error writing stream metadata $metadata to $metadataFile", e)
        throw e
    } finally {
      IOUtils.closeQuietly(output)
    }
  }
}

Source File: OapBitmapWrappedFiberCacheSuite.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.utils

import java.io.{ByteArrayOutputStream, DataOutputStream, FileOutputStream}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, Path}
import org.roaringbitmap.RoaringBitmap

import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.execution.datasources.OapException
import org.apache.spark.sql.execution.datasources.oap.filecache.{BitmapFiberId, FiberCache}
import org.apache.spark.sql.oap.OapRuntime
import org.apache.spark.sql.test.oap.SharedOapContext
import org.apache.spark.util.Utils

// Below are used to test the functionality of OapBitmapWrappedFiberCache class.
class OapBitmapWrappedFiberCacheSuite
  extends QueryTest with SharedOapContext {

  private def loadRbFile(fin: FSDataInputStream, offset: Long, size: Int): FiberCache =
    OapRuntime.getOrCreate.fiberCacheManager.toIndexFiberCache(fin, offset, size)

  test("test the functionality of OapBitmapWrappedFiberCache class") {
    val CHUNK_SIZE = 1 << 16
    val dataForRunChunk = (1 to 9).toSeq
    val dataForArrayChunk = Seq(1, 3, 5, 7, 9)
    val dataForBitmapChunk = (1 to 10000).filter(_ % 2 == 1)
    val dataCombination =
      dataForBitmapChunk ++ dataForArrayChunk ++ dataForRunChunk
    val dataArray =
      Array(dataForRunChunk, dataForArrayChunk, dataForBitmapChunk, dataCombination)
    dataArray.foreach(dataIdx => {
      val dir = Utils.createTempDir()
      val rb = new RoaringBitmap()
      dataIdx.foreach(rb.add)
      val rbFile = dir.getAbsolutePath + "rb.bin"
      rb.runOptimize()
      val rbFos = new FileOutputStream(rbFile)
      val rbBos = new ByteArrayOutputStream()
      val rbDos = new DataOutputStream(rbBos)
      rb.serialize(rbDos)
      rbBos.writeTo(rbFos)
      rbBos.close()
      rbDos.close()
      rbFos.close()
      val rbPath = new Path(rbFile.toString)
      val conf = new Configuration()
      val fin = rbPath.getFileSystem(conf).open(rbPath)
      val rbFileSize = rbPath.getFileSystem(conf).getFileStatus(rbPath).getLen
      val rbFiber = BitmapFiberId(
        () => loadRbFile(fin, 0L, rbFileSize.toInt), rbPath.toString, 0, 0)
      val rbWfc = new OapBitmapWrappedFiberCache(
        OapRuntime.getOrCreate.fiberCacheManager.get(rbFiber))
      rbWfc.init
      val chunkLength = rbWfc.getTotalChunkLength
      val length = dataIdx.size / CHUNK_SIZE
      assert(chunkLength == (length + 1))
      val chunkKeys = rbWfc.getChunkKeys
      assert(chunkKeys(0).toInt == 0)
      rbWfc.setOffset(0)
      val chunk = rbWfc.getIteratorForChunk(0)
      chunk match {
        case RunChunkIterator(rbWfc) => assert(chunk == RunChunkIterator(rbWfc))
        case ArrayChunkIterator(rbWfc, 0) => assert(chunk == ArrayChunkIterator(rbWfc, 0))
        case BitmapChunkIterator(rbWfc) => assert(chunk == BitmapChunkIterator(rbWfc))
        case _ => throw new OapException("unexpected chunk in OapBitmapWrappedFiberCache.")
      }
      rbWfc.release
      fin.close
      dir.delete
    })
  }
}

Source File: ParquetDataFileMeta.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.io

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, Path}
import org.apache.hadoop.util.StringUtils
import org.apache.parquet.hadoop.OapParquetFileReader
import org.apache.parquet.hadoop.metadata.ParquetFooter

private[oap] class ParquetDataFileMeta(val footer: ParquetFooter) extends DataFileMeta {

  require(footer != null, "footer of ParquetDataFileMeta should not be null.")

  override def fin: FSDataInputStream = null

  override def len: Long = 0

  override def getGroupCount: Int = footer.getBlocks.size()

  override def getFieldCount: Int =
    footer.getFileMetaData.getSchema.getColumns.size()
}

private[oap] object ParquetDataFileMeta {
  def apply(conf: Configuration, pathString: String): ParquetDataFileMeta = {
    val path = new Path(StringUtils.unEscapeString(pathString))
    new ParquetDataFileMeta(OapParquetFileReader.readParquetFooter(conf, path))
  }
}

Source File: OapDataReader.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.io

import org.apache.hadoop.fs.FSDataInputStream

import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.{OapException, PartitionedFile}
import org.apache.spark.sql.execution.datasources.oap.INDEX_STAT._
import org.apache.spark.sql.execution.datasources.oap.OapFileFormat
import org.apache.spark.sql.execution.datasources.oap.io.OapDataFileProperties.DataFileVersion
import org.apache.spark.sql.execution.datasources.oap.io.OapDataFileProperties.DataFileVersion.DataFileVersion
import org.apache.spark.unsafe.types.UTF8String

abstract class OapDataReader {
  def read(file: PartitionedFile): Iterator[InternalRow]

  // The two following fields have to be defined by certain versions of OapDataReader for use in
  // [[OapMetricsManager]]
  def rowsReadByIndex: Option[Long]
  def indexStat: INDEX_STAT
}

object OapDataReader extends Logging {

  def readVersion(is: FSDataInputStream, fileLen: Long): DataFileVersion = {
    val MAGIC_VERSION_LENGTH = 4
    val metaEnd = fileLen - 4

    // seek to the position of data file meta length
    is.seek(metaEnd)
    val metaLength = is.readInt()
    // read all bytes of data file meta
    val magicBuffer = new Array[Byte](MAGIC_VERSION_LENGTH)
    is.readFully(metaEnd - metaLength, magicBuffer)

    val magic = UTF8String.fromBytes(magicBuffer).toString
    magic match {
      case m if ! m.contains("OAP") => throw new OapException("Not a valid Oap Data File")
      case m if m == "OAP1" => DataFileVersion.OAP_DATAFILE_V1
      case _ => throw new OapException("Not a supported Oap Data File version")
    }
  }

  def getDataFileClassFor(dataReaderClassFromDataSourceMeta: String, reader: OapDataReader): String
    = {
    dataReaderClassFromDataSourceMeta match {
      case c if c == OapFileFormat.PARQUET_DATA_FILE_CLASSNAME => c
      case c if c == OapFileFormat.ORC_DATA_FILE_CLASSNAME => c
      case c if c == OapFileFormat.OAP_DATA_FILE_CLASSNAME =>
        reader match {
          case r: OapDataReaderV1 => OapFileFormat.OAP_DATA_FILE_V1_CLASSNAME
          case _ => throw new OapException(s"Undefined connection for $reader")
        }
      case _ => throw new OapException(
        s"Undefined data reader class name $dataReaderClassFromDataSourceMeta")
    }
  }
}

Source File: TFRecordInputFormat.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.utils.tf

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
import org.apache.hadoop.fs.FSDataInputStream

class TFRecordInputFormat extends FileInputFormat[BytesWritable, NullWritable] {
  override def createRecordReader(inputSplit: InputSplit, context: TaskAttemptContext):
  RecordReader[BytesWritable, NullWritable] = new RecordReader[BytesWritable, NullWritable] {

    private var inputStream: FSDataInputStream = null
    private var reader: TFRecordIterator = null
    private var length: Long = 0L
    private var begin: Long = 0L
    private var current: Array[Byte] = null


    override def getCurrentKey: BytesWritable = {
      new BytesWritable(current)
    }

    override def getProgress: Float = {
      (inputStream.getPos - begin) / (length + 1e-6f)
    }

    override def nextKeyValue(): Boolean = {
      if (reader.hasNext) {
        current = reader.next()
        true
      } else {
        false
      }
    }

    override def getCurrentValue: NullWritable = {
      NullWritable.get()
    }

    override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = {
      val conf = context.getConfiguration
      val fileSplit = split.asInstanceOf[FileSplit]
      length = fileSplit.getLength
      begin = fileSplit.getStart

      val file = fileSplit.getPath
      val fs = file.getFileSystem(conf)
      inputStream = fs.open(file, 4096)
      reader = new TFRecordIterator(inputStream)
    }

    override def close(): Unit = {
      inputStream.close()
    }
  }

  override protected def isSplitable(context: JobContext, filename: Path): Boolean = false
}

Source File: ExcelFileSaver.scala From spark-excel with Apache License 2.0

5 votes

package com.crealytics.spark.excel

import java.io.BufferedOutputStream

import com.crealytics.spark.excel.ExcelFileSaver.{DEFAULT_DATE_FORMAT, DEFAULT_SHEET_NAME, DEFAULT_TIMESTAMP_FORMAT}
import com.norbitltd.spoiwo.model._
import com.norbitltd.spoiwo.natures.xlsx.Model2XlsxConversions._
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import org.apache.poi.ss.util.CellRangeAddress
import org.apache.poi.xssf.usermodel.XSSFWorkbook
import org.apache.spark.sql.{DataFrame, SaveMode}

import scala.collection.JavaConverters._

object ExcelFileSaver {
  final val DEFAULT_SHEET_NAME = "Sheet1"
  final val DEFAULT_DATE_FORMAT = "yy-m-d h:mm"
  final val DEFAULT_TIMESTAMP_FORMAT = "yyyy-mm-dd hh:mm:ss.000"
}

class ExcelFileSaver(
  fs: FileSystem,
  location: Path,
  dataFrame: DataFrame,
  saveMode: SaveMode,
  dataLocator: DataLocator,
  header: Boolean = true
) {
  def save(): Unit = {
    def sheet(workbook: XSSFWorkbook) = {
      val headerRow = if (header) Some(dataFrame.schema.fields.map(_.name).toSeq) else None
      val dataRows = dataFrame
        .toLocalIterator()
        .asScala
        .map(_.toSeq)
      dataLocator.toSheet(headerRow, dataRows, workbook)
    }
    val fileAlreadyExists = fs.exists(location)
    def writeToWorkbook(workbook: XSSFWorkbook): Unit = {
      Workbook(sheet(workbook)).writeToExisting(workbook)
      autoClose(new BufferedOutputStream(fs.create(location)))(workbook.write)
    }
    (fileAlreadyExists, saveMode) match {
      case (false, _) | (_, SaveMode.Overwrite) =>
        if (fileAlreadyExists) {
          fs.delete(location, true)
        }
        writeToWorkbook(new XSSFWorkbook())
      case (true, SaveMode.ErrorIfExists) =>
        sys.error(s"path $location already exists.")
      case (true, SaveMode.Ignore) => ()
      case (true, SaveMode.Append) =>
        val inputStream: FSDataInputStream = fs.open(location)
        val workbook = new XSSFWorkbook(inputStream)
        inputStream.close()
        writeToWorkbook(workbook)
    }
  }

  def autoClose[A <: AutoCloseable, B](closeable: A)(fun: (A) => B): B = {
    try {
      fun(closeable)
    } finally {
      closeable.close()
    }
  }
}

Source File: WholeFileReader.scala From magellan with Apache License 2.0

5 votes

package magellan.mapreduce

import java.io.InputStream

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import org.apache.hadoop.io.compress.{CodecPool, CompressionCodecFactory, Decompressor}
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.mapreduce.lib.input.FileSplit
import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext}

class WholeFileReader extends RecordReader[NullWritable, Text] {

  private val key = NullWritable.get()
  private val value = new Text()
  private var split: FileSplit = _
  private var conf: Configuration = _
  private var path: Path = _
  private var done: Boolean = false

  override def getProgress: Float = ???

  override def nextKeyValue(): Boolean = {
    if (done){
      false
    } else {
      val fs = path.getFileSystem(conf)
      var is: FSDataInputStream = null
      var in: InputStream = null
      var decompressor: Decompressor = null
      try {
        is = fs.open(split.getPath)
        val codec = new CompressionCodecFactory(conf).getCodec(path)
        if (codec != null) {
          decompressor = CodecPool.getDecompressor(codec)
          in = codec.createInputStream(is, decompressor)
        } else {
          in = is
        }
        val result = IOUtils.toByteArray(in)
        value.clear()
        value.set(result)
        done = true
        true
      } finally {
        if (in != null) {
          IOUtils.closeQuietly(in)
        }
        if (decompressor != null) {
          CodecPool.returnDecompressor(decompressor)
        }
      }
    }
  }

  override def getCurrentValue: Text = value

  override def initialize(inputSplit: InputSplit,
    taskAttemptContext: TaskAttemptContext): Unit = {
    this.split = inputSplit.asInstanceOf[FileSplit]
    this.conf = MapReduceUtils.getConfigurationFromContext(taskAttemptContext)
    this.path = this.split.getPath
  }

  override def getCurrentKey: NullWritable = key

  override def close() {}
}

Source File: StreamMetadata.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.io.{InputStreamReader, OutputStreamWriter}
import java.nio.charset.StandardCharsets
import java.util.ConcurrentModificationException

import scala.util.control.NonFatal

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileAlreadyExistsException, FSDataInputStream, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization

import org.apache.spark.internal.Logging
import org.apache.spark.sql.execution.streaming.CheckpointFileManager.CancellableFSDataOutputStream
import org.apache.spark.sql.streaming.StreamingQuery


  def write(
      metadata: StreamMetadata,
      metadataFile: Path,
      hadoopConf: Configuration): Unit = {
    var output: CancellableFSDataOutputStream = null
    try {
      val fileManager = CheckpointFileManager.create(metadataFile.getParent, hadoopConf)
      output = fileManager.createAtomic(metadataFile, overwriteIfPossible = false)
      val writer = new OutputStreamWriter(output)
      Serialization.write(metadata, writer)
      writer.close()
    } catch {
      case e: FileAlreadyExistsException =>
        if (output != null) {
          output.cancel()
        }
        throw new ConcurrentModificationException(
          s"Multiple streaming queries are concurrently using $metadataFile", e)
      case e: Throwable =>
        if (output != null) {
          output.cancel()
        }
        logError(s"Error writing stream metadata $metadata to $metadataFile", e)
        throw e
    }
  }
}

Source File: MergeStrategySpec.scala From daf with BSD 3-Clause "New" or "Revised" License

5 votes

package daf.filesystem

import java.io.{ Closeable, InputStream }
import java.util.Scanner

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{ FSDataInputStream, FSDataOutputStream, FileSystem, Path }
import org.scalatest.{ BeforeAndAfterAll, Matchers, WordSpec }

import scala.collection.convert.decorateAsScala._
import scala.util.{ Random, Try }

class MergeStrategySpec extends WordSpec with Matchers with BeforeAndAfterAll {

  private implicit val fileSystem = FileSystem.getLocal(new Configuration)

  private val numFiles = 10

  private val baseDir = "test-dir".asHadoop

  private val workingDir = baseDir / f"merge-strategy-spec-${Random.nextInt(10000)}%05d"

  private def safely[A <: Closeable, U](f: A => U) = { stream: A =>
    val attempt = Try { f(stream) }
    stream.close()
    attempt
  }

  private def readFile(path: Path) = safely[FSDataInputStream, Seq[String]] { _.scanner.asScala.toSeq } apply fileSystem.open(path)

  private def readFiles = Try {
    fileSystem.listStatus(workingDir).toSeq.flatMap { status => readFile(status.getPath).get }
  }

  private def openFiles = Try {
    fileSystem.listStatus(workingDir).toSeq.map { status => fileSystem.open(status.getPath) }
  }

  private def createFile(fileName: String) = safely[FSDataOutputStream, Unit] { stream =>
    Random.alphanumeric.grouped(200).take(10).map { randomSplits(_) }.foreach { row =>
      stream.writeUTF { row.mkString("", ",", "\n") }
    }
  } apply fileSystem.create { workingDir / fileName }

  private def randomSplits(chars: Stream[Char], strings: Seq[String] = Seq.empty): Seq[String] = chars.splitAt { Random.nextInt(10) + 5 } match {
    case (head, tail) if tail.isEmpty => head.drop(1).mkString +: strings
    case (head, tail)                 => randomSplits(tail, head.mkString +: strings)
  }

  private def createWorkingDir = Try { fileSystem.mkdirs(workingDir) }

  private def createFiles = Try {
    0 until numFiles foreach { index => createFile(s"test-file-$index").get } // this is relatively nasty, and should be handled in a `traverse`
  }

  private def prepareData = for {
    _ <- createWorkingDir
    _ <- createFiles
  } yield ()

  private def purgeData = Try { fileSystem.delete(workingDir, true) }

  override def beforeAll() = prepareData.get

  override def afterAll() = purgeData.get

  "MergeStrategies info" when {

    "given compressed format files" must {

      "throw an exception" in {
        an[IllegalArgumentException] must be thrownBy MergeStrategies.find { FileInfo(workingDir / "test-file-0", 0, FileDataFormats.raw, FileCompressionFormats.gzip) }
      }
    }

    "given data as csv" must {

      "drop one line and merge the rest" in {
        safely[InputStream, Seq[String]] { new Scanner(_).asScala.toList }.andThen { attempt =>
          for {
            merged   <- attempt
            expected <- readFiles
          } merged.size should be { expected.size - numFiles + 1 }
        } apply MergeStrategies.csv.merge { openFiles.get }
      }
    }

    "given data as json" must {

      "just merge the files into one" in {
        safely[InputStream, Seq[String]] { new Scanner(_).asScala.toList }.andThen { attempt =>
          for {
            merged   <- attempt
            expected <- readFiles
          } merged.size should be { expected.size }
        } apply MergeStrategies.json.merge { openFiles.get }
      }

    }
  }
}

Source File: Ensembl.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.microorganism

import java.io._

import cn.piflow.bundle.microorganism.util.ParserGff3Data
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator}
import org.json.JSONObject

class Ensembl extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val description: String = "Parse ensembl data"
  override val inportList: List[String] =List(Port.DefaultPort.toString)
  override val outportList: List[String] = List(Port.DefaultPort.toString)

  var cachePath:String = _
  def setProperties(map: Map[String, Any]): Unit = {
    cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
  }
  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
      .defaultValue("/ensembl").required(true)
    descriptor = cachePath :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/microorganism/Ensembl.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.MicroorganismGroup)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val session = pec.get[SparkSession]()
    val inDf: DataFrame = in.read()

    val configuration: Configuration = new Configuration()
    var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]
    val pathARR: Array[String] = pathStr.split("\\/")
    var hdfsUrl:String=""
    for (x <- (0 until 3)){
      hdfsUrl+=(pathARR(x) +"/")
    }
    configuration.set("fs.defaultFS",hdfsUrl)
    var fs: FileSystem = FileSystem.get(configuration)

    val hdfsPathTemporary = hdfsUrl+cachePath+"/ensemblCache/ensemblCache.json"

    val path: Path = new Path(hdfsPathTemporary)
    if(fs.exists(path)){
      fs.delete(path)
    }
    fs.create(path).close()

    val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))

    val parser: ParserGff3Data = new ParserGff3Data
    var fdis: FSDataInputStream =null
    var br: BufferedReader = null
    var doc: JSONObject = null
    var count:Int = 0
    inDf.collect().foreach(row => {
      pathStr = row.get(0).asInstanceOf[String]

      fdis = fs.open(new Path(pathStr))
      br = new BufferedReader(new InputStreamReader(fdis))
      var eachStr:String=null

      while((eachStr = br.readLine()) != null && eachStr != null ){
        doc = parser.parserGff3(eachStr)

        if(doc.toString.length > 2){
          count += 1
          doc.write(hdfsWriter)
          hdfsWriter.write("\n")
        }
      }

      br.close()
      fdis.close()
    })

    hdfsWriter.close()

    out.write(session.read.json(hdfsPathTemporary))
  }
}

Source File: PDBData.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.microorganism

import java.io._

import cn.piflow.bundle.microorganism.util.PDB
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator}
import org.json.JSONObject

class PDBData extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val description: String = "Parse PDB data"
  override val inportList: List[String] =List(Port.DefaultPort.toString)
  override val outportList: List[String] = List(Port.DefaultPort.toString)

  var cachePath:String = _
  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val session = pec.get[SparkSession]()
    val inDf: DataFrame = in.read()

    val configuration: Configuration = new Configuration()
    var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]
    val pathARR: Array[String] = pathStr.split("\\/")
    var hdfsUrl:String=""
    for (x <- (0 until 3)){
      hdfsUrl+=(pathARR(x) +"/")
    }
    configuration.set("fs.defaultFS",hdfsUrl)
    var fs: FileSystem = FileSystem.get(configuration)

    val hdfsPathTemporary = hdfsUrl+cachePath+"/PDBCache/PDBCache.json"

    val path: Path = new Path(hdfsPathTemporary)
    if(fs.exists(path)){
      fs.delete(path)
    }
    fs.create(path).close()

    val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))

    var doc: JSONObject = null
    var pdb: PDB = null
    var count:Int=0
    inDf.collect().foreach(row => {
      count += 1
      pathStr = row.get(0).asInstanceOf[String]

      pdb = new PDB(pathStr,fs)
      doc = pdb.getDoc

      doc.write(hdfsWriter)
      hdfsWriter.write("\n")

      doc = null
    })
    hdfsWriter.close()

    val df: DataFrame = session.read.json(hdfsPathTemporary)
    out.write(df)
}

  def setProperties(map: Map[String, Any]): Unit = {
    cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
  }
  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
      .defaultValue("/PDB").required(true)
    descriptor = cachePath :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/microorganism/PDBData.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.MicroorganismGroup)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

}

Source File: Pathway.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.microorganism

import java.io.{BufferedReader, InputStreamReader, OutputStreamWriter}

import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.json.JSONObject


class Pathway extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val description: String = "Parse Pathway data"
  override val inportList: List[String] =List(Port.DefaultPort.toString)
  override val outportList: List[String] = List(Port.DefaultPort.toString)


  var cachePath:String = _
  def setProperties(map: Map[String, Any]): Unit = {
    cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
      .defaultValue("/pathway").required(true)
    descriptor = cachePath :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/microorganism/Pathway.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.MicroorganismGroup)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val inDf: DataFrame = in.read()
    var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]

    val configuration: Configuration = new Configuration()
    val pathARR: Array[String] = pathStr.split("\\/")
    var hdfsUrl:String=""
    for (x <- (0 until 3)){
      hdfsUrl+=(pathARR(x) +"/")
    }
    configuration.set("fs.defaultFS",hdfsUrl)
    var fs: FileSystem = FileSystem.get(configuration)


    val hdfsPathTemporary = hdfsUrl+cachePath+"/pathwayCache/pathwayCache.json"
    val path: Path = new Path(hdfsPathTemporary)
    if(fs.exists(path)){
      fs.delete(path)
    }
    fs.create(path).close()
    val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))

    var fdis: FSDataInputStream = null
    var br: BufferedReader = null
    var doc: JSONObject = null
    var hasAnotherSequence:Boolean = true

    inDf.collect().foreach(row => {
      pathStr = row.get(0).asInstanceOf[String]

      fdis = fs.open(new Path(pathStr))
      br = new BufferedReader(new InputStreamReader(fdis))
      var count = 0
      while (hasAnotherSequence) {
          count += 1
          doc = new JSONObject
          hasAnotherSequence = util.KeggPathway.process(br, doc)

          doc.write(hdfsWriter)
          hdfsWriter.write("\n")
        }
      br.close()
      fdis.close()
    })
    hdfsWriter.close()

    val df: DataFrame = pec.get[SparkSession]().read.json(hdfsPathTemporary)
    df.schema.printTreeString()
    println(df.count)

    out.write(df)

  }
}

Source File: PostUrl.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.http

import java.io.{BufferedReader, InputStreamReader}
import java.net.URI

import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.commons.httpclient.HttpClient
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import org.apache.http.client.methods.HttpPost
import org.apache.http.entity.StringEntity
import org.apache.http.impl.client.HttpClients
import org.apache.http.util.EntityUtils
import org.apache.spark.sql.SparkSession


class PostUrl extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val inportList: List[String] = List(Port.DefaultPort)
  override val outportList: List[String] = List(Port.DefaultPort)
  override val description: String = "Send a post request to the specified http"

  var url : String= _
  var jsonPath : String = _


  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val spark = pec.get[SparkSession]()

    //read  json from hdfs
    val conf = new Configuration()
    val fs = FileSystem.get(URI.create(jsonPath),conf)
    val stream: FSDataInputStream = fs.open(new Path(jsonPath))
    val bufferReader = new BufferedReader(new InputStreamReader(stream))
    var lineTxt = bufferReader.readLine()
    val buffer = new StringBuffer()
    while (lineTxt != null ){
      buffer.append(lineTxt.mkString)
      lineTxt=bufferReader.readLine()
    }

    // post
    val client = HttpClients.createDefault()
    val httpClient = new HttpClient()
    httpClient.getParams().setContentCharset("utf-8")

    val post = new HttpPost(url)
    post.addHeader("content-Type","application/json")
    post.setEntity(new StringEntity(buffer.toString))
    val response = client.execute(post)
    val entity = response.getEntity
    val str = EntityUtils.toString(entity,"UTF-8")
    println("Code is " + str)

  }


  override def setProperties(map: Map[String, Any]): Unit = {
    url = MapUtil.get(map,key="url").asInstanceOf[String]
    jsonPath = MapUtil.get(map,key="jsonPath").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val url = new PropertyDescriptor()
      .name("url")
      .displayName("Url")
      .defaultValue("")
      .description("http request address")
      .required(true)
      .example("http://master:8002/flow/start")

    val jsonPath = new PropertyDescriptor()
      .name("jsonPath")
      .displayName("JsonPath")
      .defaultValue("")
      .description("json parameter path for post request")
      .required(true)
        .example("hdfs://master:9000/work/flow.json")

    descriptor = url :: descriptor
    descriptor = jsonPath :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/http/PostUrl.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.HttpGroup.toString)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

}

Source File: HdfsStreamAccessor.scala From ArchiveSpark with MIT License

5 votes

package org.archive.archivespark.dataspecs.access

import java.io.InputStream

import org.apache.commons.io.input.BoundedInputStream
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import org.apache.spark.deploy.SparkHadoopUtil

class HdfsStreamAccessor(location: HdfsLocationInfo) extends CloseableDataAccessor[InputStream] {
  override def get: Option[InputStream] = {
    if (location.length < 0 || location.offset < 0) None
    else {
      val fs = FileSystem.get(SparkHadoopUtil.get.conf)
      var stream: FSDataInputStream = null
      try {
        stream = fs.open(new Path(location.path))
        stream.seek(location.offset)
        Some(new BoundedInputStream(stream, location.length))
      } catch {
        case e: Exception =>
          e.printStackTrace()
          if (stream != null) stream.close()
          None
      }
    }
  }
}

org.apache.hadoop.fs.FSDataInputStream Scala Examples