org.apache.hadoop.fs.FSDataInputStream Scala Examples

The following examples show how to use org.apache.hadoop.fs.FSDataInputStream. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: OrcDataFileMeta.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.oap.io

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FSDataInputStream
import org.apache.hadoop.fs.Path
import org.apache.orc.OrcFile
import org.apache.orc.Reader
import org.apache.orc.mapred.OrcInputFormat

private[oap] class OrcDataFileMeta(val path: Path, val configuration: Configuration)
    extends DataFileMeta {

  val fs = path.getFileSystem(configuration)
  private val readerOptions = OrcFile.readerOptions(configuration).filesystem(fs)
  private val fileReader = OrcFile.createReader(path, readerOptions)
  val length = fs.getFileStatus(path).getLen
//  val options: Reader.Options = OrcInputFormat.buildOptions(configuration, fileReader, 0, length)
  // Record reader from ORC row batch.
//  val recordReader = fileReader.rows(options)

  def getOrcFileReader(): Reader = fileReader
  val listStripeInformation = fileReader.getStripes()

  def numberOfRows: Long = fileReader.getNumberOfRows()
  override def len: Long = fileReader.getContentLength()
  override def getGroupCount: Int = fileReader.getStripes().size()
  override def getFieldCount: Int = fileReader.getSchema().getFieldNames().size()
  // Not used by orc data file.
  override def fin: FSDataInputStream = null
} 
Example 2
Source File: BedFileIterator.scala    From glow   with Apache License 2.0 5 votes vote down vote up
package io.projectglow.plink

import com.google.common.io.LittleEndianDataInputStream
import org.apache.hadoop.fs.FSDataInputStream


class BedFileIterator(
    stream: LittleEndianDataInputStream,
    underlyingStream: FSDataInputStream,
    numBlocks: Int,
    blockSize: Int)
    extends Iterator[Array[Byte]] {

  var blockIdx = 0
  val byteArray: Array[Byte] = new Array[Byte](blockSize)

  def hasNext(): Boolean = {
    val ret = blockIdx < numBlocks
    if (!ret) {
      cleanup()
    }
    ret
  }

  def next(): Array[Byte] = {
    blockIdx += 1
    stream.readFully(byteArray)
    byteArray
  }

  private def cleanup(): Unit = {
    underlyingStream.close()
  }
} 
Example 3
Source File: MatrixMetaUtils.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.graph.utils
import java.io.IOException

import com.tencent.angel.conf.AngelConf
import com.tencent.angel.ml.math2.utils.RowType
import com.tencent.angel.ml.matrix.MatrixContext
import com.tencent.angel.model.output.format.{MatrixFilesMeta, ModelFilesConstent}
import com.tencent.angel.ps.storage.partitioner.Partitioner
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}

object MatrixMetaUtils {
  def readMatrixContext(matPath: Path, fs: FileSystem): MatrixContext = {
    var input: FSDataInputStream= null
    var matrixFilesMeta: MatrixFilesMeta = null
    try {
      val metaFilePath = new Path(matPath.getName, ModelFilesConstent.modelMetaFileName)
      if (!fs.exists(metaFilePath)) {
        throw new IOException(s"Can not find meta file for matrix $metaFilePath")
      }
      fs.setVerifyChecksum(false)
      input = fs.open(metaFilePath)
      matrixFilesMeta = new MatrixFilesMeta()
      matrixFilesMeta.read(input)
    } catch {
      case e: IOException =>
        e.printStackTrace()
        throw e
      case e: Exception =>
        e.printStackTrace()
        throw e
      case ae: AssertionError =>
        ae.printStackTrace()
        throw ae
    } finally {
      if (input != null) {
        input.close()
      }
    }

    val mc = new MatrixContext
    mc.setName(matrixFilesMeta.getMatrixName)
    mc.setRowNum(matrixFilesMeta.getRow)
    mc.setColNum(matrixFilesMeta.getCol)
    mc.setMaxRowNumInBlock(matrixFilesMeta.getBlockRow)
    mc.setMaxColNumInBlock(matrixFilesMeta.getBlockCol)
    mc.setRowType(RowType.valueOf(matrixFilesMeta.getRowType))
    mc.getAttributes.putAll(matrixFilesMeta.getOptions)
    if (mc.getAttributes.containsKey(AngelConf.Angel_PS_PARTITION_CLASS)) {
      val partitionClassName = mc.getAttributes.get(AngelConf.Angel_PS_PARTITION_CLASS)
      mc.setPartitionerClass(Class.forName(partitionClassName).asInstanceOf[Class[Partitioner]])
      mc.getAttributes.remove(AngelConf.Angel_PS_PARTITION_CLASS)
    }

    mc
  }
} 
Example 4
Source File: DataBuffer.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import java.nio.{ByteBuffer, ByteOrder}

import org.apache.hadoop.fs.FSDataInputStream


class DataBuffer(dataInput: FSDataInputStream) extends Serializable {

  private var bytes = new Array[Byte](1024)
  private var byteBuffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN)

  def readBytes(length: Int) = {
    if (length > bytes.length) {
      bytes = new Array[Byte](length)
      byteBuffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN)
    }
    else {
      byteBuffer.clear
    }
    dataInput.readFully(bytes, 0, length)
    byteBuffer
  }

  def seek(position: Long) = {
    dataInput.seek(position)
    this
  }

  def close() {
    dataInput.close()
  }
}

object DataBuffer {
  def apply(dataInput: FSDataInputStream) = {
    new DataBuffer(dataInput)
  }
} 
Example 5
Source File: GDBIndex.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import java.io.{DataInput, File}
import java.nio.{ByteBuffer, ByteOrder}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, Path}
import org.apache.spark.Logging

object GDBIndex {
  def apply(path: String, name: String, conf: Configuration = new Configuration()) = {
    val filename = StringBuilder.newBuilder.append(path).append(File.separator).append(name).append(".gdbtablx").toString()
    val hdfsPath = new Path(filename)
    val dataInput = hdfsPath.getFileSystem(conf).open(hdfsPath)

    val bytes = new Array[Byte](16)
    dataInput.readFully(bytes)
    val byteBuffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN)

    val signature = byteBuffer.getInt
    val n1024Blocks = byteBuffer.getInt
    val numRows = byteBuffer.getInt
    val indexSize = byteBuffer.getInt

    new GDBIndex(dataInput, numRows, indexSize)
  }
}

private[gdb] class GDBIndex(dataInput: FSDataInputStream,
                            val numRows: Int,
                            indexSize: Int
                           ) extends Logging with AutoCloseable with Serializable {

  def readSeekForRowNum(rowNum: Int) = {
    val bytes = new Array[Byte](indexSize)
    dataInput.seek(16 + rowNum * indexSize)
    dataInput.readFully(bytes)
    ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).getInt
  }

  def iterator(startAtRow: Int = 0, numRowsToRead: Int = -1) = {
    dataInput.seek(16 + startAtRow * indexSize)
    val maxRows = if (numRowsToRead == -1) numRows else numRowsToRead
    // log.info(s"iterator::startAtRow=$startAtRow maxRows=$maxRows")
    new GDBIndexIterator(dataInput, startAtRow, maxRows, indexSize).withFilter(_.isSeekable)
  }

  def close() {
    dataInput.close()
  }
}

private[gdb] class GDBIndexIterator(dataInput: DataInput,
                                    startID: Int,
                                    maxRows: Int,
                                    indexSize: Int
                                   ) extends Iterator[IndexInfo] with Logging with Serializable {

  private val indexInfo = IndexInfo(0, 0)
  private val bytes = new Array[Byte](indexSize)
  private val byteBuffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN)

  private var objectID = startID
  private var nextRow = 0

  def hasNext() = nextRow < maxRows

  def next() = {
    // log.info(s"next::nextRow=$nextRow maxRows=$maxRows")
    nextRow += 1

    objectID += 1
    indexInfo.objectID = objectID

    byteBuffer.clear
    dataInput.readFully(bytes)
    indexInfo.seek = byteBuffer.getInt

    indexInfo
  }
} 
Example 6
Source File: StreamMetadata.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import java.io.{InputStreamReader, OutputStreamWriter}
import java.nio.charset.StandardCharsets

import scala.util.control.NonFatal

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization

import org.apache.spark.internal.Logging
import org.apache.spark.sql.streaming.StreamingQuery


  def write(
      metadata: StreamMetadata,
      metadataFile: Path,
      hadoopConf: Configuration): Unit = {
    var output: FSDataOutputStream = null
    try {
      val fs = metadataFile.getFileSystem(hadoopConf)
      output = fs.create(metadataFile)
      val writer = new OutputStreamWriter(output)
      Serialization.write(metadata, writer)
      writer.close()
    } catch {
      case NonFatal(e) =>
        logError(s"Error writing stream metadata $metadata to $metadataFile", e)
        throw e
    } finally {
      IOUtils.closeQuietly(output)
    }
  }
} 
Example 7
Source File: StreamMetadata.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import java.io.{InputStreamReader, OutputStreamWriter}
import java.nio.charset.StandardCharsets

import scala.util.control.NonFatal

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization

import org.apache.spark.internal.Logging
import org.apache.spark.sql.streaming.StreamingQuery


  def write(
      metadata: StreamMetadata,
      metadataFile: Path,
      hadoopConf: Configuration): Unit = {
    var output: FSDataOutputStream = null
    try {
      val fs = FileSystem.get(hadoopConf)
      output = fs.create(metadataFile)
      val writer = new OutputStreamWriter(output)
      Serialization.write(metadata, writer)
      writer.close()
    } catch {
      case NonFatal(e) =>
        logError(s"Error writing stream metadata $metadata to $metadataFile", e)
        throw e
    } finally {
      IOUtils.closeQuietly(output)
    }
  }
} 
Example 8
Source File: StreamMetadata.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import java.io.{InputStreamReader, OutputStreamWriter}
import java.nio.charset.StandardCharsets

import scala.util.control.NonFatal

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization

import org.apache.spark.internal.Logging
import org.apache.spark.sql.streaming.StreamingQuery


  def write(
      metadata: StreamMetadata,
      metadataFile: Path,
      hadoopConf: Configuration): Unit = {
    var output: FSDataOutputStream = null
    try {
      val fs = FileSystem.get(hadoopConf)
      output = fs.create(metadataFile)
      val writer = new OutputStreamWriter(output)
      Serialization.write(metadata, writer)
      writer.close()
    } catch {
      case NonFatal(e) =>
        logError(s"Error writing stream metadata $metadata to $metadataFile", e)
        throw e
    } finally {
      IOUtils.closeQuietly(output)
    }
  }
} 
Example 9
Source File: OapBitmapWrappedFiberCacheSuite.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.oap.utils

import java.io.{ByteArrayOutputStream, DataOutputStream, FileOutputStream}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, Path}
import org.roaringbitmap.RoaringBitmap

import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.execution.datasources.OapException
import org.apache.spark.sql.execution.datasources.oap.filecache.{BitmapFiberId, FiberCache}
import org.apache.spark.sql.oap.OapRuntime
import org.apache.spark.sql.test.oap.SharedOapContext
import org.apache.spark.util.Utils

// Below are used to test the functionality of OapBitmapWrappedFiberCache class.
class OapBitmapWrappedFiberCacheSuite
  extends QueryTest with SharedOapContext {

  private def loadRbFile(fin: FSDataInputStream, offset: Long, size: Int): FiberCache =
    OapRuntime.getOrCreate.fiberCacheManager.toIndexFiberCache(fin, offset, size)

  test("test the functionality of OapBitmapWrappedFiberCache class") {
    val CHUNK_SIZE = 1 << 16
    val dataForRunChunk = (1 to 9).toSeq
    val dataForArrayChunk = Seq(1, 3, 5, 7, 9)
    val dataForBitmapChunk = (1 to 10000).filter(_ % 2 == 1)
    val dataCombination =
      dataForBitmapChunk ++ dataForArrayChunk ++ dataForRunChunk
    val dataArray =
      Array(dataForRunChunk, dataForArrayChunk, dataForBitmapChunk, dataCombination)
    dataArray.foreach(dataIdx => {
      val dir = Utils.createTempDir()
      val rb = new RoaringBitmap()
      dataIdx.foreach(rb.add)
      val rbFile = dir.getAbsolutePath + "rb.bin"
      rb.runOptimize()
      val rbFos = new FileOutputStream(rbFile)
      val rbBos = new ByteArrayOutputStream()
      val rbDos = new DataOutputStream(rbBos)
      rb.serialize(rbDos)
      rbBos.writeTo(rbFos)
      rbBos.close()
      rbDos.close()
      rbFos.close()
      val rbPath = new Path(rbFile.toString)
      val conf = new Configuration()
      val fin = rbPath.getFileSystem(conf).open(rbPath)
      val rbFileSize = rbPath.getFileSystem(conf).getFileStatus(rbPath).getLen
      val rbFiber = BitmapFiberId(
        () => loadRbFile(fin, 0L, rbFileSize.toInt), rbPath.toString, 0, 0)
      val rbWfc = new OapBitmapWrappedFiberCache(
        OapRuntime.getOrCreate.fiberCacheManager.get(rbFiber))
      rbWfc.init
      val chunkLength = rbWfc.getTotalChunkLength
      val length = dataIdx.size / CHUNK_SIZE
      assert(chunkLength == (length + 1))
      val chunkKeys = rbWfc.getChunkKeys
      assert(chunkKeys(0).toInt == 0)
      rbWfc.setOffset(0)
      val chunk = rbWfc.getIteratorForChunk(0)
      chunk match {
        case RunChunkIterator(rbWfc) => assert(chunk == RunChunkIterator(rbWfc))
        case ArrayChunkIterator(rbWfc, 0) => assert(chunk == ArrayChunkIterator(rbWfc, 0))
        case BitmapChunkIterator(rbWfc) => assert(chunk == BitmapChunkIterator(rbWfc))
        case _ => throw new OapException("unexpected chunk in OapBitmapWrappedFiberCache.")
      }
      rbWfc.release
      fin.close
      dir.delete
    })
  }
} 
Example 10
Source File: ParquetDataFileMeta.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.oap.io

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, Path}
import org.apache.hadoop.util.StringUtils
import org.apache.parquet.hadoop.OapParquetFileReader
import org.apache.parquet.hadoop.metadata.ParquetFooter

private[oap] class ParquetDataFileMeta(val footer: ParquetFooter) extends DataFileMeta {

  require(footer != null, "footer of ParquetDataFileMeta should not be null.")

  override def fin: FSDataInputStream = null

  override def len: Long = 0

  override def getGroupCount: Int = footer.getBlocks.size()

  override def getFieldCount: Int =
    footer.getFileMetaData.getSchema.getColumns.size()
}

private[oap] object ParquetDataFileMeta {
  def apply(conf: Configuration, pathString: String): ParquetDataFileMeta = {
    val path = new Path(StringUtils.unEscapeString(pathString))
    new ParquetDataFileMeta(OapParquetFileReader.readParquetFooter(conf, path))
  }
} 
Example 11
Source File: OapDataReader.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.oap.io

import org.apache.hadoop.fs.FSDataInputStream

import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.{OapException, PartitionedFile}
import org.apache.spark.sql.execution.datasources.oap.INDEX_STAT._
import org.apache.spark.sql.execution.datasources.oap.OapFileFormat
import org.apache.spark.sql.execution.datasources.oap.io.OapDataFileProperties.DataFileVersion
import org.apache.spark.sql.execution.datasources.oap.io.OapDataFileProperties.DataFileVersion.DataFileVersion
import org.apache.spark.unsafe.types.UTF8String

abstract class OapDataReader {
  def read(file: PartitionedFile): Iterator[InternalRow]

  // The two following fields have to be defined by certain versions of OapDataReader for use in
  // [[OapMetricsManager]]
  def rowsReadByIndex: Option[Long]
  def indexStat: INDEX_STAT
}

object OapDataReader extends Logging {

  def readVersion(is: FSDataInputStream, fileLen: Long): DataFileVersion = {
    val MAGIC_VERSION_LENGTH = 4
    val metaEnd = fileLen - 4

    // seek to the position of data file meta length
    is.seek(metaEnd)
    val metaLength = is.readInt()
    // read all bytes of data file meta
    val magicBuffer = new Array[Byte](MAGIC_VERSION_LENGTH)
    is.readFully(metaEnd - metaLength, magicBuffer)

    val magic = UTF8String.fromBytes(magicBuffer).toString
    magic match {
      case m if ! m.contains("OAP") => throw new OapException("Not a valid Oap Data File")
      case m if m == "OAP1" => DataFileVersion.OAP_DATAFILE_V1
      case _ => throw new OapException("Not a supported Oap Data File version")
    }
  }

  def getDataFileClassFor(dataReaderClassFromDataSourceMeta: String, reader: OapDataReader): String
    = {
    dataReaderClassFromDataSourceMeta match {
      case c if c == OapFileFormat.PARQUET_DATA_FILE_CLASSNAME => c
      case c if c == OapFileFormat.ORC_DATA_FILE_CLASSNAME => c
      case c if c == OapFileFormat.OAP_DATA_FILE_CLASSNAME =>
        reader match {
          case r: OapDataReaderV1 => OapFileFormat.OAP_DATA_FILE_V1_CLASSNAME
          case _ => throw new OapException(s"Undefined connection for $reader")
        }
      case _ => throw new OapException(
        s"Undefined data reader class name $dataReaderClassFromDataSourceMeta")
    }
  }
} 
Example 12
Source File: TFRecordInputFormat.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.utils.tf

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
import org.apache.hadoop.fs.FSDataInputStream

class TFRecordInputFormat extends FileInputFormat[BytesWritable, NullWritable] {
  override def createRecordReader(inputSplit: InputSplit, context: TaskAttemptContext):
  RecordReader[BytesWritable, NullWritable] = new RecordReader[BytesWritable, NullWritable] {

    private var inputStream: FSDataInputStream = null
    private var reader: TFRecordIterator = null
    private var length: Long = 0L
    private var begin: Long = 0L
    private var current: Array[Byte] = null


    override def getCurrentKey: BytesWritable = {
      new BytesWritable(current)
    }

    override def getProgress: Float = {
      (inputStream.getPos - begin) / (length + 1e-6f)
    }

    override def nextKeyValue(): Boolean = {
      if (reader.hasNext) {
        current = reader.next()
        true
      } else {
        false
      }
    }

    override def getCurrentValue: NullWritable = {
      NullWritable.get()
    }

    override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = {
      val conf = context.getConfiguration
      val fileSplit = split.asInstanceOf[FileSplit]
      length = fileSplit.getLength
      begin = fileSplit.getStart

      val file = fileSplit.getPath
      val fs = file.getFileSystem(conf)
      inputStream = fs.open(file, 4096)
      reader = new TFRecordIterator(inputStream)
    }

    override def close(): Unit = {
      inputStream.close()
    }
  }

  override protected def isSplitable(context: JobContext, filename: Path): Boolean = false
} 
Example 13
Source File: ExcelFileSaver.scala    From spark-excel   with Apache License 2.0 5 votes vote down vote up
package com.crealytics.spark.excel

import java.io.BufferedOutputStream

import com.crealytics.spark.excel.ExcelFileSaver.{DEFAULT_DATE_FORMAT, DEFAULT_SHEET_NAME, DEFAULT_TIMESTAMP_FORMAT}
import com.norbitltd.spoiwo.model._
import com.norbitltd.spoiwo.natures.xlsx.Model2XlsxConversions._
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import org.apache.poi.ss.util.CellRangeAddress
import org.apache.poi.xssf.usermodel.XSSFWorkbook
import org.apache.spark.sql.{DataFrame, SaveMode}

import scala.collection.JavaConverters._

object ExcelFileSaver {
  final val DEFAULT_SHEET_NAME = "Sheet1"
  final val DEFAULT_DATE_FORMAT = "yy-m-d h:mm"
  final val DEFAULT_TIMESTAMP_FORMAT = "yyyy-mm-dd hh:mm:ss.000"
}

class ExcelFileSaver(
  fs: FileSystem,
  location: Path,
  dataFrame: DataFrame,
  saveMode: SaveMode,
  dataLocator: DataLocator,
  header: Boolean = true
) {
  def save(): Unit = {
    def sheet(workbook: XSSFWorkbook) = {
      val headerRow = if (header) Some(dataFrame.schema.fields.map(_.name).toSeq) else None
      val dataRows = dataFrame
        .toLocalIterator()
        .asScala
        .map(_.toSeq)
      dataLocator.toSheet(headerRow, dataRows, workbook)
    }
    val fileAlreadyExists = fs.exists(location)
    def writeToWorkbook(workbook: XSSFWorkbook): Unit = {
      Workbook(sheet(workbook)).writeToExisting(workbook)
      autoClose(new BufferedOutputStream(fs.create(location)))(workbook.write)
    }
    (fileAlreadyExists, saveMode) match {
      case (false, _) | (_, SaveMode.Overwrite) =>
        if (fileAlreadyExists) {
          fs.delete(location, true)
        }
        writeToWorkbook(new XSSFWorkbook())
      case (true, SaveMode.ErrorIfExists) =>
        sys.error(s"path $location already exists.")
      case (true, SaveMode.Ignore) => ()
      case (true, SaveMode.Append) =>
        val inputStream: FSDataInputStream = fs.open(location)
        val workbook = new XSSFWorkbook(inputStream)
        inputStream.close()
        writeToWorkbook(workbook)
    }
  }

  def autoClose[A <: AutoCloseable, B](closeable: A)(fun: (A) => B): B = {
    try {
      fun(closeable)
    } finally {
      closeable.close()
    }
  }
} 
Example 14
Source File: WholeFileReader.scala    From magellan   with Apache License 2.0 5 votes vote down vote up
package magellan.mapreduce

import java.io.InputStream

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import org.apache.hadoop.io.compress.{CodecPool, CompressionCodecFactory, Decompressor}
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.mapreduce.lib.input.FileSplit
import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext}

class WholeFileReader extends RecordReader[NullWritable, Text] {

  private val key = NullWritable.get()
  private val value = new Text()
  private var split: FileSplit = _
  private var conf: Configuration = _
  private var path: Path = _
  private var done: Boolean = false

  override def getProgress: Float = ???

  override def nextKeyValue(): Boolean = {
    if (done){
      false
    } else {
      val fs = path.getFileSystem(conf)
      var is: FSDataInputStream = null
      var in: InputStream = null
      var decompressor: Decompressor = null
      try {
        is = fs.open(split.getPath)
        val codec = new CompressionCodecFactory(conf).getCodec(path)
        if (codec != null) {
          decompressor = CodecPool.getDecompressor(codec)
          in = codec.createInputStream(is, decompressor)
        } else {
          in = is
        }
        val result = IOUtils.toByteArray(in)
        value.clear()
        value.set(result)
        done = true
        true
      } finally {
        if (in != null) {
          IOUtils.closeQuietly(in)
        }
        if (decompressor != null) {
          CodecPool.returnDecompressor(decompressor)
        }
      }
    }
  }

  override def getCurrentValue: Text = value

  override def initialize(inputSplit: InputSplit,
    taskAttemptContext: TaskAttemptContext): Unit = {
    this.split = inputSplit.asInstanceOf[FileSplit]
    this.conf = MapReduceUtils.getConfigurationFromContext(taskAttemptContext)
    this.path = this.split.getPath
  }

  override def getCurrentKey: NullWritable = key

  override def close() {}
} 
Example 15
Source File: StreamMetadata.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import java.io.{InputStreamReader, OutputStreamWriter}
import java.nio.charset.StandardCharsets
import java.util.ConcurrentModificationException

import scala.util.control.NonFatal

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileAlreadyExistsException, FSDataInputStream, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization

import org.apache.spark.internal.Logging
import org.apache.spark.sql.execution.streaming.CheckpointFileManager.CancellableFSDataOutputStream
import org.apache.spark.sql.streaming.StreamingQuery


  def write(
      metadata: StreamMetadata,
      metadataFile: Path,
      hadoopConf: Configuration): Unit = {
    var output: CancellableFSDataOutputStream = null
    try {
      val fileManager = CheckpointFileManager.create(metadataFile.getParent, hadoopConf)
      output = fileManager.createAtomic(metadataFile, overwriteIfPossible = false)
      val writer = new OutputStreamWriter(output)
      Serialization.write(metadata, writer)
      writer.close()
    } catch {
      case e: FileAlreadyExistsException =>
        if (output != null) {
          output.cancel()
        }
        throw new ConcurrentModificationException(
          s"Multiple streaming queries are concurrently using $metadataFile", e)
      case e: Throwable =>
        if (output != null) {
          output.cancel()
        }
        logError(s"Error writing stream metadata $metadata to $metadataFile", e)
        throw e
    }
  }
} 
Example 16
Source File: MergeStrategySpec.scala    From daf   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package daf.filesystem

import java.io.{ Closeable, InputStream }
import java.util.Scanner

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{ FSDataInputStream, FSDataOutputStream, FileSystem, Path }
import org.scalatest.{ BeforeAndAfterAll, Matchers, WordSpec }

import scala.collection.convert.decorateAsScala._
import scala.util.{ Random, Try }

class MergeStrategySpec extends WordSpec with Matchers with BeforeAndAfterAll {

  private implicit val fileSystem = FileSystem.getLocal(new Configuration)

  private val numFiles = 10

  private val baseDir = "test-dir".asHadoop

  private val workingDir = baseDir / f"merge-strategy-spec-${Random.nextInt(10000)}%05d"

  private def safely[A <: Closeable, U](f: A => U) = { stream: A =>
    val attempt = Try { f(stream) }
    stream.close()
    attempt
  }

  private def readFile(path: Path) = safely[FSDataInputStream, Seq[String]] { _.scanner.asScala.toSeq } apply fileSystem.open(path)

  private def readFiles = Try {
    fileSystem.listStatus(workingDir).toSeq.flatMap { status => readFile(status.getPath).get }
  }

  private def openFiles = Try {
    fileSystem.listStatus(workingDir).toSeq.map { status => fileSystem.open(status.getPath) }
  }

  private def createFile(fileName: String) = safely[FSDataOutputStream, Unit] { stream =>
    Random.alphanumeric.grouped(200).take(10).map { randomSplits(_) }.foreach { row =>
      stream.writeUTF { row.mkString("", ",", "\n") }
    }
  } apply fileSystem.create { workingDir / fileName }

  private def randomSplits(chars: Stream[Char], strings: Seq[String] = Seq.empty): Seq[String] = chars.splitAt { Random.nextInt(10) + 5 } match {
    case (head, tail) if tail.isEmpty => head.drop(1).mkString +: strings
    case (head, tail)                 => randomSplits(tail, head.mkString +: strings)
  }

  private def createWorkingDir = Try { fileSystem.mkdirs(workingDir) }

  private def createFiles = Try {
    0 until numFiles foreach { index => createFile(s"test-file-$index").get } // this is relatively nasty, and should be handled in a `traverse`
  }

  private def prepareData = for {
    _ <- createWorkingDir
    _ <- createFiles
  } yield ()

  private def purgeData = Try { fileSystem.delete(workingDir, true) }

  override def beforeAll() = prepareData.get

  override def afterAll() = purgeData.get

  "MergeStrategies info" when {

    "given compressed format files" must {

      "throw an exception" in {
        an[IllegalArgumentException] must be thrownBy MergeStrategies.find { FileInfo(workingDir / "test-file-0", 0, FileDataFormats.raw, FileCompressionFormats.gzip) }
      }
    }

    "given data as csv" must {

      "drop one line and merge the rest" in {
        safely[InputStream, Seq[String]] { new Scanner(_).asScala.toList }.andThen { attempt =>
          for {
            merged   <- attempt
            expected <- readFiles
          } merged.size should be { expected.size - numFiles + 1 }
        } apply MergeStrategies.csv.merge { openFiles.get }
      }
    }

    "given data as json" must {

      "just merge the files into one" in {
        safely[InputStream, Seq[String]] { new Scanner(_).asScala.toList }.andThen { attempt =>
          for {
            merged   <- attempt
            expected <- readFiles
          } merged.size should be { expected.size }
        } apply MergeStrategies.json.merge { openFiles.get }
      }

    }
  }
} 
Example 17
Source File: Ensembl.scala    From piflow   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package cn.piflow.bundle.microorganism

import java.io._

import cn.piflow.bundle.microorganism.util.ParserGff3Data
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator}
import org.json.JSONObject

class Ensembl extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val description: String = "Parse ensembl data"
  override val inportList: List[String] =List(Port.DefaultPort.toString)
  override val outportList: List[String] = List(Port.DefaultPort.toString)

  var cachePath:String = _
  def setProperties(map: Map[String, Any]): Unit = {
    cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
  }
  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
      .defaultValue("/ensembl").required(true)
    descriptor = cachePath :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/microorganism/Ensembl.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.MicroorganismGroup)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val session = pec.get[SparkSession]()
    val inDf: DataFrame = in.read()

    val configuration: Configuration = new Configuration()
    var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]
    val pathARR: Array[String] = pathStr.split("\\/")
    var hdfsUrl:String=""
    for (x <- (0 until 3)){
      hdfsUrl+=(pathARR(x) +"/")
    }
    configuration.set("fs.defaultFS",hdfsUrl)
    var fs: FileSystem = FileSystem.get(configuration)

    val hdfsPathTemporary = hdfsUrl+cachePath+"/ensemblCache/ensemblCache.json"

    val path: Path = new Path(hdfsPathTemporary)
    if(fs.exists(path)){
      fs.delete(path)
    }
    fs.create(path).close()

    val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))

    val parser: ParserGff3Data = new ParserGff3Data
    var fdis: FSDataInputStream =null
    var br: BufferedReader = null
    var doc: JSONObject = null
    var count:Int = 0
    inDf.collect().foreach(row => {
      pathStr = row.get(0).asInstanceOf[String]

      fdis = fs.open(new Path(pathStr))
      br = new BufferedReader(new InputStreamReader(fdis))
      var eachStr:String=null

      while((eachStr = br.readLine()) != null && eachStr != null ){
        doc = parser.parserGff3(eachStr)

        if(doc.toString.length > 2){
          count += 1
          doc.write(hdfsWriter)
          hdfsWriter.write("\n")
        }
      }

      br.close()
      fdis.close()
    })

    hdfsWriter.close()

    out.write(session.read.json(hdfsPathTemporary))
  }
} 
Example 18
Source File: PDBData.scala    From piflow   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package cn.piflow.bundle.microorganism

import java.io._

import cn.piflow.bundle.microorganism.util.PDB
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator}
import org.json.JSONObject

class PDBData extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val description: String = "Parse PDB data"
  override val inportList: List[String] =List(Port.DefaultPort.toString)
  override val outportList: List[String] = List(Port.DefaultPort.toString)

  var cachePath:String = _
  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val session = pec.get[SparkSession]()
    val inDf: DataFrame = in.read()

    val configuration: Configuration = new Configuration()
    var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]
    val pathARR: Array[String] = pathStr.split("\\/")
    var hdfsUrl:String=""
    for (x <- (0 until 3)){
      hdfsUrl+=(pathARR(x) +"/")
    }
    configuration.set("fs.defaultFS",hdfsUrl)
    var fs: FileSystem = FileSystem.get(configuration)

    val hdfsPathTemporary = hdfsUrl+cachePath+"/PDBCache/PDBCache.json"

    val path: Path = new Path(hdfsPathTemporary)
    if(fs.exists(path)){
      fs.delete(path)
    }
    fs.create(path).close()

    val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))

    var doc: JSONObject = null
    var pdb: PDB = null
    var count:Int=0
    inDf.collect().foreach(row => {
      count += 1
      pathStr = row.get(0).asInstanceOf[String]

      pdb = new PDB(pathStr,fs)
      doc = pdb.getDoc

      doc.write(hdfsWriter)
      hdfsWriter.write("\n")

      doc = null
    })
    hdfsWriter.close()

    val df: DataFrame = session.read.json(hdfsPathTemporary)
    out.write(df)
}

  def setProperties(map: Map[String, Any]): Unit = {
    cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
  }
  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
      .defaultValue("/PDB").required(true)
    descriptor = cachePath :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/microorganism/PDBData.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.MicroorganismGroup)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

} 
Example 19
Source File: Pathway.scala    From piflow   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package cn.piflow.bundle.microorganism

import java.io.{BufferedReader, InputStreamReader, OutputStreamWriter}

import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.json.JSONObject


class Pathway extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val description: String = "Parse Pathway data"
  override val inportList: List[String] =List(Port.DefaultPort.toString)
  override val outportList: List[String] = List(Port.DefaultPort.toString)


  var cachePath:String = _
  def setProperties(map: Map[String, Any]): Unit = {
    cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
      .defaultValue("/pathway").required(true)
    descriptor = cachePath :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/microorganism/Pathway.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.MicroorganismGroup)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val inDf: DataFrame = in.read()
    var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]

    val configuration: Configuration = new Configuration()
    val pathARR: Array[String] = pathStr.split("\\/")
    var hdfsUrl:String=""
    for (x <- (0 until 3)){
      hdfsUrl+=(pathARR(x) +"/")
    }
    configuration.set("fs.defaultFS",hdfsUrl)
    var fs: FileSystem = FileSystem.get(configuration)


    val hdfsPathTemporary = hdfsUrl+cachePath+"/pathwayCache/pathwayCache.json"
    val path: Path = new Path(hdfsPathTemporary)
    if(fs.exists(path)){
      fs.delete(path)
    }
    fs.create(path).close()
    val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))

    var fdis: FSDataInputStream = null
    var br: BufferedReader = null
    var doc: JSONObject = null
    var hasAnotherSequence:Boolean = true

    inDf.collect().foreach(row => {
      pathStr = row.get(0).asInstanceOf[String]

      fdis = fs.open(new Path(pathStr))
      br = new BufferedReader(new InputStreamReader(fdis))
      var count = 0
      while (hasAnotherSequence) {
          count += 1
          doc = new JSONObject
          hasAnotherSequence = util.KeggPathway.process(br, doc)

          doc.write(hdfsWriter)
          hdfsWriter.write("\n")
        }
      br.close()
      fdis.close()
    })
    hdfsWriter.close()

    val df: DataFrame = pec.get[SparkSession]().read.json(hdfsPathTemporary)
    df.schema.printTreeString()
    println(df.count)

    out.write(df)

  }
} 
Example 20
Source File: PostUrl.scala    From piflow   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package cn.piflow.bundle.http

import java.io.{BufferedReader, InputStreamReader}
import java.net.URI

import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.commons.httpclient.HttpClient
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import org.apache.http.client.methods.HttpPost
import org.apache.http.entity.StringEntity
import org.apache.http.impl.client.HttpClients
import org.apache.http.util.EntityUtils
import org.apache.spark.sql.SparkSession


class PostUrl extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val inportList: List[String] = List(Port.DefaultPort)
  override val outportList: List[String] = List(Port.DefaultPort)
  override val description: String = "Send a post request to the specified http"

  var url : String= _
  var jsonPath : String = _


  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val spark = pec.get[SparkSession]()

    //read  json from hdfs
    val conf = new Configuration()
    val fs = FileSystem.get(URI.create(jsonPath),conf)
    val stream: FSDataInputStream = fs.open(new Path(jsonPath))
    val bufferReader = new BufferedReader(new InputStreamReader(stream))
    var lineTxt = bufferReader.readLine()
    val buffer = new StringBuffer()
    while (lineTxt != null ){
      buffer.append(lineTxt.mkString)
      lineTxt=bufferReader.readLine()
    }

    // post
    val client = HttpClients.createDefault()
    val httpClient = new HttpClient()
    httpClient.getParams().setContentCharset("utf-8")

    val post = new HttpPost(url)
    post.addHeader("content-Type","application/json")
    post.setEntity(new StringEntity(buffer.toString))
    val response = client.execute(post)
    val entity = response.getEntity
    val str = EntityUtils.toString(entity,"UTF-8")
    println("Code is " + str)

  }


  override def setProperties(map: Map[String, Any]): Unit = {
    url = MapUtil.get(map,key="url").asInstanceOf[String]
    jsonPath = MapUtil.get(map,key="jsonPath").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val url = new PropertyDescriptor()
      .name("url")
      .displayName("Url")
      .defaultValue("")
      .description("http request address")
      .required(true)
      .example("http://master:8002/flow/start")

    val jsonPath = new PropertyDescriptor()
      .name("jsonPath")
      .displayName("JsonPath")
      .defaultValue("")
      .description("json parameter path for post request")
      .required(true)
        .example("hdfs://master:9000/work/flow.json")

    descriptor = url :: descriptor
    descriptor = jsonPath :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/http/PostUrl.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.HttpGroup.toString)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

} 
Example 21
Source File: HdfsStreamAccessor.scala    From ArchiveSpark   with MIT License 5 votes vote down vote up
package org.archive.archivespark.dataspecs.access

import java.io.InputStream

import org.apache.commons.io.input.BoundedInputStream
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import org.apache.spark.deploy.SparkHadoopUtil

class HdfsStreamAccessor(location: HdfsLocationInfo) extends CloseableDataAccessor[InputStream] {
  override def get: Option[InputStream] = {
    if (location.length < 0 || location.offset < 0) None
    else {
      val fs = FileSystem.get(SparkHadoopUtil.get.conf)
      var stream: FSDataInputStream = null
      try {
        stream = fs.open(new Path(location.path))
        stream.seek(location.offset)
        Some(new BoundedInputStream(stream, location.length))
      } catch {
        case e: Exception =>
          e.printStackTrace()
          if (stream != null) stream.close()
          None
      }
    }
  }
}