org.apache.hadoop.fs.FileSystem Scala Example

Source File: HDFSCredentialProvider.scala From drizzle-spark with Apache License 2.0

6 votes

package org.apache.spark.deploy.yarn.security

import java.io.{ByteArrayInputStream, DataInputStream}

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier
import org.apache.hadoop.mapred.Master
import org.apache.hadoop.security.Credentials

import org.apache.spark.{SparkConf, SparkException}
import org.apache.spark.deploy.yarn.config._
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config._

private[security] class HDFSCredentialProvider extends ServiceCredentialProvider with Logging {
  // Token renewal interval, this value will be set in the first call,
  // if None means no token renewer specified, so cannot get token renewal interval.
  private var tokenRenewalInterval: Option[Long] = null

  override val serviceName: String = "hdfs"

  override def obtainCredentials(
      hadoopConf: Configuration,
      sparkConf: SparkConf,
      creds: Credentials): Option[Long] = {
    // NameNode to access, used to get tokens from different FileSystems
    nnsToAccess(hadoopConf, sparkConf).foreach { dst =>
      val dstFs = dst.getFileSystem(hadoopConf)
      logInfo("getting token for namenode: " + dst)
      dstFs.addDelegationTokens(getTokenRenewer(hadoopConf), creds)
    }

    // Get the token renewal interval if it is not set. It will only be called once.
    if (tokenRenewalInterval == null) {
      tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, sparkConf)
    }

    // Get the time of next renewal.
    tokenRenewalInterval.map { interval =>
      creds.getAllTokens.asScala
        .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
        .map { t =>
          val identifier = new DelegationTokenIdentifier()
          identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
          identifier.getIssueDate + interval
      }.foldLeft(0L)(math.max)
    }
  }

  private def getTokenRenewalInterval(
      hadoopConf: Configuration, sparkConf: SparkConf): Option[Long] = {
    // We cannot use the tokens generated with renewer yarn. Trying to renew
    // those will fail with an access control issue. So create new tokens with the logged in
    // user as renewer.
    sparkConf.get(PRINCIPAL).map { renewer =>
      val creds = new Credentials()
      nnsToAccess(hadoopConf, sparkConf).foreach { dst =>
        val dstFs = dst.getFileSystem(hadoopConf)
        dstFs.addDelegationTokens(renewer, creds)
      }
      val t = creds.getAllTokens.asScala
        .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
        .head
      val newExpiration = t.renew(hadoopConf)
      val identifier = new DelegationTokenIdentifier()
      identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
      val interval = newExpiration - identifier.getIssueDate
      logInfo(s"Renewal Interval is $interval")
      interval
    }
  }

  private def getTokenRenewer(conf: Configuration): String = {
    val delegTokenRenewer = Master.getMasterPrincipal(conf)
    logDebug("delegation token renewer is: " + delegTokenRenewer)
    if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) {
      val errorMessage = "Can't get Master Kerberos principal for use as renewer"
      logError(errorMessage)
      throw new SparkException(errorMessage)
    }

    delegTokenRenewer
  }

  private def nnsToAccess(hadoopConf: Configuration, sparkConf: SparkConf): Set[Path] = {
    sparkConf.get(NAMENODES_TO_ACCESS).map(new Path(_)).toSet +
      sparkConf.get(STAGING_DIR).map(new Path(_))
        .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory)
  }
}

Source File: AvroParquetSourceTest.scala From eel-sdk with Apache License 2.0

6 votes

package io.eels.component.parquet

import java.nio.file.Paths

import io.eels.component.parquet.avro.AvroParquetSource
import io.eels.component.parquet.util.ParquetLogMute
import io.eels.schema._
import org.apache.avro.SchemaBuilder
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.parquet.avro.AvroParquetWriter
import org.scalatest.{Matchers, WordSpec}

class AvroParquetSourceTest extends WordSpec with Matchers {
  ParquetLogMute()

  private implicit val conf = new Configuration()
  private implicit val fs = FileSystem.get(conf)

  private val personFile = Paths.get(getClass.getResource("/io/eels/component/parquet/person.avro.pq").toURI)
  private val resourcesDir = personFile.getParent

  "AvroParquetSource" should {
    "read schema" in {
      val people = AvroParquetSource(personFile)
      people.schema shouldBe StructType(
        Field("name", StringType, nullable = false),
        Field("job", StringType, nullable = false),
        Field("location", StringType, nullable = false)
      )
    }
    "read parquet files" in {
      val people = AvroParquetSource(personFile.toAbsolutePath()).toDataStream().toSet.map(_.values)
      people shouldBe Set(
        Vector("clint eastwood", "actor", "carmel"),
        Vector("elton john", "musician", "pinner")
      )
    }
    "read multiple parquet files using file expansion" in {
      import io.eels.FilePattern._
      val people = AvroParquetSource(s"${resourcesDir.toUri.toString}/*.pq").toDataStream().toSet.map(_.values)
      people shouldBe Set(
        Vector("clint eastwood", "actor", "carmel"),
        Vector("elton john", "musician", "pinner"),
        Vector("clint eastwood", "actor", "carmel"),
        Vector("elton john", "musician", "pinner")
      )
    }
    // todo add merge to parquet source
    "merge schemas" ignore {

      try {
        fs.delete(new Path("merge1.pq"), false)
      } catch {
        case t: Throwable =>
      }
      try {
        fs.delete(new Path("merge2.pq"), false)
      } catch {
        case t: Throwable =>
      }

      val schema1 = SchemaBuilder.builder().record("schema1").fields().requiredString("a").requiredDouble("b").endRecord()
      val schema2 = SchemaBuilder.builder().record("schema2").fields().requiredInt("a").requiredBoolean("c").endRecord()

      val writer1 = AvroParquetWriter.builder[GenericRecord](new Path("merge1.pq")).withSchema(schema1).build()
      val record1 = new GenericData.Record(schema1)
      record1.put("a", "aaaaa")
      record1.put("b", 124.3)
      writer1.write(record1)
      writer1.close()

      val writer2 = AvroParquetWriter.builder[GenericRecord](new Path("merge2.pq")).withSchema(schema2).build()
      val record2 = new GenericData.Record(schema2)
      record2.put("a", 111)
      record2.put("c", true)
      writer2.write(record2)
      writer2.close()

      ParquetSource(new Path("merge*")).schema shouldBe
        StructType(
          Field("a", StringType, nullable = false),
          Field("b", DoubleType, nullable = false),
          Field("c", BooleanType, nullable = false)
        )

      fs.delete(new Path(".merge1.pq.crc"), false)
      fs.delete(new Path(".merge2.pq.crc"), false)
      fs.delete(new Path("merge1.pq"), false)
      fs.delete(new Path("merge2.pq"), false)
    }
  }
}

Source File: DStreamCheckpointData.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import java.io.{IOException, ObjectInputStream, ObjectOutputStream}

import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.hadoop.fs.{FileSystem, Path}

import org.apache.spark.internal.Logging
import org.apache.spark.streaming.Time
import org.apache.spark.util.Utils

private[streaming]
class DStreamCheckpointData[T: ClassTag](dstream: DStream[T])
  extends Serializable with Logging {
  protected val data = new HashMap[Time, AnyRef]()

  // Mapping of the batch time to the checkpointed RDD file of that time
  @transient private var timeToCheckpointFile = new HashMap[Time, String]
  // Mapping of the batch time to the time of the oldest checkpointed RDD
  // in that batch's checkpoint data
  @transient private var timeToOldestCheckpointFileTime = new HashMap[Time, Time]

  @transient private var fileSystem: FileSystem = null
  protected[streaming] def currentCheckpointFiles = data.asInstanceOf[HashMap[Time, String]]

  
  def restore() {
    // Create RDDs from the checkpoint data
    currentCheckpointFiles.foreach {
      case(time, file) =>
        logInfo("Restoring checkpointed RDD for time " + time + " from file '" + file + "'")
        dstream.generatedRDDs += ((time, dstream.context.sparkContext.checkpointFile[T](file)))
    }
  }

  override def toString: String = {
    "[\n" + currentCheckpointFiles.size + " checkpoint files \n" +
      currentCheckpointFiles.mkString("\n") + "\n]"
  }

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    logDebug(this.getClass().getSimpleName + ".writeObject used")
    if (dstream.context.graph != null) {
      dstream.context.graph.synchronized {
        if (dstream.context.graph.checkpointInProgress) {
          oos.defaultWriteObject()
        } else {
          val msg = "Object of " + this.getClass.getName + " is being serialized " +
            " possibly as a part of closure of an RDD operation. This is because " +
            " the DStream object is being referred to from within the closure. " +
            " Please rewrite the RDD operation inside this DStream to avoid this. " +
            " This has been enforced to avoid bloating of Spark tasks " +
            " with unnecessary objects."
          throw new java.io.NotSerializableException(msg)
        }
      }
    } else {
      throw new java.io.NotSerializableException(
        "Graph is unexpectedly null when DStream is being serialized.")
    }
  }

  @throws(classOf[IOException])
  private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException {
    logDebug(this.getClass().getSimpleName + ".readObject used")
    ois.defaultReadObject()
    timeToOldestCheckpointFileTime = new HashMap[Time, Time]
    timeToCheckpointFile = new HashMap[Time, String]
  }
}

Source File: ExecutorSource.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.executor

import java.util.concurrent.ThreadPoolExecutor

import scala.collection.JavaConverters._

import com.codahale.metrics.{Gauge, MetricRegistry}
import org.apache.hadoop.fs.FileSystem

import org.apache.spark.metrics.source.Source

private[spark]
class ExecutorSource(threadPool: ThreadPoolExecutor, executorId: String) extends Source {

  private def fileStats(scheme: String) : Option[FileSystem.Statistics] =
    FileSystem.getAllStatistics.asScala.find(s => s.getScheme.equals(scheme))

  private def registerFileSystemStat[T](
        scheme: String, name: String, f: FileSystem.Statistics => T, defaultValue: T) = {
    metricRegistry.register(MetricRegistry.name("filesystem", scheme, name), new Gauge[T] {
      override def getValue: T = fileStats(scheme).map(f).getOrElse(defaultValue)
    })
  }

  override val metricRegistry = new MetricRegistry()

  override val sourceName = "executor"

  // Gauge for executor thread pool's actively executing task counts
  metricRegistry.register(MetricRegistry.name("threadpool", "activeTasks"), new Gauge[Int] {
    override def getValue: Int = threadPool.getActiveCount()
  })

  // Gauge for executor thread pool's approximate total number of tasks that have been completed
  metricRegistry.register(MetricRegistry.name("threadpool", "completeTasks"), new Gauge[Long] {
    override def getValue: Long = threadPool.getCompletedTaskCount()
  })

  // Gauge for executor thread pool's current number of threads
  metricRegistry.register(MetricRegistry.name("threadpool", "currentPool_size"), new Gauge[Int] {
    override def getValue: Int = threadPool.getPoolSize()
  })

  // Gauge got executor thread pool's largest number of threads that have ever simultaneously
  // been in th pool
  metricRegistry.register(MetricRegistry.name("threadpool", "maxPool_size"), new Gauge[Int] {
    override def getValue: Int = threadPool.getMaximumPoolSize()
  })

  // Gauge for file system stats of this executor
  for (scheme <- Array("hdfs", "file")) {
    registerFileSystemStat(scheme, "read_bytes", _.getBytesRead(), 0L)
    registerFileSystemStat(scheme, "write_bytes", _.getBytesWritten(), 0L)
    registerFileSystemStat(scheme, "read_ops", _.getReadOps(), 0)
    registerFileSystemStat(scheme, "largeRead_ops", _.getLargeReadOps(), 0)
    registerFileSystemStat(scheme, "write_ops", _.getWriteOps(), 0)
  }
}

Source File: SentenceTokenizer.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.dataset.text

import java.io.FileInputStream
import java.net.{URI, URL}

import com.intel.analytics.bigdl.dataset.Transformer

import scala.collection.Iterator
import opennlp.tools.tokenize.{SimpleTokenizer, Tokenizer, TokenizerME, TokenizerModel}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}



class SentenceTokenizer(tokenFile: Option[String] = None)
  extends Transformer[String, Array[String]] {

  var modelIn: FileInputStream = _
  var model: TokenizerModel = _

  var tokenizer: Tokenizer = _

  def this(tokenFile: URL) {
    this(Some(tokenFile.getPath))
  }

  def close(): Unit = {
    if (modelIn != null) {
      modelIn.close()
    }
  }

  override def apply(prev: Iterator[String]): Iterator[Array[String]] =
    prev.map(x => {
      if (tokenizer == null) {
        if (!tokenFile.isDefined) {
          tokenizer = SimpleTokenizer.INSTANCE
        } else {
          val src: Path = new Path(tokenFile.get)
          val fs = src.getFileSystem(new Configuration())
          val in = fs.open(src)
          model = new TokenizerModel(in)
          tokenizer = new TokenizerME(model)
        }
      }
      val words = tokenizer.tokenize(x)
      words
    })
}

object SentenceTokenizer {
  def apply(tokenFile: Option[String] = None):
    SentenceTokenizer = new SentenceTokenizer(tokenFile)
  def apply(tokenFile: URL):
    SentenceTokenizer = new SentenceTokenizer(tokenFile)
}

Source File: SentenceSplitter.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.dataset.text

import java.io.FileInputStream
import java.net.{URI, URL}

import com.intel.analytics.bigdl.dataset.Transformer
import opennlp.tools.sentdetect.{SentenceDetector, SentenceDetectorME, SentenceModel}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

import scala.collection.Iterator


class SentenceSplitter(sentFile: Option[String] = None)
  extends Transformer[String, Array[String]] {

  var modelIn: FileInputStream = _
  var model: SentenceModel = _
  var sentenceDetector: SentenceDetector = _

  def this(sentFileURL: URL) {
    this(Some(sentFileURL.getPath))
  }

  def this(sentFile: String) {
    this(Some(sentFile))
  }

  def close(): Unit = {
    if (modelIn != null) {
      modelIn.close()
    }
  }

  override def apply(prev: Iterator[String]): Iterator[Array[String]] =
    prev.map(x => {
      if (!sentFile.isDefined) {
        x.split('.')
      } else {
        if (sentenceDetector == null) {
          val src: Path = new Path(sentFile.get)
          val fs = src.getFileSystem(new Configuration())
          val in = fs.open(src)

          model = new SentenceModel(in)
          sentenceDetector = new SentenceDetectorME(model)
        }
        sentenceDetector.sentDetect(x)
      }
    })
}

object SentenceSplitter {
  def apply(sentFile: Option[String] = None):
    SentenceSplitter = new SentenceSplitter(sentFile)
  def apply(sentFileURL: URL):
    SentenceSplitter = new SentenceSplitter(sentFileURL)
  def apply(sentFile: String):
  SentenceSplitter = new SentenceSplitter(sentFile)
}

Source File: RecordWriter.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.visualization.tensorboard

import java.io.{File, FileOutputStream}

import com.google.common.primitives.{Ints, Longs}
import com.intel.analytics.bigdl.utils.Crc32
import netty.Crc32c
import org.apache.hadoop.fs.{FSDataOutputStream, FileSystem, Path}
import org.tensorflow.util.Event


private[bigdl] class RecordWriter(file: Path, fs: FileSystem) {
  val outputStream = if (file.toString.startsWith("hdfs://")) {
    // FSDataOutputStream couldn't flush data to localFileSystem in time. So reading summaries
    // will throw exception.
    fs.create(file, true, 1024)
  } else {
    // Using FileOutputStream when write to local.
    new FileOutputStream(new File(file.toString))
  }
  val crc32 = new Crc32c()
  def write(event: Event): Unit = {
    val eventString = event.toByteArray
    val header = Longs.toByteArray(eventString.length.toLong).reverse
    outputStream.write(header)
    outputStream.write(Ints.toByteArray(Crc32.maskedCRC32(crc32, header).toInt).reverse)
    outputStream.write(eventString)
    outputStream.write(Ints.toByteArray(Crc32.maskedCRC32(crc32, eventString).toInt).reverse)
    if (outputStream.isInstanceOf[FSDataOutputStream]) {
      // Flush data to HDFS.
      outputStream.asInstanceOf[FSDataOutputStream].hflush()
    }
  }

  def close(): Unit = {
    outputStream.close()
  }
}

Source File: FileReader.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.visualization.tensorboard

import java.io.{BufferedInputStream}
import java.nio.ByteBuffer

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.tensorflow.util.Event

import scala.collection.mutable.ArrayBuffer
import scala.util.matching.Regex

private[bigdl] object FileReader {
  val fileNameRegex = """bigdl.tfevents.*""".r

  
  def readScalar(file: Path, tag: String, fs: FileSystem): Array[(Long, Float, Double)] = {
    require(fs.isFile(file), s"FileReader: ${file} should be a file")
    val bis = new BufferedInputStream(fs.open(file))
    val longBuffer = new Array[Byte](8)
    val crcBuffer = new Array[Byte](4)
    val bf = new ArrayBuffer[(Long, Float, Double)]
    while (bis.read(longBuffer) > 0) {
      val l = ByteBuffer.wrap(longBuffer.reverse).getLong()
      bis.read(crcBuffer)
      // TODO: checksum
      //      val crc1 = ByteBuffer.wrap(crcBuffer.reverse).getInt()
      val eventBuffer = new Array[Byte](l.toInt)
      bis.read(eventBuffer)
      val e = Event.parseFrom(eventBuffer)
      if (e.getSummary.getValueCount == 1 &&
        tag.equals(e.getSummary.getValue(0).getTag())) {
        bf.append((e.getStep, e.getSummary.getValue(0).getSimpleValue,
          e.getWallTime))
      }
      bis.read(crcBuffer)
      //      val crc2 = ByteBuffer.wrap(crcBuffer.reverse).getInt()
    }
    bis.close()
    bf.toArray.sortWith(_._1 < _._1)
  }
}

Source File: HdfsFileAccessor.scala From ArchiveSpark with MIT License

5 votes

package org.archive.archivespark.dataspecs.access

import java.io.InputStream

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.deploy.SparkHadoopUtil
import org.archive.archivespark.sparkling.io.IOUtil

class HdfsFileAccessor(path: String, decompress: Boolean = true) extends CloseableDataAccessor[InputStream] {
  override def get: Option[InputStream] = {
    val fs = FileSystem.get(SparkHadoopUtil.get.conf)
    var stream: InputStream = null
    try {
      val raw = fs.open(new Path(path))
      stream = if (decompress) IOUtil.decompress(raw, Some(path)) else raw
      Some(stream)
    } catch {
      case e: Exception =>
        e.printStackTrace()
        if (stream != null) stream.close()
        None
    }
  }
}

Source File: HdfsStreamAccessor.scala From ArchiveSpark with MIT License

5 votes

package org.archive.archivespark.dataspecs.access

import java.io.InputStream

import org.apache.commons.io.input.BoundedInputStream
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import org.apache.spark.deploy.SparkHadoopUtil

class HdfsStreamAccessor(location: HdfsLocationInfo) extends CloseableDataAccessor[InputStream] {
  override def get: Option[InputStream] = {
    if (location.length < 0 || location.offset < 0) None
    else {
      val fs = FileSystem.get(SparkHadoopUtil.get.conf)
      var stream: FSDataInputStream = null
      try {
        stream = fs.open(new Path(location.path))
        stream.seek(location.offset)
        Some(new BoundedInputStream(stream, location.length))
      } catch {
        case e: Exception =>
          e.printStackTrace()
          if (stream != null) stream.close()
          None
      }
    }
  }
}

Source File: HdfsBlockStream.scala From ArchiveSpark with MIT License

5 votes

package org.archive.archivespark.sparkling.io

import java.io.{ByteArrayInputStream, InputStream}

import org.apache.hadoop.fs.{FileSystem, Path}
import org.archive.archivespark.sparkling.logging.LogContext
import org.archive.archivespark.sparkling.util.Common

import scala.util.Try

class HdfsBlockStream (fs: FileSystem, file: String, offset: Long = 0, length: Long = -1, retries: Int = 60, sleepMillis: Int = 1000 * 60) extends InputStream {
  implicit val logContext: LogContext = LogContext(this)

  val path = new Path(file)
  val (blockSize: Int, fileSize: Long) = {
    val status = fs.getFileStatus(path)
    (status.getBlockSize.min(Int.MaxValue).toInt, status.getLen)
  }

  private var pos: Long = offset.max(0)
  private val max: Long = if (length > 0) fileSize.min(pos + length) else fileSize

  private val buffer = new Array[Byte](blockSize)
  private val emptyBlock = new ByteArrayInputStream(Array.emptyByteArray)
  private var block: ByteArrayInputStream = emptyBlock

  def ensureNextBlock(): InputStream = {
    if (block.available() == 0 && pos < max) {
      val end = pos + blockSize
      val blockLength = ((end - (end % blockSize)).min(max) - pos).toInt
      Common.retry(retries, sleepMillis, (retry, e) => {
        "File access failed (" + retry + "/" + retries + "): " + path + " (Offset: " + pos + ") - " + e.getMessage
      }) { retry =>
        val in = fs.open(path, blockLength)
        if (retry > 0) Try(in.seekToNewSource(pos))
        else if (pos > 0) in.seek(pos)
        var read = 0
        while (read < blockLength) read += in.read(buffer, read, blockLength - read)
        Try(in.close())
      }
      pos += blockLength
      block = new ByteArrayInputStream(buffer, 0, blockLength)
    }
    block
  }

  override def read(): Int = ensureNextBlock().read()

  override def read(b: Array[Byte]): Int = ensureNextBlock().read(b)

  override def read(b: Array[Byte], off: Int, len: Int): Int = ensureNextBlock().read(b, off, len)

  override def skip(n: Long): Long = {
    val available = block.available()
    if (n <= available) block.skip(n)
    else {
      block = emptyBlock
      val currentPos = pos - available
      val skip = n.min(max - currentPos)
      pos += skip - available
      skip
    }
  }

  override def available(): Int = block.available()

  override def close(): Unit = {}
  override def markSupported(): Boolean = false
}

Source File: FilePathMap.scala From ArchiveSpark with MIT License

5 votes

package org.archive.archivespark.util

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.deploy.SparkHadoopUtil

import scala.util.Try

case class FilePathMap(path: String, patterns: Seq[String] = Seq.empty) {
  val pathMap: Map[String, String] = {
    var map = collection.mutable.Map[String, String]()

    val fs = FileSystem.get(SparkHadoopUtil.get.conf)
    val files = fs.listFiles(new Path(path), true)
    while (files.hasNext) {
      val path = files.next.getPath
      val filename = path.getName
      if (patterns.isEmpty || patterns.exists(filename.matches)) {
        if (map.contains(filename)) throw new RuntimeException("duplicate filename: " + filename)
        map += filename -> path.getParent.toString.intern
      }
    }

    map.toMap
  }

  def pathToFile(file: String): Option[Path] = Try {new Path(file).getName}.toOption match {
    case Some(f) => pathMap.get(f).map(dir => new Path(dir, f))
    case None => None
  }
}

Source File: 2-CommonFunctions.scala From Azure-Databricks-NYC-Taxi-Workshop with MIT License

5 votes

// Databricks notebook source
import org.apache.hadoop.fs.{ FileSystem, Path }
import org.apache.hadoop.conf.Configuration

// COMMAND ----------

val prqShrinkageFactor = 0.19 //We found a saving in space of 81% with Parquet

// COMMAND ----------

def analyzeTables(databaseAndTable: String)
{
  println("Table: " + databaseAndTable)
  println("....refresh table")
  sql("REFRESH TABLE " + databaseAndTable)
  println("....analyze table")
  sql("ANALYZE TABLE " + databaseAndTable + " COMPUTE STATISTICS")
  println("....done")
}

// COMMAND ----------

def calcOutputFileCountTxtToPrq(srcDataFile: String, targetedFileSizeMB: Int): Int = {
  val fs = FileSystem.get(new Configuration())
  val estFileCount: Int = Math.floor((fs.getContentSummary(new Path(srcDataFile)).getLength * prqShrinkageFactor) / (targetedFileSizeMB * 1024 * 1024)).toInt
  if(estFileCount == 0) 1 else estFileCount
}

// COMMAND ----------

// Get recursive file collection you can iterate on
def getRecursiveFileCollection(directoryPath: String): Seq[String] =
  dbutils.fs.ls(directoryPath).map(directoryItem => {
    // Work around double encoding bug
    val directoryItemPath = directoryItem.path.replace("%25", "%").replace("%25", "%")
    if (directoryItem.isDir) 
      getRecursiveFileCollection(directoryItemPath)
    else 
      Seq[String](directoryItemPath)
  }).reduce(_ ++ _)

// COMMAND ----------

//Delete residual files from job operation (_SUCCESS, _start*, _committed*)
def recursivelyDeleteSparkJobFlagFiles(directoryPath: String)
{
  
  getRecursiveFileCollection(directoryPath).foreach(directoryItemPath => {
  if (directoryItemPath.indexOf("parquet") == -1)
  {
      println("Deleting...." +  directoryItemPath)
      dbutils.fs.rm(directoryItemPath)
  }})
}


// COMMAND ----------

dbutils.notebook.exit("Pass")

Source File: TopWORDSApp.scala From topwords with GNU General Public License v3.0

5 votes

package io.github.qf6101.topwords

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.log4j.Logger
import org.apache.spark.sql.SparkSession


object TopWORDSApp extends Serializable {
  @transient private[this] val LOGGER = Logger.getLogger(this.getClass.toString)

  def main(args: Array[String]) {
    // setup spark session
    val spark = SparkSession.builder().getOrCreate()
    try {
      TopWORDSParser.parse(args).foreach { args =>
        // remove output location files if exist
        val files = FileSystem.get(spark.sparkContext.hadoopConfiguration)
        if (files.exists(new Path(args.outputLoc))) files.delete(new Path(args.outputLoc), true)
        // read input corpus
        val corpus = if (args.numPartitions > 0)
          spark.sparkContext.textFile(args.inputLoc).repartition(args.numPartitions)
        else spark.sparkContext.textFile(args.inputLoc)
        LOGGER.info("Number of lines of input corpus: " + corpus.count())
        // run TopWORDS with the parsed arguments
        new TopWORDS(
          tauL = args.tauL,
          tauF = args.tauF,
          textLenThld = args.textLenThld,
          useProbThld = args.useProbThld,
          numIterations = args.numIterations,
          convergeTol = args.convergeTol,
          wordBoundaryThld = args.wordBoundaryThld)
          .run(corpus, args.outputLoc + "/dictionary", args.outputLoc + "/segmented_texts")
      }
      //exit normally
      LOGGER.info("Running TopWORDS successfully!")
      if (spark.sparkContext.master.contains("local")) sys.exit(0)
    } catch {
      case ex: Throwable =>
        LOGGER.error("Running TopWORDS fail!", ex)
        //signal to external process
        if (spark.sparkContext.master.contains("local")) sys.exit(1)
    } finally spark.stop()
  }
}

Source File: TestTopWORDS.scala From topwords with GNU General Public License v3.0

5 votes

package io.github.qf6101.topwords

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.sql.SparkSession


  def main(args: Array[String]) {
    // setup spark session
    val spark = SparkSession.builder().master("local[1]").appName(this.getClass.toString).getOrCreate()
    val inputFile = "test_data/story_of_stone.txt"
    val outputFile = "test_data/test_output"
    val files = FileSystem.get(spark.sparkContext.hadoopConfiguration)
    if (files.exists(new Path(outputFile))) files.delete(new Path(outputFile), true)
    val corpus = spark.sparkContext.textFile(inputFile)
    new TopWORDS(
      tauL = 10,
      tauF = 5,
      textLenThld = 2000,
      useProbThld = 1E-8,
      numIterations = 10,
      convergeTol = 1E-3,
      wordBoundaryThld = 0.0)
      .run(corpus, outputFile + "/dictionary", outputFile + "/segmented_texts")
  }
}

Source File: PostUrl.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.http

import java.io.{BufferedReader, InputStreamReader}
import java.net.URI

import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.commons.httpclient.HttpClient
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import org.apache.http.client.methods.HttpPost
import org.apache.http.entity.StringEntity
import org.apache.http.impl.client.HttpClients
import org.apache.http.util.EntityUtils
import org.apache.spark.sql.SparkSession


class PostUrl extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val inportList: List[String] = List(Port.DefaultPort)
  override val outportList: List[String] = List(Port.DefaultPort)
  override val description: String = "Send a post request to the specified http"

  var url : String= _
  var jsonPath : String = _


  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val spark = pec.get[SparkSession]()

    //read  json from hdfs
    val conf = new Configuration()
    val fs = FileSystem.get(URI.create(jsonPath),conf)
    val stream: FSDataInputStream = fs.open(new Path(jsonPath))
    val bufferReader = new BufferedReader(new InputStreamReader(stream))
    var lineTxt = bufferReader.readLine()
    val buffer = new StringBuffer()
    while (lineTxt != null ){
      buffer.append(lineTxt.mkString)
      lineTxt=bufferReader.readLine()
    }

    // post
    val client = HttpClients.createDefault()
    val httpClient = new HttpClient()
    httpClient.getParams().setContentCharset("utf-8")

    val post = new HttpPost(url)
    post.addHeader("content-Type","application/json")
    post.setEntity(new StringEntity(buffer.toString))
    val response = client.execute(post)
    val entity = response.getEntity
    val str = EntityUtils.toString(entity,"UTF-8")
    println("Code is " + str)

  }


  override def setProperties(map: Map[String, Any]): Unit = {
    url = MapUtil.get(map,key="url").asInstanceOf[String]
    jsonPath = MapUtil.get(map,key="jsonPath").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val url = new PropertyDescriptor()
      .name("url")
      .displayName("Url")
      .defaultValue("")
      .description("http request address")
      .required(true)
      .example("http://master:8002/flow/start")

    val jsonPath = new PropertyDescriptor()
      .name("jsonPath")
      .displayName("JsonPath")
      .defaultValue("")
      .description("json parameter path for post request")
      .required(true)
        .example("hdfs://master:9000/work/flow.json")

    descriptor = url :: descriptor
    descriptor = jsonPath :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/http/PostUrl.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.HttpGroup.toString)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

}

Source File: Pathway.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.microorganism

import java.io.{BufferedReader, InputStreamReader, OutputStreamWriter}

import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.json.JSONObject


class Pathway extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val description: String = "Parse Pathway data"
  override val inportList: List[String] =List(Port.DefaultPort.toString)
  override val outportList: List[String] = List(Port.DefaultPort.toString)


  var cachePath:String = _
  def setProperties(map: Map[String, Any]): Unit = {
    cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
      .defaultValue("/pathway").required(true)
    descriptor = cachePath :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/microorganism/Pathway.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.MicroorganismGroup)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val inDf: DataFrame = in.read()
    var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]

    val configuration: Configuration = new Configuration()
    val pathARR: Array[String] = pathStr.split("\\/")
    var hdfsUrl:String=""
    for (x <- (0 until 3)){
      hdfsUrl+=(pathARR(x) +"/")
    }
    configuration.set("fs.defaultFS",hdfsUrl)
    var fs: FileSystem = FileSystem.get(configuration)


    val hdfsPathTemporary = hdfsUrl+cachePath+"/pathwayCache/pathwayCache.json"
    val path: Path = new Path(hdfsPathTemporary)
    if(fs.exists(path)){
      fs.delete(path)
    }
    fs.create(path).close()
    val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))

    var fdis: FSDataInputStream = null
    var br: BufferedReader = null
    var doc: JSONObject = null
    var hasAnotherSequence:Boolean = true

    inDf.collect().foreach(row => {
      pathStr = row.get(0).asInstanceOf[String]

      fdis = fs.open(new Path(pathStr))
      br = new BufferedReader(new InputStreamReader(fdis))
      var count = 0
      while (hasAnotherSequence) {
          count += 1
          doc = new JSONObject
          hasAnotherSequence = util.KeggPathway.process(br, doc)

          doc.write(hdfsWriter)
          hdfsWriter.write("\n")
        }
      br.close()
      fdis.close()
    })
    hdfsWriter.close()

    val df: DataFrame = pec.get[SparkSession]().read.json(hdfsPathTemporary)
    df.schema.printTreeString()
    println(df.count)

    out.write(df)

  }
}

Source File: PDBData.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.microorganism

import java.io._

import cn.piflow.bundle.microorganism.util.PDB
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator}
import org.json.JSONObject

class PDBData extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val description: String = "Parse PDB data"
  override val inportList: List[String] =List(Port.DefaultPort.toString)
  override val outportList: List[String] = List(Port.DefaultPort.toString)

  var cachePath:String = _
  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val session = pec.get[SparkSession]()
    val inDf: DataFrame = in.read()

    val configuration: Configuration = new Configuration()
    var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]
    val pathARR: Array[String] = pathStr.split("\\/")
    var hdfsUrl:String=""
    for (x <- (0 until 3)){
      hdfsUrl+=(pathARR(x) +"/")
    }
    configuration.set("fs.defaultFS",hdfsUrl)
    var fs: FileSystem = FileSystem.get(configuration)

    val hdfsPathTemporary = hdfsUrl+cachePath+"/PDBCache/PDBCache.json"

    val path: Path = new Path(hdfsPathTemporary)
    if(fs.exists(path)){
      fs.delete(path)
    }
    fs.create(path).close()

    val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))

    var doc: JSONObject = null
    var pdb: PDB = null
    var count:Int=0
    inDf.collect().foreach(row => {
      count += 1
      pathStr = row.get(0).asInstanceOf[String]

      pdb = new PDB(pathStr,fs)
      doc = pdb.getDoc

      doc.write(hdfsWriter)
      hdfsWriter.write("\n")

      doc = null
    })
    hdfsWriter.close()

    val df: DataFrame = session.read.json(hdfsPathTemporary)
    out.write(df)
}

  def setProperties(map: Map[String, Any]): Unit = {
    cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
  }
  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
      .defaultValue("/PDB").required(true)
    descriptor = cachePath :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/microorganism/PDBData.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.MicroorganismGroup)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

}

Source File: Ensembl.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.microorganism

import java.io._

import cn.piflow.bundle.microorganism.util.ParserGff3Data
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator}
import org.json.JSONObject

class Ensembl extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val description: String = "Parse ensembl data"
  override val inportList: List[String] =List(Port.DefaultPort.toString)
  override val outportList: List[String] = List(Port.DefaultPort.toString)

  var cachePath:String = _
  def setProperties(map: Map[String, Any]): Unit = {
    cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
  }
  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
      .defaultValue("/ensembl").required(true)
    descriptor = cachePath :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/microorganism/Ensembl.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.MicroorganismGroup)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val session = pec.get[SparkSession]()
    val inDf: DataFrame = in.read()

    val configuration: Configuration = new Configuration()
    var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]
    val pathARR: Array[String] = pathStr.split("\\/")
    var hdfsUrl:String=""
    for (x <- (0 until 3)){
      hdfsUrl+=(pathARR(x) +"/")
    }
    configuration.set("fs.defaultFS",hdfsUrl)
    var fs: FileSystem = FileSystem.get(configuration)

    val hdfsPathTemporary = hdfsUrl+cachePath+"/ensemblCache/ensemblCache.json"

    val path: Path = new Path(hdfsPathTemporary)
    if(fs.exists(path)){
      fs.delete(path)
    }
    fs.create(path).close()

    val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))

    val parser: ParserGff3Data = new ParserGff3Data
    var fdis: FSDataInputStream =null
    var br: BufferedReader = null
    var doc: JSONObject = null
    var count:Int = 0
    inDf.collect().foreach(row => {
      pathStr = row.get(0).asInstanceOf[String]

      fdis = fs.open(new Path(pathStr))
      br = new BufferedReader(new InputStreamReader(fdis))
      var eachStr:String=null

      while((eachStr = br.readLine()) != null && eachStr != null ){
        doc = parser.parserGff3(eachStr)

        if(doc.toString.length > 2){
          count += 1
          doc.write(hdfsWriter)
          hdfsWriter.write("\n")
        }
      }

      br.close()
      fdis.close()
    })

    hdfsWriter.close()

    out.write(session.read.json(hdfsPathTemporary))
  }
}

Source File: MergeStrategySpec.scala From daf with BSD 3-Clause "New" or "Revised" License

5 votes

package daf.filesystem

import java.io.{ Closeable, InputStream }
import java.util.Scanner

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{ FSDataInputStream, FSDataOutputStream, FileSystem, Path }
import org.scalatest.{ BeforeAndAfterAll, Matchers, WordSpec }

import scala.collection.convert.decorateAsScala._
import scala.util.{ Random, Try }

class MergeStrategySpec extends WordSpec with Matchers with BeforeAndAfterAll {

  private implicit val fileSystem = FileSystem.getLocal(new Configuration)

  private val numFiles = 10

  private val baseDir = "test-dir".asHadoop

  private val workingDir = baseDir / f"merge-strategy-spec-${Random.nextInt(10000)}%05d"

  private def safely[A <: Closeable, U](f: A => U) = { stream: A =>
    val attempt = Try { f(stream) }
    stream.close()
    attempt
  }

  private def readFile(path: Path) = safely[FSDataInputStream, Seq[String]] { _.scanner.asScala.toSeq } apply fileSystem.open(path)

  private def readFiles = Try {
    fileSystem.listStatus(workingDir).toSeq.flatMap { status => readFile(status.getPath).get }
  }

  private def openFiles = Try {
    fileSystem.listStatus(workingDir).toSeq.map { status => fileSystem.open(status.getPath) }
  }

  private def createFile(fileName: String) = safely[FSDataOutputStream, Unit] { stream =>
    Random.alphanumeric.grouped(200).take(10).map { randomSplits(_) }.foreach { row =>
      stream.writeUTF { row.mkString("", ",", "\n") }
    }
  } apply fileSystem.create { workingDir / fileName }

  private def randomSplits(chars: Stream[Char], strings: Seq[String] = Seq.empty): Seq[String] = chars.splitAt { Random.nextInt(10) + 5 } match {
    case (head, tail) if tail.isEmpty => head.drop(1).mkString +: strings
    case (head, tail)                 => randomSplits(tail, head.mkString +: strings)
  }

  private def createWorkingDir = Try { fileSystem.mkdirs(workingDir) }

  private def createFiles = Try {
    0 until numFiles foreach { index => createFile(s"test-file-$index").get } // this is relatively nasty, and should be handled in a `traverse`
  }

  private def prepareData = for {
    _ <- createWorkingDir
    _ <- createFiles
  } yield ()

  private def purgeData = Try { fileSystem.delete(workingDir, true) }

  override def beforeAll() = prepareData.get

  override def afterAll() = purgeData.get

  "MergeStrategies info" when {

    "given compressed format files" must {

      "throw an exception" in {
        an[IllegalArgumentException] must be thrownBy MergeStrategies.find { FileInfo(workingDir / "test-file-0", 0, FileDataFormats.raw, FileCompressionFormats.gzip) }
      }
    }

    "given data as csv" must {

      "drop one line and merge the rest" in {
        safely[InputStream, Seq[String]] { new Scanner(_).asScala.toList }.andThen { attempt =>
          for {
            merged   <- attempt
            expected <- readFiles
          } merged.size should be { expected.size - numFiles + 1 }
        } apply MergeStrategies.csv.merge { openFiles.get }
      }
    }

    "given data as json" must {

      "just merge the files into one" in {
        safely[InputStream, Seq[String]] { new Scanner(_).asScala.toList }.andThen { attempt =>
          for {
            merged   <- attempt
            expected <- readFiles
          } merged.size should be { expected.size }
        } apply MergeStrategies.json.merge { openFiles.get }
      }

    }
  }
}

Source File: HDFSBase.scala From daf with BSD 3-Clause "New" or "Revised" License

5 votes

package daf.util

import better.files.{ File, _ }
import daf.util.DataFrameClasses.{ Address, Person }
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hdfs.{ HdfsConfiguration, MiniDFSCluster }
import org.apache.hadoop.test.PathUtils
import org.apache.spark.sql.{ SaveMode, SparkSession }
import org.scalatest.{ BeforeAndAfterAll, FlatSpec, Matchers }
import org.slf4j.LoggerFactory

import scala.util.{ Failure, Random, Try }

abstract class HDFSBase extends FlatSpec with Matchers with BeforeAndAfterAll {

  var miniCluster: Try[MiniDFSCluster] = Failure[MiniDFSCluster](new Exception)

  var fileSystem: Try[FileSystem] = Failure[FileSystem](new Exception)

  val sparkSession: SparkSession = SparkSession.builder().master("local").getOrCreate()

  val alogger = LoggerFactory.getLogger(this.getClass)

  val (testDataPath, confPath) = {
    val testDataPath = s"${PathUtils.getTestDir(this.getClass).getCanonicalPath}/MiniCluster"
    val confPath = s"$testDataPath/conf"
    (
      testDataPath.toFile.createIfNotExists(asDirectory = true, createParents = false),
      confPath.toFile.createIfNotExists(asDirectory = true, createParents = false)
    )
  }

  def pathAvro = "opendata/test.avro"
  def pathParquet = "opendata/test.parquet"
  def pathCsv = "opendata/test.csv"

  def getSparkSession = sparkSession

  override def beforeAll(): Unit = {

    val conf = new HdfsConfiguration()
    conf.setBoolean("dfs.permissions", true)
    System.clearProperty(MiniDFSCluster.PROP_TEST_BUILD_DATA)

    conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, testDataPath.pathAsString)
    //FileUtil.fullyDelete(testDataPath.toJava)

    conf.set(s"hadoop.proxyuser.${System.getProperties.get("user.name")}.groups", "*")
    conf.set(s"hadoop.proxyuser.${System.getProperties.get("user.name")}.hosts", "*")

    val builder = new MiniDFSCluster.Builder(conf)
    miniCluster = Try(builder.build())
    fileSystem = miniCluster.map(_.getFileSystem)
    fileSystem.foreach(fs => {
      val confFile: File = confPath / "hdfs-site.xml"
      for { os <- confFile.newOutputStream.autoClosed } fs.getConf.writeXml(os)
    })

    writeDf()
  }

  override def afterAll(): Unit = {
    miniCluster.foreach(_.shutdown(true))
    val _ = testDataPath.parent.parent.delete(true)
    sparkSession.stop()
  }

  
  private def writeDf(): Unit = {
    import sparkSession.implicits._

    alogger.info(s"TestDataPath ${testDataPath.toJava.getAbsolutePath}")
    alogger.info(s"ConfPath ${confPath.toJava.getAbsolutePath}")
    val persons = (1 to 10).map(i => Person(s"Andy$i", Random.nextInt(85), Address("Via Ciccio Cappuccio")))
    val caseClassDS = persons.toDS()
    caseClassDS.write.format("parquet").mode(SaveMode.Overwrite).save(pathParquet)
    caseClassDS.write.format("com.databricks.spark.avro").mode(SaveMode.Overwrite).save(pathAvro)
    //writing directly the Person dataframe generates an exception
    caseClassDS.toDF.select("name", "age").write.format("csv").mode(SaveMode.Overwrite).option("header", "true").save(pathCsv)
  }
}

object DataFrameClasses {

  final case class Address(street: String)

  final case class Person(name: String, age: Int, address: Address)
}

Source File: HDFSMiniCluster.scala From daf with BSD 3-Clause "New" or "Revised" License

5 votes

package it.teamdigitale.miniclusters

import better.files.File
import org.apache.logging.log4j.LogManager
import org.apache.hadoop.hdfs.HdfsConfiguration
import org.apache.hadoop.test.PathUtils
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hdfs.{HdfsConfiguration, MiniDFSCluster}
import better.files._

import scala.util.{Failure, Try}

class HDFSMiniCluster extends AutoCloseable {
  val alogger = LogManager.getLogger(this.getClass)

  var hdfsCluster: Try[MiniDFSCluster] = Failure[MiniDFSCluster](new Exception)
  var fileSystem: Try[FileSystem] = Failure[FileSystem](new Exception)

  val (testDataPath, confPath) = {
    val testDataPath = s"${PathUtils.getTestDir(classOf[HDFSMiniCluster]).getCanonicalPath}/MiniCluster"
    val confPath = s"$testDataPath/conf"
    (
      testDataPath.toFile.createIfNotExists(asDirectory = true, createParents = false),
      confPath.toFile.createIfNotExists(asDirectory = true, createParents = false)
    )
  }


  def start(): Unit = {

    alogger.info("Starting HDFS mini cluster")
    val conf = new HdfsConfiguration()
    conf.setBoolean("dfs.permissions", true)
    System.clearProperty(MiniDFSCluster.PROP_TEST_BUILD_DATA)

    conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, testDataPath.pathAsString)
    //FileUtil.fullyDelete(testDataPath.toJava)

    conf.set(s"hadoop.proxyuser.${System.getProperties.get("user.name")}.groups", "*")
    conf.set(s"hadoop.proxyuser.${System.getProperties.get("user.name")}.hosts", "*")

    val builder = new MiniDFSCluster.Builder(conf)
    hdfsCluster = Try(builder.build())
    fileSystem = hdfsCluster.map(_.getFileSystem)
    fileSystem.foreach(fs => {
      val confFile: File = confPath / "hdfs-site.xml"
      for {os <- confFile.newOutputStream.autoClosed} fs.getConf.writeXml(os)
    })

  }


  override def close() = {
    alogger.info("Stopping HDFS mini cluster")
    hdfsCluster.foreach(_.shutdown(true))
    val _ = testDataPath.parent.parent.delete(true)
  }
}

Source File: HDFSUtils.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.hadoop.common.utils

import java.io.File
import java.nio.file.Paths
import java.security.PrivilegedExceptionAction

import com.webank.wedatasphere.linkis.common.conf.Configuration.hadoopConfDir
import com.webank.wedatasphere.linkis.hadoop.common.conf.HadoopConf._
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.security.UserGroupInformation

object HDFSUtils {



  def getConfiguration(user: String): Configuration = getConfiguration(user, hadoopConfDir)

  def getConfiguration(user: String, hadoopConfDir: String): Configuration = {
    val confPath = new File(hadoopConfDir)
    if(!confPath.exists() || confPath.isFile) {
      throw new RuntimeException(s"Create hadoop configuration failed, path $hadoopConfDir not exists.")
    }
    val conf = new Configuration()
    conf.addResource(new Path(Paths.get(hadoopConfDir, "core-site.xml").toAbsolutePath.toFile.getAbsolutePath))
    conf.addResource(new Path(Paths.get(hadoopConfDir, "hdfs-site.xml").toAbsolutePath.toFile.getAbsolutePath))
    conf.addResource(new Path(Paths.get(hadoopConfDir, "yarn-site.xml").toAbsolutePath.toFile.getAbsolutePath))
    conf
  }

  def getHDFSRootUserFileSystem: FileSystem = getHDFSRootUserFileSystem(getConfiguration(HADOOP_ROOT_USER.getValue))

  def getHDFSRootUserFileSystem(conf: org.apache.hadoop.conf.Configuration): FileSystem =
    getHDFSUserFileSystem(HADOOP_ROOT_USER.getValue, conf)

  def getHDFSUserFileSystem(userName: String): FileSystem = getHDFSUserFileSystem(userName, getConfiguration(userName))

  def getHDFSUserFileSystem(userName: String, conf: org.apache.hadoop.conf.Configuration): FileSystem =
    getUserGroupInformation(userName)
      .doAs(new PrivilegedExceptionAction[FileSystem]{
        def run = FileSystem.get(conf)
      })
  def getUserGroupInformation(userName: String): UserGroupInformation ={
    if(KERBEROS_ENABLE.getValue) {
      val path = new File(KEYTAB_FILE.getValue , userName + ".keytab").getPath
      val user = getKerberosUser(userName)
      UserGroupInformation.setConfiguration(getConfiguration(userName))
      UserGroupInformation.loginUserFromKeytabAndReturnUGI(user, path)
    } else {
      UserGroupInformation.createRemoteUser(userName)
    }
  }

  def getKerberosUser(userName: String): String = {
    var user = userName
    if(KEYTAB_HOST_ENABLED.getValue){
      user = user+ "/" + KEYTAB_HOST.getValue
    }
    user
  }

}

Source File: DeltaOutputFormat.scala From connectors with Apache License 2.0

5 votes

package io.delta.hive

import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.io.{ArrayWritable, NullWritable}
import org.apache.hadoop.mapred.{JobConf, OutputFormat, RecordWriter}
import org.apache.hadoop.util.Progressable


class DeltaOutputFormat extends OutputFormat[NullWritable, ArrayWritable] {

  private def writingNotSupported[T](): T = {
    throw new UnsupportedOperationException(
      "Writing to a Delta table in Hive is not supported. Please use Spark to write.")
  }

  override def getRecordWriter(
    ignored: FileSystem,
    job: JobConf,
    name: String,
    progress: Progressable): RecordWriter[NullWritable, ArrayWritable] = writingNotSupported()

  override def checkOutputSpecs(ignored: FileSystem, job: JobConf): Unit = writingNotSupported()
}

Source File: ModelSource.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.common

import java.io.{InputStreamReader, BufferedReader}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{Path, FileSystem}

case class ModelSource(
  root: String,
  fs: FileSystem
) {

  def readFile(path: String): String = {
    val fsPath = filePath(path)
    val reader = new BufferedReader(new InputStreamReader(fs.open(fsPath)))

    val builder      = new StringBuilder()
    var line: String = null
    while ({ line = reader.readLine(); line != null }) {
      builder.append(line + "\n")
    }
    builder.mkString
  }

  def findFile(dir: String, recursive: Boolean, f: String => Boolean): Option[Path] = {
    val dirPath = filePath(dir)
    if (fs.exists(dirPath) & fs.isDirectory(dirPath)) {
      val iter = fs.listFiles(dirPath, recursive)
      while (iter.hasNext) {
        val st = iter.next()
        if (st.isFile && f(st.getPath.getName)) return Some(st.getPath)
      }
      None
    } else {
      None
    }
  }

  def filePath(path: String): Path = {
    new Path(s"$root/$path")
  }

}

object ModelSource {

  def local(path: String): ModelSource = {
    ModelSource(path, FileSystem.getLocal(new Configuration()))
  }

  def hadoop(path: String, conf: Configuration): ModelSource = {
    val fs = FileSystem.get(conf)
    ModelSource(path, fs)
  }

}

Source File: Util.scala From Backup-Repo with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase.util

import java.io._
import java.util.concurrent.atomic.AtomicInteger
import java.util.zip.{DeflaterOutputStream, InflaterInputStream}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hbase.HBaseConfiguration

object Util {
  val iteration = new AtomicInteger(0)

  def getTempFilePath(conf: Configuration, prefix: String): String = {
    val fileSystem = FileSystem.get(conf)
    val path = new Path(s"$prefix-${System.currentTimeMillis()}-${iteration.getAndIncrement}")
    if (fileSystem.exists(path)) {
      fileSystem.delete(path, true)
    }
    path.getName
  }

  def serializeHBaseConfiguration(configuration: Configuration): Array[Byte] = {
    val bos = new ByteArrayOutputStream
    val deflaterOutputStream = new DeflaterOutputStream(bos)
    val dos = new DataOutputStream(deflaterOutputStream)
    configuration.write(dos)
    dos.close()
    bos.toByteArray
  }

  def deserializeHBaseConfiguration(arr: Array[Byte]) = {
    val conf = HBaseConfiguration.create
    conf.readFields(new DataInputStream(new InflaterInputStream(new ByteArrayInputStream(arr))))
    conf
  }
}

Source File: FilePattern.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels

import com.sksamuel.exts.Logging
import io.eels.util.HdfsIterator
import org.apache.hadoop.fs.{FileSystem, Path}

import scala.language.implicitConversions

object FilePattern {
  def apply(path: Path)(implicit fs: FileSystem): FilePattern = apply(path.toString())
  def apply(path: java.nio.file.Path)(implicit fs: FileSystem): FilePattern = apply(path.toAbsolutePath().toString(), { _ => true })

  implicit def stringToFilePattern(str: String)(implicit fs: FileSystem): FilePattern = FilePattern(str)
}

case class FilePattern(pattern: String,
                       filter: org.apache.hadoop.fs.Path => Boolean = { _ => true }) extends Logging {

  def isRegex(): Boolean = pattern.contains("*")
  def isDirectory(): Boolean = pattern.endsWith("/")

  def toPaths()(implicit fs: FileSystem): List[Path] = {
    val paths = if (isRegex) {

      val regex = new Path(pattern).getName.replace("*", ".*?")
      val dir = new Path(pattern).getParent
      logger.debug(s"File expansion will check path $dir for files matching $regex")

      HdfsIterator.remote(fs.listFiles(dir, false)).toList
        .map(_.getPath)
        .filter { path => path.getName.matches(regex) }
        .filter(filter)

    } else if (fs.isDirectory(new Path(pattern))) {

      val path = new Path(pattern.stripSuffix("/"))
      logger.debug(s"File expansion will search directory $path")
      HdfsIterator.remote(fs.listFiles(path, false)).map(_.getPath).toList.filter(fs.isFile).filter(filter)

    } else {
      List(new Path(pattern))
    }

    logger.debug(s"toPaths has returned ${paths.size} paths, first 5: ${paths.take(5).mkString(",")}")
    paths
  }

  def withFilter(p: Path => Boolean): FilePattern = copy(filter = p)
}

Source File: HdfsMkdir.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.util

import org.apache.hadoop.fs.{FileSystem, Path}

object HdfsMkdir {
  def apply(path: Path, inheritPermissionsDefault: Boolean)(implicit fs: FileSystem): Unit = {
    if (!fs.exists(path)) {
      // iterate through the parents until we hit a parent that exists, then take that, which will give
      // us the first folder that exists
      val parent = Iterator.iterate(path)(_.getParent).dropWhile(false == fs.exists(_)).take(1).toList.head
      // using the folder that exists, get its permissions
      val permission = fs.getFileStatus(parent).getPermission
      fs.create(path, false)
      if (inheritPermissionsDefault)
        fs.setPermission(path, permission)
    }
  }
}

Source File: HdfsOps.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels

import com.sksamuel.exts.Logging
import io.eels.util.{HdfsIterator, PathIterator}
import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path}

object HdfsOps extends Logging {

  def makePathVisible(path: Path)(implicit fs: FileSystem): Unit = {
    if (path.getName.startsWith(".")) {
      logger.info(s"Making $path visible by stripping leading .")
      val dest = new Path(path.getParent, path.getName.drop(1))
      fs.rename(path, dest)
    }
  }

  def findFiles(path: Path, recursive: Boolean, fs: FileSystem): Iterator[LocatedFileStatus] = {
    HdfsIterator.remote(fs.listFiles(path, recursive))
  }

  def mkdirsp(path: Path, fs: FileSystem): Boolean = PathIterator(path).forall(fs.mkdirs)
}

Source File: AvroSource.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.avro

import java.io.File
import java.util.concurrent.atomic.AtomicBoolean

import com.sksamuel.exts.Logging
import com.sksamuel.exts.io.Using
import io.eels._
import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription}
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

case class AvroSource(path: Path)
                     (implicit conf: Configuration, fs: FileSystem) extends Source with Using {

  override lazy val schema: StructType = {
    using(AvroReaderFns.createAvroReader(path)) { reader =>
      val record = reader.next()
      AvroSchemaFns.fromAvroSchema(record.getSchema)
    }
  }

  override def parts(): Seq[Publisher[Seq[Row]]] = Seq(AvroSourcePublisher(path))
}

case class AvroSourcePublisher(path: Path)
                              (implicit conf: Configuration, fs: FileSystem)
  extends Publisher[Seq[Row]] with Logging with Using {
  override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {
    val deserializer = new AvroDeserializer()
    try {
      using(AvroReaderFns.createAvroReader(path)) { reader =>
        val running = new AtomicBoolean(true)
        subscriber.subscribed(Subscription.fromRunning(running))
        AvroRecordIterator(reader)
          .takeWhile(_ => running.get)
          .map(deserializer.toRow)
          .grouped(DataStream.DefaultBatchSize)
          .foreach(subscriber.next)
        subscriber.completed()
      }
    } catch {
      case t: Throwable => subscriber.error(t)
    }
  }
}

object AvroSource {
  def apply(file: File)(implicit conf: Configuration, fs: FileSystem): AvroSource = AvroSource(new Path(file.getAbsoluteFile.toString))
  def apply(path: java.nio.file.Path)(implicit conf: Configuration, fs: FileSystem): AvroSource = apply(path.toFile)
}

Source File: AvroSink.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.avro

import java.io.File

import io.eels.schema.StructType
import io.eels.{Row, Sink, SinkWriter}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.fs.{FileSystem, Path}

case class AvroSink(path: Path,
                    overwrite: Boolean = false,
                    permission: Option[FsPermission] = None,
                    inheritPermissions: Option[Boolean] = None)
                   (implicit conf: Configuration, fs: FileSystem) extends Sink {

  def withOverwrite(overwrite: Boolean): AvroSink = copy(overwrite = overwrite)
  def withPermission(permission: FsPermission): AvroSink = copy(permission = Option(permission))
  def withInheritPermission(inheritPermissions: Boolean): AvroSink = copy(inheritPermissions = Option(inheritPermissions))

  override def open(schema: StructType): SinkWriter = new SinkWriter {

    private val writer = new AvroWriter(schema, fs.create(path, overwrite))

    override def write(row: Row): Unit = writer.write(row)

    override def close(): Unit = {
      writer.close()
      permission match {
        case Some(perm) => fs.setPermission(path, perm)
        case None =>
          if (inheritPermissions.getOrElse(false)) {
            val permission = fs.getFileStatus(path.getParent).getPermission
            fs.setPermission(path, permission)
          }
      }
    }
  }
}

object AvroSink {
  def apply(file: File)(implicit conf: Configuration, fs: FileSystem): AvroSink = AvroSink(new Path(file.getAbsoluteFile.toString))
  def apply(path: java.nio.file.Path)(implicit conf: Configuration, fs: FileSystem): AvroSink = apply(path.toFile)
}

Source File: JsonSink.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.json

import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper
import io.eels.schema.StructType
import io.eels.{Row, Sink, SinkWriter}
import org.apache.hadoop.fs.{FileSystem, Path}

case class JsonSink(path: Path)(implicit fs: FileSystem) extends Sink {

  override def open(schema: StructType): SinkWriter = new SinkWriter {

    private val lock = new AnyRef()
    private val out = fs.create(path)

    val mapper = new ObjectMapper with ScalaObjectMapper
    mapper.registerModule(DefaultScalaModule)

    override def write(row: Row) {
      val map = schema.fieldNames.zip(row.values).toMap
      val json = mapper.writeValueAsString(map)
      lock.synchronized {
        out.writeBytes(json)
        out.writeBytes("\n")
      }
    }

    override def close() {
      out.close()
    }
  }
}

Source File: AvroParquetSource.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet.avro

import com.sksamuel.exts.Logging
import com.sksamuel.exts.OptionImplicits._
import com.sksamuel.exts.io.Using
import io.eels.component.avro.{AvroSchemaFns, AvroSchemaMerge}
import io.eels.component.parquet._
import io.eels.datastream.Publisher
import io.eels.schema.StructType
import io.eels.{FilePattern, Predicate, _}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.parquet.format.converter.ParquetMetadataConverter
import org.apache.parquet.hadoop.{Footer, ParquetFileReader}

import scala.collection.JavaConverters._

object AvroParquetSource {

  def apply(uri: java.net.URI)(implicit fs: FileSystem, conf: Configuration): AvroParquetSource =
    apply(FilePattern(new Path(uri.toString)))

  def apply(path: java.nio.file.Path)(implicit fs: FileSystem, conf: Configuration): AvroParquetSource =
    apply(FilePattern(path))

  def apply(path: Path)(implicit fs: FileSystem, conf: Configuration): AvroParquetSource =
    apply(FilePattern(path))
}

case class AvroParquetSource(pattern: FilePattern,
                             predicate: Option[Predicate] = None)
                            (implicit fs: FileSystem, conf: Configuration) extends Source with Logging with Using {

  private lazy val paths = pattern.toPaths()

  def withPredicate(pred: Predicate): AvroParquetSource = copy(predicate = pred.some)

  // the schema returned by the parquet source should be a merged version of the
  // schemas contained in all the files.
  override def schema: StructType = {
    val schemas = paths.map { path =>
      using(AvroParquetReaderFn.apply(path, predicate, None)) { reader =>
        val record = Option(reader.read()).getOrElse {
          sys.error(s"Cannot read $path for schema; file contains no records")
        }
        record.getSchema
      }
    }
    val avroSchema = AvroSchemaMerge("record", "namspace", schemas)
    AvroSchemaFns.fromAvroSchema(avroSchema)
  }

  // returns the count of all records in this source, predicate is ignored
  def countNoPredicate(): Long = statistics().count

  // returns stats, predicate is ignored
  def statistics(): Statistics = {
    if (paths.isEmpty) Statistics.Empty
    else {
      paths.foldLeft(Statistics.Empty) { (stats, path) =>
        val footer = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER)
        footer.getBlocks.asScala.foldLeft(stats) { (stats, block) =>
          stats.copy(
            count = stats.count + block.getRowCount,
            compressedSize = stats.compressedSize + block.getCompressedSize,
            uncompressedSize = stats.uncompressedSize + block.getTotalByteSize
          )
        }
      }
    }
  }

  override def parts(): Seq[Publisher[Seq[Row]]] = {
    logger.debug(s"AvroParquetSource source has ${paths.size} files: $paths")
    paths.map { it => new AvroParquetPublisher(it, predicate) }
  }

  def footers(): List[Footer] = {
    logger.debug(s"AvroParquetSource source will read footers from $paths")
    paths.flatMap { it =>
      val status = fs.getFileStatus(it)
      logger.debug(s"status=$status; path=$it")
      ParquetFileReader.readAllFootersInParallel(fs.getConf, status).asScala
    }
  }
}

Source File: AvroParquetRowWriter.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet.avro

import com.sksamuel.exts.Logging
import com.typesafe.config.{Config, ConfigFactory}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.fs.{FileSystem, Path}


class AvroParquetRowWriter(path: Path,
                           avroSchema: Schema)(implicit fs: FileSystem) extends Logging {

  private val config: Config = ConfigFactory.load()
  private val skipCrc = config.getBoolean("eel.parquet.skipCrc")
  logger.info(s"Parquet writer will skipCrc = $skipCrc")

  private val writer = AvroParquetWriterFn(path, avroSchema)

  def write(record: GenericRecord): Unit = {
    writer.write(record)
  }

  def close(): Unit = {
    writer.close()
    if (skipCrc) {
      val crc = new Path("." + path.toString() + ".crc")
      logger.debug("Deleting crc $crc")
      if (fs.exists(crc))
        fs.delete(crc, false)
    }
  }
}

Source File: AvroParquetSink.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet.avro

import com.sksamuel.exts.Logging
import com.typesafe.config.ConfigFactory
import io.eels.component.avro.{AvroSchemaFns, RowSerializer}
import io.eels.schema.StructType
import io.eels.{Row, Sink, SinkWriter}
import org.apache.hadoop.fs.{FileSystem, Path}

object AvroParquetSink {
  def apply(path: String)(implicit fs: FileSystem): AvroParquetSink = AvroParquetSink(new Path(path))
}

case class AvroParquetSink(path: Path, overwrite: Boolean = false)(implicit fs: FileSystem) extends Sink with Logging {

  def withOverwrite(overwrite: Boolean): AvroParquetSink = copy(overwrite = overwrite)

  override def open(schema: StructType): SinkWriter = new SinkWriter {

    private val config = ConfigFactory.load()
    private val caseSensitive = config.getBoolean("eel.parquet.caseSensitive")

    if (overwrite && fs.exists(path))
      fs.delete(path, false)

    private val avroSchema = AvroSchemaFns.toAvroSchema(schema, caseSensitive = caseSensitive)
    private val writer = new AvroParquetRowWriter(path, avroSchema)
    private val serializer = new RowSerializer(avroSchema)

    override def write(row: Row): Unit = {
      this.synchronized {
        val record = serializer.serialize(row)
        writer.write(record)
      }
    }

    override def close(): Unit = {
      writer.close()
    }
  }
}

Source File: ParquetSource.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import com.sksamuel.exts.Logging
import com.sksamuel.exts.OptionImplicits._
import com.sksamuel.exts.io.Using
import io.eels.datastream.Publisher
import io.eels.{Predicate, _}
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.parquet.format.converter.ParquetMetadataConverter
import org.apache.parquet.hadoop.{Footer, ParquetFileReader}

import scala.collection.JavaConverters._

object ParquetSource {

  def apply(string: String)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(string))

  def apply(uri: java.net.URI)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(new Path(uri.toString)))

  def apply(path: java.nio.file.Path)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(path))

  def apply(path: Path)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(path))
}

case class ParquetSource(pattern: FilePattern,
                         predicate: Option[Predicate] = None,
                         projection: Seq[String] = Nil,
                         dictionaryFiltering: Boolean = true,
                         caseSensitive: Boolean = true)
                        (implicit fs: FileSystem, conf: Configuration) extends Source with Logging with Using {
  logger.debug(s"Created parquet source with pattern=$pattern")

  lazy val paths: List[Path] = pattern.toPaths()

  def withDictionaryFiltering(dictionary: Boolean): ParquetSource = copy(dictionaryFiltering = dictionary)
  def withCaseSensitivity(caseSensitive: Boolean): ParquetSource = copy(caseSensitive = caseSensitive)
  def withPredicate(pred: => Predicate): ParquetSource = copy(predicate = pred.some)
  def withProjection(first: String, rest: String*): ParquetSource = withProjection(first +: rest)
  def withProjection(fields: Seq[String]): ParquetSource = {
    require(fields.nonEmpty)
    copy(projection = fields.toList)
  }

  // returns the metadata in the parquet file, or an empty map if none
  def metadata(): Map[String, String] = {
    paths.foldLeft(Map.empty[String, String]) { (metadata, path) =>
      val footer = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER)
      metadata ++ footer.getFileMetaData.getKeyValueMetaData.asScala
    }
  }

  // todo should take the merged schema from all files
  lazy val schema: StructType = RowParquetReaderFn.schema(paths.headOption.getOrError("No paths found for source"))

  // returns the count of all records in this source, predicate is ignored
  def countNoPredicate(): Long = statistics().count

  // returns stats, predicate is ignored
  def statistics(): Statistics = {
    if (paths.isEmpty) Statistics.Empty
    else {
      paths.foldLeft(Statistics.Empty) { (stats, path) =>
        val footer = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER)
        footer.getBlocks.asScala.foldLeft(stats) { (stats, block) =>
          stats.copy(
            count = stats.count + block.getRowCount,
            compressedSize = stats.compressedSize + block.getCompressedSize,
            uncompressedSize = stats.uncompressedSize + block.getTotalByteSize
          )
        }
      }
    }
  }

  override def parts(): Seq[Publisher[Seq[Row]]] = {
    logger.debug(s"Parquet source has ${paths.size} files: ${paths.mkString(", ")}")
    paths.map { it => new ParquetPublisher(it, predicate, projection, caseSensitive, dictionaryFiltering) }
  }

  def footers(): List[Footer] = {
    logger.debug(s"Parquet source will read footers from $paths")
    paths.flatMap { it =>
      val status = fs.getFileStatus(it)
      logger.debug(s"status=$status; path=$it")
      ParquetFileReader.readAllFootersInParallel(fs.getConf, status).asScala
    }
  }
}

Source File: ParquetSink.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import com.sksamuel.exts.Logging
import com.sksamuel.exts.OptionImplicits._
import io.eels.schema.StructType
import io.eels.{Row, Sink, SinkWriter}
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.fs.{FileSystem, Path}

import scala.math.BigDecimal.RoundingMode
import scala.math.BigDecimal.RoundingMode.RoundingMode

case class ParquetWriteOptions(overwrite: Boolean = false,
                               permission: Option[FsPermission] = None,
                               dictionary: Boolean = true,
                               inheritPermissions: Option[Boolean] = None,
                               roundingMode: RoundingMode = RoundingMode.UNNECESSARY,
                               metadata: Map[String, String] = Map.empty) {

  def withOverwrite(overwrite: Boolean): ParquetWriteOptions = copy(overwrite = overwrite)
  def withDictionary(dictionary: Boolean): ParquetWriteOptions = copy(dictionary = dictionary)
  def withMetaData(map: Map[String, String]): ParquetWriteOptions = copy(metadata = map)
  def withPermission(permission: FsPermission): ParquetWriteOptions = copy(permission = permission.some)
  def withInheritPermission(inheritPermissions: Boolean): ParquetWriteOptions = copy(inheritPermissions = inheritPermissions.some)
  def withRoundingMode(mode: RoundingMode): ParquetWriteOptions = copy(roundingMode = mode)
}

case class ParquetSink(path: Path, options: ParquetWriteOptions = ParquetWriteOptions())
                      (implicit fs: FileSystem) extends Sink with Logging {

  // -- convenience methods --
  def withOverwrite(overwrite: Boolean): ParquetSink = copy(options = options.withOverwrite(overwrite))
  def withDictionary(dictionary: Boolean): ParquetSink = copy(options = options.copy(dictionary = dictionary))
  def withMetaData(map: Map[String, String]): ParquetSink = copy(options = options.copy(metadata = map))
  def withPermission(permission: FsPermission): ParquetSink = copy(options = options.copy(permission = permission.some))
  def withInheritPermission(inheritPermissions: Boolean): ParquetSink = copy(options = options.copy(inheritPermissions = inheritPermissions.some))
  def withRoundingMode(mode: RoundingMode): ParquetSink = copy(options = options.copy(roundingMode = mode))

  private def create(schema: StructType, path: Path): SinkWriter = new SinkWriter {

    if (options.overwrite && fs.exists(path))
      fs.delete(path, false)

    val writer = RowParquetWriterFn(path, schema, options.metadata, options.dictionary, options.roundingMode, fs.getConf)

    override def write(row: Row): Unit = {
      writer.write(row)
    }

    override def close(): Unit = {
      writer.close()
      options.permission match {
        case Some(perm) => fs.setPermission(path, perm)
        case None =>
          if (options.inheritPermissions.getOrElse(false)) {
            val permission = fs.getFileStatus(path.getParent).getPermission
            fs.setPermission(path, permission)
          }
      }
    }
  }

  override def open(schema: StructType, n: Int): Seq[SinkWriter] = {
    if (n == 1) Seq(create(schema, path))
    else List.tabulate(n) { k => create(schema, new Path(path.getParent, path.getName + "_" + k)) }
  }

  override def open(schema: StructType): SinkWriter = create(schema, path)
}

object ParquetSink {
  def apply(path: String)(implicit fs: FileSystem): ParquetSink = ParquetSink(new Path(path))
}

Source File: HdfsWatcher.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hdfs

import java.util.concurrent.Executors
import java.util.concurrent.atomic.AtomicBoolean

import com.sksamuel.exts.Logging
import io.eels.util.HdfsIterator
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hdfs.client.HdfsAdmin
import org.apache.hadoop.hdfs.inotify.Event

import scala.concurrent.duration._
import scala.util.control.NonFatal

class HdfsWatcher(path: Path, callback: FileCallback)
                 (implicit fs: FileSystem, conf: Configuration) extends Logging {

  private val files = HdfsIterator.remote(fs.listFiles(path, false)).map(_.getPath).toBuffer
  files.foreach(callback.onStart)

  private val executor = Executors.newSingleThreadExecutor()
  private val running = new AtomicBoolean(true)
  private val interval = 5.seconds

  private val admin = new HdfsAdmin(path.toUri, conf)
  private val eventStream = admin.getInotifyEventStream

  executor.submit(new Runnable {
    override def run(): Unit = {
      while (running.get) {
        try {
          Thread.sleep(interval.toMillis)
          val events = eventStream.take
          for (event <- events.getEvents) {
            event match {
              case create: Event.CreateEvent => callback.onCreate(create)
              case append: Event.AppendEvent => callback.onAppend(append)
              case rename: Event.RenameEvent => callback.onRename(rename)
              case close: Event.CloseEvent => callback.onClose(close)
              case _ =>
            }
          }
        } catch {
          case NonFatal(e) => logger.error("Error while polling fs", e)
        }
      }
    }
  })

  def stop(): Unit = {
    running.set(false)
    executor.shutdownNow()
  }
}

trait FileCallback {
  def onStart(path: Path): Unit
  def onClose(close: Event.CloseEvent): Unit
  def onRename(rename: Event.RenameEvent): Unit
  def onAppend(append: Event.AppendEvent): Unit
  def onCreate(path: Event.CreateEvent): Unit
}

Source File: HdfsSource.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hdfs

import io.eels.FilePattern
import org.apache.hadoop.fs.permission.{AclEntryScope, AclEntryType, FsAction, FsPermission, AclEntry => HdfsAclEntry}
import org.apache.hadoop.fs.{BlockLocation, FileSystem, Path}

import scala.collection.JavaConverters._

case class HdfsSource(pattern: FilePattern)(implicit fs: FileSystem) {

  def permissions(): Vector[(Path, FsPermission)] = pattern.toPaths().map(fs.getFileStatus)
    .map(status => (status.getPath, status.getPermission)).toVector

  def setPermissions(permission: FsPermission): Unit = {
    pattern.toPaths().foreach(fs.setPermission(_, permission))
  }

  def blocks(): Map[Path, Seq[BlockLocation]] = pattern.toPaths().map { path =>
    path -> fs.getFileBlockLocations(path, 0, fs.getFileLinkStatus(path).getLen).toSeq
  }.toMap

  def setAcl(spec: AclSpec): Unit = {
    pattern.toPaths().foreach { path =>
      val hadoopAclEntries = spec.entries.map { entry =>
        val `type` = entry.`type`.toLowerCase match {
          case "user" => AclEntryType.USER
          case "group" => AclEntryType.GROUP
          case "other" => AclEntryType.OTHER
        }
        new HdfsAclEntry.Builder().setName(entry.name).setPermission(FsAction.getFsAction(entry.action)).setType(`type`).setScope(AclEntryScope.ACCESS).build()
      }
      fs.setAcl(path, hadoopAclEntries.asJava)
    }
  }
}

object HdfsSource {
  def apply(path: String)(implicit fs: FileSystem): HdfsSource = apply(FilePattern(path))
  def apply(path: Path)(implicit fs: FileSystem): HdfsSource = HdfsSource(FilePattern(path))
}

Source File: CsvSink.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.csv

import com.univocity.parsers.csv.CsvWriter
import io.eels.schema.StructType
import io.eels.{Row, Sink, SinkWriter}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

case class CsvSink(path: Path,
                   overwrite: Boolean = false,
                   headers: Header = Header.FirstRow,
                   format: CsvFormat = CsvFormat(),
                   ignoreLeadingWhitespaces: Boolean = false,
                   ignoreTrailingWhitespaces: Boolean = false)
                  (implicit conf: Configuration, fs: FileSystem) extends Sink {

  override def open(schema: StructType): SinkWriter = new CsvSinkWriter(schema, path, headers, format, ignoreLeadingWhitespaces, ignoreTrailingWhitespaces)

  def withOverwrite(overwrite: Boolean): CsvSink = copy(overwrite = overwrite)
  def withHeaders(headers: Header): CsvSink = copy(headers = headers)
  def withIgnoreLeadingWhitespaces(ignoreLeadingWhitespaces: Boolean): CsvSink = copy(ignoreLeadingWhitespaces = ignoreLeadingWhitespaces)
  def withIgnoreTrailingWhitespaces(ignoreTrailingWhitespaces: Boolean): CsvSink = copy(ignoreTrailingWhitespaces = ignoreTrailingWhitespaces)
  def withFormat(format: CsvFormat): CsvSink = copy(format = format)

  class CsvSinkWriter(schema: StructType,
                      path: Path,
                      headers: Header,
                      format: CsvFormat,
                      ignoreLeadingWhitespaces: Boolean = false,
                      ignoreTrailingWhitespaces: Boolean = false) extends SinkWriter {

    private val lock = new AnyRef {}

    if (overwrite && fs.exists(path))
      fs.delete(path, false)

    import scala.collection.JavaConverters._

    private lazy val writer: CsvWriter = {
      val output = fs.create(path)
      val writer = CsvSupport.createWriter(output, format, ignoreLeadingWhitespaces, ignoreTrailingWhitespaces)
      headers match {
        case Header.FirstComment => writer.commentRow(schema.fieldNames().mkString(format.delimiter.toString()))
        case Header.FirstRow => writer.writeHeaders(schema.fieldNames().asJava)
        case _ =>
      }
      writer
    }

    override def close(): Unit = writer.close()

    override def write(row: Row): Unit = {
      lock.synchronized {
        // nulls should be written as empty strings
        val array = row.values.map {
          case null => ""
          case other => other.toString
        }
        writer.writeRow(array: _*)
      }
    }
  }
}

object CsvSink {
  def apply(path: java.nio.file.Path)
           (implicit conf: Configuration, fs: FileSystem): CsvSink = CsvSink(new Path(path.toString))
}

Source File: ReadParquetEEL.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels

import java.sql.Timestamp

import io.eels.component.parquet.{ParquetSink, ParquetSource}
import io.eels.datastream.DataStream
import io.eels.schema.{ArrayType, DecimalType, Field, IntType, Precision, Scale, StringType, StructType, TimestampMillisType}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

object ReadParquetEEL extends App {

  def readParquet(path: Path): Unit = {

    implicit val hadoopConfiguration = new Configuration()
    implicit val hadoopFileSystem = FileSystem.get(hadoopConfiguration)
    val rows = ParquetSource(parquetFilePath).toDataStream().collect
    rows.foreach(row => println(row))
  }

  val parquetFilePath = new Path("file:///home/sam/development/person2.parquet")
  implicit val hadoopConfiguration = new Configuration()
  implicit val hadoopFileSystem = FileSystem.get(hadoopConfiguration)

  val friendStruct = Field.createStructField("FRIEND",
    Seq(
      Field("NAME", StringType),
      Field("AGE", IntType.Signed)
    )
  )

  val personDetailsStruct = Field.createStructField("PERSON_DETAILS",
    Seq(
      Field("NAME", StringType),
      Field("AGE", IntType.Signed),
      Field("SALARY", DecimalType(Precision(38), Scale(5))),
      Field("CREATION_TIME", TimestampMillisType)
    )
  )

  val friendType = StructType(friendStruct)
  val schema = StructType(personDetailsStruct, Field("FRIENDS", ArrayType(friendType), nullable = false))

  val friends = Vector(
    Vector(Vector("John", 25)),
    Vector(Vector("Adam", 26)),
    Vector(Vector("Steven", 27))
  )

  val rows = Vector(
    Vector(Vector("Fred", 50, BigDecimal("50000.99000"), new Timestamp(System.currentTimeMillis())), friends)
  )

  try {
    DataStream.fromValues(schema, rows).to(ParquetSink(parquetFilePath).withOverwrite(true))
  } catch {
    case e: Exception => e.printStackTrace()
  }

  try {
    readParquet(parquetFilePath)
  } catch {
    case e: Exception => e.printStackTrace()
  }
}

Source File: FilePatternTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels

import java.nio.file.Files

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.{Matchers, WordSpec}

class FilePatternTest extends WordSpec with Matchers {

  implicit val fs = FileSystem.get(new Configuration())

  "FilePattern" should {
    "detect single hdfs path without name server" ignore {
      FilePattern("hdfs:///mypath").toPaths() shouldBe List(new Path("hdfs:///mypath"))
    }
    "detect single hdfs path with name server" ignore {
      FilePattern("hdfs://nameserver/mypath").toPaths() shouldBe List(new Path("hdfs://nameserver/mypath"))
    }
    "detect absolute local file" in {
      FilePattern("file:///absolute/file").toPaths() shouldBe List(new Path("file:///absolute/file"))
    }
    "detect relative local file" in {
      FilePattern("file:///local/file").toPaths() shouldBe List(new Path("file:///local/file"))
    }
    "detect relative local file expansion" in {
      val dir = Files.createTempDirectory("filepatterntest")
      val files = List("a", "b", "c").map { it =>
        dir.resolve(it)
      }
      val hdfsPaths = files.map { it =>
        new Path(it.toUri)
      }
      files.foreach(file => Files.createFile(file))
      FilePattern(dir.toUri.toString() + "/*").toPaths().toSet shouldBe hdfsPaths.toSet
      files.foreach(Files.deleteIfExists)
      Files.deleteIfExists(dir)
    }

    //not working on windows
    "detect relative local file expansion with schema" in {
      val dir = Files.createTempDirectory("filepatterntest")
      val files = List("a", "b", "c").map { it =>
        dir.resolve(it)
      }
      val hdfsPaths = files.map { it =>
        new Path(it.toUri)
      }
      files.foreach(file => Files.createFile(file))
      FilePattern(dir.toUri.toString() + "/*").toPaths().toSet shouldBe hdfsPaths.toSet
      files.foreach(Files.deleteIfExists)
      Files.deleteIfExists(dir)
    }

    "use filter if supplied" in {
      val dir = Files.createTempDirectory("filepatterntest")
      val files = List("a", "b", "c").map { it => dir.resolve(it) }
      files.foreach { it => Files.createFile(it) }
      val a = FilePattern(dir.toAbsolutePath().toString() + "/*")
        .withFilter(_.toString().endsWith("a"))
        .toPaths.toSet
      a shouldBe Set(new Path("file:///" + dir.resolve("a")))
      files.foreach { it => Files.deleteIfExists(it) }
      Files.deleteIfExists(dir)
    }
  }
}

Source File: ListenerTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels

import java.util.concurrent.{CountDownLatch, TimeUnit}

import io.eels.component.csv.{CsvSink, CsvSource}
import io.eels.datastream.DataStream
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.{Matchers, WordSpec}

import scala.util.Random

class ListenerTest extends WordSpec with Matchers {

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.get(conf)

  val schema = StructType("a", "b", "c", "d", "e")
  val rows = List.fill(1000)(Row(schema, Random.nextBoolean(), Random.nextFloat(), Random.nextGaussian(), Random.nextLong(), Random.nextString(10)))
  val ds = DataStream.fromRows(schema, rows)

  val path = new Path("listener_test.csv")

  "DataStream" should {
    "support user's listeners" in {

      val latch = new CountDownLatch(1000)
      fs.delete(path, false)

      ds.listener(new Listener {
        override def onNext(value: Row): Unit = latch.countDown()
        override def onError(e: Throwable): Unit = ()
        override def onComplete(): Unit = ()
      }).to(CsvSink(path))

      latch.await(20, TimeUnit.SECONDS) shouldBe true

      fs.delete(path, false)
    }
    "propagate errors in listeners" in {

      class TestSink extends Sink {
        override def open(schema: StructType): SinkWriter = new SinkWriter {
          override def close(): Unit = ()
          override def write(row: Row): Unit = ()
        }
      }

      try {
        ds.listener(new Listener {
          override def onNext(value: Row): Unit = sys.error("boom")
          override def onError(e: Throwable): Unit = ()
          override def onComplete(): Unit = ()
        }).to(new TestSink)
        assert(false)
      } catch {
        case _: Throwable =>
      }
    }
  }

  "Source.toDataStream" should {
    "call on next for each row" in {

      val latch = new CountDownLatch(1000)

      fs.delete(path, false)
      ds.to(CsvSink(path))

      CsvSource(path).toDataStream(new Listener {
        override def onNext(value: Row): Unit = latch.countDown()
        override def onError(e: Throwable): Unit = ()
        override def onComplete(): Unit = ()
      }).collect

      latch.await(5, TimeUnit.SECONDS) shouldBe true
      fs.delete(path, false)
    }
    "call on complete once finished" in {

      val latch = new CountDownLatch(1001)

      fs.delete(path, false)
      ds.to(CsvSink(path))

      CsvSource(path).toDataStream(new Listener {
        override def onNext(value: Row): Unit = latch.countDown()
        override def onError(e: Throwable): Unit = ()
        override def onComplete(): Unit = latch.countDown()
      }).collect

      latch.await(5, TimeUnit.SECONDS) shouldBe true
      fs.delete(path, false)
    }
  }
}

Source File: AvroSourceTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.avro

import java.nio.file.Paths

import com.typesafe.config.ConfigFactory
import io.eels.schema.{Field, StructType}
import org.apache.avro.util.Utf8
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.scalatest.{Matchers, WordSpec}

class AvroSourceTest extends WordSpec with Matchers {

  private implicit val conf = new Configuration()
  private implicit val fs = FileSystem.get(new Configuration())

  "AvroSource" should {
    "read schema" in {
      val people = AvroSource(Paths.get(getClass.getResource("/test.avro").toURI).toAbsolutePath)
      people.schema shouldBe StructType(Field("name", nullable = false), Field("job", nullable = false), Field("location", nullable = false))
    }
    "read strings as java.lang.String when eel.avro.java.string is true" in {
      System.setProperty("eel.avro.java.string", "true")
      ConfigFactory.invalidateCaches()
      val people = AvroSource(Paths.get(getClass.getResource("/test.avro").toURI).toAbsolutePath).toDataStream().toSet
      people.map(_.values) shouldBe Set(
        List("clint eastwood", "actor", "carmel"),
        List("elton john", "musician", "pinner"),
        List("issac newton", "scientist", "heaven")
      )
      System.setProperty("eel.avro.java.string", "false")
      ConfigFactory.invalidateCaches()
    }
    "read strings as utf8 when eel.avro.java.string is false" in {
      System.setProperty("eel.avro.java.string", "false")
      ConfigFactory.invalidateCaches()
      val people = AvroSource(Paths.get(getClass.getResource("/test.avro").toURI).toAbsolutePath).toDataStream().toSet
      people.map(_.values) shouldBe Set(
        List(new Utf8("clint eastwood"), new Utf8("actor"), new Utf8("carmel")),
        List(new Utf8("elton john"), new Utf8("musician"), new Utf8("pinner")),
        List(new Utf8("issac newton"), new Utf8("scientist"), new Utf8("heaven"))
      )
      System.setProperty("eel.avro.java.string", "true")
      ConfigFactory.invalidateCaches()
    }
  }
}

Source File: AvroSinkTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.avro

import io.eels.Row
import io.eels.datastream.DataStream
import io.eels.schema.{ArrayType, Field, MapType, StringType, StructType}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.{Matchers, WordSpec}

class AvroSinkTest extends WordSpec with Matchers {

  private implicit val conf = new Configuration()
  private implicit val fs = FileSystem.get(new Configuration())

  private val ds = DataStream.fromValues(
    StructType("name", "job", "location"),
    Seq(
      List("clint eastwood", "actor", "carmel"),
      List("elton john", "musician", "pinner"),
      List("issac newton", "scientist", "heaven")
    )
  )

  "AvroSink" should {
    "write to avro" in {
      val path = new Path("avro.test")
      fs.delete(path, false)
      ds.to(AvroSink(path))
      fs.delete(path, false)
    }
    "support overwrite option" in {
      val path = new Path("overwrite_test", ".avro")
      fs.delete(path, false)
      ds.to(AvroSink(path))
      ds.to(AvroSink(path).withOverwrite(true))
      fs.delete(path, false)
    }
    "write lists and maps" in {
      val ds = DataStream.fromValues(
        StructType(
          Field("name"),
          Field("movies", ArrayType(StringType)),
          Field("characters", MapType(StringType, StringType))
        ),
        Seq(
          List(
            "clint eastwood",
            List("fistful of dollars", "high plains drifters"),
            Map("preacher" -> "high plains", "no name" -> "good bad ugly")
          )
        )
      )

      val path = new Path("array_map_avro", ".avro")
      fs.delete(path, false)
      ds.to(AvroSink(path))
      AvroSource(path).toDataStream().collect shouldBe Seq(
        Row(
          ds.schema,
          Seq(
            "clint eastwood",
            List("fistful of dollars", "high plains drifters"),
            Map("preacher" -> "high plains", "no name" -> "good bad ugly")
          )
        )
      )

      fs.delete(path, true)
    }
  }
}

Source File: JsonSinkTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.json

import io.eels.datastream.DataStream
import io.eels.schema.{Field, StructType}
import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.{Matchers, WordSpec}

class JsonSinkTest extends WordSpec with Matchers {

  val path = new Path("test.json")
  implicit val fs: FileSystem = FileSystem.get(new Configuration())

  "JsonSink" should {
    "write multiple json docs to a file" in {
      if (fs.exists(path))
        fs.delete(path, false)

      val schema = StructType(Field("name"), Field("location"))
      val ds = DataStream.fromValues(
        schema,
        Seq(
          Vector("sam", "aylesbury"),
          Vector("jam", "aylesbury"),
          Vector("ham", "buckingham")
        )
      )

      ds.to(JsonSink(path))
      val input = IOUtils.toString(fs.open(path))
      input should include("""{"name":"sam","location":"aylesbury"}""")
      input should include("""{"name":"jam","location":"aylesbury"}""")
      input should include("""{"name":"ham","location":"buckingham"}""")

      fs.delete(path, false)
    }
    "support arrays" in {
      if (fs.exists(path))
        fs.delete(path, false)

      val schema = StructType(Field("name"), Field("skills"))
      val frame = DataStream.fromValues(
        schema,
        Seq(Vector("sam", Array("karate", "kung fu")))
      )

      frame.to(JsonSink(path))
      val input = IOUtils.toString(fs.open(path))
      input.trim shouldBe """{"name":"sam","skills":["karate","kung fu"]}"""

      fs.delete(path, false)
    }
    "support maps" in {
      if (fs.exists(path))
        fs.delete(path, false)

      val schema = StructType(Field("name"), Field("locations"))
      val frame = DataStream.fromValues(
        schema,
        Seq(Vector("sam", Map("home" -> "boro", "work" -> "london")))
      )

      frame.to(JsonSink(path))
      val input = IOUtils.toString(fs.open(path))
      input.trim shouldBe """{"name":"sam","locations":{"home":"boro","work":"london"}}"""

      fs.delete(path, false)
    }
    "support structs" in {

      case class Foo(home: String, work: String)

      if (fs.exists(path))
        fs.delete(path, false)

      val schema = StructType(Field("name"), Field("locations"))
      val frame = DataStream.fromValues(
        schema,
        Seq(Vector("sam", Foo("boro", "london")))
      )

      frame.to(JsonSink(path))
      val input = IOUtils.toString(fs.open(path))
      input.trim shouldBe """{"name":"sam","locations":{"home":"boro","work":"london"}}"""

      fs.delete(path, false)
    }
  }
}

Source File: SequenceSinkTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.sequence

import io.eels.datastream.DataStream
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile}
import org.scalatest.{Matchers, WordSpec}

class SequenceSinkTest extends WordSpec with Matchers {

  private val ds = DataStream.fromValues(
    StructType("a", "b", "c", "d"),
    Seq(
      List("1", "2", "3", "4"),
      List("5", "6", "7", "8")
    )
  )

  "SequenceSink" should {
    "write sequence files" in {

      implicit val conf = new Configuration
      implicit val fs = FileSystem.get(conf)

      val path = new Path("seqsink.seq")
      if (fs.exists(path))
        fs.delete(path, true)

      ds.to(SequenceSink(path))

      val reader = new SequenceFile.Reader(new Configuration, SequenceFile.Reader.file(path))

      val k = new IntWritable
      val v = new BytesWritable

      val set = for (_ <- 1 to 3) yield {
        reader.next(k, v)
        new String(v.copyBytes)
      }

      set.toSet shouldBe Set(
        "a,b,c,d",
        "1,2,3,4",
        "5,6,7,8"
      )

      reader.close()

      fs.delete(path, true)
    }
  }
}

Source File: ParquetProjectionTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import java.io.{File, FilenameFilter}

import io.eels.datastream.DataStream
import io.eels.schema.{Field, StringType, StructType}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.{FlatSpec, Matchers}

class ParquetProjectionTest extends FlatSpec with Matchers {

  cleanUpResidualParquetTestFiles

  private val schema = StructType(
    Field("name", StringType, nullable = false),
    Field("job", StringType, nullable = false),
    Field("location", StringType, nullable = false)
  )
  private val ds = DataStream.fromValues(
    schema,
    Seq(
      Vector("clint eastwood", "actor", "carmel"),
      Vector("elton john", "musician", "pinner")
    )
  )

  private implicit val conf = new Configuration()
  private implicit val fs = FileSystem.get(new Configuration())
  private val file = new File(s"test_${System.currentTimeMillis()}.pq")
  file.deleteOnExit()
  private val path = new Path(file.toURI)

  if (fs.exists(path))
    fs.delete(path, false)

  ds.to(ParquetSink(path).withOverwrite(true))

  "ParquetSource" should "support projections" in {
    val rows = ParquetSource(path).withProjection("name").toDataStream().collect
    rows.map(_.values) shouldBe Vector(Vector("clint eastwood"), Vector("elton john"))
  }

  it should "return all data when no projection is set" in {
    val rows = ParquetSource(path).toDataStream().collect
    rows.map(_.values) shouldBe Vector(Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner"))
  }

  private def cleanUpResidualParquetTestFiles = {
    new File(".").listFiles(new FilenameFilter {
      override def accept(dir: File, name: String): Boolean = {
        (name.startsWith("test_") && name.endsWith(".pq")) || (name.startsWith(".test_") && name.endsWith(".pq.crc"))
      }
    }).foreach(_.delete())
  }

}

Source File: AvroAndParquetCrossCompatibilityTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import io.eels.component.parquet.avro.{AvroParquetSink, AvroParquetSource}
import io.eels.datastream.DataStream
import io.eels.schema.{Field, StringType, StructType}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.{FlatSpec, Matchers}

// tests that avro source/sink and avro parquet source/sink can write/read each others files
class AvroAndParquetCrossCompatibilityTest extends FlatSpec with Matchers {

  private implicit val conf = new Configuration()
  private implicit val fs = FileSystem.get(new Configuration())

  "AvroParquetSource and ParquetSource" should "be compatible" in {

    val path = new Path("cross.pq")
    if (fs.exists(path))
      fs.delete(path, false)

    val structType = StructType(
      Field("name", StringType, nullable = false),
      Field("location", StringType, nullable = false)
    )

    val ds = DataStream.fromValues(
      structType,
      Seq(
        Vector("clint eastwood", "carmel"),
        Vector("elton john", "pinner")
      )
    )

    ds.to(ParquetSink(path))
    AvroParquetSource(path).toDataStream().collect shouldBe ds.collect
    fs.delete(path, false)

    ds.to(AvroParquetSink(path))
    ParquetSource(path).toDataStream().collect shouldBe ds.collect
    fs.delete(path, false)
  }
}

Source File: ParquetSpeedTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import java.io.File

import com.sksamuel.exts.metrics.Timed
import io.eels.Row
import io.eels.component.parquet.avro.{AvroParquetSink, AvroParquetSource}
import io.eels.component.parquet.util.ParquetLogMute
import io.eels.datastream.DataStream
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

import scala.util.Random


object ParquetSpeedTest extends App with Timed {
  ParquetLogMute()

  val size = 2000000
  val schema = StructType("a", "b", "c", "d", "e")
  val createRow = Row(schema, Random.nextBoolean(), Random.nextFloat(), Random.nextGaussian(), Random.nextLong(), Random.nextString(4))
  val ds = DataStream.fromIterator(schema, Iterator.continually(createRow).take(size))

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.getLocal(new Configuration())

  val path = new Path("parquet_speed.pq")
  fs.delete(path, false)

  new File(path.toString).deleteOnExit()

  timed("Insertion") {
    ds.to(AvroParquetSink(path).withOverwrite(true))
  }

  while (true) {

    timed("Reading with ParquetSource") {
      val actual = ParquetSource(path).toDataStream().size
      assert(actual == size)
    }

    println("")
    println("---------")
    println("")

    Thread.sleep(2000)

    timed("Reading with AvroParquetSource") {
      val actual = AvroParquetSource(path).toDataStream().size
      assert(actual == size)
    }
  }
}

Source File: ParquetMultipleFileSpeedTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import java.io.File

import com.sksamuel.exts.metrics.Timed
import io.eels.component.parquet.util.ParquetLogMute
import io.eels.datastream.DataStream
import io.eels.schema.StructType
import io.eels.{FilePattern, Row}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

import scala.util.Random


object ParquetMultipleFileSpeedTest extends App with Timed {
  ParquetLogMute()

  val size = 5000000
  val count = 20
  val schema = StructType("a", "b", "c", "d", "e")

  def createRow = Row(schema, Random.nextBoolean(), Random.nextFloat(), Random.nextGaussian(), Random.nextLong(), Random.nextString(4))

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.getLocal(new Configuration())

  val dir = new Path("parquet-speed-test")
  new File(dir.toString).mkdirs()

  new File(dir.toString).listFiles().foreach(_.delete)
  timed("Insertion") {
    val ds = DataStream.fromRowIterator(schema, Iterator.continually(createRow).take(size))
    ds.to(ParquetSink(new Path("parquet-speed-test/parquet_speed.pq")), count)
  }

  for (_ <- 1 to 25) {
    assert(count == FilePattern("parquet-speed-test/*").toPaths().size)

    timed("Reading with ParquetSource") {
      val actual = ParquetSource("parquet-speed-test/*").toDataStream().map { row => row }.filter(_ => true).size
      assert(actual == size, s"Expected $size but was $actual")
    }

    println("")
    println("---------")
    println("")
  }
}

Source File: AvroParquetSinkTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import io.eels.Row
import io.eels.component.parquet.avro.{AvroParquetSink, AvroParquetSource}
import io.eels.component.parquet.util.ParquetLogMute
import io.eels.datastream.DataStream
import io.eels.schema.{Field, StringType, StructType}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.{Matchers, WordSpec}

class AvroParquetSinkTest extends WordSpec with Matchers {
  ParquetLogMute()

  private val schema = StructType(
    Field("name", StringType, nullable = false),
    Field("job", StringType, nullable = false),
    Field("location", StringType, nullable = false)
  )
  private val ds = DataStream.fromValues(
    schema,
    Seq(
      Vector("clint eastwood", "actor", "carmel"),
      Vector("elton john", "musician", "pinner")
    )
  )

  private implicit val conf = new Configuration()
  private implicit val fs = FileSystem.get(new Configuration())
  private val path = new Path("test.pq")

  "ParquetSink" should {
    "write schema" in {
      if (fs.exists(path))
        fs.delete(path, false)
      ds.to(AvroParquetSink(path))
      val people = ParquetSource(path)
      people.schema shouldBe StructType(
        Field("name", StringType, false),
        Field("job", StringType, false),
        Field("location", StringType, false)
      )
      fs.delete(path, false)
    }
    "write data" in {
      if (fs.exists(path))
        fs.delete(path, false)
      ds.to(AvroParquetSink(path))
      AvroParquetSource(path).toDataStream().toSet.map(_.values) shouldBe
        Set(
          Vector("clint eastwood", "actor", "carmel"),
          Vector("elton john", "musician", "pinner")
        )
      fs.delete(path, false)
    }
    "support overwrite" in {

      val path = new Path("overwrite_test.pq")
      fs.delete(path, false)

      val schema = StructType(Field("a", StringType))
      val ds = DataStream.fromRows(schema,
        Row(schema, Vector("x")),
        Row(schema, Vector("y"))
      )

      ds.to(AvroParquetSink(path))
      ds.to(AvroParquetSink(path).withOverwrite(true))
      fs.delete(path, false)
    }
  }
}

Source File: AvroParquetReaderFnTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import java.util.UUID

import io.eels.component.avro.AvroSchemaFns
import io.eels.component.parquet.avro.AvroParquetReaderFn
import io.eels.schema.{DoubleType, Field, LongType, StructType}
import org.apache.avro.SchemaBuilder
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.avro.util.Utf8
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.parquet.avro.AvroParquetWriter
import org.scalatest.{BeforeAndAfterAll, Matchers, WordSpec}

class AvroParquetReaderFnTest extends WordSpec with Matchers with BeforeAndAfterAll {

  private implicit val conf = new Configuration()
  private implicit val fs = FileSystem.get(new Configuration())

  private val path = new Path(UUID.randomUUID().toString())

  override def afterAll(): Unit = {
    val fs = FileSystem.get(new Configuration())
    fs.delete(path, false)
  }

  private val avroSchema = SchemaBuilder.record("com.chuckle").fields()
    .requiredString("str").requiredLong("looong").requiredDouble("dooble").endRecord()

  private val writer = AvroParquetWriter.builder[GenericRecord](path)
    .withSchema(avroSchema)
    .build()

  private val record = new GenericData.Record(avroSchema)
  record.put("str", "wibble")
  record.put("looong", 999L)
  record.put("dooble", 12.34)
  writer.write(record)
  writer.close()

  val schema = StructType(Field("str"), Field("looong", LongType(true), true), Field("dooble", DoubleType, true))

  "AvroParquetReaderFn" should {
    "support projections on doubles" in {

      val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema.removeField("looong"))))
      val record = reader.read()
      reader.close()

      record.get("str").asInstanceOf[Utf8].toString shouldBe "wibble"
      record.get("dooble") shouldBe 12.34
    }
    "support projections on longs" in {

      val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema.removeField("str"))))
      val record = reader.read()
      reader.close()

      record.get("looong") shouldBe 999L
    }
    "support full projections" in {

      val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema)))
      val record = reader.read()
      reader.close()

      record.get("str").asInstanceOf[Utf8].toString shouldBe "wibble"
      record.get("looong") shouldBe 999L
      record.get("dooble") shouldBe 12.34

    }
    "support non projections" in {

      val reader = AvroParquetReaderFn(path, None, None)
      val group = reader.read()
      reader.close()

      group.get("str").asInstanceOf[Utf8].toString shouldBe "wibble"
      group.get("looong") shouldBe 999L
      group.get("dooble") shouldBe 12.34

    }
  }
}

Source File: CsvSourceTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.csv

import java.nio.file.Paths

import io.eels.schema.{Field, StringType, StructType}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.scalatest.{Matchers, WordSpec}

class CsvSourceTest extends WordSpec with Matchers {

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.getLocal(conf)

  "CsvSource" should {
    "read schema" in {
      val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI()
      val path = Paths.get(file)
      CsvSource(path).schema shouldBe StructType(
        Field("a", StringType, true),
        Field("b", StringType, true),
        Field("c", StringType, true)
      )
    }
    "support null cell value option as null" in {
      val file = getClass.getResource("/io/eels/component/csv/csvwithempty.csv").toURI()
      val path = Paths.get(file)
      CsvSource(path).withNullValue(null).toDataStream().toSet.map(_.values) shouldBe
        Set(Vector("1", null, "3"))
    }
    "support null cell value replacement value" in {
      val file = getClass.getResource("/io/eels/component/csv/csvwithempty.csv").toURI()
      val path = Paths.get(file)
      CsvSource(path).withNullValue("foo").toDataStream().toSet.map(_.values) shouldBe
        Set(Vector("1", "foo", "3"))
    }
    "read from path" in {
      val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI()
      val path = Paths.get(file)
      CsvSource(path).withHeader(Header.FirstRow).toDataStream().size shouldBe 3
      CsvSource(path).withHeader(Header.None).toDataStream().size shouldBe 4
    }
    "allow specifying manual schema" in {
      val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI()
      val path = Paths.get(file)
      val schema = StructType(
        Field("test1", StringType, true),
        Field("test2", StringType, true),
        Field("test3", StringType, true)
      )
      CsvSource(path).withSchema(schema).toDataStream().schema shouldBe schema
    }
    "support reading header" in {
      val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI()
      val path = Paths.get(file)
      CsvSource(path).withHeader(Header.FirstRow).toDataStream().collect.map(_.values).toSet shouldBe
        Set(Vector("e", "f", "g"), Vector("1", "2", "3"), Vector("4", "5", "6"))
    }
    "support skipping header" in {
      val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI()
      val path = Paths.get(file)
      CsvSource(path).withHeader(Header.None).toDataStream().toSet.map(_.values) shouldBe
        Set(Vector("a", "b", "c"), Vector("e", "f", "g"), Vector("1", "2", "3"), Vector("4", "5", "6"))
    }
    "support delimiters" in {
      val file = getClass.getResource("/io/eels/component/csv/psv.psv").toURI()
      val path = Paths.get(file)
      CsvSource(path).withDelimiter('|').toDataStream().collect.map(_.values).toSet shouldBe
        Set(Vector("e", "f", "g"))
      CsvSource(path).withDelimiter('|').withHeader(Header.None).toDataStream().toSet.map(_.values) shouldBe
        Set(Vector("a", "b", "c"), Vector("e", "f", "g"))
    }
    "support comments for headers" in {
      val file = getClass.getResource("/io/eels/component/csv/comments.csv").toURI()
      val path = Paths.get(file)
      CsvSource(path).withHeader(Header.FirstComment).schema shouldBe StructType(
        Field("a", StringType, true),
        Field("b", StringType, true),
        Field("c", StringType, true)
      )
      CsvSource(path).withHeader(Header.FirstComment).toDataStream().toSet.map(_.values) shouldBe
        Set(Vector("1", "2", "3"), Vector("e", "f", "g"), Vector("4", "5", "6"))
    }
    "terminate if asking for first comment but no comments" in {
      val file = getClass.getResource("/io/eels/component/csv/csvtest.csv").toURI()
      val path = Paths.get(file)
      CsvSource(path).withHeader(Header.FirstComment).schema shouldBe StructType(
        Field("", StringType, true)
      )
    }
    "support skipping corrupt rows" ignore {
      val file = getClass.getResource("/io/eels/component/csv/corrupt.csv").toURI()
      val path = Paths.get(file)
      CsvSource(path).withHeader(Header.FirstRow).toDataStream().toVector.map(_.values) shouldBe
        Vector(Vector("1", "2", "3"))
    }
  }
}

Source File: Main.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.cli

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hive.conf.HiveConf

object Main extends App {

  implicit val fs = FileSystem.get(new Configuration)
  implicit val hiveConf = new HiveConf

  // the first parameter determines the command to run, just like in git, eg git pull, or in hadoop, eg hadoop fs
  val command = args.head
  val params = args.tail

  command match {
    case "schema" => ShowSchemaMain(params)
    case "stream" => StreamMain(params)
    case "apply-spec" => ApplySpecMain(params)
    case "fetch-spec" => FetchSpecMain(params)
    case "analyze" => AnalyzeMain(params)
    case other => System.err.println(s"Unknown command $other")
  }
}

case class Options(from: String = "", to: String = "", workerThreads: Int = 1, sourceIOThreads: Int = 1)

Source File: FetchSpecMain.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.cli

import java.io.PrintStream

import io.eels.{Constants, SourceParser}
import io.eels.component.hive.{HiveSource, HiveSpec}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hive.conf.HiveConf

object FetchSpecMain {

  implicit val fs = FileSystem.get(new Configuration)
  implicit val hiveConf = new HiveConf

  def apply(args: Seq[String], out: PrintStream = System.out): Unit = {

    val parser = new scopt.OptionParser[Options]("eel") {
      head("eel fetch-spec", Constants.EelVersion)

      opt[String]("dataset") required() action { (source, o) =>
        o.copy(source = source)
      } text "specify dataset, eg hive:database:table"
    }

    parser.parse(args, Options()) match {
      case Some(options) =>
        val builder = SourceParser(options.source).getOrElse(sys.error(s"Unsupported source ${options.source}"))
        val source = builder()
        source match {
          case hive: HiveSource =>
            val spec = hive.spec
            val json = HiveSpec.writeAsJson(spec.copy(tables = spec.tables.filter(_.tableName == hive.tableName)))
            println(json)
          case _ =>
            sys.error(s"Unsupported source $source")
        }
      case _ =>
    }
  }

  case class Options(source: String = null)
}

Source File: ApplySpecMain.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.cli

import java.io.PrintStream
import java.nio.file.{Path, Paths}

import io.eels.{Constants, SourceParser}
import io.eels.component.hive.{HiveOps, HiveSource, HiveSpec}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient

object ApplySpecMain {

  implicit val fs = FileSystem.get(new Configuration)
  implicit val hiveConf = new HiveConf
  implicit val client = new HiveMetaStoreClient(hiveConf)

  def apply(args: Seq[String], out: PrintStream = System.out): Unit = {

    val parser = new scopt.OptionParser[Options]("eel") {
      head("eel apply-spec", Constants.EelVersion)

      opt[String]("dataset") required() action { (source, o) =>
        o.copy(source = source)
      } text "specify dataset, eg hive:database:table"

      opt[String]("spec") required() action { (schema, o) =>
        o.copy(specPath = Paths.get(schema))
      } text "specify path to eel spec"
    }

    parser.parse(args, Options()) match {
      case Some(options) =>
        val builder = SourceParser(options.source).getOrElse(sys.error(s"Unsupported source ${options.source}"))
        val source = builder()
        source match {
          case hive: HiveSource =>
            HiveOps.applySpec(HiveSpec(options.specPath), false)
          case _ =>
            sys.error(s"Unsupported source $source")
        }
      case _ =>
    }
  }

  case class Options(source: String = null, specPath: Path = null)
}

Source File: ShowSchemaMain.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.cli

import java.io.PrintStream

import io.eels.{Constants, SourceParser}
import io.eels.component.avro.AvroSchemaFn
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hive.conf.HiveConf

object ShowSchemaMain {

  implicit val fs = FileSystem.get(new Configuration)
  implicit val hiveConf = new HiveConf

  def apply(args: Seq[String], out: PrintStream = System.out): Unit = {

    val parser = new scopt.OptionParser[Options]("eel") {
      head("eel schema", Constants.EelVersion)

      opt[String]("source") required() action { (source, o) =>
        o.copy(source = source)
      } text "specify source, eg hive:database:table or parquet:/path/to/file"
    }

    parser.parse(args, Options()) match {
      case Some(options) =>
        val builder = SourceParser(options.source).getOrElse(sys.error(s"Unsupported source ${options.source}"))
        val source = builder()
        val schema = source.schema
        val avroSchema = AvroSchemaFn.toAvro(schema)
        out.println(avroSchema)
      case _ =>
    }
  }

  case class Options(source: String = "")
}

Source File: AnalyzeMain.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.cli

import java.io.PrintStream

import io.eels.{Constants, SourceParser}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hive.conf.HiveConf

object AnalyzeMain {

  import scala.concurrent.ExecutionContext.Implicits.global

  implicit val fs = FileSystem.get(new Configuration)
  implicit val hiveConf = new HiveConf

  def apply(args: Seq[String], out: PrintStream = System.out): Unit = {

    val parser = new scopt.OptionParser[Options]("eel") {
      head("eel analyze", Constants.EelVersion)

      opt[String]("dataset") required() action { (source, o) =>
        o.copy(source = source)
      } text "specify dataset, eg hive:database:table"

      opt[Boolean]("reverse") optional() action { (reverse, o) =>
        o.copy(reverse = reverse)
      } text "specify reverse ordering of columns, eg most distinct first"
    }

    parser.parse(args, Options()) match {
      case Some(options) =>
        val builder = SourceParser(options.source).getOrElse(sys.error(s"Unsupported source ${options.source}"))
        val result = builder().counts.toSeq.sortBy(_._2.size)
        val orderedResults = if (options.reverse) result.reverse else result
        for ((columnName, columnCounts) <- orderedResults) {
          println(columnName)
          for ((value, counts) <- columnCounts) {
            println(s"\t$value ($counts)")
          }
        }
      case _ =>
    }
  }

  case class Options(source: String = null, reverse: Boolean = false)
}

Source File: StreamMain.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.cli

import io.eels.{Constants, Sink, SinkParser, SourceParser}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hive.conf.HiveConf

object StreamMain {

  import scala.concurrent.ExecutionContext.Implicits.global

  implicit val fs = FileSystem.get(new Configuration)
  implicit val hiveConf = new HiveConf

  def apply(args: Seq[String]): Unit = {

    val parser = new scopt.OptionParser[Options]("eel") {
      head("eel", Constants.EelVersion)

      opt[String]("source") required() action { (source, o) =>
        o.copy(from = source)
      } text "specify source, eg hive:database:table"

      opt[String]("sink") required() action { (sink, o) =>
        o.copy(to = sink)
      } text "specify sink, eg hive:database:table"

      opt[Int]("sourceThreads") optional() action { (threads, options) =>
        options.copy(sourceIOThreads = threads)
      } text "number of source io threads, defaults to 1"

      opt[Int]("workerThreads") optional() action { (threads, options) =>
        options.copy(workerThreads = threads)
      } text "number of worker threads, defaults to 1"
    }

    parser.parse(args, Options()) match {
      case Some(options) =>
        val sourceBuilder = SourceParser(options.from).orNull
        val source = sourceBuilder()
        val sinkBuilder = SinkParser(options.to).orNull
        val sink = sinkBuilder()
        val result = source.toFrame(options.sourceIOThreads).to(sink)
        println(s"Completed with $result rows")
      case _ =>
    }
  }
}

Source File: HivePartitionScanner.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive

import com.sksamuel.exts.Logging
import com.typesafe.config.{Config, ConfigFactory}
import io.eels.component.hive.partition.PartitionMetaData
import io.eels.schema.PartitionConstraint
import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus}

// scans partitions for files, returning the files and the meta data object for each partition
class HivePartitionScanner(implicit fs: FileSystem) extends Logging {

  private val config: Config = ConfigFactory.load()
  private val missingPartitionAction: String = config.getString("eel.hive.source.missingPartitionAction")

  def scan(partitions: Seq[PartitionMetaData],
           constraints: Seq[PartitionConstraint] = Nil): Map[PartitionMetaData, Seq[LocatedFileStatus]] = {
    logger.debug(s"Scanning ${partitions.size} partitions for applicable files ${partitions.map(_.location).mkString(", ").take(100)}")

    // first we filter out any partitions not matching the constraints
    val filteredPartitions = partitions.filter { meta =>
      constraints.forall(_.eval(meta.partition))
    }
    logger.debug(s"Filtered partitions: ${filteredPartitions.map(_.location).mkString(", ")})")

    // next, we check that the directories that the partitions point to actually exist
    // this will avoid a situation where a location exists in the metastore but not on disk
    val exantPartitions = filteredPartitions.filter { partition =>
      if (fs.exists(partition.location)) {
        true
      } else {
        if (missingPartitionAction == "error") {
          throw new IllegalStateException(s"Partition [${partition.name}] was specified in the hive metastore at [${partition.location}] but did not exist on disk. To disable these exceptions set eel.hive.source.missingPartitionAction=warn or eel.hive.source.missingPartitionAction=none")
        } else if (missingPartitionAction == "warn") {
          logger.warn(s"Partition [${partition.name}] was specified in the hive metastore at [${partition.location}] but did not exist on disk. To disable these warnings set eel.hive.source.missingPartitionAction=none")
          false
        } else {
          false
        }
      }
    }

    // next we grab all the data files from each of these partitions
    exantPartitions.map { meta =>
      meta -> HiveFileScanner(meta.location, false)
    }.toMap
  }
}

Source File: HiveFileScanner.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive

import com.sksamuel.exts.Logging
import com.typesafe.config.ConfigFactory
import io.eels.util.HdfsIterator
import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path}

// given a hadoop path, will look for files inside that path that match the
// configured settings for hidden files
// does not return directories
object HiveFileScanner extends Logging {

  private val config = ConfigFactory.load()
  private val ignoreHiddenFiles = config.getBoolean("eel.hive.source.ignoreHiddenFiles")
  private val hiddenFilePattern = config.getString("eel.hive.source.hiddenFilePattern")

  // returns true if the given file should be considered based on the config settings
  private def skip(file: LocatedFileStatus): Boolean = {
    file.getLen == 0L || ignoreHiddenFiles && file.getPath.getName.matches(hiddenFilePattern)
  }

  def apply(path: Path, recursive: Boolean)(implicit fs: FileSystem): Seq[LocatedFileStatus] = {
    logger.debug(s"Scanning $path, filtering=$ignoreHiddenFiles, pattern=$hiddenFilePattern")
    val files: List[LocatedFileStatus] = if (fs.exists(path)) {
      val files = fs.listFiles(path, recursive)
      HdfsIterator.remote(files)
          .filter(_.isFile)
          .filterNot(skip)
          .toList
    } else {
      Nil
    }
    logger.debug(s"Scanner found ${files.size} files")
    files
  }
}

Source File: HiveTableFilesFn.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive

import com.sksamuel.exts.Logging
import io.eels.component.hive.partition.PartitionMetaData
import io.eels.schema.{Partition, PartitionConstraint}
import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient


object HiveTableFilesFn extends Logging {

  def apply(dbName: String,
            tableName: String,
            tableLocation: Path,
            partitionConstraints: Seq[PartitionConstraint])
           (implicit fs: FileSystem, client: IMetaStoreClient): Map[Partition, Seq[LocatedFileStatus]] = {

    val ops = new HiveOps(client)

    // when we have no partitions, this will scan just the table folder directly for files
    def rootScan(): Map[Partition, Seq[LocatedFileStatus]] = {
      Map(Partition.empty -> HiveFileScanner(tableLocation, false))
    }

    def partitionsScan(partitions: Seq[PartitionMetaData]): Map[Partition, Seq[LocatedFileStatus]] = {
      new HivePartitionScanner().scan(partitions, partitionConstraints)
        .map { case (key, value) => key.partition -> value }
    }

    // the table may or may not have partitions.
    //
    // 1. If we do have partitions then we need to scan the path of each partition
    // (and each partition may be located anywhere outside of the table root)
    //
    // 2. If we do not have partitions then we can simply scan the table root.

    // we go to the metastore as we need the locations of the partitions not the values
    val partitions = ops.partitionsMetaData(dbName, tableName)
    if (partitions.isEmpty && partitionConstraints.nonEmpty) {
      sys.error("Constraints were used on a table that was not partitioned")
    } else if (partitions.isEmpty) {
      logger.debug(s"No partitions for $tableName; performing root table scan")
      rootScan
    } else partitionsScan(partitions)
  }
}

Source File: HiveFilePublisher.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive

import com.sksamuel.exts.io.Using
import io.eels.datastream.{Subscription, Publisher, Subscriber}
import io.eels.schema.{Partition, StructType}
import io.eels.{Predicate, _}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus}


class HiveFilePublisher(dialect: HiveDialect,
                        file: LocatedFileStatus,
                        metastoreSchema: StructType,
                        projectionSchema: StructType,
                        predicate: Option[Predicate],
                        partition: Partition)
                       (implicit fs: FileSystem, conf: Configuration) extends Publisher[Seq[Row]] with Using {
  require(projectionSchema.fieldNames.forall { it => it == it.toLowerCase() }, s"Use only lower case field names with hive")

  override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {

    val partitionMap: Map[String, Any] = partition.entries.map { it => (it.key, it.value) }.toMap

    // the schema we send to the dialect must have any partition fields removed, because those
    // fields won't exist in the data files. This is because partitions are not always written
    // and instead inferred from the partition itself.
    val projectionFields = projectionSchema.fields.filterNot(field => partition.containsKey(field.name))
    val projectionWithoutPartitions = StructType(projectionFields)

    // since we removed the partition fields from the target schema, we must repopulate them after the read
    // we also need to throw away the dummy field if we had an empty schema
    val publisher = dialect.input(file.getPath, metastoreSchema, projectionWithoutPartitions, predicate)
    publisher.subscribe(new Subscriber[Seq[Row]] {
      override def subscribed(s: Subscription): Unit = subscriber.subscribed(s)
      override def next(chunk: Seq[Row]): Unit = {
        val aligned = chunk.map { row =>
          if (projectionFields.isEmpty) {
            val values = projectionSchema.fieldNames().map(partitionMap.apply)
            Row(projectionSchema, values.toVector)
          } else {
            RowUtils.rowAlign(row, projectionSchema, partitionMap)
          }
        }
        subscriber.next(aligned)
      }
      override def completed(): Unit = subscriber.completed()
      override def error(t: Throwable): Unit = subscriber.error(t)
    })
  }
}

Source File: DynamicPartitionStrategy.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive.partition

import io.eels.component.hive.HiveOps
import io.eels.schema.Partition
import io.eels.util.HdfsMkdir
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient


class DynamicPartitionStrategy extends PartitionStrategy {

  private val cache = scala.collection.mutable.Map.empty[Partition, Path]

  def ensurePartition(partition: Partition,
                      dbName: String,
                      tableName: String,
                      inheritPermissions: Boolean,
                      client: IMetaStoreClient)(implicit fs: FileSystem): Path = {

    def createPartition: Path = this.synchronized {
      val ops = new HiveOps(client)
      ops.partitionMetaData(dbName, tableName, partition) match {
        case Some(meta) => meta.location
        case _ =>
          val tableLocation = ops.tablePath(dbName, tableName)
          val partitionPath = new Path(tableLocation, partition.unquoted)
          ops.createPartitionIfNotExists(dbName, tableName, partition, partitionPath)
          HdfsMkdir(partitionPath, inheritPermissions)
          partitionPath
      }
    }

    cache.getOrElseUpdate(partition, createPartition)
  }
}

Source File: StaticPartitionStrategy.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive.partition

import io.eels.component.hive.HiveOps
import io.eels.schema.Partition
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import com.sksamuel.exts.OptionImplicits._


object StaticPartitionStrategy extends PartitionStrategy {
  private val cache = scala.collection.mutable.Map.empty[Partition, Path]
  def ensurePartition(partition: Partition, dbName: String, tableName: String, inheritPermissions: Boolean, client: IMetaStoreClient)(implicit fs: FileSystem): Path = {
    cache.getOrElseUpdate(partition, {
      val ops = new HiveOps(client)
      val meta = ops.partitionMetaData(dbName, tableName, partition).getOrError(s"Unknown partition $partition")
      meta.location
    })
  }
}

Source File: HiveStats.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive

import com.sksamuel.exts.Logging
import com.sksamuel.exts.OptionImplicits._
import io.eels.schema.PartitionConstraint
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.parquet.format.converter.ParquetMetadataConverter
import org.apache.parquet.hadoop.ParquetFileReader

import scala.collection.JavaConverters._

trait HiveStats {

  // total number of records
  def count: Long = count(Nil)

  // total number of records in the partitions that match the constraints
  def count(constraints: Seq[PartitionConstraint]): Long

  // returns the minimum value of this field
  def min(field: String): Any = min(field, Nil)

  // returns the maximum value of this field
  def max(field: String): Any = max(field, Nil)

  // returns the minimum value of this field for the partitions that match the constraints
  def min(field: String, constraints: Seq[PartitionConstraint]): Any

  // returns the maximum value of this field for the partitions that match the constraints
  def max(field: String, constraints: Seq[PartitionConstraint]): Any
}

class ParquetHiveStats(dbName: String,
                       tableName: String,
                       table: HiveTable)
                      (implicit fs: FileSystem,
                       conf: Configuration,
                       client: IMetaStoreClient) extends HiveStats with Logging {

  private val ops = new HiveOps(client)

  private def count(path: Path) = {
    val blocks = ParquetFileReader.readFooter(fs.getConf, path, ParquetMetadataConverter.NO_FILTER).getBlocks.asScala
    blocks.map(_.getRowCount).sum
  }

  override def count(constraints: Seq[PartitionConstraint]): Long = {
    val counts = HiveTableFilesFn(dbName, tableName, table.location, constraints)
      .flatMap(_._2)
      .map(_.getPath).map(count)
    if (counts.isEmpty) 0 else counts.sum
  }

  private def minmax(field: String, constraints: Seq[PartitionConstraint]): (Any, Any) = {
    def stats[T]: (Any, Any) = {
      def min(ts: Seq[Comparable[T]]): Any = ts.reduceLeft { (a, b) => if (a.compareTo(b.asInstanceOf[T]) <= 0) a else b }
      def max(ts: Seq[Comparable[T]]): Any = ts.reduceLeft { (a, b) => if (a.compareTo(b.asInstanceOf[T]) >= 0) a else b }
      val location = new Path(ops.location(dbName, tableName))
      val (mins, maxes) = HiveTableFilesFn(dbName, tableName, location, constraints).toSeq.flatMap { case (_, files) =>
        logger.debug(s"Calculating min,max in file $files")
        files.flatMap { file =>
          val footer = ParquetFileReader.readFooter(conf, file, ParquetMetadataConverter.NO_FILTER)
          footer.getBlocks.asScala.map { block =>
            val column = block.getColumns.asScala.find(_.getPath.toDotString == field).getOrError(s"Unknown column $field")
            val min = column.getStatistics.genericGetMin.asInstanceOf[Comparable[T]]
            val max = column.getStatistics.genericGetMax.asInstanceOf[Comparable[T]]
            (min, max)
          }
        }
      }.unzip
      (min(mins), max(maxes))
    }
    stats[Any]
  }

  override def min(field: String, constraints: Seq[PartitionConstraint]): Any = minmax(field, constraints)._1
  override def max(field: String, constraints: Seq[PartitionConstraint]): Any = minmax(field, constraints)._2
}

Source File: ParquetHiveDialect.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive.dialect

import java.util.concurrent.atomic.AtomicInteger

import com.sksamuel.exts.Logging
import com.sksamuel.exts.OptionImplicits._
import com.sksamuel.exts.io.Using
import io.eels.component.hive.{HiveDialect, HiveOps, HiveOutputStream}
import io.eels.component.parquet._
import io.eels.component.parquet.util.{ParquetIterator, ParquetLogMute}
import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription}
import io.eels.schema.StructType
import io.eels.{Predicate, Row}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
import org.apache.hadoop.hive.ql.io.parquet.{MapredParquetInputFormat, MapredParquetOutputFormat}

import scala.math.BigDecimal.RoundingMode.RoundingMode

case class ParquetHiveDialect(options: ParquetWriteOptions = ParquetWriteOptions()) extends HiveDialect with Logging with Using {

  override val serde: String = classOf[ParquetHiveSerDe].getCanonicalName
  override val inputFormat: String = classOf[MapredParquetInputFormat].getCanonicalName
  override val outputFormat: String = classOf[MapredParquetOutputFormat].getCanonicalName

  override def input(path: Path,
                     ignore: StructType,
                     projectionSchema: StructType,
                     predicate: Option[Predicate])
                    (implicit fs: FileSystem, conf: Configuration): Publisher[Seq[Row]] = new Publisher[Seq[Row]] {

    val client = new HiveMetaStoreClient(new HiveConf)
    val ops = new HiveOps(client)

    override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {
      // convert the eel projection schema into a parquet schema which will be used by the native parquet reader
      try {
        val parquetProjectionSchema = ParquetSchemaFns.toParquetMessageType(projectionSchema)
        using(RowParquetReaderFn(path, predicate, parquetProjectionSchema.some, true)) { reader =>
          val subscription = new Subscription {
            override def cancel(): Unit = reader.close()
          }
          subscriber.subscribed(subscription)
          ParquetIterator(reader).grouped(DataStream.DefaultBatchSize).foreach(subscriber.next)
          subscriber.completed()
        }
      } catch {
        case t: Throwable => subscriber.error(t)
      }
    }
  }

  override def output(schema: StructType,
                      path: Path,
                      permission: Option[FsPermission],
                      roundingMode: RoundingMode,
                      metadata: Map[String, String])
                     (implicit fs: FileSystem, conf: Configuration): HiveOutputStream = {
    val path_x = path
    new HiveOutputStream {
      ParquetLogMute()

      private val _records = new AtomicInteger(0)
      logger.debug(s"Creating parquet writer at $path")
      private val writer = RowParquetWriterFn(path, schema, metadata, true, roundingMode, fs.getConf)

      override def write(row: Row) {
        require(row.values.nonEmpty, "Attempting to write an empty row")
        writer.write(row)
        _records.incrementAndGet()
      }

      override def close(): Unit = {
        logger.debug(s"Closing hive parquet writer $path")
        writer.close()
        // after the files are closed, we should set permissions if we've been asked to, this allows
        // all the files we create to stay consistent
        permission.foreach(fs.setPermission(path, _))
      }

      override def records: Int = _records.get()
      override def path: Path = path_x
    }
  }
}

Source File: OrcHiveDialect.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive.dialect

import com.sksamuel.exts.Logging
import io.eels.component.hive.{HiveDialect, HiveOutputStream}
import io.eels.component.orc.{OrcPublisher, OrcWriteOptions, OrcWriter}
import io.eels.datastream.{Publisher, Subscriber}
import io.eels.schema.StructType
import io.eels.{Predicate, Row}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.ql.io.orc.{OrcInputFormat, OrcOutputFormat, OrcSerde}

import scala.math.BigDecimal.RoundingMode.RoundingMode

case class OrcHiveDialect(options: OrcWriteOptions = OrcWriteOptions()) extends HiveDialect with Logging {

  override val serde: String = classOf[OrcSerde].getCanonicalName
  override val inputFormat: String = classOf[OrcInputFormat].getCanonicalName
  override val outputFormat: String = classOf[OrcOutputFormat].getCanonicalName

  override def input(path: Path,
                     metastoreSchema: StructType,
                     projectionSchema: StructType,
                     predicate: Option[Predicate])
                    (implicit fs: FileSystem, conf: Configuration): Publisher[Seq[Row]] = new Publisher[Seq[Row]] {
    override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {
      new OrcPublisher(path, projectionSchema.fieldNames(), predicate).subscribe(subscriber)
    }
  }

  override def output(schema: StructType,
                      path: Path,
                      permission: Option[FsPermission],
                      roundingMode: RoundingMode,
                      metadata: Map[String, String])(implicit fs: FileSystem, conf: Configuration): HiveOutputStream = {

    val path_x = path
    val writer = new OrcWriter(path, schema, options)

    new HiveOutputStream {

      override def write(row: Row): Unit = {
        require(row.values.nonEmpty, "Attempting to write an empty row")
        writer.write(row)
      }

      override def close(): Unit = {
        writer.close()
        permission.foreach(fs.setPermission(path, _))
      }

      override def records: Int = writer.records
      override def path: Path = path_x
    }
  }
}

Source File: HiveDatabase.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive

import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hive.metastore.IMetaStoreClient

import scala.collection.JavaConverters._

class HiveContext(implicit fs: FileSystem, client: IMetaStoreClient) {
  def databases: Seq[HiveDatabase] = client.getAllDatabases.asScala.map { dbName => HiveDatabase(dbName) }
}

case class HiveDatabase(dbName: String)(implicit fs: FileSystem, client: IMetaStoreClient) {

  def tables(): List[HiveSource] = {
    val tables = client.getAllTables(dbName).asScala
    tables.map { it => HiveSource(dbName, it) }.toList
  }

  def table(tableName: String): HiveSource = {
    val exists = client.tableExists(dbName, tableName)
    if (!exists)
      throw new IllegalArgumentException(s"$dbName.$tableName does not exist")
    HiveSource(dbName, tableName)
  }

  def location: String = client.getDatabase(dbName).getLocationUri
}

Source File: HiveDialect.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive

import com.sksamuel.exts.Logging
import io.eels.component.hive.dialect.{OrcHiveDialect, ParquetHiveDialect}
import io.eels.datastream.Publisher
import io.eels.schema.StructType
import io.eels.{Predicate, Row}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.api.Table

import scala.math.BigDecimal.RoundingMode.RoundingMode

trait HiveDialect extends Logging {

  def serde: String
  def inputFormat: String
  def outputFormat: String

  
  def output(schema: StructType, // schema without partition information
             path: Path,
             permission: Option[FsPermission],
             roundingMode: RoundingMode,
             metadata: Map[String, String])
            (implicit fs: FileSystem, conf: Configuration): HiveOutputStream

  def stats(getPath: Path)(implicit fs: FileSystem): Long = throw new UnsupportedOperationException
}

object HiveDialect extends Logging {

  def apply(format: String): HiveDialect = format match {
    case input if input.contains("ParquetInputFormat") => ParquetHiveDialect()
    case input if input.contains("OrcInputFormat") => OrcHiveDialect()
    //case input if input.contains("AvroHiveDialect") || input.contains("AvroContainerInputFormat") => AvroHiveDialect
    //      "org.apache.hadoop.mapred.TextInputFormat" -> TextHiveDialect
    case _ => throw new UnsupportedOperationException(s"Unknown hive input format $format")
  }

  def apply(table: Table): HiveDialect = {
    val format = table.getSd.getInputFormat
    logger.debug(s"Table format is $format")
    val dialect = HiveDialect(format)
    logger.debug(s"HiveDialect is $dialect")
    dialect
  }
}

Source File: HivePartitionPublisher.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive

import com.sksamuel.exts.Logging
import com.typesafe.config.ConfigFactory
import io.eels.Row
import io.eels.datastream.{Publisher, Subscriber, Subscription}
import io.eels.schema.StructType
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient

import scala.util.control.NonFatal


class HivePartitionPublisher(dbName: String,
                             tableName: String,
                             projectionSchema: StructType,
                             partitionKeys: List[String], // partition keys for this table, used to map the partition values back to a map
                             dialect: HiveDialect // used to open up the files to check they exist if checkDataForPartitionOnlySources is true
                            )
                            (implicit fs: FileSystem,
                             client: IMetaStoreClient) extends Publisher[Seq[Row]] with Logging {

  private val config = ConfigFactory.load()

  // if this is true, then we will still check that some files exist for each partition, to avoid
  // a situation where the partitions have been created in the hive metastore, but no actual
  // data has been written using those yet.
  private val partitionPartFileCheck = config.getBoolean("eel.hive.source.checkDataForPartitionOnlySources")
  logger.info(s"eel.hive.source.checkDataForPartitionOnlySources=$partitionPartFileCheck")

  // returns true if the partition exists on disk
  private def isPartitionPhysical(part: org.apache.hadoop.hive.metastore.api.Partition): Boolean = {
    val location = new Path(part.getSd.getLocation)
    logger.debug(s"Checking that partition $location has been created on disk...")
    try {
      val exists = fs.exists(location)
      if (exists) {
        logger.debug("...exists")
      } else {
        logger.debug("...not found")
      }
      exists
    } catch {
      case NonFatal(e) =>
        logger.warn(s"Error reading $location", e)
        false
    }
  }

  override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = client.synchronized {
    try {

      import scala.collection.JavaConverters._

      // each row will contain just the values from the metastore
      val rows = client.listPartitions(dbName, tableName, Short.MaxValue).asScala.filter { part =>
        !partitionPartFileCheck || isPartitionPhysical(part)
      }.map { part =>
        // the partition values are assumed to be the same order as the supplied partition keys
        // first we build a map of the keys to values, then use that map to return a Row with
        // values in the order set by the fieldNames parameter
        val map = partitionKeys.zip(part.getValues.asScala).toMap
        Row(projectionSchema, projectionSchema.fieldNames.map(map(_)).toVector)
      }

      logger.debug(s"After scanning partitions and files we have ${rows.size} rows")
      subscriber.subscribed(Subscription.empty)
      rows.iterator.grouped(10).foreach(subscriber.next)
      subscriber.completed()
    } catch {
      case t: Throwable => subscriber.error(t)
    }
  }
}

Source File: ParquetVsOrcSpeedTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive

import java.io.File
import java.math.MathContext

import com.sksamuel.exts.metrics.Timed
import io.eels.Row
import io.eels.component.orc.{OrcSink, OrcSource}
import io.eels.component.parquet.{ParquetSink, ParquetSource}
import io.eels.datastream.DataStream
import io.eels.schema._
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

import scala.math.BigDecimal.RoundingMode
import scala.util.Random

object ParquetVsOrcSpeedTest extends App with Timed {

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.getLocal(new Configuration())

  val size = 5000000

  val structType = StructType(
    Field("name", StringType),
    Field("age", IntType.Signed),
    Field("height", DoubleType),
    Field("amazing", BooleanType),
    Field("fans", LongType.Signed),
    Field("rating", DecimalType(4, 2))
  )

  def iter: Iterator[Vector[Any]] = Iterator.continually(Vector(
    Random.nextString(10),
    Random.nextInt(),
    Random.nextDouble(),
    Random.nextBoolean(),
    Random.nextLong(),
    BigDecimal(Random.nextDouble(), new MathContext(4)).setScale(2, RoundingMode.UP)
  ))

  def ds: DataStream = DataStream.fromIterator(structType, iter.take(size).map(Row(structType, _)))

  val ppath = new Path("parquet_speed.pq")
  fs.delete(ppath, false)

  val opath = new Path("orc_speed.orc")
  fs.delete(opath, false)

  new File(ppath.toString).deleteOnExit()
  new File(opath.toString).deleteOnExit()

  timed("Orc Insertion") {
    ds.to(OrcSink(opath))
  }

  timed("Parquet Insertion") {
    ds.to(ParquetSink(ppath))
  }

  while (true) {

    timed("Reading with OrcSource") {
      val actual = OrcSource(opath).toDataStream().size
      assert(actual == size, s"$actual != $size")
    }

    timed("Reading with ParquetSource") {
      val actual = ParquetSource(ppath).toDataStream().size
      assert(actual == size, s"$actual != $size")
    }
  }
}

Source File: HiveBenchmarkApp.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive

import java.util.UUID

import com.sksamuel.exts.metrics.Timed
import io.eels.datastream.DataStream
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient

import scala.util.Random

object HiveBenchmarkApp extends App with Timed {

  val states = List(
    "Alabama",
    "Alaska",
    "Arizona",
    "Arkansas",
    "California",
    "Colorado",
    "Connecticut",
    "Delaware",
    "Florida",
    "Georgia",
    "Hawaii",
    "Idaho",
    "Illinois",
    "Indiana",
    "Iowa",
    "Kansas",
    "Kentucky",
    "Louisiana",
    "Maine",
    "Maryland",
    "Massachusetts",
    "Michigan",
    "Minnesota",
    "Mississippi",
    "Missouri",
    "Montana",
    "Nebraska",
    "Nevada",
    "New Hampshire",
    "New Jersey",
    "New Mexico",
    "New York",
    "North Carolina",
    "North Dakota",
    "Ohio",
    "Oklahoma",
    "Oregon",
    "Pennsylvania",
    "Rhode Island",
    "South Carolina",
    "South Dakota",
    "Tennessee",
    "Texas",
    "Utah",
    "Vermont",
    "Virginia",
    "Washington",
    "West Virginia",
    "Wisconsin",
    "Wyoming").map(_.replace(' ', '_').toLowerCase)

  import HiveConfig._

  val schema = StructType("id", "state")
  val rows = List.fill(1000000)(List(UUID.randomUUID.toString, states(Random.nextInt(50))))

  logger.info(s"Generated ${rows.size} rows")

  new HiveOps(client).createTable(
    "sam",
    "people",
    schema,
    List("state"),
    overwrite = true
  )

  logger.info("Table created")

  val sink = HiveSink("sam", "people")
  DataStream.fromValues(schema, rows).to(sink)

  logger.info("Write complete")

  while (true) {

    timed("datastream took") {
      val result = HiveSource("sam", "people").toDataStream().collect
      println(result.size)
    }
  }
}

Source File: OrcSource.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.orc

import java.util.concurrent.atomic.AtomicBoolean

import com.sksamuel.exts.OptionImplicits._
import com.sksamuel.exts.io.Using
import io.eels._
import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription}
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.orc.OrcFile.ReaderOptions
import org.apache.orc._

import scala.collection.JavaConverters._

object OrcSource {
  def apply(path: Path)(implicit fs: FileSystem, conf: Configuration): OrcSource = apply(FilePattern(path))
  def apply(str: String)(implicit fs: FileSystem, conf: Configuration): OrcSource = apply(FilePattern(str))
}

case class OrcSource(pattern: FilePattern,
                     projection: Seq[String] = Nil,
                     predicate: Option[Predicate] = None)
                    (implicit fs: FileSystem, conf: Configuration) extends Source with Using {

  override def parts(): Seq[Publisher[Seq[Row]]] = pattern.toPaths().map(new OrcPublisher(_, projection, predicate))

  def withPredicate(predicate: Predicate): OrcSource = copy(predicate = predicate.some)
  def withProjection(first: String, rest: String*): OrcSource = withProjection(first +: rest)
  def withProjection(fields: Seq[String]): OrcSource = {
    require(fields.nonEmpty)
    copy(projection = fields.toList)
  }

  override def schema: StructType = {
    val reader = OrcFile.createReader(pattern.toPaths().head, new ReaderOptions(conf))
    val schema = reader.getSchema
    OrcSchemaFns.fromOrcType(schema).asInstanceOf[StructType]
  }

  private def reader() = {
    val options = new ReaderOptions(conf)
    OrcFile.createReader(pattern.toPaths().head, options)
  }

  def count(): Long = reader().getNumberOfRows
  def statistics(): Seq[ColumnStatistics] = reader().getStatistics.toVector
  def stripes(): Seq[StripeInformation] = reader().getStripes.asScala
  def stripeStatistics(): Seq[StripeStatistics] = reader().getStripeStatistics.asScala
}

class OrcPublisher(path: Path,
                   projection: Seq[String],
                   predicate: Option[Predicate])(implicit conf: Configuration) extends Publisher[Seq[Row]] {

  override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {
    try {
      val reader = OrcFile.createReader(path, new ReaderOptions(conf))
      val fileSchema = OrcSchemaFns.fromOrcType(reader.getSchema).asInstanceOf[StructType]
      val iterator: Iterator[Row] = OrcBatchIterator(reader, fileSchema, projection, predicate).flatten

      val running = new AtomicBoolean(true)
      subscriber.subscribed(Subscription.fromRunning(running))
      iterator.grouped(DataStream.DefaultBatchSize).takeWhile(_ => running.get).foreach(subscriber.next)
      subscriber.completed()
    } catch {
      case t: Throwable => subscriber.error(t)
    }
  }
}

Source File: OrcSink.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.orc

import com.sksamuel.exts.Logging
import com.sksamuel.exts.OptionImplicits._
import com.sksamuel.exts.config.ConfigSupport
import com.typesafe.config.ConfigFactory
import io.eels.schema.StructType
import io.eels.{Row, Sink, SinkWriter}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.orc.OrcFile.{CompressionStrategy, EncodingStrategy}
import org.apache.orc.OrcProto.CompressionKind

case class OrcWriteOptions(overwrite: Boolean = false,
                           compressionKind: CompressionKind,
                           compressionStrategy: CompressionStrategy,
                           compressionBufferSize: Option[Int],
                           encodingStrategy: Option[EncodingStrategy],
                           bloomFilterColumns: Seq[String] = Nil,
                           permission: Option[FsPermission] = None,
                           inheritPermissions: Option[Boolean] = None,
                           rowIndexStride: Option[Int] = None) {
  def withCompressionKind(kind: CompressionKind): OrcWriteOptions = copy(compressionKind = kind)
  def withCompressionStrategy(strategy: CompressionStrategy): OrcWriteOptions = copy(compressionStrategy = strategy)
  def withCompressionBufferSize(size: Int): OrcWriteOptions = copy(compressionBufferSize = size.some)
  def withEncodingStrategy(strategy: EncodingStrategy): OrcWriteOptions = copy(encodingStrategy = strategy.some)
  def withBloomFilterColumns(bloomFilterColumns: Seq[String]): OrcWriteOptions = copy(bloomFilterColumns = bloomFilterColumns)
  def withRowIndexStride(stride: Int): OrcWriteOptions = copy(rowIndexStride = stride.some)
  def withOverwrite(overwrite: Boolean): OrcWriteOptions = copy(overwrite = overwrite)
  def withPermission(permission: FsPermission): OrcWriteOptions = copy(permission = permission.some)
  def withInheritPermission(inheritPermissions: Boolean): OrcWriteOptions = copy(inheritPermissions = inheritPermissions.some)
}

object OrcWriteOptions extends ConfigSupport {

  // creates a config from the typesafe reference.confs
  def apply(): OrcWriteOptions = {
    val config = ConfigFactory.load()
    OrcWriteOptions(
      false,
      CompressionKind valueOf config.getString("eel.orc.writer.compression-kind"),
      CompressionStrategy valueOf config.getString("eel.orc.writer.compression-strategy"),
      config.getIntOpt("eel.orc.writer.compression-buffer-size"),
      config.getStringOpt("eel.orc.writer.encoding-strategy").map(EncodingStrategy.valueOf)
    )
  }
}

case class OrcSink(path: Path, options: OrcWriteOptions = OrcWriteOptions())
                  (implicit fs: FileSystem, conf: Configuration) extends Sink with Logging {

  // -- convenience options --
  def withCompressionKind(kind: CompressionKind): OrcSink = copy(options = options.copy(compressionKind = kind))
  def withCompressionStrategy(strategy: CompressionStrategy): OrcSink = copy(options = options.copy(compressionStrategy = strategy))
  def withCompressionBufferSize(size: Int): OrcSink = copy(options = options.copy(compressionBufferSize = size.some))
  def withEncodingStrategy(strategy: EncodingStrategy): OrcSink = copy(options = options.copy(encodingStrategy = strategy.some))
  def withBloomFilterColumns(bloomFilterColumns: Seq[String]): OrcSink = copy(options = options.copy(bloomFilterColumns = bloomFilterColumns))
  def withRowIndexStride(stride: Int): OrcSink = copy(options = options.copy(rowIndexStride = stride.some))
  def withOverwrite(overwrite: Boolean): OrcSink = copy(options = options.copy(overwrite = overwrite))
  def withPermission(permission: FsPermission): OrcSink = copy(options = options.copy(permission = permission.some))
  def withInheritPermission(inheritPermissions: Boolean): OrcSink = copy(options = options.copy(inheritPermissions = inheritPermissions.some))

  override def open(schema: StructType, n: Int): Seq[SinkWriter] = {
    if (n == 1) Seq(create(schema, path))
    else List.tabulate(n) { k => create(schema, new Path(path.getParent, path.getName + "_" + k)) }
  }

  override def open(schema: StructType): SinkWriter = create(schema, path)

  private def create(schema: StructType, path: Path): SinkWriter = new SinkWriter {

    if (options.overwrite && fs.exists(path))
      fs.delete(path, false)

    val writer = new OrcWriter(path, schema, options)

    override def write(row: Row): Unit = writer.write(row)
    
    override def close(): Unit = {
      writer.close()
      options.permission match {
        case Some(perm) => fs.setPermission(path, perm)
        case None =>
          if (options.inheritPermissions.getOrElse(false)) {
            val permission = fs.getFileStatus(path.getParent).getPermission
            fs.setPermission(path, permission)
          }
      }
    }
  }
}

Source File: OrcMultipleFileSpeedTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.orc

import java.io.File

import com.sksamuel.exts.metrics.Timed
import io.eels.datastream.DataStream
import io.eels.schema.StructType
import io.eels.{FilePattern, Row}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

import scala.util.Random


object OrcMultipleFileSpeedTest extends App with Timed {

  val size = 5000000
  val count = 20
  val schema = StructType("a", "b", "c", "d", "e")

  def createRow = Row(schema, Random.nextBoolean(), Random.nextFloat(), Random.nextGaussian(), Random.nextLong(), Random.nextString(4))

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.getLocal(new Configuration())

  val dir = new Path("orc-speed-test")
  new File(dir.toString).mkdirs()

  timed("Insertion") {
    val ds = DataStream.fromIterator(schema, Iterator.continually(createRow).take(size))
    new File(dir.toString).listFiles().foreach(_.delete)
    ds.to(OrcSink(new Path("orc-speed-test/orc_speed.pq")).withOverwrite(true), count)
  }

  for (_ <- 1 to 25) {
    assert(count == FilePattern("orc-speed-test/*").toPaths().size)

    timed("Reading with OrcSource") {
      val actual = OrcSource("orc-speed-test/*").toDataStream().map { row => row }.filter(_ => true).size
      assert(actual == size, s"Expected $size but was $actual")
    }

    println("")
    println("---------")
    println("")
  }
}

Source File: OrcPredicateTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.orc

import java.io.{File, FilenameFilter}

import io.eels.Predicate
import io.eels.datastream.DataStream
import io.eels.schema.{Field, LongType, StringType, StructType}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers}

class OrcPredicateTest extends FlatSpec with Matchers with BeforeAndAfterAll {
  cleanUpResidualOrcTestFiles

  val schema = StructType(
    Field("name", StringType, nullable = true),
    Field("city", StringType, nullable = true),
    Field("age", LongType.Signed, nullable = true)
  )

  val values = Vector.fill(1000) {
    Vector("sam", "middlesbrough", 37)
  } ++ Vector.fill(1000) {
    Vector("laura", "iowa city", 24)
  }

  val ds = DataStream.fromValues(schema, values)

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.get(new Configuration())
  val path = new Path("test.orc")

  if (fs.exists(path))
    fs.delete(path, false)

  new File(path.toString).deleteOnExit()

  ds.to(OrcSink(path).withRowIndexStride(1000))

  override protected def afterAll(): Unit = fs.delete(path, false)

  "OrcSource" should "support string equals predicates" in {
    conf.set("eel.orc.predicate.row.filter", "false")
    val rows = OrcSource(path).withPredicate(Predicate.equals("name", "sam")).toDataStream().collect
    rows.map(_.values).toSet shouldBe Set(Vector("sam", "middlesbrough", 37L))
  }

  it should "support gt predicates" in {
    conf.set("eel.orc.predicate.row.filter", "false")
    val rows = OrcSource(path).withPredicate(Predicate.gt("age", 30L)).toDataStream().collect
    rows.map(_.values).toSet shouldBe Set(Vector("sam", "middlesbrough", 37L))
  }

  it should "support lt predicates" in {
    conf.set("eel.orc.predicate.row.filter", "false")
    val rows = OrcSource(path).withPredicate(Predicate.lt("age", 30)).toDataStream().collect
    rows.map(_.values).toSet shouldBe Set(Vector("laura", "iowa city", 24L))
  }

  it should "enable row level filtering with predicates by default" in {
    conf.set("eel.orc.predicate.row.filter", "true")
    val rows = OrcSource(path).withPredicate(Predicate.equals("name", "sam")).toDataStream().collect
    rows.head.schema shouldBe schema
    rows.head.values shouldBe Vector("sam", "middlesbrough", 37L)
  }

  private def cleanUpResidualOrcTestFiles = {
    new File(".").listFiles(new FilenameFilter {
      override def accept(dir: File, name: String): Boolean = {
        (name.startsWith("test_") && name.endsWith(".orc")) || (name.startsWith(".test_") && name.endsWith(".orc.crc"))
      }
    }).foreach(_.delete())
  }
}

Source File: InputFormatConf.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.hadoop

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{ FileSystem, Path }
import org.apache.hadoop.io.{ LongWritable, Text, Writable }
import org.apache.hadoop.mapreduce.{ InputFormat, InputSplit, Job, RecordReader }
import org.apache.hadoop.mapreduce.lib.input.{ FileInputFormat, FileSplit, TextInputFormat }

import scala.collection.immutable

trait InputFormatConf[K, V] extends Serializable {
  type IF <: InputFormat[K, V]
  type Split <: InputSplit with Writable

  type KExtract <: Extract[K]
  type VExtract <: Extract[V]

  def kExtract: KExtract
  def vExtract: VExtract

  def makeInputFormat(): IF

  // I'm unsure if we should WriSer them for them
  def makeSplits(hadoopConf: Configuration): IndexedSeq[WriSer[Split]]

  // TODO do we want to require typing of the RecordReader as well?
  final def createRecordReader(hadoopConf: Configuration, split: Split,
    inputFormat: IF = makeInputFormat()): RecordReader[K, V] = {
    val tac = ConfOnlyTAC(hadoopConf)
    val recordReader = inputFormat.createRecordReader(split, tac)
    recordReader.initialize(split, tac)
    recordReader
  }
}

case class TextInputFormatConf(file: String, partitions: Int)
  extends InputFormatConf[LongWritable, Text] {
  type IF = TextInputFormat
  type Split = FileSplit

  // TODO now that we figured out what's up, see if we can't eliminate the need for this...
  val internalK = Extract.unit[LongWritable]
  val internalV = Extract.text

  type KExtract = internalK.type
  type VExtract = internalV.type

  override val kExtract: KExtract = internalK
  override val vExtract: VExtract = internalV

  def makeInputFormat() = new TextInputFormat()
  def makeSplits(hadoopConf: Configuration): immutable.IndexedSeq[WriSer[FileSplit]] = {
    val job = Job.getInstance(hadoopConf)
    FileInputFormat.setInputPaths(job, file)
    val path = new Path(file)
    val len = FileSystem.get(hadoopConf).listStatus(path).head.getLen
    val size_per = math.round(len / partitions.toDouble)

    ((0 until partitions - 1).map { p =>
      new FileSplit(path, size_per * p, size_per, null)
    } :+ {
      val fin = size_per * (partitions - 1)
      new FileSplit(path, fin, len - fin, null)
    }).map(WriSer(_))
  }
}

// TODO do we really get much from having this as its own class? consider just making a def csv method in TextInputFormatConf
object CSVInputFormatConf {
  def apply[V](ifc: InputFormatConf[LongWritable, V] { type Split = FileSplit }): InputFormatConf[LongWritable, V] {
    type IF = ifc.IF
    type Split = ifc.Split
    type KExtract = ifc.KExtract
    type VExtract = ifc.VExtract
  } = new InputFormatConf[LongWritable, V] {
    type IF = ifc.IF
    type Split = ifc.Split
    type KExtract = ifc.KExtract
    type VExtract = ifc.VExtract

    override val kExtract: KExtract = ifc.kExtract
    override val vExtract: VExtract = ifc.vExtract

    override def makeInputFormat() = ifc.makeInputFormat()
    override def makeSplits(hadoopConf: Configuration) = {
      val splits = ifc.makeSplits(hadoopConf)
      splits.headOption.fold(IndexedSeq.empty[WriSer[Split]]) {
        case WriSer(head) =>
          val rr = createRecordReader(hadoopConf, head)
          require(rr.nextKeyValue, "csv has no header, first line was empty")
          val afterHeader = rr.getCurrentKey.get
          require(rr.nextKeyValue, "first split is empty")
          WriSer(new FileSplit(head.getPath, afterHeader, head.getLength - afterHeader, null)) +:
            splits.tail
      }
    }
  }
}

Source File: package.scala From amadou with Apache License 2.0

5 votes

package com.mediative.amadou

import com.google.api.services.bigquery.model._
import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem
import com.google.cloud.hadoop.io.bigquery._
import org.apache.hadoop.fs.{FileSystem, Path}
import net.ceedubs.ficus.readers.ValueReader
import net.ceedubs.ficus.FicusInstances

import org.apache.spark.sql.{Dataset, SparkSession, Encoder}
import java.util.concurrent.ThreadLocalRandom
import scala.collection.JavaConversions._

package object bigquery extends FicusInstances {

  object CreateDisposition extends Enumeration {
    val CREATE_IF_NEEDED, CREATE_NEVER = Value
  }

  object WriteDisposition extends Enumeration {
    val WRITE_TRUNCATE, WRITE_APPEND, WRITE_EMPTY = Value
  }

  val BQ_CSV_DATE_FORMAT = "yyyy-MM-dd HH:mm:ss zzz"

  object TableNotFound {
    import com.google.api.client.googleapis.json.GoogleJsonResponseException
    import com.google.api.client.googleapis.json.GoogleJsonError
    import scala.collection.JavaConverters._

    def unapply(error: Throwable): Option[GoogleJsonError.ErrorInfo] = error match {
      case error: GoogleJsonResponseException =>
        Some(error.getDetails)
          .filter(_.getCode == 404)
          .flatMap(_.getErrors.asScala.find(_.getReason == "notFound"))
      case _ => None
    }
  }

  def tableHasDataForDate(
      spark: SparkSession,
      table: TableReference,
      date: java.sql.Date,
      column: String): Boolean = {
    val bq = BigQueryClient.getInstance(spark.sparkContext.hadoopConfiguration)
    bq.hasDataForDate(table, date, column)
  }

  
    def saveAsBigQueryTable(
        tableRef: TableReference,
        writeDisposition: WriteDisposition.Value,
        createDisposition: CreateDisposition.Value): Unit = {
      val bucket = conf.get(BigQueryConfiguration.GCS_BUCKET_KEY)
      val temp =
        s"spark-bigquery-${System.currentTimeMillis()}=${ThreadLocalRandom.current.nextInt(Int.MaxValue)}"
      val gcsPath = s"gs://$bucket/spark-bigquery-tmp/$temp"
      self.write.json(gcsPath)

      val schemaFields = self.schema.fields.map { field =>
        import org.apache.spark.sql.types._

        val fieldType = field.dataType match {
          case BooleanType    => "BOOLEAN"
          case LongType       => "INTEGER"
          case IntegerType    => "INTEGER"
          case StringType     => "STRING"
          case DoubleType     => "FLOAT"
          case TimestampType  => "TIMESTAMP"
          case _: DecimalType => "INTEGER"
        }
        new TableFieldSchema().setName(field.name).setType(fieldType)
      }.toList

      val tableSchema = new TableSchema().setFields(schemaFields)

      bq.load(gcsPath, tableRef, tableSchema, writeDisposition, createDisposition)
      delete(new Path(gcsPath))
    }

    private def delete(path: Path): Unit = {
      val fs = FileSystem.get(path.toUri, conf)
      fs.delete(path, true)
      ()
    }

  }

  implicit val valueReader: ValueReader[BigQueryTable.PartitionStrategy] =
    ValueReader[String].map {
      _ match {
        case "month" => BigQueryTable.PartitionByMonth
        case "day"   => BigQueryTable.PartitionByDay
        case other   => sys.error(s"Unknown partition strategy")
      }
    }
}

Source File: MNIST.scala From spark-tsne with Apache License 2.0

5 votes

package com.github.saurfang.spark.tsne.examples


import java.io.{BufferedWriter, OutputStreamWriter}

import com.github.saurfang.spark.tsne.impl._
import com.github.saurfang.spark.tsne.tree.SPTree
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.{SparkConf, SparkContext}
import org.slf4j.LoggerFactory

object MNIST {
  private def logger = LoggerFactory.getLogger(MNIST.getClass)

  def main (args: Array[String]) {
    val conf = new SparkConf()
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .registerKryoClasses(Array(classOf[SPTree]))
    val sc = new SparkContext(conf)
    val hadoopConf = sc.hadoopConfiguration
    val fs = FileSystem.get(hadoopConf)

    val dataset = sc.textFile("data/MNIST/mnist.csv.gz")
      .zipWithIndex()
      .filter(_._2 < 6000)
      .sortBy(_._2, true, 60)
      .map(_._1)
      .map(_.split(","))
      .map(x => (x.head.toInt, x.tail.map(_.toDouble)))
      .cache()
    //logInfo(dataset.collect.map(_._2.toList).toList.toString)

    //val features = dataset.map(x => Vectors.dense(x._2))
    //val scaler = new StandardScaler(true, true).fit(features)
    //val scaledData = scaler.transform(features)
    //  .map(v => Vectors.dense(v.toArray.map(x => if(x.isNaN || x.isInfinite) 0.0 else x)))
    //  .cache()
    val data = dataset.flatMap(_._2)
    val mean = data.mean()
    val std = data.stdev()
    val scaledData = dataset.map(x => Vectors.dense(x._2.map(v => (v - mean) / std))).cache()

    val labels = dataset.map(_._1).collect()
    val matrix = new RowMatrix(scaledData)
    val pcaMatrix = matrix.multiply(matrix.computePrincipalComponents(50))
    pcaMatrix.rows.cache()

    val costWriter = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(s".tmp/MNIST/cost.txt"), true)))

    //SimpleTSNE.tsne(pcaMatrix, perplexity = 20, maxIterations = 200)
    BHTSNE.tsne(pcaMatrix, maxIterations = 500, callback = {
    //LBFGSTSNE.tsne(pcaMatrix, perplexity = 10, maxNumIterations = 500, numCorrections = 10, convergenceTol = 1e-8)
      case (i, y, loss) =>
        if(loss.isDefined) logger.info(s"$i iteration finished with loss $loss")

        val os = fs.create(new Path(s".tmp/MNIST/result${"%05d".format(i)}.csv"), true)
        val writer = new BufferedWriter(new OutputStreamWriter(os))
        try {
          (0 until y.rows).foreach {
            row =>
              writer.write(labels(row).toString)
              writer.write(y(row, ::).inner.toArray.mkString(",", ",", "\n"))
          }
          if(loss.isDefined) costWriter.write(loss.get + "\n")
        } finally {
          writer.close()
        }
    })
    costWriter.close()

    sc.stop()
  }
}

Source File: RMCallbackHandler.scala From DataXServer with Apache License 2.0

5 votes

package org.tianlangstudio.data.hamal.yarn

import java.io.File
import java.util.{Collections, List}

import org.tianlangstudio.data.hamal.core.{Constants, HamalConf}
import org.tianlangstudio.data.hamal.core.HamalConf
//import java.util.Collections

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path, FileContext}
import org.apache.hadoop.yarn.api.records._
import org.apache.hadoop.yarn.client.api.{AMRMClient, NMClient}
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.apache.hadoop.yarn.util.{ConverterUtils, Records}
import scala.jdk.CollectionConverters._
//import scala.collection.JavaConverters._
/**
 * Created by zhuhq on 2016/4/29.
 */
class RMCallbackHandler(nmClient:NMClient,containerCmd:Container => String,hamalConf: HamalConf,yarnConfiguration: Configuration)  extends  AMRMClientAsync.CallbackHandler {
  private val logging = org.slf4j.LoggerFactory.getLogger(classOf[RMCallbackHandler])
  override def onContainersCompleted(statuses: List[ContainerStatus]): Unit = {
    for(containerStatus <- statuses.asScala) {
      logging.info(s"containerId:${containerStatus} exitStatus:${containerStatus}")
    }
  }

  override def onError(e: Throwable): Unit = {
    logging.error("on error",e)

  }

  override def getProgress: Float = {

    0
  }

  override def onShutdownRequest(): Unit = {
    logging.info("on shutdown request")

  }

  override def onNodesUpdated(updatedNodes: List[NodeReport]): Unit = {
    logging.info("on nodes updated")
    for(nodeReport <- updatedNodes.asScala) {
      logging.info(s"node id:${nodeReport} node labels:${nodeReport}");
    }
  }

  override def onContainersAllocated(containers: List[Container]): Unit = {
    logging.info("on containers allocated");
    for (container:Container <- containers.asScala) {
      try {
        // Launch container by create ContainerLaunchContext
        val  ctx = Records.newRecord(classOf[ContainerLaunchContext]);

        //ctx.setCommands(Collections.singletonList(""" echo "begin";sleep 900;echo "end"; """))
        ctx.setCommands(Collections.singletonList(containerCmd(container)))
        val packagePath = hamalConf.getString(Constants.DATAX_EXECUTOR_FILE,"executor.zip");
        val archiveStat = FileSystem.get(yarnConfiguration).getFileStatus(new Path(packagePath))
        val  packageUrl = ConverterUtils.getYarnUrlFromPath(
          FileContext.getFileContext.makeQualified(new Path(packagePath)));
        val packageResource = Records.newRecord[LocalResource](classOf[LocalResource])

        packageResource.setResource(packageUrl);
        packageResource.setSize(archiveStat.getLen);
        packageResource.setTimestamp(archiveStat.getModificationTime);
        packageResource.setType(LocalResourceType.ARCHIVE);
        packageResource.setVisibility(LocalResourceVisibility.APPLICATION)
        ctx.setLocalResources(Collections.singletonMap(Constants.DATAX_EXECUTOR_ARCHIVE_FILE_NAME,packageResource))
        logging.info("[AM] Launching container " + container.getId());
        nmClient.startContainer(container, ctx);
      } catch {
        case ex:Exception =>
          logging.info("[AM] Error launching container " + container.getId() + " " + ex);
      }
    }
  }

}

Source File: GenericMainClass.scala From darwin with Apache License 2.0

5 votes

package it.agilelab.darwin.app.spark

import java.text.SimpleDateFormat
import java.util.Date

import com.typesafe.config.{Config, ConfigFactory}
import org.apache.hadoop.fs.FileSystem
import org.apache.spark.sql.SparkSession
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._

trait GenericMainClass {
  self: SparkManager =>

  val genericMainClassLogger: Logger = LoggerFactory.getLogger("SparkManager")

  private def makeFileSystem(session: SparkSession): FileSystem = {
    if (session.sparkContext.isLocal) {
      FileSystem.getLocal(session.sparkContext.hadoopConfiguration)
    }
    else {
      FileSystem.get(session.sparkContext.hadoopConfiguration)
    }
  }


  
  // scalastyle:off
  private def getGlobalConfig: Config = {
    genericMainClassLogger.debug("system environment vars")
    for ((k, v) <- System.getenv().asScala.toSeq.sortBy(_._1)) genericMainClassLogger.debug(s"$k -> $v")

    genericMainClassLogger.debug("system properties")
    for ((k, v) <- System.getProperties.asScala.toSeq.sortBy(_._1)) genericMainClassLogger.debug(s"$k -> $v")

    ConfigFactory.load()
  }

  // scalastyle:on

}

Source File: SchemaManagerSparkApp.scala From darwin with Apache License 2.0

5 votes

package it.agilelab.darwin.app.spark

import java.nio.ByteOrder

import com.typesafe.config.{Config, ConfigFactory}
import it.agilelab.darwin.app.spark.classes._
import it.agilelab.darwin.manager.AvroSchemaManagerFactory
import org.apache.avro.reflect.ReflectData
import org.apache.hadoop.fs.FileSystem
import org.apache.spark.sql.SparkSession
import org.slf4j.{Logger, LoggerFactory}

object SchemaManagerSparkApp extends GenericMainClass with SparkManager {

  val mainLogger: Logger = LoggerFactory.getLogger("SchemaManagerSparkApp")

  val endianness: ByteOrder = ByteOrder.BIG_ENDIAN

  override protected def runJob(settings: Config)(implicit fs: FileSystem, sparkSession: SparkSession): Int = {
    import sparkSession.implicits._

    val ds = sparkSession.createDataset(sparkSession.sparkContext.parallelize(1 to 1000, 20))
    mainLogger.info("Registering schemas")
    //    val reflections = new Reflections("it.agilelab.darwin.app.spark.classes")
    //    val annotationClass: Class[AvroSerde] = classOf[AvroSerde]
    //    val classes = reflections.getTypesAnnotatedWith(annotationClass).asScala.toSeq
    //      .filter(c => !c.isInterface && !Modifier.isAbstract(c.getModifiers))
    //    val schemas = classes.map(c => ReflectData.get().getSchema(Class.forName(c.getName)))
    val schemas = Seq(ReflectData.get().getSchema(classOf[Menu]), ReflectData.get().getSchema(classOf[MenuItem]),
      ReflectData.get().getSchema(classOf[Food]), ReflectData.get().getSchema(classOf[Order]),
      ReflectData.get().getSchema(classOf[Price]))
    val conf = ConfigFactory.load()
    val manager = AvroSchemaManagerFactory.initialize(conf)
    val registeredIDs: Seq[Long] = manager.registerAll(schemas).map(_._1)
    mainLogger.info("Schemas registered")

    mainLogger.info("Getting ID for a schema")
    manager.getId(ReflectData.get().getSchema(classOf[Menu]))
    mainLogger.info("ID retrieved for the schema")

    mainLogger.info("Get Schema from ID")
    val d2 = ds.map { x =>
      AvroSchemaManagerFactory.initialize(conf).getSchema(registeredIDs(x % registeredIDs.size))
      x
    }
    d2.count()
    mainLogger.info("All schemas obtained")
    10
  }

  override protected def handleException(exception: Throwable, applicationSettings: Config): Unit = {
    mainLogger.error(exception.getMessage)
  }
}

Source File: ImageLoaderUtils.scala From keystone with Apache License 2.0

5 votes

package keystoneml.loaders

import java.awt.image.BufferedImage
import java.io.{InputStream, ByteArrayInputStream}
import java.net.URI
import java.util.zip.GZIPInputStream
import javax.imageio.ImageIO

import keystoneml.loaders.VOCLoader._
import org.apache.commons.compress.archivers.ArchiveStreamFactory
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import keystoneml.pipelines.Logging
import keystoneml.utils._

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

object ImageLoaderUtils extends Logging {
  
  def loadFiles[L, I <: AbstractLabeledImage[L] : ClassTag](
      filePathsRDD: RDD[URI],
      labelsMap: String => L,
      imageBuilder: (Image, L, Option[String]) => I, // TODO(etrain): We can probably do this with implicits.
      namePrefix: Option[String] = None): RDD[I] = {
    filePathsRDD.flatMap(fileUri => loadFile(fileUri, labelsMap, imageBuilder, namePrefix))
  }

  private def loadFile[L, I <: AbstractLabeledImage[L]](
      fileUri: URI,
      labelsMap: String => L,
      imageBuilder: (Image, L, Option[String]) => I,
      namePrefix: Option[String]): Iterator[I] = {
    val filePath = new Path(fileUri)
    val conf = new Configuration(true)
    val fs = FileSystem.get(filePath.toUri(), conf)
    val fStream = fs.open(filePath)

    val tarStream = new ArchiveStreamFactory().createArchiveInputStream(
      "tar", fStream).asInstanceOf[TarArchiveInputStream]

    var entry = tarStream.getNextTarEntry()
    val imgs = new ArrayBuffer[I]
    while (entry != null) {
      if (!entry.isDirectory && (namePrefix.isEmpty || entry.getName.startsWith(namePrefix.get))) {
        var offset = 0
        var ret = 0
        val content = new Array[Byte](entry.getSize().toInt)
        while (ret >= 0 && offset != entry.getSize()) {
          ret = tarStream.read(content, offset, content.length - offset)
          if (ret >= 0) {
            offset += ret
          }
        }

        val bais = new ByteArrayInputStream(content)

        val image = ImageUtils.loadImage(bais).map { img =>
          imageBuilder(img, labelsMap(entry.getName), Some(entry.getName))
        }

        imgs ++= image
      }
      entry = tarStream.getNextTarEntry()
    }

    imgs.iterator
  }
}

Source File: QueryPartitionSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import java.io.File
import java.sql.Timestamp

import com.google.common.io.Files
import org.apache.hadoop.fs.FileSystem

import org.apache.spark.internal.config._
import org.apache.spark.sql._
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.test.SQLTestUtils
import org.apache.spark.util.Utils

class QueryPartitionSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
  import spark.implicits._

  private def queryWhenPathNotExist(): Unit = {
    withTempView("testData") {
      withTable("table_with_partition", "createAndInsertTest") {
        withTempDir { tmpDir =>
          val testData = sparkContext.parallelize(
            (1 to 10).map(i => TestData(i, i.toString))).toDF()
          testData.createOrReplaceTempView("testData")

          // create the table for test
          sql(s"CREATE TABLE table_with_partition(key int,value string) " +
              s"PARTITIONED by (ds string) location '${tmpDir.toURI}' ")
          sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='1') " +
              "SELECT key,value FROM testData")
          sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='2') " +
              "SELECT key,value FROM testData")
          sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='3') " +
              "SELECT key,value FROM testData")
          sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='4') " +
              "SELECT key,value FROM testData")

          // test for the exist path
          checkAnswer(sql("select key,value from table_with_partition"),
            testData.union(testData).union(testData).union(testData))

          // delete the path of one partition
          tmpDir.listFiles
              .find { f => f.isDirectory && f.getName().startsWith("ds=") }
              .foreach { f => Utils.deleteRecursively(f) }

          // test for after delete the path
          checkAnswer(sql("select key,value from table_with_partition"),
            testData.union(testData).union(testData))
        }
      }
    }
  }

  test("SPARK-5068: query data when path doesn't exist") {
    withSQLConf(SQLConf.HIVE_VERIFY_PARTITION_PATH.key -> "true") {
      queryWhenPathNotExist()
    }
  }

  test("Replace spark.sql.hive.verifyPartitionPath by spark.files.ignoreMissingFiles") {
    withSQLConf(SQLConf.HIVE_VERIFY_PARTITION_PATH.key -> "false") {
      sparkContext.conf.set(IGNORE_MISSING_FILES.key, "true")
      queryWhenPathNotExist()
    }
  }

  test("SPARK-21739: Cast expression should initialize timezoneId") {
    withTable("table_with_timestamp_partition") {
      sql("CREATE TABLE table_with_timestamp_partition(value int) PARTITIONED BY (ts TIMESTAMP)")
      sql("INSERT OVERWRITE TABLE table_with_timestamp_partition " +
        "PARTITION (ts = '2010-01-01 00:00:00.000') VALUES (1)")

      // test for Cast expression in TableReader
      checkAnswer(sql("SELECT * FROM table_with_timestamp_partition"),
        Seq(Row(1, Timestamp.valueOf("2010-01-01 00:00:00.000"))))

      // test for Cast expression in HiveTableScanExec
      checkAnswer(sql("SELECT value FROM table_with_timestamp_partition " +
        "WHERE ts = '2010-01-01 00:00:00.000'"), Row(1))
    }
  }
}

Source File: ParquetFileFormatSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.parquet

import org.apache.hadoop.fs.{FileSystem, Path}

import org.apache.spark.SparkException
import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.test.SharedSQLContext

class ParquetFileFormatSuite extends QueryTest with ParquetTest with SharedSQLContext {

  test("read parquet footers in parallel") {
    def testReadFooters(ignoreCorruptFiles: Boolean): Unit = {
      withTempDir { dir =>
        val fs = FileSystem.get(spark.sessionState.newHadoopConf())
        val basePath = dir.getCanonicalPath

        val path1 = new Path(basePath, "first")
        val path2 = new Path(basePath, "second")
        val path3 = new Path(basePath, "third")

        spark.range(1).toDF("a").coalesce(1).write.parquet(path1.toString)
        spark.range(1, 2).toDF("a").coalesce(1).write.parquet(path2.toString)
        spark.range(2, 3).toDF("a").coalesce(1).write.json(path3.toString)

        val fileStatuses =
          Seq(fs.listStatus(path1), fs.listStatus(path2), fs.listStatus(path3)).flatten

        val footers = ParquetFileFormat.readParquetFootersInParallel(
          spark.sessionState.newHadoopConf(), fileStatuses, ignoreCorruptFiles)

        assert(footers.size == 2)
      }
    }

    testReadFooters(true)
    val exception = intercept[SparkException] {
      testReadFooters(false)
    }.getCause
    assert(exception.getMessage().contains("Could not read footer for file"))
  }
}

Source File: VLBFGS1.scala From spark-vl-bfgs with Apache License 2.0

5 votes

package org.apache.spark.ml.optim

import java.util.Random

import scala.language.implicitConversions

import org.apache.hadoop.fs.{FileSystem, Path}

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.optim.VectorFreeLBFGS.{Oracle, VectorSpace}
import org.apache.spark.ml.optim.VectorRDDFunctions._
import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.mllib.random.RandomRDDs
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.{RDD, UnionRDD}
import org.apache.spark.storage.StorageLevel


  private def gradient(data: RDD[Array[LabeledPoint]], dx: RDD[Vector]): RDD[Vector] = {
    data.cartesian(dx).map { case (points, x) =>
      val g = Vectors.zeros(x.size)
      points.foreach { case LabeledPoint(b, a) =>
        val err = BLAS.dot(a, x) - b
        BLAS.axpy(err, a, g)
      }
      g
    }.treeSum()
  }

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("VLBFGS").setMaster("local[*]")
    val sc = new SparkContext(conf)
    sc.setCheckpointDir("/tmp/checkpoint")
    val n = 1000
    val p = 100
    val random = new Random(0L)
    val xExact = Vectors.dense(Array.fill(p)(random.nextDouble()))
    val data = RandomRDDs.normalVectorRDD(sc, n, p, 4, 11L).mapPartitionsWithIndex { (idx, part) =>
      val random = new Random(100 + idx)
      part.map { v =>
        val target = BLAS.dot(v, xExact) + 0.1 * random.nextGaussian()
        LabeledPoint(target, v)
      }
    }.glom()
    .cache()

    val x = solve(data).first()

    println(s"x_exact = $xExact")
    println(s"x_vlbfgs = $x")

    sc.stop()
  }
}

Source File: UnsplittableSequenceFileInputFormat.scala From spark-util with Apache License 2.0

5 votes

package org.hammerlab.hadoop.splits

import java.io.IOException
import java.util

import org.apache.hadoop.fs.{ FileStatus, FileSystem, Path ⇒ HPath }
import org.apache.hadoop.mapred.{ JobConf, SequenceFileInputFormat }
import org.apache.hadoop.mapreduce.JobContext
import org.apache.hadoop.mapreduce.lib.input

import scala.collection.JavaConverters._



  override def listStatus(job: JobContext): util.List[FileStatus] =
    super
      .listStatus(job)
      .asScala
      .sortBy {
        _.getPath.getName match {
          case PartFileBasename(idx) ⇒
            idx
          case basename ⇒
            throw new IllegalArgumentException(s"Bad partition file: $basename")
        }
      }
      .asJava
}

Source File: HadoopBundleFileSystem.scala From mleap with Apache License 2.0

5 votes

package ml.bundle.hdfs

import java.io.File
import java.net.URI
import java.nio.file.{Files, Paths}

import com.typesafe.config.Config
import ml.combust.bundle.fs.BundleFileSystem
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

import scala.util.Try
import scala.collection.JavaConverters._

object HadoopBundleFileSystem {
  lazy val defaultSchemes: Seq[String] = Seq("hdfs")

  def createHadoopConfiguration(config: Config): Configuration = {
    val options: Map[String, String] = if(config.hasPath("options")) {
      config.getConfig("options").entrySet().asScala.map {
        entry => (entry.getKey, entry.getValue.unwrapped().toString)
      }.toMap
    } else {
      Map()
    }

    val c = new Configuration()
    for ((key, value) <- options) { c.set(key, value) }
    c
  }

  def createSchemes(config: Config): Seq[String] = if (config.hasPath("schemes")) {
    config.getStringList("schemes").asScala
  } else { Seq("hdfs") }
}

class HadoopBundleFileSystem(fs: FileSystem,
                             override val schemes: Seq[String] = HadoopBundleFileSystem.defaultSchemes) extends BundleFileSystem {
  def this(config: Config) = {
    this(FileSystem.get(HadoopBundleFileSystem.createHadoopConfiguration(config)),
      HadoopBundleFileSystem.createSchemes(config))
  }

  override def load(uri: URI): Try[File] = Try {
    val tmpDir = Files.createTempDirectory("hdfs-bundle")
    val tmpFile = Paths.get(tmpDir.toString, "bundle.zip")
    fs.copyToLocalFile(new Path(uri.toString), new Path(tmpFile.toString))
    tmpFile.toFile
  }

  override def save(uri: URI, localFile: File): Unit = {
    fs.copyFromLocalFile(new Path(localFile.toString), new Path(uri.toString))
  }
}

Source File: HadoopBundleFileSystemSpec.scala From mleap with Apache License 2.0

5 votes

package ml.bundle.hdfs

import java.net.URI
import java.nio.file.{Files, Paths}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.FunSpec

class HadoopBundleFileSystemSpec extends FunSpec {
  private val fs = FileSystem.get(new Configuration())
  private val bundleFs = new HadoopBundleFileSystem(fs)

  describe("scheme") {
    it("returns hdfs") {
      assert(bundleFs.schemes == Seq("hdfs"))
    }
  }

  describe("load") {
    it("loads a file from hadoop and saves to a local file") {
      val testFile = Files.createTempFile("HadoopBundleFileSystemSpec", ".txt")
      Files.write(testFile.toAbsolutePath, "HELLO".getBytes())

      val loadedFile = bundleFs.load(testFile.toUri).get
      val contents = new String(Files.readAllBytes(loadedFile.toPath))

      assert(contents == "HELLO")
    }
  }

  describe("save") {
    it("saves local file to HDFS") {
      val testFile = Files.createTempFile("HadoopBundleFileSystemSpec", ".txt")
      Files.write(testFile.toAbsolutePath, "HELLO".getBytes())

      val tmpDir = Files.createTempDirectory("HadoopBundleFileSystemSpec")
      val tmpFile = new URI(s"file://$tmpDir/test.txt")

      bundleFs.save(tmpFile, testFile.toFile)
      val contents = new String(Files.readAllBytes(Paths.get(tmpFile)))

      assert(contents == "HELLO")
    }
  }
}

Source File: SparkBundleContext.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle

import ml.bundle.hdfs.HadoopBundleFileSystem
import ml.combust.bundle.{BundleRegistry, HasBundleRegistry}
import ml.combust.mleap.ClassLoaderUtil
import org.apache.hadoop.fs.FileSystem
import org.apache.spark.sql.DataFrame


object SparkBundleContext {
  val DEFAULT_REGISTRY_KEY: String = "ml.combust.mleap.spark.registry.default"

  implicit lazy val defaultContext: SparkBundleContext = SparkBundleContext(None, Some(classOf[SparkBundleContext].getClassLoader))

  def apply(dataset: Option[DataFrame] = None): SparkBundleContext = apply(dataset, None)

  def apply(dataset: Option[DataFrame], clOption: Option[ClassLoader]): SparkBundleContext = {
    val cl = clOption.getOrElse(ClassLoaderUtil.findClassLoader(classOf[SparkBundleContext].getCanonicalName))
    apply(dataset, BundleRegistry(DEFAULT_REGISTRY_KEY, Some(cl)))
  }
}

case class SparkBundleContext(dataset: Option[DataFrame],
                              override val bundleRegistry: BundleRegistry) extends HasBundleRegistry {
  def withDataset(dataset: DataFrame): SparkBundleContext = withDataset(dataset, registerHdfs = true)

    def withDataset(dataset: DataFrame, registerHdfs: Boolean): SparkBundleContext = {
    val bundleRegistry2 = if (registerHdfs) {
      bundleRegistry.registerFileSystem(
        new HadoopBundleFileSystem(FileSystem.get(
          dataset.sqlContext.sparkSession.sparkContext.hadoopConfiguration)))
    } else { bundleRegistry }

    copy(dataset = Some(dataset),
      bundleRegistry = bundleRegistry2)
  }
}

Source File: TestSpec.scala From spark-distcp with Apache License 2.0

5 votes

package com.coxautodata

import java.io.ByteArrayInputStream
import java.nio.file.Files

import com.coxautodata.objects.SerializableFileStatus
import com.coxautodata.utils.FileListing
import org.apache.commons.io.{FileUtils, IOUtils}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path}
import org.scalatest.{BeforeAndAfterEach, FunSpec, Matchers}

trait TestSpec extends FunSpec with Matchers with BeforeAndAfterEach {

  var testingBaseDir: java.nio.file.Path = _
  var testingBaseDirName: String = _
  var testingBaseDirPath: Path = _
  var localFileSystem: LocalFileSystem = _

  override def beforeEach(): Unit = {
    super.beforeEach()
    testingBaseDir = Files.createTempDirectory("test_output")
    testingBaseDirName = testingBaseDir.toString
    localFileSystem = FileSystem.getLocal(new Configuration())
    testingBaseDirPath = localFileSystem.makeQualified(new Path(testingBaseDirName))
  }

  override def afterEach(): Unit = {
    super.afterEach()
    FileUtils.deleteDirectory(testingBaseDir.toFile)
  }

  def createFile(relativePath: Path, content: Array[Byte]): SerializableFileStatus = {
    val path = new Path(testingBaseDirPath, relativePath)
    localFileSystem.mkdirs(path.getParent)
    val in = new ByteArrayInputStream(content)
    val out = localFileSystem.create(path)
    IOUtils.copy(in, out)
    in.close()
    out.close()
    SerializableFileStatus(localFileSystem.getFileStatus(path))
  }

  def fileStatusToResult(f: SerializableFileStatus): FileListing = {
    FileListing(f.getPath.toString, if (f.isFile) Some(f.getLen) else None)
  }

}

Source File: GeoJSONRelation.scala From magellan with Apache License 2.0

5 votes

package magellan

import magellan.mapreduce.WholeFileInputFormat
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.json4s._
import org.json4s.jackson.JsonMethods._


case class GeoJSONRelation(
    path: String,
    parameters: Map[String, String])
    (@transient val sqlContext: SQLContext)
  extends SpatialRelation {

  protected override def _buildScan(): RDD[Array[Any]] = {
    val conf = sc.hadoopConfiguration
    FileSystem.getLocal(conf)
    sc.newAPIHadoopFile(
      path,
      classOf[WholeFileInputFormat],
      classOf[NullWritable],
      classOf[Text]).flatMap {
      case (k, v) =>
        val line = v.toString()
        parseShapeWithMeta(line)
    }.map {
      case (shape: Shape, meta: Option[Map[String, String]]) =>
        Array(shape, meta)
    }
  }

  private def parseShapeWithMeta(line: String) = {
    val tree = parse(line)
    implicit val formats = org.json4s.DefaultFormats
    val result = tree.extract[GeoJSON]
    result.features.flatMap { f =>
      f.geometry.shapes.map(shape => (shape, f.properties))
    }
  }
}

Source File: WholeFileReader.scala From magellan with Apache License 2.0

5 votes

package magellan.mapreduce

import java.io.InputStream

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import org.apache.hadoop.io.compress.{CodecPool, CompressionCodecFactory, Decompressor}
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.mapreduce.lib.input.FileSplit
import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext}

class WholeFileReader extends RecordReader[NullWritable, Text] {

  private val key = NullWritable.get()
  private val value = new Text()
  private var split: FileSplit = _
  private var conf: Configuration = _
  private var path: Path = _
  private var done: Boolean = false

  override def getProgress: Float = ???

  override def nextKeyValue(): Boolean = {
    if (done){
      false
    } else {
      val fs = path.getFileSystem(conf)
      var is: FSDataInputStream = null
      var in: InputStream = null
      var decompressor: Decompressor = null
      try {
        is = fs.open(split.getPath)
        val codec = new CompressionCodecFactory(conf).getCodec(path)
        if (codec != null) {
          decompressor = CodecPool.getDecompressor(codec)
          in = codec.createInputStream(is, decompressor)
        } else {
          in = is
        }
        val result = IOUtils.toByteArray(in)
        value.clear()
        value.set(result)
        done = true
        true
      } finally {
        if (in != null) {
          IOUtils.closeQuietly(in)
        }
        if (decompressor != null) {
          CodecPool.returnDecompressor(decompressor)
        }
      }
    }
  }

  override def getCurrentValue: Text = value

  override def initialize(inputSplit: InputSplit,
    taskAttemptContext: TaskAttemptContext): Unit = {
    this.split = inputSplit.asInstanceOf[FileSplit]
    this.conf = MapReduceUtils.getConfigurationFromContext(taskAttemptContext)
    this.path = this.split.getPath
  }

  override def getCurrentKey: NullWritable = key

  override def close() {}
}

Source File: ShxReaderSuite.scala From magellan with Apache License 2.0

5 votes

package magellan.mapreduce

import magellan.TestSparkContext
import magellan.io.PolygonReader
import org.apache.commons.io.EndianUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.{ArrayWritable, LongWritable, Text}
import org.scalatest.FunSuite

class ShxReaderSuite extends FunSuite with TestSparkContext {

  test("Read shx file") {
    val path = this.getClass.getClassLoader.getResource("shapefiles/us_states/tl_2016_us_state.shx").getPath
    val conf = new Configuration()
    conf.set("mapreduce.input.fileinputformat.split.maxsize", "10000")

    val data = sc.newAPIHadoopFile(
      path,
      classOf[ShxInputFormat],
      classOf[Text],
      classOf[ArrayWritable],
      conf
    ).map { case (txt: Text, splits: ArrayWritable) =>
        val fileName = txt.toString
        val s = splits.get()
        val size = s.length
        var i = 0
        val v = Array.fill(size)(0L)
        while (i < size) {
          v.update(i, s(i).asInstanceOf[LongWritable].get())
          i += 1
        }
        (fileName, v)
      }
    assert(data.count() === 1)
    val (fileName, splits) = data.first()
    assert(fileName === "tl_2016_us_state")

    // the offsets should be correct
    val firstOffset = splits(0)
    val secondOffset = splits(1)

    // skipping to the first offset in the Shapefile should allow me to read the first polygon
    val shpFilePath = this.getClass.getClassLoader.getResource("shapefiles/us_states/tl_2016_us_state.shp").getPath

    val fs = FileSystem.get(sc.hadoopConfiguration)

    var dis = fs.open(new Path(shpFilePath))

    // skip  firstOffset # of bytes
    dis.seek(firstOffset)

    // skip record number
    assert(dis.readInt() === 1)

    // read content length
    var contentLength = 16 * (dis.readInt() + 4)

    // extract the shape type
    var shapeType = EndianUtils.swapInteger(dis.readInt())

    // expect a Polygon
    assert(shapeType === 5)

    // the first polygon's content should follow from here
    val polygonReader = new PolygonReader()
    val polygon = polygonReader.readFields(dis)
    assert(polygon != null)

    // seek to the second offset
    dis.seek(secondOffset)
    assert(dis.readInt() === 2)

  }
}

Source File: WorkbookReader.scala From spark-excel with Apache License 2.0

5 votes

package com.crealytics.spark.excel

import java.io.InputStream

import com.crealytics.spark.excel.Utils.MapIncluding
import com.github.pjfanning.xlsx.StreamingReader
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.poi.ss.usermodel.{Workbook, WorkbookFactory}

trait WorkbookReader {
  protected def openWorkbook(): Workbook
  def withWorkbook[T](f: Workbook => T): T = {
    val workbook = openWorkbook()
    val res = f(workbook)
    workbook.close()
    res
  }
  def sheetNames: Seq[String] = {
    withWorkbook(
      workbook =>
        for (sheetIx <- (0 until workbook.getNumberOfSheets())) yield {
          workbook.getSheetAt(sheetIx).getSheetName()
        }
    )
  }
}

object WorkbookReader {
  val WithLocationMaxRowsInMemoryAndPassword =
    MapIncluding(Seq("path"), optionally = Seq("maxRowsInMemory", "workbookPassword"))

  def apply(parameters: Map[String, String], hadoopConfiguration: Configuration): WorkbookReader = {
    def readFromHadoop(location: String) = {
      val path = new Path(location)
      FileSystem.get(path.toUri, hadoopConfiguration).open(path)
    }
    parameters match {
      case WithLocationMaxRowsInMemoryAndPassword(Seq(location), Seq(Some(maxRowsInMemory), passwordOption)) =>
        new StreamingWorkbookReader(readFromHadoop(location), passwordOption, maxRowsInMemory.toInt)
      case WithLocationMaxRowsInMemoryAndPassword(Seq(location), Seq(None, passwordOption)) =>
        new DefaultWorkbookReader(readFromHadoop(location), passwordOption)
    }
  }
}
class DefaultWorkbookReader(inputStreamProvider: => InputStream, workbookPassword: Option[String])
    extends WorkbookReader {
  protected def openWorkbook(): Workbook =
    workbookPassword
      .fold(WorkbookFactory.create(inputStreamProvider))(
        password => WorkbookFactory.create(inputStreamProvider, password)
      )
}

class StreamingWorkbookReader(inputStreamProvider: => InputStream, workbookPassword: Option[String], maxRowsInMem: Int)
    extends WorkbookReader {
  override protected def openWorkbook(): Workbook = {
    val builder = StreamingReader
      .builder()
      .rowCacheSize(maxRowsInMem)
      .bufferSize(4096)
    workbookPassword
      .fold(builder)(password => builder.password(password))
      .open(inputStreamProvider)
  }
}

Source File: ExcelFileSaver.scala From spark-excel with Apache License 2.0

5 votes

package com.crealytics.spark.excel

import java.io.BufferedOutputStream

import com.crealytics.spark.excel.ExcelFileSaver.{DEFAULT_DATE_FORMAT, DEFAULT_SHEET_NAME, DEFAULT_TIMESTAMP_FORMAT}
import com.norbitltd.spoiwo.model._
import com.norbitltd.spoiwo.natures.xlsx.Model2XlsxConversions._
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import org.apache.poi.ss.util.CellRangeAddress
import org.apache.poi.xssf.usermodel.XSSFWorkbook
import org.apache.spark.sql.{DataFrame, SaveMode}

import scala.collection.JavaConverters._

object ExcelFileSaver {
  final val DEFAULT_SHEET_NAME = "Sheet1"
  final val DEFAULT_DATE_FORMAT = "yy-m-d h:mm"
  final val DEFAULT_TIMESTAMP_FORMAT = "yyyy-mm-dd hh:mm:ss.000"
}

class ExcelFileSaver(
  fs: FileSystem,
  location: Path,
  dataFrame: DataFrame,
  saveMode: SaveMode,
  dataLocator: DataLocator,
  header: Boolean = true
) {
  def save(): Unit = {
    def sheet(workbook: XSSFWorkbook) = {
      val headerRow = if (header) Some(dataFrame.schema.fields.map(_.name).toSeq) else None
      val dataRows = dataFrame
        .toLocalIterator()
        .asScala
        .map(_.toSeq)
      dataLocator.toSheet(headerRow, dataRows, workbook)
    }
    val fileAlreadyExists = fs.exists(location)
    def writeToWorkbook(workbook: XSSFWorkbook): Unit = {
      Workbook(sheet(workbook)).writeToExisting(workbook)
      autoClose(new BufferedOutputStream(fs.create(location)))(workbook.write)
    }
    (fileAlreadyExists, saveMode) match {
      case (false, _) | (_, SaveMode.Overwrite) =>
        if (fileAlreadyExists) {
          fs.delete(location, true)
        }
        writeToWorkbook(new XSSFWorkbook())
      case (true, SaveMode.ErrorIfExists) =>
        sys.error(s"path $location already exists.")
      case (true, SaveMode.Ignore) => ()
      case (true, SaveMode.Append) =>
        val inputStream: FSDataInputStream = fs.open(location)
        val workbook = new XSSFWorkbook(inputStream)
        inputStream.close()
        writeToWorkbook(workbook)
    }
  }

  def autoClose[A <: AutoCloseable, B](closeable: A)(fun: (A) => B): B = {
    try {
      fun(closeable)
    } finally {
      closeable.close()
    }
  }
}

Source File: ParquetWriterTask.scala From gearpump-examples with Apache License 2.0

5 votes

package io.gearpump.examples.kafka_hdfs_pipeline

import org.apache.avro.Schema
import io.gearpump.Message
import io.gearpump.cluster.UserConfig
import io.gearpump.examples.kafka_hdfs_pipeline.ParquetWriterTask._
import io.gearpump.streaming.task.{StartTime, Task, TaskContext}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.apache.parquet.avro.AvroParquetWriter

import scala.util.{Failure, Success, Try}

class ParquetWriterTask(taskContext : TaskContext, config: UserConfig) extends Task(taskContext, config) {
  val outputFileName = taskContext.appName + ".parquet"
  val absolutePath = Option(getHdfs + config.getString(PARQUET_OUTPUT_DIRECTORY).getOrElse("/parquet") + "/" + outputFileName).map(deleteFile(_)).get
  val outputPath = new Path(absolutePath)
  var parquetWriter = new AvroParquetWriter[SpaceShuttleRecord](outputPath, SpaceShuttleRecord.SCHEMA$)

  def getYarnConf = new YarnConfiguration
  def getFs = FileSystem.get(getYarnConf)
  def getHdfs = new Path(getFs.getHomeDirectory, "gearpump")

  private def deleteFile(fileName: String): String = {
    val file = new Path(fileName)
    getFs.exists(file) match {
      case true =>
        getFs.delete(file,false)
      case false =>
    }
    fileName
  }

  override def onStart(startTime: StartTime): Unit = {
    LOG.info(s"ParquetWriter.onStart $absolutePath")
  }

  override def onNext(msg: Message): Unit = {
    Try({
      parquetWriter.write(msg.msg.asInstanceOf[SpaceShuttleRecord])
    }) match {
      case Success(ok) =>
      case Failure(throwable) =>
        LOG.error(s"failed ${throwable.getMessage}")
    }
  }

  override def onStop(): Unit = {
    LOG.info("ParquetWriter.onStop")
    parquetWriter.close()
  }
}

object ParquetWriterTask {
  val PARQUET_OUTPUT_DIRECTORY = "parquet.output.directory"
  val PARQUET_WRITER = "parquet.writer"
}

Source File: ParquetWriterTaskSpec.scala From gearpump-examples with Apache License 2.0

5 votes

package io.gearpump.examples.kafka_hdfs_pipeline

import akka.actor.ActorSystem
import org.apache.avro.Schema
import io.gearpump.Message
import io.gearpump.cluster.UserConfig
import io.gearpump.streaming.MockUtil
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.apache.parquet.avro.{AvroParquetReader, AvroParquetWriter}
import org.apache.parquet.hadoop.ParquetReader
import org.apache.parquet.hadoop.api.ReadSupport
import org.mockito.Mockito
import org.mockito.Mockito._
import org.scalatest.prop.PropertyChecks
import org.scalatest.{BeforeAndAfterAll, Matchers, PropSpec}


class ParquetWriterTaskSpec extends PropSpec with PropertyChecks with Matchers with BeforeAndAfterAll {

  implicit var system: ActorSystem = ActorSystem("PipeLineSpec")
  val context = MockUtil.mockTaskContext
  val appName = "KafkaHdfsPipeLine"
  when(context.appName).thenReturn(appName)
  val fs = FileSystem.get(new YarnConfiguration)
  val homeDir = fs.getHomeDirectory.toUri.getPath
  val parquetDir = new Path(homeDir, "gearpump")  + "/parquet/"
  val parquetPath = parquetDir + appName + ".parquet"
  val parquetCrc = parquetDir + "." + appName + ".parquet.crc"
  val parquetWriter = Mockito.mock(classOf[AvroParquetWriter[SpaceShuttleRecord]])

  val anomaly = 0.252
  val now = System.currentTimeMillis

  val userConfig = UserConfig.empty.withString(ParquetWriterTask.PARQUET_OUTPUT_DIRECTORY, "/parquet")

  override def afterAll(): Unit = {
    List(parquetPath, parquetCrc, parquetDir).foreach(new java.io.File(_).delete)
    system.shutdown()
  }

  property("ParquetWriterTask should initialize with local parquet file opened for writing") {
    val parquetWriterTask = new ParquetWriterTask(context, userConfig)
    val path = parquetWriterTask.absolutePath.stripPrefix("file:")
    assert(parquetPath.equals(path))
    parquetWriterTask.onStop
  }

  property("ParquetWriterTask should write records to a parquet file") {
    val message = Message(SpaceShuttleRecord(now, anomaly), now)
    val parquetWriterTask = new ParquetWriterTask(context, userConfig)
    parquetWriterTask.parquetWriter = parquetWriter
    parquetWriterTask.onNext(message)
    verify(parquetWriterTask.parquetWriter).write(message.msg.asInstanceOf[SpaceShuttleRecord])
    parquetWriterTask.onStop
  }

  property("ParquetWriterTask should have verifiable written record") {
    val message = Message(SpaceShuttleRecord(now, anomaly), now)
    val parquetWriterTask = new ParquetWriterTask(context, userConfig)
    parquetWriterTask.onNext(message)
    parquetWriterTask.onStop
    val reader = new AvroParquetReader[SpaceShuttleRecord](new Path(parquetPath))
    val record = reader.read()
    assert(message.msg.asInstanceOf[SpaceShuttleRecord].anomaly == record.anomaly)
    assert(message.msg.asInstanceOf[SpaceShuttleRecord].ts == record.ts)
  }
}

Source File: HdfsUtilsTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.serving.core.utils

import java.io.{FileNotFoundException, InputStream}

import org.apache.hadoop.fs.{FileSystem, _}
import org.junit.runner.RunWith
import org.mockito.Mockito._
import org.scalatest._
import org.scalatest.junit.JUnitRunner
import org.scalatest.mock.MockitoSugar

import scala.util.{Failure, Try}

@RunWith(classOf[JUnitRunner])
class HdfsUtilsTest extends FlatSpec with ShouldMatchers with MockitoSugar {

  val fileSystem: FileSystem = mock[FileSystem]

  val utils = new HdfsUtils(fileSystem, "stratio")

  "hdfs utils" should "getfiles from a path" in {
    val expected = Array(mock[FileStatus])
    when(fileSystem.listStatus(new Path("myTestPath"))).thenReturn(expected)
    val result = utils.getFiles("myTestPath")
    result should be(expected)
  }

  it should "return single file as inputStream" in {
    val expected: InputStream = mock[FSDataInputStream]
    when(fileSystem.open(new Path("testFile"))).thenReturn(expected.asInstanceOf[FSDataInputStream])
    val result: InputStream = utils.getFile("testFile")
    result should be(expected)
  }

  it should "write" in {
    val result = Try(utils.write("from", "to", true)) match {
      case Failure(ex: Throwable) => ex
    }
    result.isInstanceOf[FileNotFoundException] should be(true)
  }

  it should "write without override" in {
    val result = Try(utils.write("from", "to", false)) match {
      case Failure(ex: Throwable) => ex
    }
    result.isInstanceOf[FileNotFoundException] should be(true)
  }
}

Source File: SharedOapContext.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.test.oap

import scala.collection.mutable

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem

import org.apache.spark.sql.{OapExtensions, SparkSession}
import org.apache.spark.sql.execution.{FileSourceScanExec, FilterExec, SparkPlan}
import org.apache.spark.sql.execution.datasources.oap.{IndexType, OapFileFormat}
import org.apache.spark.sql.internal.oap.OapConf
import org.apache.spark.sql.oap.{OapDriverRuntime, OapRuntime}
import org.apache.spark.sql.test.OapSharedSQLContext

trait SharedOapContext extends SharedOapContextBase {
  protected override def createSparkSession: SparkSession = {
    SparkSession.cleanupAnyExistingSession()
    val session = SparkSession.builder()
      .master("local[2]")
      .appName("test-oap-context")
      .config(oapSparkConf).getOrCreate()
    OapRuntime.getOrCreate.asInstanceOf[OapDriverRuntime].setTestSession(session)
    session
  }
}


  protected def withFileSystem(f: FileSystem => Unit): Unit = {
    var fs: FileSystem = null
    try {
      fs = FileSystem.get(configuration)
      f(fs)
    } finally {
      if (fs != null) {
        fs.close()
      }
    }
  }
}

case class TestPartition(key: String, value: String)

case class TestIndex(
    tableName: String,
    indexName: String,
    partitions: TestPartition*)

Source File: SeqFileStreamProcessor.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.streaming.examples.fsio

import java.io.File
import java.time.Instant
import java.util.concurrent.TimeUnit

import scala.concurrent.duration.FiniteDuration
import akka.actor.Cancellable
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.SequenceFile._
import org.apache.hadoop.io.{SequenceFile, Text}
import org.apache.gearpump.Message
import org.apache.gearpump.cluster.UserConfig
import org.apache.gearpump.streaming.examples.fsio.HadoopConfig._
import org.apache.gearpump.streaming.examples.fsio.SeqFileStreamProcessor._
import org.apache.gearpump.streaming.task.{Task, TaskContext}

class SeqFileStreamProcessor(taskContext: TaskContext, config: UserConfig)
  extends Task(taskContext, config) {

  import taskContext.taskId

  val outputPath = new Path(config.getString(OUTPUT_PATH).get + File.separator + taskId)
  var writer: SequenceFile.Writer = null
  val textClass = new Text().getClass
  val key = new Text()
  val value = new Text()
  val hadoopConf = config.hadoopConf

  private var msgCount: Long = 0
  private var snapShotKVCount: Long = 0
  private var snapShotTime: Long = 0
  private var scheduler: Cancellable = null

  override def onStart(startTime: Instant): Unit = {

    val fs = FileSystem.get(hadoopConf)
    fs.deleteOnExit(outputPath)
    writer = SequenceFile.createWriter(hadoopConf, Writer.file(outputPath),
      Writer.keyClass(textClass), Writer.valueClass(textClass))

    scheduler = taskContext.schedule(new FiniteDuration(5, TimeUnit.SECONDS),
      new FiniteDuration(5, TimeUnit.SECONDS))(reportStatus())
    snapShotTime = System.currentTimeMillis()
    LOG.info("sequence file bolt initiated")
  }

  override def onNext(msg: Message): Unit = {
    val kv = msg.value.asInstanceOf[String].split("\\+\\+")
    if (kv.length >= 2) {
      key.set(kv(0))
      value.set(kv(1))
      writer.append(key, value)
    }
    msgCount += 1
  }

  override def onStop(): Unit = {
    if (scheduler != null) {
      scheduler.cancel()
    }
    writer.close()
    LOG.info("sequence file bolt stopped")
  }

  private def reportStatus() = {
    val current: Long = System.currentTimeMillis()
    LOG.info(s"Task $taskId Throughput: ${
      (msgCount - snapShotKVCount,
        (current - snapShotTime) / 1000)
    } (KVPairs, second)")
    snapShotKVCount = msgCount
    snapShotTime = current
  }
}

object SeqFileStreamProcessor {
  val OUTPUT_PATH = "outputpath"
}

Source File: SeqFileStreamProducer.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.streaming.examples.fsio

import java.time.Instant

import org.apache.gearpump.streaming.source.Watermark
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.SequenceFile._
import org.apache.hadoop.io.{SequenceFile, Text}
import org.apache.gearpump.Message
import org.apache.gearpump.cluster.UserConfig
import org.apache.gearpump.streaming.examples.fsio.HadoopConfig._
import org.apache.gearpump.streaming.examples.fsio.SeqFileStreamProducer._
import org.apache.gearpump.streaming.task.{Task, TaskContext}

class SeqFileStreamProducer(taskContext: TaskContext, config: UserConfig)
  extends Task(taskContext, config) {

  import taskContext.output

  val value = new Text()
  val key = new Text()
  var reader: SequenceFile.Reader = _
  val hadoopConf = config.hadoopConf
  val fs = FileSystem.get(hadoopConf)
  val inputPath = new Path(config.getString(INPUT_PATH).get)

  override def onStart(startTime: Instant): Unit = {
    reader = new SequenceFile.Reader(hadoopConf, Reader.file(inputPath))
    self ! Start
    LOG.info("sequence file spout initiated")
  }

  override def onNext(msg: Message): Unit = {
    if (reader.next(key, value)) {
      output(Message(key + "++" + value))
    } else {
      reader.close()
      reader = new SequenceFile.Reader(hadoopConf, Reader.file(inputPath))
    }
    self ! Continue
  }

  override def onStop(): Unit = {
    reader.close()
  }
}

object SeqFileStreamProducer {
  def INPUT_PATH: String = "inputpath"

  val Start = Watermark(Instant.now)
  val Continue = Watermark(Instant.now)
}

Source File: SeqFileStreamProcessorSpec.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.streaming.examples.fsio

import java.io.File
import java.time.Instant
import scala.collection.mutable.ArrayBuffer

import akka.actor.ActorSystem
import akka.testkit.TestProbe
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.SequenceFile.Reader
import org.apache.hadoop.io.{SequenceFile, Text}
import org.mockito.Mockito._
import org.scalacheck.Gen
import org.scalatest.prop.PropertyChecks
import org.scalatest.{BeforeAndAfter, Matchers, PropSpec}

import org.apache.gearpump.Message
import org.apache.gearpump.cluster.{TestUtil, UserConfig}
import org.apache.gearpump.streaming.task.TaskId
import org.apache.gearpump.streaming.{MockUtil, Processor}
class SeqFileStreamProcessorSpec
  extends PropSpec with PropertyChecks with Matchers with BeforeAndAfter {

  val kvPairs = new ArrayBuffer[(String, String)]
  val outputDirectory = "SeqFileStreamProcessor_Test"
  val sequenceFilePath = new Path(outputDirectory + File.separator + TaskId(0, 0))
  val hadoopConf = new Configuration()
  val fs = FileSystem.get(hadoopConf)
  val textClass = new Text().getClass
  val _key = new Text()
  val _value = new Text()

  val kvGenerator = for {
    key <- Gen.alphaStr
    value <- Gen.alphaStr
  } yield (key, value)

  before {
    implicit val system1 = ActorSystem("SeqFileStreamProcessor", TestUtil.DEFAULT_CONFIG)
    val system2 = ActorSystem("Reporter", TestUtil.DEFAULT_CONFIG)
    val watcher = TestProbe()(system1)
    val conf = HadoopConfig(UserConfig.empty.withString(SeqFileStreamProcessor.OUTPUT_PATH,
      outputDirectory)).withHadoopConf(new Configuration())
    val context = MockUtil.mockTaskContext

    val processorDescription =
      Processor.ProcessorToProcessorDescription(id = 0, Processor[SeqFileStreamProcessor](1))

    val taskId = TaskId(0, 0)
    when(context.taskId).thenReturn(taskId)

    val processor = new SeqFileStreamProcessor(context, conf)
    processor.onStart(Instant.EPOCH)

    forAll(kvGenerator) { kv =>
      val (key, value) = kv
      kvPairs.append((key, value))
      processor.onNext(Message(key + "++" + value))
    }
    processor.onStop()
  }

  property("SeqFileStreamProcessor should write the key-value pairs to a sequence file") {
    val reader = new SequenceFile.Reader(hadoopConf, Reader.file(sequenceFilePath))
    kvPairs.foreach { kv =>
      val (key, value) = kv
      if (value.length > 0 && reader.next(_key, _value)) {
        assert(_key.toString == key && _value.toString == value)
      }
    }
    reader.close()
  }

  after {
    fs.deleteOnExit(new Path(outputDirectory))
  }
}

Source File: SeqFileStreamProducerSpec.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.streaming.examples.fsio

import java.time.Instant

import scala.collection.mutable.ArrayBuffer

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.SequenceFile.Writer
import org.apache.hadoop.io.{SequenceFile, Text}
import org.mockito.Mockito._
import org.scalacheck.Gen
import org.scalatest.prop.PropertyChecks
import org.scalatest.{BeforeAndAfter, Matchers, PropSpec}

import org.apache.gearpump.Message
import org.apache.gearpump.cluster.UserConfig
import org.apache.gearpump.streaming.MockUtil
import org.apache.gearpump.streaming.MockUtil._

class SeqFileStreamProducerSpec
  extends PropSpec with PropertyChecks with Matchers with BeforeAndAfter {

  val kvPairs = new ArrayBuffer[(String, String)]
  val inputFile = "SeqFileStreamProducer_Test"
  val sequenceFilePath = new Path(inputFile)
  val hadoopConf = new Configuration()
  val fs = FileSystem.get(hadoopConf)
  val textClass = new Text().getClass
  val _key = new Text()
  val _value = new Text()

  val kvGenerator = for {
    key <- Gen.alphaStr
    value <- Gen.alphaStr
  } yield (key, value)

  before {
    fs.deleteOnExit(sequenceFilePath)
    val writer = SequenceFile.createWriter(hadoopConf, Writer.file(sequenceFilePath),
      Writer.keyClass(textClass), Writer.valueClass(textClass))
    forAll(kvGenerator) { kv =>
      _key.set(kv._1)
      _value.set(kv._2)
      kvPairs.append((kv._1, kv._2))
      writer.append(_key, _value)
    }
    writer.close()
  }

  property("SeqFileStreamProducer should read the key-value pairs from " +
    "a sequence file and deliver them") {

    val conf = HadoopConfig(UserConfig.empty.withString(SeqFileStreamProducer.INPUT_PATH,
      inputFile)).withHadoopConf(new Configuration())

    val context = MockUtil.mockTaskContext

    val producer = new SeqFileStreamProducer(context, conf)
    producer.onStart(Instant.EPOCH)
    producer.onNext(Message("start"))

    val expected = kvPairs.map(kv => kv._1 + "++" + kv._2).toSet
    verify(context).output(argMatch[Message](msg =>
      expected.contains(msg.value.asInstanceOf[String])))
  }

  after {
    fs.deleteOnExit(sequenceFilePath)
  }
}

Source File: HDFSClusterTest.scala From spark-testing-base with Apache License 2.0

5 votes

package com.holdenkarau.spark.testing

import java.io.{
  BufferedReader, BufferedWriter, InputStreamReader, OutputStreamWriter}

import com.holdenkarau.spark.testing.{RDDComparisons, SharedSparkContext}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite

class HDFSClusterTest extends FunSuite with SharedSparkContext with RDDComparisons {

  var hdfsCluster: HDFSCluster = null

  override def beforeAll(): Unit = {
    super.beforeAll()
    hdfsCluster = new HDFSCluster
    hdfsCluster.startHDFS()
  }

  test("get the namenode uri") {
    val nameNodeURI = hdfsCluster.getNameNodeURI()
    assert(nameNodeURI == "hdfs://localhost:8020")
  }

  test("read and write from spark to hdfs") {
    val list = List(1, 2, 3, 4, 5)
    val numRDD: RDD[Int] = sc.parallelize(list)

    val path = hdfsCluster.getNameNodeURI() + "/myRDD"
    numRDD.saveAsTextFile(path)

    val loadedRDD: RDD[Int] = sc.textFile(path).map(_.toInt)
    assertRDDEquals(numRDD, loadedRDD)
  }

  test("test creating local file to hdfs") {
    val path = new Path(hdfsCluster.getNameNodeURI() + "/myfile")
    val fs = FileSystem.get(path.toUri, new Configuration())

    val writer = new BufferedWriter(new OutputStreamWriter(fs.create(path)))
    val writtenString = "hello, it's me"
    writer.write(writtenString)
    writer.close()

    val reader = new BufferedReader(new InputStreamReader(fs.open(path)))
    val readString = reader.readLine()
    reader.close()

    assert(writtenString == readString)
  }

  override def afterAll() {
    hdfsCluster.shutdownHDFS()
    super.afterAll()
  }
}

Source File: StreamMetadata.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.io.{InputStreamReader, OutputStreamWriter}
import java.nio.charset.StandardCharsets

import scala.util.control.NonFatal

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization

import org.apache.spark.internal.Logging
import org.apache.spark.sql.streaming.StreamingQuery


  def write(
      metadata: StreamMetadata,
      metadataFile: Path,
      hadoopConf: Configuration): Unit = {
    var output: FSDataOutputStream = null
    try {
      val fs = FileSystem.get(hadoopConf)
      output = fs.create(metadataFile)
      val writer = new OutputStreamWriter(output)
      Serialization.write(metadata, writer)
      writer.close()
    } catch {
      case NonFatal(e) =>
        logError(s"Error writing stream metadata $metadata to $metadataFile", e)
        throw e
    } finally {
      IOUtils.closeQuietly(output)
    }
  }
}

Source File: HDFSCredentialProvider.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn.security

import java.io.{ByteArrayInputStream, DataInputStream}

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier
import org.apache.hadoop.mapred.Master
import org.apache.hadoop.security.Credentials

import org.apache.spark.{SparkConf, SparkException}
import org.apache.spark.deploy.yarn.config._
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config._

private[security] class HDFSCredentialProvider extends ServiceCredentialProvider with Logging {
  // Token renewal interval, this value will be set in the first call,
  // if None means no token renewer specified, so cannot get token renewal interval.
  private var tokenRenewalInterval: Option[Long] = null

  override val serviceName: String = "hdfs"

  override def obtainCredentials(
      hadoopConf: Configuration,
      sparkConf: SparkConf,
      creds: Credentials): Option[Long] = {
    // NameNode to access, used to get tokens from different FileSystems
    nnsToAccess(hadoopConf, sparkConf).foreach { dst =>
      val dstFs = dst.getFileSystem(hadoopConf)
      logInfo("getting token for namenode: " + dst)
      dstFs.addDelegationTokens(getTokenRenewer(hadoopConf), creds)
    }

    // Get the token renewal interval if it is not set. It will only be called once.
    if (tokenRenewalInterval == null) {
      tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, sparkConf)
    }

    // Get the time of next renewal.
    tokenRenewalInterval.map { interval =>
      creds.getAllTokens.asScala
        .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
        .map { t =>
          val identifier = new DelegationTokenIdentifier()
          identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
          identifier.getIssueDate + interval
      }.foldLeft(0L)(math.max)
    }
  }

  private def getTokenRenewalInterval(
      hadoopConf: Configuration, sparkConf: SparkConf): Option[Long] = {
    // We cannot use the tokens generated with renewer yarn. Trying to renew
    // those will fail with an access control issue. So create new tokens with the logged in
    // user as renewer.
    sparkConf.get(PRINCIPAL).flatMap { renewer =>
      val creds = new Credentials()
      nnsToAccess(hadoopConf, sparkConf).foreach { dst =>
        val dstFs = dst.getFileSystem(hadoopConf)
        dstFs.addDelegationTokens(renewer, creds)
      }
      val hdfsToken = creds.getAllTokens.asScala
        .find(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
      hdfsToken.map { t =>
        val newExpiration = t.renew(hadoopConf)
        val identifier = new DelegationTokenIdentifier()
        identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
        val interval = newExpiration - identifier.getIssueDate
        logInfo(s"Renewal Interval is $interval")
        interval
      }
    }
  }

  private def getTokenRenewer(conf: Configuration): String = {
    val delegTokenRenewer = Master.getMasterPrincipal(conf)
    logDebug("delegation token renewer is: " + delegTokenRenewer)
    if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) {
      val errorMessage = "Can't get Master Kerberos principal for use as renewer"
      logError(errorMessage)
      throw new SparkException(errorMessage)
    }

    delegTokenRenewer
  }

  private def nnsToAccess(hadoopConf: Configuration, sparkConf: SparkConf): Set[Path] = {
    sparkConf.get(NAMENODES_TO_ACCESS).map(new Path(_)).toSet +
      sparkConf.get(STAGING_DIR).map(new Path(_))
        .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory)
  }
}

Source File: ExecutorSource.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.executor

import java.util.concurrent.ThreadPoolExecutor

import scala.collection.JavaConverters._

import com.codahale.metrics.{Gauge, MetricRegistry}
import org.apache.hadoop.fs.FileSystem

import org.apache.spark.metrics.source.Source

private[spark]
class ExecutorSource(threadPool: ThreadPoolExecutor, executorId: String) extends Source {

  private def fileStats(scheme: String) : Option[FileSystem.Statistics] =
    FileSystem.getAllStatistics.asScala.find(s => s.getScheme.equals(scheme))

  private def registerFileSystemStat[T](
        scheme: String, name: String, f: FileSystem.Statistics => T, defaultValue: T) = {
    metricRegistry.register(MetricRegistry.name("filesystem", scheme, name), new Gauge[T] {
      override def getValue: T = fileStats(scheme).map(f).getOrElse(defaultValue)
    })
  }

  override val metricRegistry = new MetricRegistry()

  override val sourceName = "executor"

  // Gauge for executor thread pool's actively executing task counts
  metricRegistry.register(MetricRegistry.name("threadpool", "activeTasks"), new Gauge[Int] {
    override def getValue: Int = threadPool.getActiveCount()
  })

  // Gauge for executor thread pool's approximate total number of tasks that have been completed
  metricRegistry.register(MetricRegistry.name("threadpool", "completeTasks"), new Gauge[Long] {
    override def getValue: Long = threadPool.getCompletedTaskCount()
  })

  // Gauge for executor thread pool's current number of threads
  metricRegistry.register(MetricRegistry.name("threadpool", "currentPool_size"), new Gauge[Int] {
    override def getValue: Int = threadPool.getPoolSize()
  })

  // Gauge got executor thread pool's largest number of threads that have ever simultaneously
  // been in th pool
  metricRegistry.register(MetricRegistry.name("threadpool", "maxPool_size"), new Gauge[Int] {
    override def getValue: Int = threadPool.getMaximumPoolSize()
  })

  // Gauge for file system stats of this executor
  for (scheme <- Array("hdfs", "file")) {
    registerFileSystemStat(scheme, "read_bytes", _.getBytesRead(), 0L)
    registerFileSystemStat(scheme, "write_bytes", _.getBytesWritten(), 0L)
    registerFileSystemStat(scheme, "read_ops", _.getReadOps(), 0)
    registerFileSystemStat(scheme, "largeRead_ops", _.getLargeReadOps(), 0)
    registerFileSystemStat(scheme, "write_ops", _.getWriteOps(), 0)
  }
}

Source File: SimrSchedulerBackend.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import org.apache.hadoop.fs.{Path, FileSystem}

import org.apache.spark.{Logging, SparkContext, SparkEnv}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.scheduler.TaskSchedulerImpl
import org.apache.spark.util.AkkaUtils

private[spark] class SimrSchedulerBackend(
    scheduler: TaskSchedulerImpl,
    sc: SparkContext,
    driverFilePath: String)
  extends CoarseGrainedSchedulerBackend(scheduler, sc.env.actorSystem)
  with Logging {

  val tmpPath = new Path(driverFilePath + "_tmp")
  val filePath = new Path(driverFilePath)

  val maxCores = conf.getInt("spark.simr.executor.cores", 1)

  override def start() {
    super.start()

    val driverUrl = AkkaUtils.address(
      AkkaUtils.protocol(actorSystem),
      SparkEnv.driverActorSystemName,
      sc.conf.get("spark.driver.host"),
      sc.conf.get("spark.driver.port"),
      CoarseGrainedSchedulerBackend.ACTOR_NAME)

    val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
    val fs = FileSystem.get(conf)
    val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("")

    logInfo("Writing to HDFS file: "  + driverFilePath)
    logInfo("Writing Akka address: "  + driverUrl)
    logInfo("Writing Spark UI Address: " + appUIAddress)

    // Create temporary file to prevent race condition where executors get empty driverUrl file
    val temp = fs.create(tmpPath, true)
    temp.writeUTF(driverUrl)
    temp.writeInt(maxCores)
    temp.writeUTF(appUIAddress)
    temp.close()

    // "Atomic" rename
    fs.rename(tmpPath, filePath)
  }

  override def stop() {
    val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
    val fs = FileSystem.get(conf)
    fs.delete(new Path(driverFilePath), false)
    super.stop()
  }

}

Source File: ExecutorSource.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.executor

import scala.collection.JavaConversions._

import com.codahale.metrics.{Gauge, MetricRegistry}
import org.apache.hadoop.fs.FileSystem

import org.apache.spark.metrics.source.Source

private[spark] class ExecutorSource(val executor: Executor, executorId: String) extends Source {
  private def fileStats(scheme: String) : Option[FileSystem.Statistics] =
    FileSystem.getAllStatistics().filter(s => s.getScheme.equals(scheme)).headOption

  private def registerFileSystemStat[T](
        scheme: String, name: String, f: FileSystem.Statistics => T, defaultValue: T) = {
    metricRegistry.register(MetricRegistry.name("filesystem", scheme, name), new Gauge[T] {
      override def getValue: T = fileStats(scheme).map(f).getOrElse(defaultValue)
    })
  }

  override val metricRegistry = new MetricRegistry()

  override val sourceName = "executor"

  // Gauge for executor thread pool's actively executing task counts
  metricRegistry.register(MetricRegistry.name("threadpool", "activeTasks"), new Gauge[Int] {
    override def getValue: Int = executor.threadPool.getActiveCount()
  })

  // Gauge for executor thread pool's approximate total number of tasks that have been completed
  metricRegistry.register(MetricRegistry.name("threadpool", "completeTasks"), new Gauge[Long] {
    override def getValue: Long = executor.threadPool.getCompletedTaskCount()
  })

  // Gauge for executor thread pool's current number of threads
  metricRegistry.register(MetricRegistry.name("threadpool", "currentPool_size"), new Gauge[Int] {
    override def getValue: Int = executor.threadPool.getPoolSize()
  })

  // Gauge got executor thread pool's largest number of threads that have ever simultaneously
  // been in th pool
  metricRegistry.register(MetricRegistry.name("threadpool", "maxPool_size"), new Gauge[Int] {
    override def getValue: Int = executor.threadPool.getMaximumPoolSize()
  })

  // Gauge for file system stats of this executor
  for (scheme <- Array("hdfs", "file")) {
    registerFileSystemStat(scheme, "read_bytes", _.getBytesRead(), 0L)
    registerFileSystemStat(scheme, "write_bytes", _.getBytesWritten(), 0L)
    registerFileSystemStat(scheme, "read_ops", _.getReadOps(), 0)
    registerFileSystemStat(scheme, "largeRead_ops", _.getLargeReadOps(), 0)
    registerFileSystemStat(scheme, "write_ops", _.getWriteOps(), 0)
  }
}

Source File: SessionDataFileHDFSWriter.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.streaming.dstream.sessionization

import java.io.BufferedWriter
import java.io.FileWriter
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.conf.Configuration
import java.io.OutputStreamWriter
import org.apache.hadoop.fs.Path
import java.util.Random

object SessionDataFileHDFSWriter {
  
  val eol = System.getProperty("line.separator");  
  
  def main(args: Array[String]) {
    if (args.length == 0) {
        println("SessionDataFileWriter {tempDir} {distDir} {numberOfFiles} {numberOfEventsPerFile} {waitBetweenFiles}");
        return;
    }
    val conf = new Configuration
    conf.addResource(new Path("/etc/hadoop/conf/core-site.xml"))
    conf.addResource(new Path("/etc/hadoop/conf/mapred-site.xml"))
    conf.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml"))
    
    val fs = FileSystem.get(new Configuration)
    val rootTempDir = args(0)
    val rootDistDir = args(1)
    val files = args(2).toInt
    val loops = args(3).toInt
    val waitBetweenFiles = args(4).toInt
    val r = new Random
    for (f <- 1 to files) {
      val rootName = "/weblog." + System.currentTimeMillis()
      val tmpPath = new Path(rootTempDir + rootName + ".tmp")
      val writer = new BufferedWriter(new OutputStreamWriter(fs.create(tmpPath)))
      
      print(f + ": [")
      
      val randomLoops = loops + r.nextInt(loops)
      
      for (i <- 1 to randomLoops) {
        writer.write(SessionDataGenerator.getNextEvent + eol)
        if (i%100 == 0) {
          print(".")
        }
      }
      println("]")
      writer.close
      
      val distPath = new Path(rootDistDir + rootName + ".dat")
      
      fs.rename(tmpPath, distPath)
      Thread.sleep(waitBetweenFiles)
    }
    println("Done")
  }
}

Source File: spark_algo.scala From mllib_subpackage with Apache License 2.0

5 votes

import org.apache.commons.cli.{Options, PosixParser}
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
//import org.apache.hadoop.fs
import java.util.Date
import java.util.Calendar



import org.apache.hadoop.fs.FileSystem
//import sun.management.FileSystem


object spark_algo {
  def main(args: Array[String]) {

    // Input Params
    val parser = new PosixParser( )
    val options = new Options( )
    options.addOption("a", "algo", true, "algo type; 10. sgd 11. lbfgs")
    val cl = parser.parse( options, args, true )
    val algo = cl.getOptionValue("algo")

    val conf = new SparkConf()
    val sc = new SparkContext(conf)

    sc.getConf.getAll.foreach(println)

    val configuration = sc.hadoopConfiguration
    configuration.setBoolean("mapreduce.output.fileoutputformat.compress", false)
    val fs = FileSystem.get(configuration)
    val modeltmp = if(algo=="10" || algo=="11" || algo=="12" || algo=="13") {
      new mllib_lr(sc, fs, args)
    } else if(algo=="21") {
      new ftrl(sc, fs, args)
    } else if(algo=="22") {
      new ftrl_batch(sc, fs, args)
    } else if(algo=="31") {
      new relative(sc, fs, args)
    } else if(algo=="40") {
      new mllib_gbdt(sc, fs, args)
    } else if(algo=="41") {
      new lambda_mart(sc, fs, args)
    } else if(algo=="91") {
      new feature_analyse(sc, fs, args)
    } else if(algo=="docs_words_analyse") {
      new docs_words_analyse(sc, fs, args)
    }

    val model = modeltmp.asInstanceOf[malgo]
    model.deal()
  }



}

Source File: GzipDecompressor.scala From m3d-engine with Apache License 2.0

5 votes

package com.adidas.analytics.algo

import java.util.concurrent.{Executors, TimeUnit}

import com.adidas.analytics.algo.GzipDecompressor.{changeFileExtension, compressedExtension, _}
import com.adidas.analytics.algo.core.JobRunner
import com.adidas.analytics.config.GzipDecompressorConfiguration
import com.adidas.analytics.util.DFSWrapper
import com.adidas.analytics.util.DFSWrapper._
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.IOUtils
import org.apache.hadoop.io.compress.CompressionCodecFactory
import org.apache.spark.sql.SparkSession
import org.slf4j.{Logger, LoggerFactory}

import scala.concurrent._
import scala.concurrent.duration._


final class GzipDecompressor protected(val spark: SparkSession, val dfs: DFSWrapper, val configLocation: String)
  extends JobRunner with GzipDecompressorConfiguration {

  private val hadoopConfiguration: Configuration = spark.sparkContext.hadoopConfiguration
  private val fileSystem: FileSystem = dfs.getFileSystem(inputDirectoryPath)


  override def run(): Unit = {
    //check if directory exists
    if (!fileSystem.exists(inputDirectoryPath)){
      logger.error(s"Input directory: $inputDirectoryPath does not exist.")
      throw new RuntimeException(s"Directory $inputDirectoryPath does not exist.")
    }

    val compressedFilePaths = fileSystem.ls(inputDirectoryPath, recursive)
      .filterNot(path => fileSystem.isDirectory(path))
      .filter(_.getName.toLowerCase.endsWith(compressedExtension))

    if (compressedFilePaths.isEmpty) {
      logger.warn(s"Input directory $inputDirectoryPath does not contain compressed files. Skipping...")
    } else {
      implicit val ec: ExecutionContext = ExecutionContext.fromExecutor(Executors.newFixedThreadPool(threadPoolSize))
      Await.result(Future.sequence(
        compressedFilePaths.map { compressedFilePath =>
          Future {
            logger.info(s"Decompressing file: $compressedFilePath")

            val decompressedFileName = changeFileExtension(compressedFilePath.getName, compressedExtension, outputExtension)
            val decompressedFilePath = new Path(compressedFilePath.getParent, decompressedFileName)

            val compressionCodecFactory = new CompressionCodecFactory(hadoopConfiguration)
            val inputCodec = compressionCodecFactory.getCodec(compressedFilePath)

            val inputStream = inputCodec.createInputStream(fileSystem.open(compressedFilePath))
            val output = fileSystem.create(decompressedFilePath)

            IOUtils.copyBytes(inputStream, output, hadoopConfiguration)
            logger.info(s"Finished decompressing file: $compressedFilePath")

            //Delete the compressed file
            fileSystem.delete(compressedFilePath, false)
            logger.info(s"Removed file: $compressedFilePath")
          }
        }
      ), Duration(4, TimeUnit.HOURS))
    }
  }
}


object GzipDecompressor {

  private val logger: Logger = LoggerFactory.getLogger(this.getClass)

  private val compressedExtension: String = ".gz"

  def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): GzipDecompressor = {
    new GzipDecompressor(spark, dfs, configLocation)
  }

  private def changeFileExtension(fileName: String, currentExt: String, newExt: String): String = {
    val newFileName =  fileName.substring(0, fileName.lastIndexOf(currentExt))
    if (newFileName.endsWith(newExt)) newFileName else newFileName + newExt
  }
}

Source File: HDFSSupport.scala From m3d-engine with Apache License 2.0

5 votes

package com.adidas.utils

import java.io.File

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{CommonConfigurationKeysPublic, FileSystem}
import org.apache.hadoop.hdfs.MiniDFSCluster
import org.slf4j.Logger

trait HDFSSupport {

  private lazy val defaultDataNodesNum: Int = 2
  private lazy val defaultPort: Int = 8201

  lazy val cluster: MiniDFSCluster = startHDFS(clusterHdfsConf)
  lazy val fs: FileSystem = cluster.getFileSystem()

  def logger: Logger
  def testAppId: String
  def localTestDir: String
  def clusterHdfsConf: Option[Configuration] = Option.empty

  def startHDFS(hadoopConf: Option[Configuration]): MiniDFSCluster = {
    val appDir = new File(localTestDir, testAppId)
    val hdfsTestDir = new File(appDir, "hdfs").getAbsoluteFile
    hdfsTestDir.mkdirs()

    val clusterConf = hadoopConf.fold(new Configuration())(c => new Configuration(c))
    clusterConf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, hdfsTestDir.getAbsolutePath)
    clusterConf.set(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY, s"hdfs://localhost:$defaultPort/")

    logger.info(s"Starting test DFS cluster with base directory at ${hdfsTestDir.getAbsolutePath} ...")
    new MiniDFSCluster.Builder(clusterConf)
      .numDataNodes(defaultDataNodesNum)
      .nameNodePort(defaultPort)
      .format(true)
      .build()
  }
}

Source File: HdfsUtils.scala From etl-light with MIT License

5 votes

package yamrcraft.etlite.utils

import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}

import scala.collection.mutable

object HdfsUtils {

  def renameFiles(fromBase: Path, toBase: Path, fs: FileSystem) = {
    if (fs.exists(fromBase)) {
      val filesToMove = listFiles(fromBase, fs)
      println("files to move:")
      filesToMove foreach (p => println("+++" + p.toString))
      filesToMove foreach { file =>
        val relPath = relativize(fromBase, file)
        val toPath = new Path(toBase, relPath)
        fs.mkdirs(toPath.getParent)
        fs.rename(file, toPath)
        println(" file renamed to: " + toPath.toString)
      }
    }
  }

  def relativize(base: Path, files: List[Path]) = {
    files map (file => new Path(base.toUri.relativize(file.toUri).getPath))
  }

  def relativize(base: Path, file: Path): Path = {
    new Path(base.toUri.relativize(file.toUri).getPath)
  }

  def listFiles(path: Path, fs: FileSystem): List[Path] = {
    val statusList = mutable.MutableList[FileStatus]()
    traverse(path, statusList, fs)
    statusList.map(status => new Path(status.getPath.toUri.getPath)).toList
  }

  private def traverse(path: Path, list: mutable.MutableList[FileStatus], fs: FileSystem): Unit = {
    fs.listStatus(path) foreach { status =>
      if (!status.isDirectory) {
        list += status
      } else {
        traverse(status.getPath, list, fs)
      }
    }
  }

}

Source File: TimePartitioningWriter.scala From etl-light with MIT License

5 votes

package yamrcraft.etlite.writers

import java.io.IOException

import com.typesafe.config.Config
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.joda.time.format.DateTimeFormat
import org.slf4j.LoggerFactory
import yamrcraft.etlite.EtlException
import yamrcraft.etlite.transformers.Message
import yamrcraft.etlite.utils.ConfigConversions._

import scala.collection.mutable


class TimePartitioningWriter[T](config: Config, jobId: Long, partitionId: Int,
  writerFactory: (String, String) => Writer[T]) extends Writer[Message[T]] {

  val logger = LoggerFactory.getLogger(this.getClass)

  // config settings
  val workingFolder: String = config.getString("working-folder")
  val outputFolder: String = config.getString("output-folder")
  val partitionPattern: String = config.getString("partition.pattern")
  val folderMapping: Map[String, String] = config.getConfig("record-name-to-folder-mapping").asMap

  val fs = FileSystem.get(new Configuration())

  val partitionFormat = DateTimeFormat.forPattern(partitionPattern)

  val partitionsWriters = mutable.Map[String, Writer[T]]()

  @throws(classOf[EtlException])
  @throws(classOf[IOException])
  override def write(event: Message[T]): Unit = {
    val timestamp = event.msgTimestamp
    val baseFolder = folderMapping.getOrElse(event.msgType, event.msgType)

    val writer = writerFor(baseFolder, timestamp)
    writer.write(event.msg)
  }

  override def commit() = {
    // close all writers
    partitionsWriters foreach { case (file, writer) =>
      writer.commit()
    }
  }

  @throws(classOf[EtlException])
  private def writerFor(baseFolder: String, timestamp: Long): Writer[T] = {
    val relativeFileName = new Path(s"$baseFolder/${partitionFormat.print(timestamp)}/events_${baseFolder}_job${jobId}_part$partitionId")
    val tempFile = new Path(workingFolder, relativeFileName)
    val outputFile = new Path(outputFolder, relativeFileName)

    partitionsWriters.getOrElseUpdate(tempFile.toString, writerFactory(tempFile.toString, outputFile.toString))
  }

}

Source File: BigQueryDataFrame.scala From spark-bigquery with Apache License 2.0

5 votes

package com.samelamin.spark.bigquery

import com.google.api.services.bigquery.model.{TableReference, TableSchema}
import com.google.cloud.hadoop.io.bigquery._
import com.google.gson._
import com.samelamin.spark.bigquery.converters.{BigQueryAdapter, SchemaConverters}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.{LongWritable, NullWritable}
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
import org.apache.spark.sql.DataFrame
import org.slf4j.LoggerFactory

import scala.util.Random

  def saveAsBigQueryTable(fullyQualifiedOutputTableId: String,
                          isPartitionedByDay: Boolean = false,
                          timePartitionExpiration: Long = 0,
                          writeDisposition: WriteDisposition.Value = null,
                          createDisposition: CreateDisposition.Value = null): Unit = {
    val destinationTable = BigQueryStrings.parseTableReference(fullyQualifiedOutputTableId)
    val bigQuerySchema = SchemaConverters.SqlToBQSchema(adaptedDf)
    val gcsPath = writeDFToGoogleStorage(adaptedDf,destinationTable,bigQuerySchema)
    bq.load(destinationTable,
      bigQuerySchema,
      gcsPath,
      isPartitionedByDay,
      timePartitionExpiration,
      writeDisposition,
      createDisposition)
    delete(new Path(gcsPath))
  }

  def writeDFToGoogleStorage(adaptedDf: DataFrame,
                             destinationTable: TableReference,
                             bqSchema: TableSchema): String = {
    val tableName = BigQueryStrings.toString(destinationTable)

    BigQueryConfiguration.configureBigQueryOutput(hadoopConf, tableName, bqSchema.toPrettyString())
    hadoopConf.set("mapreduce.job.outputformat.class", classOf[BigQueryOutputFormat[_, _]].getName)
    val bucket = self.sparkSession.conf.get(BigQueryConfiguration.GCS_BUCKET_KEY)
    val temp = s"spark-bigquery-${System.currentTimeMillis()}=${Random.nextInt(Int.MaxValue)}"
    val gcsPath = s"gs://$bucket/hadoop/tmp/spark-bigquery/$temp"
    if(hadoopConf.get(BigQueryConfiguration.TEMP_GCS_PATH_KEY) == null) {
      hadoopConf.set(BigQueryConfiguration.TEMP_GCS_PATH_KEY, gcsPath)
    }

    logger.info(s"Loading $gcsPath into $tableName")
    adaptedDf
      .toJSON
      .rdd
      .map(json => (null, jsonParser.parse(json)))
      .saveAsNewAPIHadoopFile(gcsPath,
        classOf[GsonBigQueryInputFormat],
        classOf[LongWritable],
        classOf[TextOutputFormat[NullWritable, JsonObject]],
        hadoopConf)
    gcsPath
  }



  private def delete(path: Path): Unit = {
    val fs = FileSystem.get(path.toUri, hadoopConf)
    fs.delete(path, true)
  }
}

Source File: SentencePieceWrapper.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.ml.tensorflow.sentencepiece

import java.io.File
import java.nio.file.{Files, Paths}
import java.util.UUID

import org.apache.commons.io.FileUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.sql.SparkSession

class SentencePieceWrapper(
                            var sppModel: Array[Byte]
                          ) extends Serializable {

  @transient private var mspp: SentencePieceProcessor = _

  def getSppModel: SentencePieceProcessor = {

    if (mspp == null){
      val spp = new SentencePieceProcessor()
      spp.loadFromSerializedProto(sppModel)
      mspp = spp
    }
    mspp
  }

}

object SentencePieceWrapper {

  def read(
            path: String
          ): SentencePieceWrapper = {
    val byteArray = Files.readAllBytes(Paths.get(path))
    val sppWrapper = new SentencePieceWrapper(byteArray)
    val spp = new SentencePieceProcessor()
    spp.loadFromSerializedProto(byteArray)

    sppWrapper.mspp = spp
    sppWrapper
  }
}


trait WriteSentencePieceModel {
  def writeSentencePieceModel(
                               path: String,
                               spark: SparkSession,
                               spp: SentencePieceWrapper,
                               suffix: String, filename:String
                             ): Unit = {

    val uri = new java.net.URI(path.replaceAllLiterally("\\", "/"))
    val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration)

    // 1. Create tmp folder
    val tmpFolder = Files.createTempDirectory(UUID.randomUUID().toString.takeRight(12) + suffix)
      .toAbsolutePath.toString

    val sppFile = Paths.get(tmpFolder, filename).toString

    // 2. Save Tensorflow state
    FileUtils.writeByteArrayToFile(new File(sppFile), spp.sppModel)
    // 3. Copy to dest folder
    fs.copyFromLocalFile(new Path(sppFile), new Path(path))

    // 4. Remove tmp folder
    FileUtils.deleteDirectory(new File(tmpFolder))
  }
}

trait ReadSentencePieceModel {
  val sppFile: String

  def readSentencePieceModel(
                              path: String,
                              spark: SparkSession,
                              suffix: String
                            ): SentencePieceWrapper = {

    val uri = new java.net.URI(path.replaceAllLiterally("\\", "/"))
    val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration)

    // 1. Create tmp directory
    val tmpFolder = Files.createTempDirectory(UUID.randomUUID().toString.takeRight(12)+ suffix)
      .toAbsolutePath.toString

    // 2. Copy to local dir
    fs.copyToLocalFile(new Path(path, sppFile), new Path(tmpFolder))

    val sppModelFilePath = new Path(tmpFolder, sppFile)

    val byteArray = Files.readAllBytes(Paths.get(sppModelFilePath.toString))
    val sppWrapper = new SentencePieceWrapper(byteArray)
    sppWrapper
  }
}

Source File: NerDLPythonReader.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.nlp.annotators.ner.dl

import java.nio.file.{Files, Paths}
import java.util.UUID

import com.johnsnowlabs.ml.tensorflow.{DatasetEncoderParams, NerDatasetEncoder, TensorflowNer, TensorflowWrapper}
import com.johnsnowlabs.nlp.annotators.ner.Verbose
import com.johnsnowlabs.storage.{RocksDBConnection, StorageHelper}
import com.johnsnowlabs.util.FileHelper
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.sql.SparkSession

import scala.io.Source


object NerDLModelPythonReader {
  val embeddingsMetaFile = "embeddings.meta"
  val embeddingsFile = "embeddings"
  val tagsFile = "tags.csv"
  val charsFile = "chars.csv"


  private def readTags(folder: String): List[String] = {
    Source.fromFile(Paths.get(folder, tagsFile).toString).getLines().toList
  }

  private def readChars(folder: String): List[Char] = {
    val lines = Source.fromFile(Paths.get(folder, charsFile).toString).getLines()
    lines.toList.head.toCharArray.toList
  }

  private def readEmbeddingsHead(folder: String): Int = {
    val metaFile = Paths.get(folder, embeddingsMetaFile).toString
    Source.fromFile(metaFile).getLines().toList.head.toInt
  }

  private def readEmbeddings(
                              folder: String,
                              spark: SparkSession,
                              embeddingsDim: Int,
                              normalize: Boolean
                            ): RocksDBConnection = {
    StorageHelper.load(
      Paths.get(folder, embeddingsFile).toString,
      spark,
      "python_tf_model",
      "python_tf_ref",
      false
    )
  }

  def readLocal(folder: String,
                dim: Int,
                useBundle: Boolean = false,
                verbose: Verbose.Level = Verbose.All,
                tags: Array[String] = Array.empty[String]): TensorflowNer = {

    val labels = readTags(folder)
    val chars = readChars(folder)
    val settings = DatasetEncoderParams(labels, chars,
      Array.fill(dim)(0f).toList, dim)
    val encoder = new NerDatasetEncoder(settings)
    val tf = TensorflowWrapper.read(folder, zipped=false, useBundle, tags)

    new TensorflowNer(tf, encoder, 32, verbose)
  }

  def read(
            folder: String,
            dim: Int,
            spark: SparkSession,
            useBundle: Boolean = false,
            tags: Array[String] = Array.empty[String]): NerDLModel = {

    val uri = new java.net.URI(folder.replaceAllLiterally("\\", "/"))
    val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration)

    val tmpFolder = Files.createTempDirectory(UUID.randomUUID().toString.takeRight(12) + "_bundle")
      .toAbsolutePath.toString

    fs.copyToLocalFile(new Path(folder), new Path(tmpFolder))

    val nerModel = readLocal(tmpFolder, dim, useBundle, tags = tags)
    FileHelper.delete(tmpFolder)

    new NerDLModel()
      .setModelIfNotSet(spark, nerModel.tensorflow)
      .setDatasetParams(nerModel.encoder.params)
  }
}

Source File: StorageHelper.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.storage

import java.io.File

import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.spark.{SparkContext, SparkFiles}
import org.apache.spark.sql.SparkSession


object StorageHelper {

  def resolveStorageName(database: String, storageRef: String): String = new Path(database + "_" + storageRef).toString

  def load(
            storageSourcePath: String,
            spark: SparkSession,
            database: String,
            storageRef: String,
            withinStorage: Boolean
          ): RocksDBConnection = {

    val dbFolder = StorageHelper.resolveStorageName(database.toString, storageRef)
    val src = StorageLocator.getStorageSerializedPath(storageSourcePath.replaceAllLiterally("\\", "/"), dbFolder, withinStorage)

    val locator = StorageLocator(database, storageRef, spark)

    sendToCluster(src, locator.clusterFilePath, locator.clusterFileName, locator.destinationScheme, spark.sparkContext)

    RocksDBConnection.getOrCreate(locator.clusterFileName)
  }

  def save(path: String, connection: RocksDBConnection, spark: SparkSession, withinStorage: Boolean): Unit = {
    val indexUri = "file://"+(new java.net.URI(connection.findLocalIndex.replaceAllLiterally("\\", "/")).getPath)
    val index = new Path(indexUri)

    val uri = new java.net.URI(path.replaceAllLiterally("\\", "/"))
    val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration)
    val dst = new Path(path+{if (withinStorage) "/storage/" else ""})

    save(fs, index, dst)
  }

  private def save(fs: FileSystem, index: Path, dst: Path): Unit = {
    if (!fs.exists(dst))
      fs.mkdirs(dst)
    fs.copyFromLocalFile(false, true, index, dst)
  }

  def sendToCluster(source: Path, clusterFilePath: Path, clusterFileName: String, destinationScheme: String, sparkContext: SparkContext): Unit = {
    if (destinationScheme == "file") {
      copyIndexToLocal(source, new Path(RocksDBConnection.getLocalPath(clusterFileName)), sparkContext)
    } else {
      copyIndexToCluster(source, clusterFilePath, sparkContext)
    }
  }

  private def copyIndexToCluster(sourcePath: Path, dst: Path, spark: SparkContext): String = {
    if (!new File(SparkFiles.get(dst.getName)).exists()) {
      val srcFS = sourcePath.getFileSystem(spark.hadoopConfiguration)
      val dstFS = dst.getFileSystem(spark.hadoopConfiguration)

      if (srcFS.getScheme == "file") {
        val src = sourcePath
        dstFS.copyFromLocalFile(false, true, src, dst)
      } else {
        FileUtil.copy(srcFS, sourcePath, dstFS, dst, false, true, spark.hadoopConfiguration)
      }

      spark.addFile(dst.toString, recursive = true)
    }
    dst.toString
  }

  private def copyIndexToLocal(source: Path, destination: Path, context: SparkContext): Unit = {
    
    val fs = source.getFileSystem(context.hadoopConfiguration)
    if (!fs.exists(destination))
      fs.copyFromLocalFile(false, true, source, destination)
  }

}

Source File: StorageLocator.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.storage

import java.util.UUID

import com.johnsnowlabs.util.ConfigHelper
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.sql.SparkSession

case class StorageLocator(database: String, storageRef: String, sparkSession: SparkSession) {

  private val fs = FileSystem.get(sparkSession.sparkContext.hadoopConfiguration)

  private val clusterTmpLocation: String = {
    val tmpLocation = ConfigHelper.getConfigValue(ConfigHelper.storageTmpDir).map(p => new Path(p)).getOrElse(
      sparkSession.sparkContext.hadoopConfiguration.get("hadoop.tmp.dir")
    ).toString+"/"+UUID.randomUUID().toString.takeRight(12)+"_cdx"
    val tmpLocationPath = new Path(tmpLocation)
    fs.mkdirs(tmpLocationPath)
    fs.deleteOnExit(tmpLocationPath)
    tmpLocation
  }

  val clusterFileName: String = {
    StorageHelper.resolveStorageName(database, storageRef)
  }

  val clusterFilePath: Path = {
    Path.mergePaths(new Path(fs.getUri.toString + clusterTmpLocation), new Path("/"+clusterFileName))
  }

  val destinationScheme: String = {
    fs.getScheme
  }

}

object StorageLocator {
  def getStorageSerializedPath(path: String, folder: String, withinStorage: Boolean): Path =
    Path.mergePaths(new Path(path), new Path((if (withinStorage) "/storage/" else "/")+folder))
}

Source File: HadoopFileSystemLogStore.scala From delta with Apache License 2.0

5 votes

package org.apache.spark.sql.delta.storage

import java.io.{BufferedReader, FileNotFoundException, InputStreamReader}
import java.nio.charset.StandardCharsets.UTF_8
import java.nio.file.FileAlreadyExistsException
import java.util.UUID

import scala.collection.JavaConverters._

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession


  protected def writeWithRename(
      path: Path, actions: Iterator[String], overwrite: Boolean = false): Unit = {
    val fs = path.getFileSystem(getHadoopConfiguration)

    if (!fs.exists(path.getParent)) {
      throw new FileNotFoundException(s"No such file or directory: ${path.getParent}")
    }
    if (overwrite) {
      val stream = fs.create(path, true)
      try {
        actions.map(_ + "\n").map(_.getBytes(UTF_8)).foreach(stream.write)
      } finally {
        stream.close()
      }
    } else {
      if (fs.exists(path)) {
        throw new FileAlreadyExistsException(path.toString)
      }
      val tempPath = createTempPath(path)
      var streamClosed = false // This flag is to avoid double close
      var renameDone = false // This flag is to save the delete operation in most of cases.
      val stream = fs.create(tempPath)
      try {
        actions.map(_ + "\n").map(_.getBytes(UTF_8)).foreach(stream.write)
        stream.close()
        streamClosed = true
        try {
          if (fs.rename(tempPath, path)) {
            renameDone = true
          } else {
            if (fs.exists(path)) {
              throw new FileAlreadyExistsException(path.toString)
            } else {
              throw new IllegalStateException(s"Cannot rename $tempPath to $path")
            }
          }
        } catch {
          case _: org.apache.hadoop.fs.FileAlreadyExistsException =>
            throw new FileAlreadyExistsException(path.toString)
        }
      } finally {
        if (!streamClosed) {
          stream.close()
        }
        if (!renameDone) {
          fs.delete(tempPath, false)
        }
      }
    }
  }

  protected def createTempPath(path: Path): Path = {
    new Path(path.getParent, s".${path.getName}.${UUID.randomUUID}.tmp")
  }

  override def invalidateCache(): Unit = {}
}

Source File: Job.scala From spark-avro-compactor with Apache License 2.0

5 votes

package ie.ianduffy.spark.avro.compactor

import ie.ianduffy.spark.avro.compactor.Utils._
import io.confluent.kafka.schemaregistry.client.{SchemaMetadata, SchemaRegistryClient}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.avro.mapred.AvroKey
import org.apache.avro.mapreduce.AvroKeyOutputFormat
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.NullWritable
import org.apache.spark.sql.SparkSession
import org.slf4j.LoggerFactory

object Job {

  private val log = LoggerFactory.getLogger(Job.getClass.getName.replace("$", ""))

  def run(spark: SparkSession, schemaRegistry: SchemaRegistryClient, jobConfig: JobConfig): Unit = {
    val schema: Schema = {
      val latestSchemaMetadata: SchemaMetadata = schemaRegistry.getLatestSchemaMetadata(jobConfig.schemaRegistrySubject)
      val id: Int = latestSchemaMetadata.getId
      schemaRegistry.getById(id)
    }

    implicit val sparkConfig: Configuration = spark.sparkContext.hadoopConfiguration
    sparkConfig.set("avro.schema.input.key", schema.toString())
    sparkConfig.set("avro.schema.output.key", schema.toString())

    val inputPath: Path = new Path(jobConfig.input)
    val outputPath: Path = new Path(jobConfig.output)

    val fs: FileSystem = inputPath.getFileSystem(sparkConfig)

    // avoid raising org.apache.hadoop.mapred.FileAlreadyExistsException
    if (jobConfig.overrideOutput) fs.delete(outputPath, true)

    // from fileSystem prefix with s3 the default is 64MB and can be overwitten by fs.s3.block.size
    // from fileSystem prefix with s3a the default is 32MB and can be overwitten by setting fs.s3a.block.size
    val outputBlocksize: Long = fs.getDefaultBlockSize(outputPath)

    // Where inputPath is of the form s3://some/path
    val inputPathSize: Long = fs.getContentSummary(inputPath).getSpaceConsumed

    val numPartitions: Int = Math.max(1, Math.floor((inputPathSize / CompressionRatio.AVRO_SNAPPY) / outputBlocksize).toInt)

    log.debug(
      s"""outputBlocksize: $outputBlocksize
         | inputPathSize: $inputPathSize
         | splitSize: $numPartitions
       """.stripMargin)

    val rdd = readHadoopFile(spark, inputPath.toString)

    rdd.coalesce(numPartitions)
      .saveAsNewAPIHadoopFile(
        outputPath.toString,
        classOf[AvroKey[GenericRecord]],
        classOf[NullWritable],
        classOf[AvroKeyOutputFormat[GenericRecord]],
        sparkConfig
      )
  }
}

Source File: BinaryFileReader.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark

import com.microsoft.ml.spark.core.env.StreamUtilities
import com.microsoft.ml.spark.core.schema.BinaryFileSchema
import com.microsoft.ml.spark.core.utils.AsyncUtils
import org.apache.commons.io.IOUtils
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
import org.apache.spark.binary.BinaryFileFormat
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.binary.ConfUtils
import org.apache.spark.sql.types.BinaryType

import scala.concurrent.{ExecutionContext, Future}
import scala.concurrent.duration.Duration

object BinaryFileReader {

  private def recursePath(fileSystem: FileSystem,
                          path: Path,
                          pathFilter: FileStatus => Boolean,
                          visitedSymlinks: Set[Path]): Array[Path] ={
    val filteredPaths = fileSystem.listStatus(path).filter(pathFilter)
    val filteredDirs = filteredPaths.filter(fs => fs.isDirectory & !visitedSymlinks(fs.getPath))
    val symlinksFound = visitedSymlinks ++ filteredDirs.filter(_.isSymlink).map(_.getPath)
    filteredPaths.map(_.getPath) ++ filteredDirs.map(_.getPath)
      .flatMap(p => recursePath(fileSystem, p, pathFilter, symlinksFound))
  }

  def recursePath(fileSystem: FileSystem, path: Path, pathFilter: FileStatus => Boolean): Array[Path] ={
    recursePath(fileSystem, path, pathFilter, Set())
  }

  
  def readFromPaths(df: DataFrame,
                    pathCol: String,
                    bytesCol: String,
                    concurrency: Int,
                    timeout: Int
                   ): DataFrame = {
    val outputSchema = df.schema.add(bytesCol, BinaryType, nullable = true)
    val encoder = RowEncoder(outputSchema)
    val hconf = ConfUtils.getHConf(df)

    df.mapPartitions { rows =>
      val futures = rows.map {row: Row =>
        Future {
            val path = new Path(row.getAs[String](pathCol))
            val fs = path.getFileSystem(hconf.value)
            val bytes = StreamUtilities.using(fs.open(path)) {is => IOUtils.toByteArray(is)}.get
            val ret = Row.merge(Seq(row, Row(bytes)): _*)
            ret
          }(ExecutionContext.global)
      }
      AsyncUtils.bufferedAwait(
        futures,concurrency, Duration.fromNanos(timeout*(20^6).toLong))(ExecutionContext.global)
    }(encoder)
  }

}

Source File: IndexedBinaryBlockReader.scala From hail with MIT License

5 votes

package is.hail.io

import is.hail.annotations.RegionValueBuilder
import is.hail.io.fs.{HadoopFS, WrappedSeekableDataInputStream}
import org.apache.commons.logging.{Log, LogFactory}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.mapred._

abstract class KeySerializedValueRecord[K] extends Serializable {
  var input: Array[Byte] = _
  var key: K = _

  def setSerializedValue(arr: Array[Byte]) {
    this.input = arr
  }

  def getValue(rvb: RegionValueBuilder, includeGT: Boolean): Unit

  def setKey(k: K) {
    this.key = k
  }

  def getKey: K = key
}

abstract class IndexedBinaryBlockReader[T](job: Configuration, split: FileSplit)
  extends RecordReader[LongWritable, T] {

  val LOG: Log = LogFactory.getLog(classOf[IndexedBinaryBlockReader[T]].getName)
  val partitionStart: Long = split.getStart
  var pos: Long = partitionStart
  val end: Long = partitionStart + split.getLength
  val bfis = openFile()

  def openFile(): HadoopFSDataBinaryReader = {
    val file: Path = split.getPath
    val fs: FileSystem = file.getFileSystem(job)
    val is = fs.open(file)
    new HadoopFSDataBinaryReader(
      new WrappedSeekableDataInputStream(
        HadoopFS.toSeekableInputStream(is)))
  }

  def createKey(): LongWritable = new LongWritable()

  def createValue(): T

  def getPos: Long = pos

  def getProgress: Float = {
    if (partitionStart == end)
      0.0f
    else
      Math.min(1.0f, (pos - partitionStart) / (end - partitionStart).toFloat)
  }

  def close() = bfis.close()

}

Source File: HadoopFSHelpers.scala From morpheus with Apache License 2.0

5 votes

package org.opencypher.morpheus.api.io.fs

import java.io.{BufferedReader, BufferedWriter, InputStreamReader, OutputStreamWriter}

import org.apache.hadoop.fs.{FileSystem, Path}
import org.opencypher.morpheus.api.io.util.FileSystemUtils.using

object HadoopFSHelpers {

  implicit class RichHadoopFileSystem(fileSystem: FileSystem) {

    protected def createDirectoryIfNotExists(path: Path): Unit = {
      if (!fileSystem.exists(path)) {
        fileSystem.mkdirs(path)
      }
    }

    def listDirectories(path: String): List[String] = {
      val p = new Path(path)
      createDirectoryIfNotExists(p)
      fileSystem.listStatus(p)
        .filter(_.isDirectory)
        .map(_.getPath.getName)
        .toList
    }

    def deleteDirectory(path: String): Unit = {
      fileSystem.delete(new Path(path),  true)
    }

    def readFile(path: String): String = {
      using(new BufferedReader(new InputStreamReader(fileSystem.open(new Path(path)), "UTF-8"))) { reader =>
        def readLines = Stream.cons(reader.readLine(), Stream.continually(reader.readLine))
        readLines.takeWhile(_ != null).mkString
      }
    }

    def writeFile(path: String, content: String): Unit = {
      val p = new Path(path)
      val parentDirectory = p.getParent
      createDirectoryIfNotExists(parentDirectory)
      using(fileSystem.create(p)) { outputStream =>
        using(new BufferedWriter(new OutputStreamWriter(outputStream, "UTF-8"))) { bufferedWriter =>
          bufferedWriter.write(content)
        }
      }
    }
  }

}

Source File: StreamMetadata.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.io.{InputStreamReader, OutputStreamWriter}
import java.nio.charset.StandardCharsets

import scala.util.control.NonFatal

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization

import org.apache.spark.internal.Logging
import org.apache.spark.sql.streaming.StreamingQuery


  def write(
      metadata: StreamMetadata,
      metadataFile: Path,
      hadoopConf: Configuration): Unit = {
    var output: FSDataOutputStream = null
    try {
      val fs = FileSystem.get(hadoopConf)
      output = fs.create(metadataFile)
      val writer = new OutputStreamWriter(output)
      Serialization.write(metadata, writer)
      writer.close()
    } catch {
      case NonFatal(e) =>
        logError(s"Error writing stream metadata $metadata to $metadataFile", e)
        throw e
    } finally {
      IOUtils.closeQuietly(output)
    }
  }
}

Source File: ExecutorSource.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.executor

import java.util.concurrent.ThreadPoolExecutor

import scala.collection.JavaConverters._

import com.codahale.metrics.{Gauge, MetricRegistry}
import org.apache.hadoop.fs.FileSystem

import org.apache.spark.metrics.source.Source

private[spark]
class ExecutorSource(threadPool: ThreadPoolExecutor, executorId: String) extends Source {

  private def fileStats(scheme: String) : Option[FileSystem.Statistics] =
    FileSystem.getAllStatistics.asScala.find(s => s.getScheme.equals(scheme))

  private def registerFileSystemStat[T](
        scheme: String, name: String, f: FileSystem.Statistics => T, defaultValue: T) = {
    metricRegistry.register(MetricRegistry.name("filesystem", scheme, name), new Gauge[T] {
      override def getValue: T = fileStats(scheme).map(f).getOrElse(defaultValue)
    })
  }

  override val metricRegistry = new MetricRegistry()

  override val sourceName = "executor"

  // Gauge for executor thread pool's actively executing task counts
  metricRegistry.register(MetricRegistry.name("threadpool", "activeTasks"), new Gauge[Int] {
    override def getValue: Int = threadPool.getActiveCount()
  })

  // Gauge for executor thread pool's approximate total number of tasks that have been completed
  metricRegistry.register(MetricRegistry.name("threadpool", "completeTasks"), new Gauge[Long] {
    override def getValue: Long = threadPool.getCompletedTaskCount()
  })

  // Gauge for executor thread pool's current number of threads
  metricRegistry.register(MetricRegistry.name("threadpool", "currentPool_size"), new Gauge[Int] {
    override def getValue: Int = threadPool.getPoolSize()
  })

  // Gauge got executor thread pool's largest number of threads that have ever simultaneously
  // been in th pool
  metricRegistry.register(MetricRegistry.name("threadpool", "maxPool_size"), new Gauge[Int] {
    override def getValue: Int = threadPool.getMaximumPoolSize()
  })

  // Gauge for file system stats of this executor
  for (scheme <- Array("hdfs", "file")) {
    registerFileSystemStat(scheme, "read_bytes", _.getBytesRead(), 0L)
    registerFileSystemStat(scheme, "write_bytes", _.getBytesWritten(), 0L)
    registerFileSystemStat(scheme, "read_ops", _.getReadOps(), 0)
    registerFileSystemStat(scheme, "largeRead_ops", _.getLargeReadOps(), 0)
    registerFileSystemStat(scheme, "write_ops", _.getWriteOps(), 0)
  }
}

Source File: EventHistoryReporter.scala From sparklens with Apache License 2.0

5 votes

package com.qubole.sparklens.app

import java.io.{BufferedInputStream, InputStream}
import java.net.URI

import com.ning.compress.lzf.LZFInputStream
import com.qubole.sparklens.QuboleJobListener
import com.qubole.sparklens.common.Json4sWrapper
import com.qubole.sparklens.helper.HDFSConfigHelper
import net.jpountz.lz4.LZ4BlockInputStream
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.SparkConf
import org.json4s.DefaultFormats
import org.xerial.snappy.SnappyInputStream


class EventHistoryReporter(file: String, extraConf: List[(String, String)] = List.empty) {

  // This is using reflection in spark-2.0.0 ReplayListenerBus
  val busKlass = Class.forName("org.apache.spark.scheduler.ReplayListenerBus")
  val bus = busKlass.newInstance()
  val addListenerMethod = busKlass.getMethod("addListener", classOf[Object])
  val conf = new SparkConf()
    .set("spark.sparklens.reporting.disabled", "false")
    .set("spark.sparklens.save.data", "false")

  extraConf.foreach(x => {
    conf.set(x._1, x._2)
  })

  val listener = new QuboleJobListener(conf)
  addListenerMethod.invoke(bus, listener)


  try {
    val replayMethod = busKlass.getMethod("replay", classOf[InputStream], classOf[String],
      classOf[Boolean])
    replayMethod.invoke(bus, getDecodedInputStream(file, conf), file, boolean2Boolean(false))
  } catch {
    case _: NoSuchMethodException => // spark binaries are 2.1* and above
      val replayMethod = busKlass.getMethod("replay", classOf[InputStream], classOf[String],
        classOf[Boolean], classOf[String => Boolean])
      replayMethod.invoke(bus, getDecodedInputStream(file, conf), file, boolean2Boolean(false),
        getFilter _)
    case x: Exception => {
     println(s"Failed replaying events from ${file} [${x.getMessage}]")
    }
  }


  // Borrowed from CompressionCodecs in spark
  private def getDecodedInputStream(file: String, conf: SparkConf): InputStream = {

    val fs = FileSystem.get(new URI(file), HDFSConfigHelper.getHadoopConf(Some(conf)))
    val path = new Path(file)
    val bufStream = new BufferedInputStream(fs.open(path))

    val logName = path.getName.stripSuffix(".inprogress")
    val codecName: Option[String] = logName.split("\\.").tail.lastOption

    codecName.getOrElse("") match {
      case "lz4" => new LZ4BlockInputStream(bufStream)
      case "lzf" => new LZFInputStream(bufStream)
      case "snappy" => new SnappyInputStream(bufStream)
      case _ => bufStream
    }
  }

  private def getFilter(eventString: String): Boolean = {
    implicit val formats = DefaultFormats
    eventFilter.contains(Json4sWrapper.parse(eventString).extract[Map[String, Any]].get("Event")
      .get.asInstanceOf[String])
  }

  private def eventFilter: Set[String] = {
    Set(
      "SparkListenerTaskEnd",
      "SparkListenerApplicationStart",
      "SparkListenerApplicationEnd",
      "SparkListenerExecutorAdded",
      "SparkListenerExecutorRemoved",
      "SparkListenerJobStart",
      "SparkListenerJobEnd",
      "SparkListenerStageSubmitted",
      "SparkListenerStageCompleted"
    )
  }

}

Source File: ExecutorDelegationTokenUpdater.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.util.concurrent.{Executors, TimeUnit}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.security.{Credentials, UserGroupInformation}

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.util.{ThreadUtils, Utils}

import scala.util.control.NonFatal

private[spark] class ExecutorDelegationTokenUpdater(
    sparkConf: SparkConf,
    hadoopConf: Configuration) extends Logging {

  @volatile private var lastCredentialsFileSuffix = 0

  private val credentialsFile = sparkConf.get("spark.yarn.credentials.file")

  private val delegationTokenRenewer =
    Executors.newSingleThreadScheduledExecutor(
      ThreadUtils.namedThreadFactory("Delegation Token Refresh Thread"))

  // On the executor, this thread wakes up and picks up new tokens from HDFS, if any.
  private val executorUpdaterRunnable =
    new Runnable {
      override def run(): Unit = Utils.logUncaughtExceptions(updateCredentialsIfRequired())
    }

  def updateCredentialsIfRequired(): Unit = {
    try {
      val credentialsFilePath = new Path(credentialsFile)
      val remoteFs = FileSystem.get(hadoopConf)
      SparkHadoopUtil.get.listFilesSorted(
        remoteFs, credentialsFilePath.getParent,
        credentialsFilePath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION)
        .lastOption.foreach { credentialsStatus =>
        val suffix = SparkHadoopUtil.get.getSuffixForCredentialsPath(credentialsStatus.getPath)
        if (suffix > lastCredentialsFileSuffix) {
          logInfo("Reading new delegation tokens from " + credentialsStatus.getPath)
          val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsStatus.getPath)
          lastCredentialsFileSuffix = suffix
          UserGroupInformation.getCurrentUser.addCredentials(newCredentials)
          logInfo("Tokens updated from credentials file.")
        } else {
          // Check every hour to see if new credentials arrived.
          logInfo("Updated delegation tokens were expected, but the driver has not updated the " +
            "tokens yet, will check again in an hour.")
          delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS)
          return
        }
      }
      val timeFromNowToRenewal =
        SparkHadoopUtil.get.getTimeFromNowToRenewal(
          sparkConf, 0.8, UserGroupInformation.getCurrentUser.getCredentials)
      if (timeFromNowToRenewal <= 0) {
        executorUpdaterRunnable.run()
      } else {
        logInfo(s"Scheduling token refresh from HDFS in $timeFromNowToRenewal millis.")
        delegationTokenRenewer.schedule(
          executorUpdaterRunnable, timeFromNowToRenewal, TimeUnit.MILLISECONDS)
      }
    } catch {
      // Since the file may get deleted while we are reading it, catch the Exception and come
      // back in an hour to try again
      case NonFatal(e) =>
        logWarning("Error while trying to update credentials, will try again in 1 hour", e)
        delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS)
    }
  }

  private def getCredentialsFromHDFSFile(remoteFs: FileSystem, tokenPath: Path): Credentials = {
    val stream = remoteFs.open(tokenPath)
    try {
      val newCredentials = new Credentials()
      newCredentials.readTokenStorageStream(stream)
      newCredentials
    } finally {
      stream.close()
    }
  }

  def stop(): Unit = {
    delegationTokenRenewer.shutdown()
  }

}

Source File: SimrSchedulerBackend.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import org.apache.hadoop.fs.{Path, FileSystem}

import org.apache.spark.rpc.RpcAddress
import org.apache.spark.{Logging, SparkContext, SparkEnv}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.scheduler.TaskSchedulerImpl

private[spark] class SimrSchedulerBackend(
    scheduler: TaskSchedulerImpl,
    sc: SparkContext,
    driverFilePath: String)
  extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv)
  with Logging {

  val tmpPath = new Path(driverFilePath + "_tmp")
  val filePath = new Path(driverFilePath)

  val maxCores = conf.getInt("spark.simr.executor.cores", 1)

  override def start() {
    super.start()

    val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName,
      RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt),
      CoarseGrainedSchedulerBackend.ENDPOINT_NAME)

    val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
    val fs = FileSystem.get(conf)
    val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("")

    logInfo("Writing to HDFS file: "  + driverFilePath)
    logInfo("Writing Akka address: "  + driverUrl)
    logInfo("Writing Spark UI Address: " + appUIAddress)

    // Create temporary file to prevent race condition where executors get empty driverUrl file
    val temp = fs.create(tmpPath, true)
    temp.writeUTF(driverUrl)
    temp.writeInt(maxCores)
    temp.writeUTF(appUIAddress)
    temp.close()

    // "Atomic" rename
    fs.rename(tmpPath, filePath)
  }

  override def stop() {
    val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
    val fs = FileSystem.get(conf)
    fs.delete(new Path(driverFilePath), false)
    super.stop()
  }

}

Source File: ExecutorSource.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.executor

import java.util.concurrent.ThreadPoolExecutor

import scala.collection.JavaConversions._

import com.codahale.metrics.{Gauge, MetricRegistry}
import org.apache.hadoop.fs.FileSystem

import org.apache.spark.metrics.source.Source

private[spark]
class ExecutorSource(threadPool: ThreadPoolExecutor, executorId: String) extends Source {

  private def fileStats(scheme: String) : Option[FileSystem.Statistics] =
    FileSystem.getAllStatistics().find(s => s.getScheme.equals(scheme))

  private def registerFileSystemStat[T](
        scheme: String, name: String, f: FileSystem.Statistics => T, defaultValue: T) = {
    metricRegistry.register(MetricRegistry.name("filesystem", scheme, name), new Gauge[T] {
      override def getValue: T = fileStats(scheme).map(f).getOrElse(defaultValue)
    })
  }

  override val metricRegistry = new MetricRegistry()

  override val sourceName = "executor"

  // Gauge for executor thread pool's actively executing task counts
  metricRegistry.register(MetricRegistry.name("threadpool", "activeTasks"), new Gauge[Int] {
    override def getValue: Int = threadPool.getActiveCount()
  })

  // Gauge for executor thread pool's approximate total number of tasks that have been completed
  metricRegistry.register(MetricRegistry.name("threadpool", "completeTasks"), new Gauge[Long] {
    override def getValue: Long = threadPool.getCompletedTaskCount()
  })

  // Gauge for executor thread pool's current number of threads
  metricRegistry.register(MetricRegistry.name("threadpool", "currentPool_size"), new Gauge[Int] {
    override def getValue: Int = threadPool.getPoolSize()
  })

  // Gauge got executor thread pool's largest number of threads that have ever simultaneously
  // been in th pool
  metricRegistry.register(MetricRegistry.name("threadpool", "maxPool_size"), new Gauge[Int] {
    override def getValue: Int = threadPool.getMaximumPoolSize()
  })

  // Gauge for file system stats of this executor
  for (scheme <- Array("hdfs", "file")) {
    registerFileSystemStat(scheme, "read_bytes", _.getBytesRead(), 0L)
    registerFileSystemStat(scheme, "write_bytes", _.getBytesWritten(), 0L)
    registerFileSystemStat(scheme, "read_ops", _.getReadOps(), 0)
    registerFileSystemStat(scheme, "largeRead_ops", _.getLargeReadOps(), 0)
    registerFileSystemStat(scheme, "write_ops", _.getWriteOps(), 0)
  }
}

Source File: FeatureSelection.scala From aerosolve with Apache License 2.0

5 votes

package com.airbnb.aerosolve.training

import java.io.BufferedWriter
import java.io.OutputStreamWriter
import java.util

import com.airbnb.aerosolve.core.{ModelRecord, ModelHeader, FeatureVector, Example}
import com.airbnb.aerosolve.core.models.LinearModel
import com.airbnb.aerosolve.core.util.Util
import com.typesafe.config.Config
import org.slf4j.{LoggerFactory, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet
import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.Buffer
import scala.collection.JavaConversions._
import scala.collection.JavaConverters._
import scala.util.Random
import scala.math.abs
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

object FeatureSelection {
  private final val log: Logger = LoggerFactory.getLogger("FeatureSelection")
  val allKey : (String, String) = ("$ALL", "$POS")

  // Given a RDD compute the pointwise mutual information between
  // the positive label and the discrete features.
  def pointwiseMutualInformation(examples : RDD[Example],
                                 config : Config,
                                 key : String,
                                 rankKey : String,
                                 posThreshold : Double,
                                 minPosCount : Double,
                                 newCrosses : Boolean) : RDD[((String, String), Double)] = {
    val pointwise = LinearRankerUtils.makePointwise(examples, config, key, rankKey)
    val features = pointwise
      .mapPartitions(part => {
      // The tuple2 is var, var | positive
      val output = scala.collection.mutable.HashMap[(String, String), (Double, Double)]()
      part.foreach(example =>{
        val featureVector = example.example.get(0)
        val isPos = if (featureVector.floatFeatures.get(rankKey).asScala.head._2 > posThreshold) 1.0
        else 0.0
        val all : (Double, Double) = output.getOrElse(allKey, (0.0, 0.0))
        output.put(allKey, (all._1 + 1.0, all._2 + 1.0 * isPos))

        val features : Array[(String, String)] =
          LinearRankerUtils.getFeatures(featureVector)
        if (newCrosses) {
          for (i <- features) {
            for (j <- features) {
              if (i._1 < j._1) {
                val key = ("%s<NEW>%s".format(i._1, j._1),
                           "%s<NEW>%s".format(i._2, j._2))
                val x = output.getOrElse(key, (0.0, 0.0))
                output.put(key, (x._1 + 1.0, x._2 + 1.0 * isPos))
              }
            }
          }
        }
        for (feature <- features) {
          val x = output.getOrElse(feature, (0.0, 0.0))
          output.put(feature, (x._1 + 1.0, x._2 + 1.0 * isPos))
        }
      })
      output.iterator
    })
    .reduceByKey((a, b) => (a._1 + b._1, a._2 + b._2))
    .filter(x => x._2._2 >= minPosCount)

    val allCount = features.filter(x => x._1.equals(allKey)).take(1).head

    features.map(x => {
      val prob = x._2._1 / allCount._2._1
      val probPos = x._2._2 / allCount._2._2
      (x._1, math.log(probPos / prob) / math.log(2.0))
    })
  }

  // Returns the maximum entropy per family
  def maxEntropy(input : RDD[((String, String), Double)]) : RDD[((String, String), Double)] = {
    input
      .map(x => (x._1._1, (x._1._2, x._2)))
      .reduceByKey((a, b) => if (math.abs(a._2) > math.abs(b._2)) a else b)
      .map(x => ((x._1, x._2._1), x._2._2))
  }
}

Source File: HDFSUtil.scala From aerosolve with Apache License 2.0

5 votes

package com.airbnb.common.ml.util

import java.io.{BufferedReader, IOException, InputStreamReader}
import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}


object HDFSUtil extends ScalaLogging {

  private lazy val hadoopConfiguration = new Configuration()

  
  def lastTaskSucceed(path: String): Boolean = {
    if (dirExists(path)) {
      if (dirExists(path + "/_temporary")) {
        logger.info(s"Deleting partial data for $path.")
        deleteDirWithoutThrow(path)
        false
      } else {
        logger.info(s"$path exists")
        true
      }
    } else {
      logger.info(s"$path does not exist")
      false
    }
  }

  def dirExists(dir: String): Boolean = {
    val path = new Path(dir)
    val hdfs = FileSystem.get(
      new java.net.URI(dir), hadoopConfiguration)

    hdfs.exists(path)
  }

  def deleteDirWithoutThrow(dir: String): Unit = {
    val path = new Path(dir)
    val hdfs = FileSystem.get(
      new java.net.URI(dir), hadoopConfiguration)
    if (hdfs.exists(path)) {
      logger.warn(s"$dir exists, DELETING")
      try {
        hdfs.delete(path, true)
      } catch {
        case e: IOException => logger.error(s" exception $e")
      }
    }
  }

  def createPath(path: String): Unit = {
    val remotePath = new Path(path)
    val remoteFS = remotePath.getFileSystem(hadoopConfiguration)
    remoteFS.mkdirs(new Path(path))
  }

  def readStringFromFile(inputFile : String): String = {
    val fs = FileSystem.get(new URI(inputFile), hadoopConfiguration)
    val path = new Path(inputFile)
    val stream = fs.open(path)
    val reader = new BufferedReader(new InputStreamReader(stream))
    val str = Stream.continually(reader.readLine()).takeWhile(_ != null).mkString("\n")
    str
  }

}

Source File: WriteTransformer.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.deeplang.doperations

import java.io.{File, IOException}

import scala.reflect.runtime.{universe => ru}
import ai.deepsense.commons.utils.Version
import ai.deepsense.commons.utils.FileOperations.deleteRecursivelyIfExists
import ai.deepsense.deeplang.DOperation.Id
import ai.deepsense.deeplang.documentation.OperationDocumentation
import ai.deepsense.deeplang.doperables.Transformer
import ai.deepsense.deeplang.doperations.exceptions.DeepSenseIOException
import ai.deepsense.deeplang.params.{BooleanParam, Params, StringParam}
import ai.deepsense.deeplang.{DOperation1To0, ExecutionContext}

import java.net.URI
import org.apache.hadoop.fs.{FileSystem, Path}

case class WriteTransformer()
  extends DOperation1To0[Transformer]
  with Params
  with OperationDocumentation {

  override val id: Id = "58368deb-68d0-4657-ae3f-145160cb1e2b"
  override val name: String = "Write Transformer"
  override val description: String = "Writes a Transformer to a directory"

  override val since: Version = Version(1, 1, 0)

  val shouldOverwrite = BooleanParam(
    name = "overwrite",
    description = Some("Should an existing transformer with the same name be overwritten?")
  )
  setDefault(shouldOverwrite, true)

  def getShouldOverwrite: Boolean = $(shouldOverwrite)
  def setShouldOverwrite(value: Boolean): this.type = set(shouldOverwrite, value)

  val outputPath = StringParam(
    name = "output path",
    description = Some("The output path for writing the Transformer."))

  def getOutputPath: String = $(outputPath)
  def setOutputPath(value: String): this.type = set(outputPath, value)

  val specificParams: Array[ai.deepsense.deeplang.params.Param[_]] = Array(outputPath, shouldOverwrite)

  override protected def execute(transformer: Transformer)(context: ExecutionContext): Unit = {
    val outputDictPath = getOutputPath
    try {
      if (getShouldOverwrite) {
        removeDirectory(context, outputDictPath)
      }
      transformer.save(context, outputDictPath)
    } catch {
      case e: IOException =>
        logger.error(s"WriteTransformer error. Could not write transformer to the directory", e)
        throw DeepSenseIOException(e)
    }
  }

  private def removeDirectory(context: ExecutionContext, path: String): Unit = {
    if (path.startsWith("hdfs://")) {
      val configuration = context.sparkContext.hadoopConfiguration
      val hdfs = FileSystem.get(new URI(extractHdfsAddress(path)), configuration)
      hdfs.delete(new Path(path), true)
    } else {
      deleteRecursivelyIfExists(new File(path))
    }
  }

  private def extractHdfsAddress(path: String): String = {
    // first group: "hdfs://ip.addr.of.hdfs", second group: "/some/path/on/hdfs"
    val regex = "(hdfs:\\/\\/[^\\/]*)(.*)".r
    val regex(hdfsAddress, _) = path
    hdfsAddress
  }

  @transient
  override lazy val tTagTI_0: ru.TypeTag[Transformer] = ru.typeTag[Transformer]
}

object WriteTransformer {
  def apply(outputPath: String): WriteTransformer = {
    new WriteTransformer().setOutputPath(outputPath)
  }
}

Source File: FileDownloader.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.deeplang.doperations.readwritedataframe.filestorage

import java.io.{BufferedWriter, FileOutputStream, IOException, OutputStreamWriter}
import java.nio.file.{Files, Paths}
import java.util.UUID

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

import ai.deepsense.deeplang.ExecutionContext
import ai.deepsense.deeplang.doperations.exceptions.DeepSenseIOException
import ai.deepsense.deeplang.doperations.readwritedataframe.FilePath

private[filestorage] object FileDownloader {

  def downloadFile(url: String)(implicit context: ExecutionContext): FilePath = {
    if (context.tempPath.startsWith("hdfs://")) {
      downloadFileToHdfs(url)
    } else {
      downloadFileToDriver(url)
    }
  }

  private def downloadFileToHdfs(url: String)(implicit context: ExecutionContext) = {
    val content = scala.io.Source.fromURL(url).getLines()
    val hdfsPath = s"${context.tempPath}/${UUID.randomUUID()}"

    val configuration = new Configuration()
    val hdfs = FileSystem.get(configuration)
    val file = new Path(hdfsPath)
    val hdfsStream = hdfs.create(file)
    val writer = new BufferedWriter(new OutputStreamWriter(hdfsStream))
    try {
      content.foreach {s =>
        writer.write(s)
        writer.newLine()
      }
    } finally {
      safeClose(writer)
      hdfs.close()
    }

    FilePath(hdfsPath)
  }

  private def downloadFileToDriver(url: String)
                                  (implicit context: ExecutionContext) = {
    val outputDirPath = Paths.get(context.tempPath)
    // We're checking if the output is a directory following symlinks.
    // The default behaviour of createDirectories is NOT to follow symlinks
    if (!Files.isDirectory(outputDirPath)) {
      Files.createDirectories(outputDirPath)
    }

    val outFilePath = Files.createTempFile(outputDirPath, "download", ".csv")
    // content is a stream. Do not invoke stuff like .toList() on it.
    val content = scala.io.Source.fromURL(url).getLines()
    val writer: BufferedWriter =
      new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFilePath.toFile)))
    try {
      content.foreach {s =>
        writer.write(s)
        writer.newLine()
      }
    } finally {
      safeClose(writer)
    }
    FilePath(s"file:///$outFilePath")
  }

  private def safeClose(bufferedWriter: BufferedWriter): Unit = {
    try {
      bufferedWriter.flush()
      bufferedWriter.close()
    } catch {
      case e: IOException => throw new DeepSenseIOException(e)
    }
  }

}

Source File: DQMainClass.scala From DataQuality with GNU Lesser General Public License v3.0

5 votes

package it.agilelab.bigdata.DataQuality.utils

import java.util.Locale

import it.agilelab.bigdata.DataQuality.utils.io.HistoryDBManager
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveContext


trait DQMainClass { this: DQSparkContext with Logging =>

  private def initLogger(): Unit = {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("org.apache.spark.scheduler.TaskSetManager").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.OFF)
    Logger.getLogger("io.netty").setLevel(Level.OFF)
    Logger.getLogger("org.spark-project.jetty").setLevel(Level.OFF)
    Logger.getLogger("org.apache.hadoop.hdfs.KeyProviderCache").setLevel(Level.OFF)
  }

  private def makeFileSystem(settings: DQSettings, sc: SparkContext): FileSystem = {
    if (sc.isLocal) FileSystem.getLocal(sc.hadoopConfiguration)
    else {
      if (settings.s3Bucket.isDefined) {
        sc.hadoopConfiguration.set("fs.defaultFS", settings.s3Bucket.get)
        sc.hadoopConfiguration.set("fs.AbstractFileSystem.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
      }
      FileSystem.get( sc.hadoopConfiguration)
    }
  }

  protected def body()(implicit fs: FileSystem,
                       sparkContext: SparkContext,
                       sqlContext: SQLContext,
                       sqlWriter: HistoryDBManager,
                       settings: DQSettings): Boolean

  def preMessage(task: String): Unit = {
    log.warn("************************************************************************")
    log.warn(s"               Starting execution of task $task")
    log.warn("************************************************************************")
  }

  def postMessage(task: String): Unit = {
    log.warn("************************************************************************")
    log.warn(s"               Finishing execution of task $task")
    log.warn("************************************************************************")
  }

  def main(args: Array[String]): Unit = {
    // set to avoid casting problems in metric result name generation
    Locale.setDefault(Locale.ENGLISH)
    initLogger()

    DQCommandLineOptions.parser().parse(args, DQCommandLineOptions("","")) match {
      case Some(commandLineOptions) =>
        // Load our own config values from the default location, application.conf
        val settings = new DQSettings(commandLineOptions)
        val sparkContext = makeSparkContext(settings)
        val fs = makeFileSystem(settings, sparkContext)

        settings.logThis()(log)

        val sqlContext: SQLContext = if (settings.hiveDir.isDefined) {
          val hc =  new HiveContext(sparkContext)
          hc.setConf("hive.metastore.warehouse.dir", settings.hiveDir.get)
          hc
        } else makeSqlContext(sparkContext)

        val historyDatabase = new HistoryDBManager(settings)

        // Starting application body
        preMessage(s"{${settings.appName}}")
        val startTime = System.currentTimeMillis()
        body()(fs, sparkContext, sqlContext, historyDatabase, settings)
        postMessage(s"{${settings.appName}}")

        log.info(s"Execution finished in [${(System.currentTimeMillis() - startTime) / 60000}] min(s)")
        log.info("Closing application...")

        historyDatabase.closeConnection()
        sparkContext.stop()

        log.info("Spark context were terminated. Exiting...")

      case None =>
        log.error("Wrong parameters provided")
        throw new Exception("Wrong parameters provided")

    }

  }

}

Source File: TransposePostprocessor.scala From DataQuality with GNU Lesser General Public License v3.0

5 votes

package it.agilelab.bigdata.DataQuality.postprocessors

import com.typesafe.config.Config
import it.agilelab.bigdata.DataQuality.checks.CheckResult
import it.agilelab.bigdata.DataQuality.metrics.MetricResult
import it.agilelab.bigdata.DataQuality.sources.HdfsFile
import it.agilelab.bigdata.DataQuality.targets.HdfsTargetConfig
import it.agilelab.bigdata.DataQuality.utils
import it.agilelab.bigdata.DataQuality.utils.DQSettings
import it.agilelab.bigdata.DataQuality.utils.io.{HdfsReader, HdfsWriter}
import org.apache.hadoop.fs.FileSystem
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, SQLContext}

import scala.collection.JavaConversions._

final class TransposePostprocessor(config: Config, settings: DQSettings)
    extends BasicPostprocessor(config, settings: DQSettings) {
  private val vs = config.getString("source")
  private val keys = config.getStringList("keyColumns")
  private val target: HdfsTargetConfig = {
    val conf = config.getConfig("saveTo")
    utils.parseTargetConfig(conf)(settings).get
  }

  override def process(vsRef: Set[HdfsFile],
                       metRes: Seq[MetricResult],
                       chkRes: Seq[CheckResult])(
      implicit fs: FileSystem,
      sqlContext: SQLContext,
      settings: DQSettings): HdfsFile = {

    import sqlContext.implicits._

    def toLong(df: DataFrame, by: Seq[String]): DataFrame = {
      val (cols, types) = df.dtypes.filter { case (c, _) => !by.contains(c) }.unzip
      require(types.distinct.length == 1)

      val kvs = explode(
        array(
          cols.map(c => struct(lit(c).alias(settings.backComp.trKeyName), col(c).alias(settings.backComp.trValueName))): _*
        ))

      val byExprs = by.map(col)

      df.select(byExprs :+ kvs.alias("_kvs"): _*)
        .select(byExprs ++ Seq($"_kvs.${settings.backComp.trKeyName}", $"_kvs.${settings.backComp.trValueName}"): _*)
    }

    val reqVS: HdfsFile = vsRef.filter(vr => vr.id == vs).head
    val df: DataFrame = HdfsReader.load(reqVS, settings.ref_date).head

    val transposed: DataFrame = toLong(df, keys)

    HdfsWriter.saveVirtualSource(transposed, target, settings.refDateString)(
      fs,
      sqlContext.sparkContext)

    new HdfsFile(target)
  }

}

Source File: EnrichPostprocessor.scala From DataQuality with GNU Lesser General Public License v3.0

5 votes

package it.agilelab.bigdata.DataQuality.postprocessors

import java.util

import com.typesafe.config.Config
import it.agilelab.bigdata.DataQuality.checks.CheckResult
import it.agilelab.bigdata.DataQuality.exceptions.IllegalParameterException
import it.agilelab.bigdata.DataQuality.metrics.MetricResult
import it.agilelab.bigdata.DataQuality.sources.HdfsFile
import it.agilelab.bigdata.DataQuality.targets.HdfsTargetConfig
import it.agilelab.bigdata.DataQuality.utils
import it.agilelab.bigdata.DataQuality.utils.DQSettings
import it.agilelab.bigdata.DataQuality.utils.io.{HdfsReader, HdfsWriter}
import org.apache.hadoop.fs.FileSystem
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.{DataFrame, SQLContext}

import scala.collection.JavaConversions._
import scala.util.Try

final class EnrichPostprocessor(config: Config, settings: DQSettings)
    extends BasicPostprocessor(config, settings) {

  private val vs: Option[String] = Try(config.getString("source")).toOption
  private val metrics: util.List[String] = config.getStringList("metrics")
  private val checks: util.List[String] = config.getStringList("checks")
  private val extra = config.getObject("extra").toMap

  private val target: HdfsTargetConfig = {
    val conf = config.getConfig("saveTo")
    utils.parseTargetConfig(conf)(settings).get
  }

  override def process(vsRef: Set[HdfsFile],
                       metRes: Seq[MetricResult],
                       chkRes: Seq[CheckResult])(
      implicit fs: FileSystem,
      sqlContext: SQLContext,
      settings: DQSettings): HdfsFile = {

    import sqlContext.implicits._

    val df: DataFrame = vs match {
      case Some(vsource) =>
        val reqVS: HdfsFile = vsRef.filter(vr => vr.id == vsource).head
        HdfsReader.load(reqVS, settings.ref_date).head
      case None =>
        sqlContext.sparkContext.parallelize(Seq(1)).toDF("teapot")
    }

    val reqMet: Seq[(String, Double)] = metRes
      .filter(mr => metrics.contains(mr.metricId))
      .map(mr => mr.metricId -> mr.result)
    val reqCheck: Seq[(String, String)] = chkRes
      .filter(cr => checks.contains(cr.checkId))
      .map(cr => cr.checkId -> cr.status)

    if (reqMet.size != metrics.size())
      throw IllegalParameterException("Some of stated metrics are missing!")
    if (reqCheck.size != checks.size())
      throw IllegalParameterException("Some of stated checks are missing!")

    val dfWithMet: DataFrame =
      reqMet.foldLeft(df)((df, met) => df.withColumn(met._1, lit(met._2)))
    val dfWithChecks = reqCheck.foldLeft(dfWithMet)((df, met) =>
      df.withColumn(met._1, lit(met._2)))
    val dfWithExtra = extra.foldLeft(dfWithChecks)((df, ex) =>
      df.withColumn(ex._1, lit(ex._2.unwrapped())))

    HdfsWriter.saveVirtualSource(
      dfWithExtra.drop("teapot"),
      target,
      settings.refDateString)(fs, sqlContext.sparkContext)

    new HdfsFile(target)
  }
}

Source File: ArrangePostprocessor.scala From DataQuality with GNU Lesser General Public License v3.0

5 votes

package it.agilelab.bigdata.DataQuality.postprocessors

import com.typesafe.config.Config
import it.agilelab.bigdata.DataQuality.checks.CheckResult
import it.agilelab.bigdata.DataQuality.metrics.MetricResult
import it.agilelab.bigdata.DataQuality.sources.HdfsFile
import it.agilelab.bigdata.DataQuality.targets.HdfsTargetConfig
import it.agilelab.bigdata.DataQuality.utils
import it.agilelab.bigdata.DataQuality.utils.DQSettings
import it.agilelab.bigdata.DataQuality.utils.io.{HdfsReader, HdfsWriter}
import org.apache.hadoop.fs.FileSystem
import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, NumericType}
import org.apache.spark.sql.{Column, DataFrame, SQLContext}

import scala.collection.JavaConversions._

final class ArrangePostprocessor(config: Config, settings: DQSettings)
    extends BasicPostprocessor(config, settings) {

  private case class ColumnSelector(name: String, tipo: Option[String] = None, format: Option[String] = None, precision: Option[Integer] = None) {
    def toColumn()(implicit df: DataFrame): Column = {

      val dataType: Option[NumericType with Product with Serializable] =
        tipo.getOrElse("").toUpperCase match {
          case "DOUBLE" => Some(DoubleType)
          case "INT"    => Some(IntegerType)
          case "LONG"   => Some(LongType)
          case _        => None
        }

      import org.apache.spark.sql.functions.format_number
      import org.apache.spark.sql.functions.format_string

      (dataType, precision, format) match {
        case (Some(dt), None, None) => df(name).cast(dt)
        case(Some(dt), None, Some(f)) => format_string(f, df(name).cast(dt)).alias(name)
        case (Some(dt), Some(p),None) => format_number(df(name).cast(dt), p).alias(name)
        case (None, Some(p), None) => format_number(df(name), p).alias(name)
        case (None, None, Some(f)) => format_string(f, df(name)).alias(name)
        case _ => df(name)
      }
    }
  }

  private val vs = config.getString("source")
  private val target: HdfsTargetConfig = {
    val conf = config.getConfig("saveTo")
    utils.parseTargetConfig(conf)(settings).get
  }

  private val columns: Seq[ColumnSelector] =
    config.getAnyRefList("columnOrder").map {
      case x: String => ColumnSelector(x)
      case x: java.util.HashMap[_, String] => {
        val (name, v) = x.head.asInstanceOf[String Tuple2 _]

        v match {
          case v: String =>
            ColumnSelector(name, Option(v))
          case v: java.util.HashMap[String, _] => {
            val k = v.head._1
            val f = v.head._2

            f match {
              case f: Integer =>
                ColumnSelector(name, Option(k), None, Option(f))
              case f: String =>
                ColumnSelector(name, Option(k), Option(f))
            }
          }
        }
      }
    }

  override def process(vsRef: Set[HdfsFile],
                       metRes: Seq[MetricResult],
                       chkRes: Seq[CheckResult])(
      implicit fs: FileSystem,
      sqlContext: SQLContext,
      settings: DQSettings): HdfsFile = {

    val reqVS: HdfsFile = vsRef.filter(vr => vr.id == vs).head
    implicit val df: DataFrame = HdfsReader.load(reqVS, settings.ref_date).head

    val arrangeDF = df.select(columns.map(_.toColumn): _*)

    HdfsWriter.saveVirtualSource(arrangeDF, target, settings.refDateString)(
      fs,
      sqlContext.sparkContext)

    new HdfsFile(target)
  }
}

Source File: PostprocessorType.scala From DataQuality with GNU Lesser General Public License v3.0

5 votes

package it.agilelab.bigdata.DataQuality.postprocessors

import com.typesafe.config.Config
import it.agilelab.bigdata.DataQuality.checks.CheckResult
import it.agilelab.bigdata.DataQuality.metrics.MetricResult
import it.agilelab.bigdata.DataQuality.sources.HdfsFile
import it.agilelab.bigdata.DataQuality.utils.DQSettings
import org.apache.hadoop.fs.FileSystem
import org.apache.spark.sql.SQLContext

object PostprocessorType extends Enumeration {
  val enrich: PostprocessorVal =
    PostprocessorVal("enrich", classOf[EnrichPostprocessor])
  val transpose: PostprocessorVal =
    PostprocessorVal("transpose_by_key", classOf[TransposePostprocessor])
  val headless: PostprocessorVal =
    PostprocessorVal("transpose_by_column", classOf[TransposeByColumnPostprocessor])
  val arrange: PostprocessorVal =
    PostprocessorVal("arrange", classOf[ArrangePostprocessor])

  protected case class PostprocessorVal(name: String,
                                        service: Class[_ <: BasicPostprocessor])
    extends super.Val() {
    override def toString(): String = this.name
  }
  implicit def convert(value: Value): PostprocessorVal =
    value.asInstanceOf[PostprocessorVal]
}

abstract class BasicPostprocessor(config: Config, settings: DQSettings){
  def process(vsRef: Set[HdfsFile],
              metRes: Seq[MetricResult],
              chkRes: Seq[CheckResult])(implicit fs: FileSystem,
                                        sqlContext: SQLContext,
                                        settings: DQSettings): HdfsFile
}

Source File: SparkTestSpec.scala From DataQuality with GNU Lesser General Public License v3.0

5 votes

import com.typesafe.config._
import it.agilelab.bigdata.DataQuality.configs.ConfigReader
import it.agilelab.bigdata.DataQuality.metrics.{ColumnMetric, FileMetric, MetricProcessor}
import it.agilelab.bigdata.DataQuality.sources.{HdfsFile, SourceConfig}
import it.agilelab.bigdata.DataQuality.utils.DQSettings
import it.agilelab.bigdata.DataQuality.utils.io.HistoryDBManager
import org.apache.hadoop.fs.FileSystem
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
import org.joda.time
import org.scalatest.{BeforeAndAfterAll, FunSuite}

import scala.util.Random

class SparkTestSpec extends FunSuite with BeforeAndAfterAll {
  @transient private var _sc: SparkContext = _
  def sc: SparkContext = _sc
  @transient private var _fs: FileSystem = _
  def fs: FileSystem = _fs

  val SAMPLE_SIZE = 100
  val r = Random
  r.setSeed(123)

  val settings: DQSettings = new DQSettings(
    conf = ConfigFactory.parseURL(getClass.getResource("/application.conf")).getConfig("dataquality"),
    configFilePath = getClass.getResource("/conf/test.conf").getPath,
    repartition = false,
    local = true,
    ref_date = time.DateTime.now()
  )

  val conf = new SparkConf().setAppName(settings.appName)
    .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .set("spark.kryoserializer.buffer.max", "128")
    .set("spark.sql.parquet.compression.codec", "snappy")
    .setMaster("local[*]")

  val localSqlWriter = new HistoryDBManager(settings)

  override def beforeAll() {
    _sc = new SparkContext(conf)
    _fs = FileSystem.getLocal(_sc.hadoopConfiguration)
    super.beforeAll()
  }

  test("parse basic conf") {
    val configuration = new ConfigReader(settings.configFilePath)(localSqlWriter, settings)

    val testSource: HdfsFile = HdfsFile("T1", "./t1.csv", "csv", true, "2018-03-26", None)
    val sources: Map[String, SourceConfig] = configuration.sourcesConfigMap
    assert(sources.keySet.size == 3, "Should be equal 3")
    assert(sources.keySet == Set("T1","T2","T3"))
    assert(sources("T1") === testSource)
    assert(true)
  }

  case class TestRow(
                      str: String = r.nextString(5),
                      int: Int = r.nextInt(),
                      long: Long = r.nextLong(),
                      double: Double = r.nextDouble(),
                      boolean: Boolean = r.nextBoolean()
                    )

  test("run basic metrics") {
    val sqlContext = new SQLContext(sc)
    val input: DataFrame = sqlContext.createDataFrame(List.fill(SAMPLE_SIZE)(TestRow.apply()))
    val metric = FileMetric("123","ROW_COUNT","","input","2018-12-12",Map.empty)
    val res: (Map[Seq[String], Map[ColumnMetric, (Double, Option[String])]], Map[FileMetric, (Double, Option[String])]) =
      MetricProcessor.processAllMetrics(input, Seq.empty, Seq(metric), Seq.empty)(settings,sc, sqlContext, fs)
    assert(res._2(metric)._1 == SAMPLE_SIZE)
  }

  override def afterAll() {
    sc.stop()
    System.clearProperty("spark.driver.port")
    _sc = null
    super.afterAll()
  }
}

Source File: AvroInOutTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.utils.io.avro

import java.io.{File, FileNotFoundException, FileWriter}
import java.nio.file.Paths

import com.salesforce.op.test.TestSparkContext
import com.salesforce.op.utils.io.avro.AvroInOut._
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.rdd.RDD
import org.junit.runner.RunWith
import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class AvroInOutTest extends FlatSpec with TestSparkContext {
  val avroSchemaPath = s"$testDataDir/PassengerDataAll.avsc"
  val avroFilePath = s"$testDataDir/PassengerDataAll.avro"
  val avroFileRecordCount = 891
  val hdfs: FileSystem = FileSystem.get(sc.hadoopConfiguration)
  lazy val avroTemp: String = tempDir + "/avro-inout-test"

  Spec(AvroInOut.getClass) should "creates RDD from an avro file" in {
    val res = readPathSeq(avroFilePath, withCount = true, deepCopy = true, persist = false)
    res shouldBe a[RDD[_]]
    res.count shouldBe avroFileRecordCount
  }

  it should "creates RDD from a sequence of avro files" in {
    val res = readPathSeq(s"$avroFilePath,$avroFilePath")
    res.count shouldBe avroFileRecordCount*2
  }

  it should "create RDD from a mixed sequence of valid and invalid avro files" in {
    val res = readPathSeq(s"badfile/path1,$avroFilePath,badfile/path2,$avroFilePath,badfile/path3")
    res.count shouldBe avroFileRecordCount*2
  }

  it should "throw an error if passed in avro files are invalid" in {
    val error = intercept[IllegalArgumentException](readPathSeq("badfile/path1,badfile/path2"))
    error.getMessage shouldBe "No valid directory found in path 'badfile/path1,badfile/path2'"
  }

  it should "creates Some(RDD) from an avro file" in {
    val res = read(avroFilePath)
    res.size shouldBe 1
    res.get shouldBe an[RDD[_]]
    res.get.count shouldBe avroFileRecordCount
  }

  it should "create None from an invalid avro file" in {
    val res = read("badfile/path")
    res shouldBe None
  }

  Spec[AvroWriter[_]] should "writeAvro to filesystem" in {
    val avroData = readPathSeq(avroFilePath).asInstanceOf[RDD[GenericRecord]]
    val avroSchema = loadFile(avroSchemaPath)

    val error = intercept[FileNotFoundException](hdfs.listStatus(new Path(avroTemp)))
    error.getMessage shouldBe s"File $avroTemp does not exist"

    AvroWriter(avroData).writeAvro(avroTemp, avroSchema)
    val hdfsFiles = hdfs.listStatus(new Path(avroTemp)) filter (x => x.getPath.getName.contains("part"))
    val res = readPathSeq((for { x <- hdfsFiles } yield avroTemp + "/" + x.getPath.getName).mkString(","))
    res.count shouldBe avroFileRecordCount
  }

  it should "checkPathsExist" in {
    val tmpDir = Paths.get(File.separator, "tmp").toFile
    val f1 = new File(tmpDir, "avroinouttest")
    f1.delete()
    val w = new FileWriter(f1)
    w.write("just checking")
    w.close()
    val f2 = new File(tmpDir, "thisfilecannotexist")
    f2.delete()
    val f3 = new File(tmpDir, "this file cannot exist")
    f3.delete()
    assume(f1.exists && !f2.exists && !f3.exists)

    // check for one dir being invalid in the path amongst two
    selectExistingPaths(s"$f1,$f2") shouldBe f1.toString

    // check if all dirs in the path are invalid then we get an exception
    intercept[IllegalArgumentException] { selectExistingPaths(f2.toString) }

    // also, check if all dirs in the path are invalid ( in a different way ) then we get an exception
    intercept[IllegalArgumentException] { selectExistingPaths(f3.toString) }

    // check for one dir being invalid ( in a different way ) in the path amongst the two dirs in it
    selectExistingPaths(s"$f1,$f3") shouldBe f1.toString

    // check for paths order insensitivity
    selectExistingPaths(s"$f3,$f1") shouldBe f1.toString

    // check for an exception if the path is an empty string
    intercept[IllegalArgumentException] { selectExistingPaths("") }
  }

}

Source File: BigQueryDataFrame.scala From spark-bigquery with Apache License 2.0

5 votes

package com.spotify.spark.bigquery

import com.google.api.services.bigquery.model.TableReference
import com.google.cloud.hadoop.io.bigquery.{BigQueryConfiguration, BigQueryStrings}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.sql.{DataFrame, SQLContext}
import com.databricks.spark.avro._

import scala.util.Random

object CreateDisposition extends Enumeration {
  val CREATE_IF_NEEDED, CREATE_NEVER = Value
}

object WriteDisposition extends Enumeration {
  val WRITE_TRUNCATE, WRITE_APPEND, WRITE_EMPTY = Value
}


class BigQueryDataFrame(df: DataFrame) {
  val sqlContext: SQLContext = df.sqlContext
  val conf: Configuration = sqlContext.sparkContext.hadoopConfiguration
  val bq: BigQueryClient = BigQueryClient.getInstance(conf)

  sqlContext.setConf("spark.sql.avro.compression.codec", "deflate")

  
  def saveAsBigQueryTable(tableSpec: String,
                          writeDisposition: WriteDisposition.Value = null,
                          createDisposition: CreateDisposition.Value = null,
                          tmpWriteOptions: Map[String,String] = null): Unit =
    saveAsBigQueryTable(
      BigQueryStrings.parseTableReference(tableSpec),
      writeDisposition,
      createDisposition,
      tmpWriteOptions)

  private def delete(path: Path): Unit = {
    val fs = FileSystem.get(path.toUri, conf)
    fs.delete(path, true)
  }
}

Source File: JsonFileReporter.scala From kyuubi with Apache License 2.0

5 votes

package yaooqinn.kyuubi.metrics

import java.io.{BufferedWriter, Closeable, IOException, OutputStreamWriter}
import java.util.{Timer, TimerTask}
import java.util.concurrent.TimeUnit

import scala.util.Try
import scala.util.control.NonFatal

import com.codahale.metrics.MetricRegistry
import com.codahale.metrics.json.MetricsModule
import com.fasterxml.jackson.databind.ObjectMapper
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.kyuubi.Logging
import org.apache.spark.{KyuubiSparkUtil, SparkConf}
import org.apache.spark.KyuubiConf._

private[metrics] class JsonFileReporter(conf: SparkConf, registry: MetricRegistry)
  extends Closeable with Logging {

  private val jsonMapper = new ObjectMapper().registerModule(
    new MetricsModule(TimeUnit.MILLISECONDS, TimeUnit.MILLISECONDS, false))
  private val timer = new Timer(true)
  private val interval = KyuubiSparkUtil.timeStringAsMs(conf.get(METRICS_REPORT_INTERVAL))
  private val path = conf.get(METRICS_REPORT_LOCATION)
  private val hadoopConf = KyuubiSparkUtil.newConfiguration(conf)

  def start(): Unit = {
    timer.schedule(new TimerTask {
      var bw: BufferedWriter = _
      override def run(): Unit = try {
        val json = jsonMapper.writerWithDefaultPrettyPrinter().writeValueAsString(registry)
        val tmpPath = new Path(path + ".tmp")
        val tmpPathUri = tmpPath.toUri
        val fs = if (tmpPathUri.getScheme == null && tmpPathUri.getAuthority == null) {
          FileSystem.getLocal(hadoopConf)
        } else {
          FileSystem.get(tmpPathUri, hadoopConf)
        }
        fs.delete(tmpPath, true)
        bw = new BufferedWriter(new OutputStreamWriter(fs.create(tmpPath, true)))
        bw.write(json)
        bw.close()
        fs.setPermission(tmpPath, FsPermission.createImmutable(Integer.parseInt("644", 8).toShort))
        val finalPath = new Path(path)
        fs.rename(tmpPath, finalPath)
        fs.setPermission(finalPath,
          FsPermission.createImmutable(Integer.parseInt("644", 8).toShort))
      } catch {
        case NonFatal(e) => error("Error writing metrics to json file" + path, e)
      } finally {
        if (bw != null) {
          Try(bw.close())
        }
      }
    }, 0, interval)
  }

  override def close(): Unit = {
    timer.cancel()
  }
}

Source File: HDFSTokenCollector.scala From kyuubi with Apache License 2.0

5 votes

package yaooqinn.kyuubi.session.security

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.mapred.Master
import org.apache.hadoop.security.{Credentials, UserGroupInformation}
import org.apache.kyuubi.Logging
import org.apache.spark.KyuubiSparkUtil._
import org.apache.spark.SparkConf

import yaooqinn.kyuubi.service.ServiceException


private[security] object HDFSTokenCollector extends TokenCollector with Logging {

  private def hadoopFStoAccess(conf: SparkConf, hadoopConf: Configuration): Set[FileSystem] = {
    val fileSystems = conf.getOption(ACCESS_FSS)
      .orElse(conf.getOption(ACCESS_NNS)) match {
      case Some(nns) => nns.split(",").map(new Path(_).getFileSystem(hadoopConf)).toSet
      case _ => Set.empty[FileSystem]
    }

    fileSystems +
      conf.getOption(STAGING_DIR).map(new Path(_).getFileSystem(hadoopConf))
        .getOrElse(FileSystem.get(hadoopConf))
  }

  private def renewer(hadoopConf: Configuration): String = {
    val tokenRenewer = Master.getMasterPrincipal(hadoopConf)
    debug("Delegation token renewer is: " + tokenRenewer)

    if (tokenRenewer == null || tokenRenewer.length() == 0) {
      val errorMessage = "Can't get Master Kerberos principal for use as renewer."
      error(errorMessage)
      throw new ServiceException(errorMessage)
    }
    tokenRenewer
  }

  override def obtainTokens(conf: SparkConf): Unit = try {
    val hadoopConf = newConfiguration(conf)
    val tokenRenewer = renewer(hadoopConf)
    val creds = new Credentials()
    hadoopFStoAccess(conf, hadoopConf).foreach { fs =>
      fs.addDelegationTokens(tokenRenewer, creds)
    }
    UserGroupInformation.getCurrentUser.addCredentials(creds)
  } catch {
    case e: Exception =>
      error("Failed to obtain HDFS tokens", e)
  }
}

Source File: KyuubiHiveUtil.scala From kyuubi with Apache License 2.0

5 votes

package yaooqinn.kyuubi.utils

import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.ql.session.SessionState
import org.apache.hadoop.security.UserGroupInformation
import org.apache.kyuubi.Logging
import org.apache.spark.{KyuubiSparkUtil, SparkConf}

object KyuubiHiveUtil extends Logging {

  private val HIVE_PREFIX = "hive."
  private val METASTORE_PREFIX = "metastore."

  val URIS: String = HIVE_PREFIX + METASTORE_PREFIX + "uris"
  val METASTORE_PRINCIPAL: String = HIVE_PREFIX + METASTORE_PREFIX + "kerberos.principal"

  def hiveConf(conf: SparkConf): HiveConf = {
    val hadoopConf = KyuubiSparkUtil.newConfiguration(conf)
    new HiveConf(hadoopConf, classOf[HiveConf])
  }

  def addDelegationTokensToHiveState(ugi: UserGroupInformation): Unit = {
    val state = SessionState.get
    if (state != null) {
      addDelegationTokensToHiveState(state, ugi)
    }
  }

  def addDelegationTokensToHiveState(state: SessionState, ugi: UserGroupInformation): Unit = {
    state.getHdfsEncryptionShim match {
      case shim: org.apache.hadoop.hive.shims.Hadoop23Shims#HdfsEncryptionShim =>
        try {
          val hdfsAdmin = ReflectUtils.getFieldValue(shim, "hdfsAdmin")
          val dfs = ReflectUtils.getFieldValue(hdfsAdmin, "dfs")
          dfs.asInstanceOf[FileSystem].addDelegationTokens(ugi.getUserName, ugi.getCredentials)
        } catch {
          case e: Exception =>
            error("Failed add delegation token to hive session state", e)
        }
      case _ =>
    }
  }
}

Source File: KyuubiDistributedCacheManager.scala From kyuubi with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.net.URI

import scala.collection.mutable.{HashMap, Map}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
import org.apache.hadoop.yarn.api.records.{LocalResource, LocalResourceType}


  def addResource(
      fs: FileSystem,
      conf: Configuration,
      destPath: Path,
      localResources: HashMap[String, LocalResource],
      resourceType: LocalResourceType,
      link: String,
      statCache: Map[URI, FileStatus]): Unit = {
    cacheManager.addResource(fs, conf, destPath,
      localResources, resourceType, link, statCache, appMasterOnly = true)
  }
}

Source File: KyuubiDistributedCacheManagerSuite.scala From kyuubi with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.net.URI

import scala.collection.mutable.{HashMap, Map}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
import org.apache.hadoop.yarn.api.records.{LocalResource, LocalResourceType, LocalResourceVisibility}
import org.apache.hadoop.yarn.util.ConverterUtils
import org.apache.spark.{KyuubiSparkUtil, SparkFunSuite}
import org.mockito.Mockito.when
import org.scalatest.mock.MockitoSugar

import yaooqinn.kyuubi.utils.ReflectUtils

class KyuubiDistributedCacheManagerSuite extends SparkFunSuite with MockitoSugar {

  class MockClientDistributedCacheManager extends ClientDistributedCacheManager {
    override def getVisibility(conf: Configuration, uri: URI, statCache: Map[URI, FileStatus]):
    LocalResourceVisibility = {
      LocalResourceVisibility.PRIVATE
    }
  }

  test("add resource") {
    val fs = mock[FileSystem]
    val conf = new Configuration()
    val destPath = new Path("file:///foo.bar.com:8080/tmp/testing")
    val localResources = HashMap[String, LocalResource]()
    val statCache = HashMap[URI, FileStatus]()
    val status = new FileStatus()
    when(fs.getFileStatus(destPath)).thenReturn(status)
    val fileLink = "link"
    ReflectUtils.setFieldValue(
      KyuubiDistributedCacheManager, "cacheManager", new MockClientDistributedCacheManager)
    KyuubiDistributedCacheManager.addResource(
      fs, conf, destPath, localResources, LocalResourceType.FILE, fileLink, statCache)
    val res = localResources(fileLink)
    assert(res.getVisibility === LocalResourceVisibility.PRIVATE)
    assert(ConverterUtils.getPathFromYarnURL(res.getResource) === destPath)
    assert(res.getSize === 0)
    assert(res.getTimestamp === 0)
    assert(res.getType === LocalResourceType.FILE)
    val status2 = new FileStatus(
      10, false, 1, 1024, 10,
      10, null, KyuubiSparkUtil.getCurrentUserName, null, new Path("/tmp/testing2"))
    val destPath2 = new Path("file:///foo.bar.com:8080/tmp/testing2")
    when(fs.getFileStatus(destPath2)).thenReturn(status2)
    val fileLink2 = "link2"
    KyuubiDistributedCacheManager.addResource(
      fs, conf, destPath2, localResources, LocalResourceType.FILE, fileLink2, statCache)
    val res2 = localResources(fileLink2)
    assert(res2.getVisibility === LocalResourceVisibility.PRIVATE)
    assert(ConverterUtils.getPathFromYarnURL(res2.getResource) === destPath2)
    assert(res2.getSize === 10)
    assert(res2.getTimestamp === 10)
    assert(res2.getType === LocalResourceType.FILE)
  }

  test("add resource when link null") {
    val distMgr = new MockClientDistributedCacheManager()
    val fs = mock[FileSystem]
    val conf = new Configuration()
    val destPath = new Path("file:///foo.bar.com:8080/tmp/testing")
    ReflectUtils.setFieldValue(KyuubiDistributedCacheManager, "cacheManager", distMgr)
    val localResources = HashMap[String, LocalResource]()
    val statCache = HashMap[URI, FileStatus]()
    when(fs.getFileStatus(destPath)).thenReturn(new FileStatus())
    intercept[Exception] {
      KyuubiDistributedCacheManager.addResource(
        fs, conf, destPath, localResources, LocalResourceType.FILE, null, statCache)
    }
    assert(localResources.get("link") === None)
    assert(localResources.size === 0)
  }

  test("test addResource archive") {
    val distMgr = new MockClientDistributedCacheManager()
    ReflectUtils.setFieldValue(KyuubiDistributedCacheManager, "cacheManager", distMgr)
    val fs = mock[FileSystem]
    val conf = new Configuration()
    val destPath = new Path("file:///foo.bar.com:8080/tmp/testing")
    val localResources = HashMap[String, LocalResource]()
    val statCache = HashMap[URI, FileStatus]()
    val realFileStatus = new FileStatus(10, false, 1, 1024, 10, 10, null, "testOwner",
      null, new Path("/tmp/testing"))
    when(fs.getFileStatus(destPath)).thenReturn(realFileStatus)

    KyuubiDistributedCacheManager.addResource(
      fs, conf, destPath, localResources, LocalResourceType.ARCHIVE, "link", statCache)
    val resource = localResources("link")
    assert(resource.getVisibility === LocalResourceVisibility.PRIVATE)
    assert(ConverterUtils.getPathFromYarnURL(resource.getResource) === destPath)
    assert(resource.getTimestamp === 10)
    assert(resource.getSize === 10)
    assert(resource.getType === LocalResourceType.ARCHIVE)

  }

}

Source File: HDFSHelperTest.scala From spark1.52 with Apache License 2.0

5 votes

package scalaDemo
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

import scala.collection.mutable.ListBuffer

object HDFSHelperTest {
  def start(args: Array[String]): Unit = {
    val hdfs : FileSystem = FileSystem.get(new Configuration)
    args(0) match {
      case "list" => traverse(hdfs, args(1))
      case "createFile" => HDFSHelper.createFile(hdfs, args(1))
      case "createFolder" => HDFSHelper.createFolder(hdfs, args(1))
      case "copyfile" => HDFSHelper.copyFile(hdfs, args(1), args(2))
      case "copyfolder" => HDFSHelper.copyFolder(hdfs, args(1), args(2))
      case "delete" => HDFSHelper.deleteFile(hdfs, args(1))
      case "copyfilefrom" => HDFSHelper.copyFileFromLocal(hdfs, args(1), args(2))
      case "copyfileto" => HDFSHelper.copyFileToLocal(hdfs, args(1), args(2))
      case "copyfolderfrom" => HDFSHelper.copyFolderFromLocal(hdfs, args(1), args(2))
      case "copyfolderto" => HDFSHelper.copyFolderToLocal(hdfs, args(1), args(2))
    }
  }

  def traverse(hdfs : FileSystem, hdfsPath : String) = {
    val holder : ListBuffer[String] = new ListBuffer[String]
    val paths : List[String] = HDFSHelper.listChildren(hdfs, hdfsPath, holder).toList
    for(path <- paths){
      //path.toString才是文件的全路径名
      System.out.println("--------- path = " + path)
      //path.getName只是文件名，不包括路径
      System.out.println("--------- Path.getname = " + new Path(path).getName)
    }
  }
}

Source File: ExecutorDelegationTokenUpdater.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.util.concurrent.{Executors, TimeUnit}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.security.{Credentials, UserGroupInformation}

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.util.{ThreadUtils, Utils}

import scala.util.control.NonFatal

private[spark] class ExecutorDelegationTokenUpdater(
    sparkConf: SparkConf,
    hadoopConf: Configuration) extends Logging {

  @volatile private var lastCredentialsFileSuffix = 0

  private val credentialsFile = sparkConf.get("spark.yarn.credentials.file")
  private val freshHadoopConf =
    SparkHadoopUtil.get.getConfBypassingFSCache(
      hadoopConf, new Path(credentialsFile).toUri.getScheme)

  private val delegationTokenRenewer =
    Executors.newSingleThreadScheduledExecutor(
      ThreadUtils.namedThreadFactory("Delegation Token Refresh Thread"))

  // On the executor, this thread wakes up and picks up new tokens from HDFS, if any.
  //在执行程序中,该线程唤醒并从HDFS中获取新令牌(如果有的话)
  private val executorUpdaterRunnable =
    new Runnable {
      override def run(): Unit = Utils.logUncaughtExceptions(updateCredentialsIfRequired())
    }

  def updateCredentialsIfRequired(): Unit = {
    try {
      val credentialsFilePath = new Path(credentialsFile)
      val remoteFs = FileSystem.get(freshHadoopConf)
      SparkHadoopUtil.get.listFilesSorted(
        remoteFs, credentialsFilePath.getParent,
        credentialsFilePath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION)
        .lastOption.foreach { credentialsStatus =>
        val suffix = SparkHadoopUtil.get.getSuffixForCredentialsPath(credentialsStatus.getPath)
        if (suffix > lastCredentialsFileSuffix) {
          logInfo("Reading new delegation tokens from " + credentialsStatus.getPath)
          val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsStatus.getPath)
          lastCredentialsFileSuffix = suffix
          UserGroupInformation.getCurrentUser.addCredentials(newCredentials)
          logInfo("Tokens updated from credentials file.")
        } else {
          // Check every hour to see if new credentials arrived.
          logInfo("Updated delegation tokens were expected, but the driver has not updated the " +
            "tokens yet, will check again in an hour.")
          delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS)
          return
        }
      }
      val timeFromNowToRenewal =
        SparkHadoopUtil.get.getTimeFromNowToRenewal(
          sparkConf, 0.8, UserGroupInformation.getCurrentUser.getCredentials)
      if (timeFromNowToRenewal <= 0) {
        executorUpdaterRunnable.run()
      } else {
        logInfo(s"Scheduling token refresh from HDFS in $timeFromNowToRenewal millis.")
        delegationTokenRenewer.schedule(
          executorUpdaterRunnable, timeFromNowToRenewal, TimeUnit.MILLISECONDS)
      }
    } catch {
      // Since the file may get deleted while we are reading it, catch the Exception and come
      // back in an hour to try again
      case NonFatal(e) =>
        logWarning("Error while trying to update credentials, will try again in 1 hour", e)
        delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS)
    }
  }

  private def getCredentialsFromHDFSFile(remoteFs: FileSystem, tokenPath: Path): Credentials = {
    val stream = remoteFs.open(tokenPath)
    try {
      val newCredentials = new Credentials()
      newCredentials.readTokenStorageStream(stream)
      newCredentials
    } finally {
      stream.close()
    }
  }

  def stop(): Unit = {
    delegationTokenRenewer.shutdown()
  }

}

Source File: SimrSchedulerBackend.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import org.apache.hadoop.fs.{Path, FileSystem}

import org.apache.spark.rpc.RpcAddress
import org.apache.spark.{Logging, SparkContext, SparkEnv}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.scheduler.TaskSchedulerImpl

private[spark] class SimrSchedulerBackend(
    scheduler: TaskSchedulerImpl,
    sc: SparkContext,
    driverFilePath: String)
  extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv)
  with Logging {

  val tmpPath = new Path(driverFilePath + "_tmp")
  val filePath = new Path(driverFilePath)

  val maxCores = conf.getInt("spark.simr.executor.cores", 1)

  override def start() {
    super.start()

    val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName,
    //运行driver的主机名或 IP 地址
      RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt),
      CoarseGrainedSchedulerBackend.ENDPOINT_NAME)

    val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
    val fs = FileSystem.get(conf)
    val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("")

    logInfo("Writing to HDFS file: "  + driverFilePath)
    logInfo("Writing Akka address: "  + driverUrl)
    logInfo("Writing Spark UI Address: " + appUIAddress)

    // Create temporary file to prevent race condition where executors get empty driverUrl file
    //创建临时文件以防止执行程序获得空的驱动程序文件的竞争条件
    val temp = fs.create(tmpPath, true)
    temp.writeUTF(driverUrl)
    temp.writeInt(maxCores)
    temp.writeUTF(appUIAddress)
    temp.close()

    // "Atomic" rename
    fs.rename(tmpPath, filePath)
  }

  override def stop() {
val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
val fs = FileSystem.get(conf)
fs.delete(new Path(driverFilePath), false)
super.stop()
}

}

Source File: Util.scala From Heracles with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase.util

import java.io._
import java.util.concurrent.atomic.AtomicInteger
import java.util.zip.{DeflaterOutputStream, InflaterInputStream}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hbase.HBaseConfiguration

object Util {
  val iteration = new AtomicInteger(0)

  
  def dropTempFilePath(conf: Configuration, path: String): Boolean = {
    val fileSystem = FileSystem.get(conf)
    val filePath = new Path(path)
    if (fileSystem.exists(filePath)) {
      fileSystem.delete(filePath, true)
    } else {
      false
    }
  }

  def serializeHBaseConfiguration(configuration: Configuration): Array[Byte] = {
    val bos = new ByteArrayOutputStream
    val deflaterOutputStream = new DeflaterOutputStream(bos)
    val dos = new DataOutputStream(deflaterOutputStream)
    configuration.write(dos)
    dos.close()
    bos.toByteArray
  }

  def deserializeHBaseConfiguration(arr: Array[Byte]) = {
    val conf = HBaseConfiguration.create
    conf.readFields(new DataInputStream(new InflaterInputStream(new ByteArrayInputStream(arr))))
    conf
  }
}

Source File: Util.scala From Spark-SQL-on-HBase with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase.util

import java.io._
import java.util.concurrent.atomic.AtomicInteger
import java.util.zip.{DeflaterOutputStream, InflaterInputStream}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hbase.HBaseConfiguration

object Util {
  val iteration = new AtomicInteger(0)

  def getTempFilePath(conf: Configuration, prefix: String): String = {
    val fileSystem = FileSystem.get(conf)
    val path = new Path(s"$prefix-${System.currentTimeMillis()}-${iteration.getAndIncrement}")
    if (fileSystem.exists(path)) {
      fileSystem.delete(path, true)
    }
    path.getName
  }

  def serializeHBaseConfiguration(configuration: Configuration): Array[Byte] = {
    val bos = new ByteArrayOutputStream
    val deflaterOutputStream = new DeflaterOutputStream(bos)
    val dos = new DataOutputStream(deflaterOutputStream)
    configuration.write(dos)
    dos.close()
    bos.toByteArray
  }

  def deserializeHBaseConfiguration(arr: Array[Byte]) = {
    val conf = HBaseConfiguration.create
    conf.readFields(new DataInputStream(new InflaterInputStream(new ByteArrayInputStream(arr))))
    conf
  }
}

Source File: YARNHadoopDelegationTokenManager.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn.security

import java.util.ServiceLoader

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.security.Credentials

import org.apache.spark.SparkConf
import org.apache.spark.deploy.security.HadoopDelegationTokenManager
import org.apache.spark.internal.Logging
import org.apache.spark.util.Utils


  def obtainDelegationTokens(hadoopConf: Configuration, creds: Credentials): Long = {
    val superInterval = delegationTokenManager.obtainDelegationTokens(hadoopConf, creds)

    credentialProviders.values.flatMap { provider =>
      if (provider.credentialsRequired(hadoopConf)) {
        provider.obtainCredentials(hadoopConf, sparkConf, creds)
      } else {
        logDebug(s"Service ${provider.serviceName} does not require a token." +
          s" Check your configuration to see if security is disabled or not.")
        None
      }
    }.foldLeft(superInterval)(math.min)
  }

  private def getCredentialProviders: Map[String, ServiceCredentialProvider] = {
    val providers = loadCredentialProviders

    providers.
      filter { p => delegationTokenManager.isServiceEnabled(p.serviceName) }
      .map { p => (p.serviceName, p) }
      .toMap
  }

  private def loadCredentialProviders: List[ServiceCredentialProvider] = {
    ServiceLoader.load(classOf[ServiceCredentialProvider], Utils.getContextOrSparkClassLoader)
      .asScala
      .toList
  }
}

Source File: InsertIntoHiveDirCommand.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import scala.language.existentials

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.common.FileUtils
import org.apache.hadoop.hive.ql.plan.TableDesc
import org.apache.hadoop.hive.serde.serdeConstants
import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
import org.apache.hadoop.mapred._

import org.apache.spark.SparkException
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable}
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.hive.client.HiveClientImpl


case class InsertIntoHiveDirCommand(
    isLocal: Boolean,
    storage: CatalogStorageFormat,
    query: LogicalPlan,
    overwrite: Boolean,
    outputColumns: Seq[Attribute]) extends SaveAsHiveFile {

  override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = {
    assert(storage.locationUri.nonEmpty)

    val hiveTable = HiveClientImpl.toHiveTable(CatalogTable(
      identifier = TableIdentifier(storage.locationUri.get.toString, Some("default")),
      tableType = org.apache.spark.sql.catalyst.catalog.CatalogTableType.VIEW,
      storage = storage,
      schema = query.schema
    ))
    hiveTable.getMetadata.put(serdeConstants.SERIALIZATION_LIB,
      storage.serde.getOrElse(classOf[LazySimpleSerDe].getName))

    val tableDesc = new TableDesc(
      hiveTable.getInputFormatClass,
      hiveTable.getOutputFormatClass,
      hiveTable.getMetadata
    )

    val hadoopConf = sparkSession.sessionState.newHadoopConf()
    val jobConf = new JobConf(hadoopConf)

    val targetPath = new Path(storage.locationUri.get)
    val writeToPath =
      if (isLocal) {
        val localFileSystem = FileSystem.getLocal(jobConf)
        localFileSystem.makeQualified(targetPath)
      } else {
        val qualifiedPath = FileUtils.makeQualified(targetPath, hadoopConf)
        val dfs = qualifiedPath.getFileSystem(jobConf)
        if (!dfs.exists(qualifiedPath)) {
          dfs.mkdirs(qualifiedPath.getParent)
        }
        qualifiedPath
      }

    val tmpPath = getExternalTmpPath(sparkSession, hadoopConf, writeToPath)
    val fileSinkConf = new org.apache.spark.sql.hive.HiveShim.ShimFileSinkDesc(
      tmpPath.toString, tableDesc, false)

    try {
      saveAsHiveFile(
        sparkSession = sparkSession,
        plan = child,
        hadoopConf = hadoopConf,
        fileSinkConf = fileSinkConf,
        outputLocation = tmpPath.toString,
        allColumns = outputColumns)

      val fs = writeToPath.getFileSystem(hadoopConf)
      if (overwrite && fs.exists(writeToPath)) {
        fs.listStatus(writeToPath).foreach { existFile =>
          if (Option(existFile.getPath) != createdTempDir) fs.delete(existFile.getPath, true)
        }
      }

      fs.listStatus(tmpPath).foreach {
        tmpFile => fs.rename(tmpFile.getPath, writeToPath)
      }
    } catch {
      case e: Throwable =>
        throw new SparkException(
          "Failed inserting overwrite directory " + storage.locationUri.get, e)
    } finally {
      deleteExternalTmpPath(hadoopConf)
    }

    Seq.empty[Row]
  }
}

Source File: QueryPartitionSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import java.io.File
import java.sql.Timestamp

import com.google.common.io.Files
import org.apache.hadoop.fs.FileSystem

import org.apache.spark.sql._
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.test.SQLTestUtils
import org.apache.spark.util.Utils

class QueryPartitionSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
  import spark.implicits._

  test("SPARK-5068: query data when path doesn't exist") {
    withSQLConf((SQLConf.HIVE_VERIFY_PARTITION_PATH.key, "true")) {
      val testData = sparkContext.parallelize(
        (1 to 10).map(i => TestData(i, i.toString))).toDF()
      testData.createOrReplaceTempView("testData")

      val tmpDir = Files.createTempDir()
      // create the table for test
      sql(s"CREATE TABLE table_with_partition(key int,value string) " +
        s"PARTITIONED by (ds string) location '${tmpDir.toURI}' ")
      sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='1') " +
        "SELECT key,value FROM testData")
      sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='2') " +
        "SELECT key,value FROM testData")
      sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='3') " +
        "SELECT key,value FROM testData")
      sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='4') " +
        "SELECT key,value FROM testData")

      // test for the exist path
      checkAnswer(sql("select key,value from table_with_partition"),
        testData.toDF.collect ++ testData.toDF.collect
          ++ testData.toDF.collect ++ testData.toDF.collect)

      // delete the path of one partition
      tmpDir.listFiles
        .find { f => f.isDirectory && f.getName().startsWith("ds=") }
        .foreach { f => Utils.deleteRecursively(f) }

      // test for after delete the path
      checkAnswer(sql("select key,value from table_with_partition"),
        testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect)

      sql("DROP TABLE IF EXISTS table_with_partition")
      sql("DROP TABLE IF EXISTS createAndInsertTest")
    }
  }

  test("SPARK-21739: Cast expression should initialize timezoneId") {
    withTable("table_with_timestamp_partition") {
      sql("CREATE TABLE table_with_timestamp_partition(value int) PARTITIONED BY (ts TIMESTAMP)")
      sql("INSERT OVERWRITE TABLE table_with_timestamp_partition " +
        "PARTITION (ts = '2010-01-01 00:00:00.000') VALUES (1)")

      // test for Cast expression in TableReader
      checkAnswer(sql("SELECT * FROM table_with_timestamp_partition"),
        Seq(Row(1, Timestamp.valueOf("2010-01-01 00:00:00.000"))))

      // test for Cast expression in HiveTableScanExec
      checkAnswer(sql("SELECT value FROM table_with_timestamp_partition " +
        "WHERE ts = '2010-01-01 00:00:00.000'"), Row(1))
    }
  }
}

Source File: StreamMetadata.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.io.{InputStreamReader, OutputStreamWriter}
import java.nio.charset.StandardCharsets

import scala.util.control.NonFatal

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization

import org.apache.spark.internal.Logging
import org.apache.spark.sql.streaming.StreamingQuery


  def write(
      metadata: StreamMetadata,
      metadataFile: Path,
      hadoopConf: Configuration): Unit = {
    var output: FSDataOutputStream = null
    try {
      val fs = metadataFile.getFileSystem(hadoopConf)
      output = fs.create(metadataFile)
      val writer = new OutputStreamWriter(output)
      Serialization.write(metadata, writer)
      writer.close()
    } catch {
      case NonFatal(e) =>
        logError(s"Error writing stream metadata $metadata to $metadataFile", e)
        throw e
    } finally {
      IOUtils.closeQuietly(output)
    }
  }
}

Source File: ParquetFileFormatSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.parquet

import org.apache.hadoop.fs.{FileSystem, Path}

import org.apache.spark.SparkException
import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.test.SharedSQLContext

class ParquetFileFormatSuite extends QueryTest with ParquetTest with SharedSQLContext {

  test("read parquet footers in parallel") {
    def testReadFooters(ignoreCorruptFiles: Boolean): Unit = {
      withTempDir { dir =>
        val fs = FileSystem.get(sparkContext.hadoopConfiguration)
        val basePath = dir.getCanonicalPath

        val path1 = new Path(basePath, "first")
        val path2 = new Path(basePath, "second")
        val path3 = new Path(basePath, "third")

        spark.range(1).toDF("a").coalesce(1).write.parquet(path1.toString)
        spark.range(1, 2).toDF("a").coalesce(1).write.parquet(path2.toString)
        spark.range(2, 3).toDF("a").coalesce(1).write.json(path3.toString)

        val fileStatuses =
          Seq(fs.listStatus(path1), fs.listStatus(path2), fs.listStatus(path3)).flatten

        val footers = ParquetFileFormat.readParquetFootersInParallel(
          sparkContext.hadoopConfiguration, fileStatuses, ignoreCorruptFiles)

        assert(footers.size == 2)
      }
    }

    testReadFooters(true)
    val exception = intercept[java.io.IOException] {
      testReadFooters(false)
    }
    assert(exception.getMessage().contains("Could not read footer for file"))
  }
}

Source File: RecordIOOutputFormatTests.scala From sagemaker-spark with Apache License 2.0

5 votes

package com.amazonaws.services.sagemaker.sparksdk.protobuf

import java.io.ByteArrayOutputStream

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FSDataOutputStream, Path}
import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.mockito.Matchers.any
import org.mockito.Mockito.{verify, when}
import org.scalatest.{BeforeAndAfter, FlatSpec}
import org.scalatest.mock.MockitoSugar

import com.amazonaws.services.sagemaker.sparksdk.protobuf.RecordIOOutputFormat.SageMakerProtobufRecordWriter


class RecordIOOutputFormatTests extends FlatSpec with MockitoSugar with BeforeAndAfter {

  var sagemakerProtobufRecordWriter: SageMakerProtobufRecordWriter = _
  var mockOutputStream : FSDataOutputStream = _
  var byteArrayOutputStream: ByteArrayOutputStream = _
  var mockTaskAttemptContext: TaskAttemptContext = _
  var mockPath: Path = _
  var mockFileSystem: FileSystem = _

  before {
    byteArrayOutputStream = new ByteArrayOutputStream()
    mockOutputStream = mock[FSDataOutputStream]
    sagemakerProtobufRecordWriter = new SageMakerProtobufRecordWriter(mockOutputStream)
    mockTaskAttemptContext = mock[TaskAttemptContext]
    mockPath = mock[Path]
    mockFileSystem = mock[FileSystem]
  }

  it should "write an empty array of bytes" in {
    val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray)

    val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes)
    sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable)

    verify(mockOutputStream).write(bytes, 0, bytes.length)
  }


  it should "write an array of bytes" in {
    val byteArray = Array[Byte](0, 0, 0, 0)
    byteArrayOutputStream.write(byteArray)
    val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray)
    val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes)

    sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable)

    verify(mockOutputStream).write(bytes, 0, bytes.length)
  }

  it should "write an array of bytes, padding as necessary" in {
    byteArrayOutputStream.write(5)
    val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray)
    val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes)

    sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable)

    verify(mockOutputStream).write(bytes, 0, bytes.length)
  }

  it should "write an array of bytes, padding only as much as necessary" in {
    byteArrayOutputStream.write(Array[Byte](0, 0, 0, 0, 0))
    val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray)
    val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes)

    sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable)

    verify(mockOutputStream).write(bytes, 0, bytes.length)
  }

  it should "create a record writer from a FSDataOutputStream created by the filesystem" in {
    val mockTaskAttemptContext = mock[TaskAttemptContext]
    val mockPath = mock[Path]
    val mockFileSystem = mock[FileSystem]
    when(mockPath.getFileSystem(any[Configuration])).thenReturn(mockFileSystem)
    new RecordIOOutputFormat() {
      override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
        mockPath
      }
    }.getRecordWriter(mockTaskAttemptContext)
    verify(mockFileSystem).create(mockPath, true)

  }

}

Source File: IOReader.scala From spark-benchmarks with Apache License 2.0

5 votes

package com.bbva.spark.benchmarks.dfsio

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

class IOReader(hadoopConf: Configuration, dataDir: String) extends IOTestBase(hadoopConf, dataDir) {

  def doIO(fileName: String, fileSize: BytesSize)(implicit conf: Configuration, fs: FileSystem): BytesSize = {

    val bufferSize = conf.getInt("test.io.file.buffer.size", DefaultBufferSize) // TODO GET RID OF DEFAULT
    val buffer: Array[Byte] = new Array[Byte](bufferSize)
    val filePath = new Path(dataDir, fileName.toString)

    logger.info("Reading file {} with size {}", filePath.toString, fileSize.toString)

    val in = fs.open(filePath)

    var actualSize: Long = 0 // TODO improve this
    try {
      Stream.continually(in.read(buffer, 0, bufferSize))
        .takeWhile(_ > 0 && actualSize < fileSize)
        .foreach { currentSize =>
          actualSize += currentSize
          logger.debug(s"Reading chunk of size $currentSize. Currently: $actualSize / $fileSize")
        }
    } finally {
      in.close()
    }

    logger.info("File {} with size {} read successfully", fileName, actualSize.toString)

    actualSize
  }

}

Source File: ExecutorDelegationTokenUpdater.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.util.concurrent.{Executors, TimeUnit}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.security.{Credentials, UserGroupInformation}

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.util.{ThreadUtils, Utils}

import scala.util.control.NonFatal

private[spark] class ExecutorDelegationTokenUpdater(
    sparkConf: SparkConf,
    hadoopConf: Configuration) extends Logging {

  @volatile private var lastCredentialsFileSuffix = 0

  private val credentialsFile = sparkConf.get("spark.yarn.credentials.file")
  private val freshHadoopConf =
    SparkHadoopUtil.get.getConfBypassingFSCache(
      hadoopConf, new Path(credentialsFile).toUri.getScheme)

  private val delegationTokenRenewer =
    Executors.newSingleThreadScheduledExecutor(
      ThreadUtils.namedThreadFactory("Delegation Token Refresh Thread"))

  // On the executor, this thread wakes up and picks up new tokens from HDFS, if any.
  private val executorUpdaterRunnable =
    new Runnable {
      override def run(): Unit = Utils.logUncaughtExceptions(updateCredentialsIfRequired())
    }

  def updateCredentialsIfRequired(): Unit = {
    try {
      val credentialsFilePath = new Path(credentialsFile)
      val remoteFs = FileSystem.get(freshHadoopConf)
      SparkHadoopUtil.get.listFilesSorted(
        remoteFs, credentialsFilePath.getParent,
        credentialsFilePath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION)
        .lastOption.foreach { credentialsStatus =>
        val suffix = SparkHadoopUtil.get.getSuffixForCredentialsPath(credentialsStatus.getPath)
        if (suffix > lastCredentialsFileSuffix) {
          logInfo("Reading new delegation tokens from " + credentialsStatus.getPath)
          val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsStatus.getPath)
          lastCredentialsFileSuffix = suffix
          UserGroupInformation.getCurrentUser.addCredentials(newCredentials)
          logInfo("Tokens updated from credentials file.")
        } else {
          // Check every hour to see if new credentials arrived.
          logInfo("Updated delegation tokens were expected, but the driver has not updated the " +
            "tokens yet, will check again in an hour.")
          delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS)
          return
        }
      }
      val timeFromNowToRenewal =
        SparkHadoopUtil.get.getTimeFromNowToRenewal(
          sparkConf, 0.8, UserGroupInformation.getCurrentUser.getCredentials)
      if (timeFromNowToRenewal <= 0) {
        // We just checked for new credentials but none were there, wait a minute and retry.
        // This handles the shutdown case where the staging directory may have been removed(see
        // SPARK-12316 for more details).
        delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.MINUTES)
      } else {
        logInfo(s"Scheduling token refresh from HDFS in $timeFromNowToRenewal millis.")
        delegationTokenRenewer.schedule(
          executorUpdaterRunnable, timeFromNowToRenewal, TimeUnit.MILLISECONDS)
      }
    } catch {
      // Since the file may get deleted while we are reading it, catch the Exception and come
      // back in an hour to try again
      case NonFatal(e) =>
        logWarning("Error while trying to update credentials, will try again in 1 hour", e)
        delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS)
    }
  }

  private def getCredentialsFromHDFSFile(remoteFs: FileSystem, tokenPath: Path): Credentials = {
    val stream = remoteFs.open(tokenPath)
    try {
      val newCredentials = new Credentials()
      newCredentials.readTokenStorageStream(stream)
      newCredentials
    } finally {
      stream.close()
    }
  }

  def stop(): Unit = {
    delegationTokenRenewer.shutdown()
  }

}

Source File: SimrSchedulerBackend.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import org.apache.hadoop.fs.{Path, FileSystem}

import org.apache.spark.rpc.RpcAddress
import org.apache.spark.{Logging, SparkContext, SparkEnv}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.scheduler.TaskSchedulerImpl

private[spark] class SimrSchedulerBackend(
    scheduler: TaskSchedulerImpl,
    sc: SparkContext,
    driverFilePath: String)
  extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv)
  with Logging {

  val tmpPath = new Path(driverFilePath + "_tmp")
  val filePath = new Path(driverFilePath)

  val maxCores = conf.getInt("spark.simr.executor.cores", 1)

  override def start() {
    super.start()

    val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName,
      RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt),
      CoarseGrainedSchedulerBackend.ENDPOINT_NAME)

    val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
    val fs = FileSystem.get(conf)
    val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("")

    logInfo("Writing to HDFS file: "  + driverFilePath)
    logInfo("Writing Akka address: "  + driverUrl)
    logInfo("Writing Spark UI Address: " + appUIAddress)

    // Create temporary file to prevent race condition where executors get empty driverUrl file
    val temp = fs.create(tmpPath, true)
    temp.writeUTF(driverUrl)
    temp.writeInt(maxCores)
    temp.writeUTF(appUIAddress)
    temp.close()

    // "Atomic" rename
    fs.rename(tmpPath, filePath)
  }

  override def stop() {
    val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
    val fs = FileSystem.get(conf)
    if (!fs.delete(new Path(driverFilePath), false)) {
      logWarning(s"error deleting ${driverFilePath}")
    }
    super.stop()
  }

}

Source File: ExecutorSource.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.executor

import java.util.concurrent.ThreadPoolExecutor

import scala.collection.JavaConverters._

import com.codahale.metrics.{Gauge, MetricRegistry}
import org.apache.hadoop.fs.FileSystem

import org.apache.spark.metrics.source.Source

private[spark]
class ExecutorSource(threadPool: ThreadPoolExecutor, executorId: String) extends Source {

  private def fileStats(scheme: String) : Option[FileSystem.Statistics] =
    FileSystem.getAllStatistics.asScala.find(s => s.getScheme.equals(scheme))

  private def registerFileSystemStat[T](
        scheme: String, name: String, f: FileSystem.Statistics => T, defaultValue: T) = {
    metricRegistry.register(MetricRegistry.name("filesystem", scheme, name), new Gauge[T] {
      override def getValue: T = fileStats(scheme).map(f).getOrElse(defaultValue)
    })
  }

  override val metricRegistry = new MetricRegistry()

  override val sourceName = "executor"

  // Gauge for executor thread pool's actively executing task counts
  metricRegistry.register(MetricRegistry.name("threadpool", "activeTasks"), new Gauge[Int] {
    override def getValue: Int = threadPool.getActiveCount()
  })

  // Gauge for executor thread pool's approximate total number of tasks that have been completed
  metricRegistry.register(MetricRegistry.name("threadpool", "completeTasks"), new Gauge[Long] {
    override def getValue: Long = threadPool.getCompletedTaskCount()
  })

  // Gauge for executor thread pool's current number of threads
  metricRegistry.register(MetricRegistry.name("threadpool", "currentPool_size"), new Gauge[Int] {
    override def getValue: Int = threadPool.getPoolSize()
  })

  // Gauge got executor thread pool's largest number of threads that have ever simultaneously
  // been in th pool
  metricRegistry.register(MetricRegistry.name("threadpool", "maxPool_size"), new Gauge[Int] {
    override def getValue: Int = threadPool.getMaximumPoolSize()
  })

  // Gauge for file system stats of this executor
  for (scheme <- Array("hdfs", "file")) {
    registerFileSystemStat(scheme, "read_bytes", _.getBytesRead(), 0L)
    registerFileSystemStat(scheme, "write_bytes", _.getBytesWritten(), 0L)
    registerFileSystemStat(scheme, "read_ops", _.getReadOps(), 0)
    registerFileSystemStat(scheme, "largeRead_ops", _.getLargeReadOps(), 0)
    registerFileSystemStat(scheme, "write_ops", _.getWriteOps(), 0)
  }
}

Source File: LineCount.scala From cloud-integration with Apache License 2.0

5 votes

package com.cloudera.spark.cloud.operations

import java.net.URI

import com.cloudera.spark.cloud.ObjectStoreExample
import com.cloudera.spark.cloud.s3.SequentialIOPolicy
import com.cloudera.spark.cloud.common.CloudTestKeys._
import com.cloudera.spark.cloud.s3.SequentialIOPolicy
import org.apache.hadoop.fs.{FileSystem, Path}

import org.apache.spark.{SparkConf, SparkContext}


          destFsInfo = Some(s"\nFile System $destPath=\n$destFS\n")

        }
      }
      srcFsInfo = Some(s"\nSource File System = $sourceFs\n")
    } finally {
      logInfo("Stopping Spark Context")
      sc.stop()
      srcFsInfo.foreach(logInfo(_))
      destFsInfo.foreach(logInfo(_))
    }
    0
  }

  def defaultSource: Option[String] = {
    Some(S3A_CSV_PATH_DEFAULT)
  }

  def maybeEnableAnonymousAccess(
      sparkConf: SparkConf,
      dest: Option[String]): Unit = {
    if (dest.isEmpty) {
      hconf(sparkConf, AWS_CREDENTIALS_PROVIDER, ANONYMOUS_CREDENTIALS)
    }
  }
}

Source File: S3ADataFrames.scala From cloud-integration with Apache License 2.0

5 votes

package com.cloudera.spark.cloud.s3

import com.cloudera.spark.cloud.common.CloudTestKeys
import com.cloudera.spark.cloud.operations.CloudDataFrames
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

import org.apache.spark.sql.SparkSession


object S3ADataFrames extends CloudDataFrames with S3AExampleSetup {

  override def extraValidation(
      session: SparkSession,
      conf: Configuration,
      fs: FileSystem,
      results: Seq[(String, Path, Long, Long)]): Unit = {

    val operations = new S3AOperations(fs)
    if (conf.getBoolean(CloudTestKeys.S3A_COMMITTER_TEST_ENABLED, false)) {
      results.foreach((tuple: (String, Path, Long, Long)) => {
        operations.verifyS3Committer(tuple._2, None, None, "")
      })
    }

  }
}

Source File: SeekReadTests.scala From cloud-integration with Apache License 2.0

5 votes

package com.cloudera.spark.cloud.common

import org.apache.hadoop.fs.FileSystem


class SeekReadTests extends CloudSuiteWithCSVDatasource  {

  override def enabled: Boolean = super.enabled && hasCSVTestFile


  ctest("SeekReadFully",
      """Assess cost of seek and read operations.
        | When moving the cursor in an input stream, an HTTP connection may be closed and
        | then re-opened. This can be very expensive; tactics like streaming forwards instead
        | of seeking, and/or postponing movement until the following read ('lazy seek') try
        | to address this. Logging these operation times helps track performance.
        | This test also tries to catch out a regression, where a `close()` operation
        | is implemented through reading through the entire input stream. This is exhibited
        | in the time to `close()` while at offset 0 being `O(len(file))`.
        |
        | Note also the cost of `readFully()`; this method call is common inside libraries
        | like Orc and Parquet.""".stripMargin) {
    val (source, fs) = getCSVSourceAndFileSystem()
    FileSystem.clearStatistics
    fs.getStorageStatistics.reset()
    val st = logDuration("stat") {
      fs.getFileStatus(source)
    }
    val in = logDuration("open") {
      fs.open(source)
    }
    def time[T](operation: String)(testFun: => T): T = {
      logInfo(s"")
      var r = logDuration(operation + s" [pos = ${in.getPos}]")(testFun)
      logInfo(s"  ${in.getWrappedStream}")
      r
    }

    val eof = st.getLen

    time("read()") {
      assert(-1 !== in.read())
    }
    time("seek(256)") {
      in.seek(256)
    }
    time("seek(256)") {
      in.seek(256)
    }
    time("seek(EOF-2)") {
      in.seek(eof - 2)
    }
    time("read()") {
      assert(-1 !== in.read())
    }

    def readFully(offset: Long, len: Int): Unit = {
      time(s"readFully($offset, byte[$len])") {
        val bytes = new Array[Byte](len)
        assert(-1 !== in.readFully(offset, bytes))
      }
    }
    readFully(1L, 1)
    readFully(1L, 256)
    readFully(eof - 350, 300)
    readFully(260L, 256)
    readFully(1024L, 256)
    readFully(1536L, 256)
    readFully(8192L, 1024)
    readFully(8192L + 1024 + 512, 1024)
    readFully(0L, 1024)
    readFully(eof - 1024, 1024)

    time("seek(getPos)") {
      in.seek(in.getPos())
    }
    time("read()") {
      assert(-1 !== in.read())
    }
    logDuration("close()") {
      in.close
    }
    dumpFileSystemStatistics(fs.getStorageStatistics)

  }

}

Source File: TableFuncs.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.utils

import org.apache.hadoop.fs.FileSystem
import org.apache.spark.sql.SparkSession

object TableFuncs{

  def getTableMetadata(spark:SparkSession, tableName:String) = {
    val catalog = spark.sessionState.catalog
    val tId = spark.sessionState.sqlParser.parseTableIdentifier(tableName)
    catalog.getTableMetadata(tId)
  }

  def getTableDirectory(spark: SparkSession, tableName:String) ={
    getTableMetadata(spark,tableName)
      .location
      .toString
      .split('/')
      .dropRight(1)
      .mkString("/")
  }

  def getExactSamplePath(spark: SparkSession, path:String) = {
    val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
    val statuses = fs.globStatus(new org.apache.hadoop.fs.Path(path))
    statuses.head.getPath.toString
  }

  def getParentFolderPath(spark: SparkSession, path: String): String = {
    val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
    (new org.apache.hadoop.fs.Path(path)).getParent.toString
  }

  def getAllSamples(spark: SparkSession, path:String) = {
    val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
    val statuses = fs.globStatus(new org.apache.hadoop.fs.Path(path))
    //println(statuses.length)
    statuses
      .map(_.getPath.toString.split('/').takeRight(1).head.split('.').take(1).head)
  }
}

Source File: FileHandler.scala From gimel with Apache License 2.0

5 votes

package com.paypal.gimel.common.security

import java.nio.file._
import java.security.AccessControlException

import scala.collection.JavaConverters._

import org.apache.hadoop.fs.{FileSystem, Path}

import com.paypal.gimel.common.conf.GimelConstants
import com.paypal.gimel.logger.Logger

object FileHandler {

  val logger = Logger(this.getClass)

  
  def checkIfFileAccessibleByOthers(filePath: String, source: String, fail: Boolean): Unit = {
    source.toLowerCase() match {
      case GimelConstants.HADDOP_FILE_SYSTEM =>
        val conf = new org.apache.hadoop.conf.Configuration()
        val fs = FileSystem.get(conf)
        val hdfsPath = new Path(filePath)
        if (fs.exists(hdfsPath)) {
          val permission = fs.getFileStatus(hdfsPath).getPermission.toString
          if (permission.substring(3, permission.length) != "------") {
            val message = s"FILE IS NOT PROTECTED. PLEASE PROTECT THE FILE WITH PROPER PERMISSIONS (700) : ${filePath}"
            if (fail) {
              throw new AccessControlException(message)
            }
          }
        }
      case GimelConstants.LOCAL_FILE_SYSTEM =>
        val path = Paths.get(filePath)
        if (Files.exists(path)) {
          val p = Files.getPosixFilePermissions(path)
          if (p.asScala.exists(x => x.toString.startsWith("OTHER") || x.toString.startsWith("GROUP"))) {
            val message = s"FILE IS NOT PROTECTED. PLEASE PROTECT THE FILE WITH PROPER PERMISSIONS (700) : ${filePath}"
            if (fail) {
              throw new AccessControlException(message)
            }
          }
        }
    }
  }
}

Source File: ArtifactHdfsSaver.scala From marvin-engine-executor with Apache License 2.0

5 votes

package org.marvin.artifact.manager

import java.io.{File, FileInputStream}

import akka.Done
import akka.actor.{Actor, ActorLogging}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.marvin.artifact.manager.ArtifactSaver.{SaveToLocal, SaveToRemote}
import org.marvin.model.EngineMetadata

class ArtifactHdfsSaver(metadata: EngineMetadata) extends Actor with ActorLogging {
  var conf: Configuration = _

  override def preStart() = {
    log.info(s"${this.getClass().getCanonicalName} actor initialized...")
    conf = new Configuration()

    if (sys.env.get("HADOOP_CONF_DIR") != None){
      val confFiles:List[File] = getListOfFiles(sys.env.get("HADOOP_CONF_DIR").mkString)

      for(file <- confFiles){
        log.info(s"Loading ${file.getAbsolutePath} file to hdfs client configuration ..")
        conf.addResource(new FileInputStream(file))
      }
    }

    conf.set("fs.defaultFS", metadata.hdfsHost)
  }

  def generatePaths(artifactName: String, protocol: String): Map[String, Path] = {
    Map(
      "localPath" -> new Path(s"${metadata.artifactsLocalPath}/${metadata.name}/$artifactName"),
      "remotePath" -> new Path(s"${metadata.artifactsRemotePath}/${metadata.name}/${metadata.version}/$artifactName/$protocol")
    )
  }

  def getListOfFiles(path: String): List[File] = {
    val dir = new File(path)
    val extensions = List("xml")
    dir.listFiles.filter(_.isFile).toList.filter { file =>
      extensions.exists(file.getName.endsWith(_))
    }
  }

  def validatePath(path: Path, isRemote: Boolean, fs: FileSystem): Boolean = {
    if (isRemote) {
      fs.exists(path)
    } else {
      new java.io.File(path.toString).exists
    }
  }

  override def receive: Receive = {
    case SaveToLocal(artifactName, protocol) =>
      log.info("Receive message and starting to working...")
      val fs = FileSystem.get(conf)
      val uris = generatePaths(artifactName, protocol)

      if (validatePath(uris("remotePath"), true, fs)) {
        log.info(s"Copying files from ${uris("remotePath")} to ${uris("localPath")}")
        fs.copyToLocalFile(false, uris("remotePath"), uris("localPath"), false)
        fs.close()
        log.info(s"File ${uris("localPath")} saved!")
      }
      else {
        log.error(s"Invalid protocol: ${protocol}, save process canceled!")
      }

      sender ! Done

    case SaveToRemote(artifactName, protocol) =>
      log.info("Receive message and starting to working...")
      val fs = FileSystem.get(conf)
      val uris = generatePaths(artifactName, protocol)

      if (validatePath(uris("localPath"), false, fs)) {
        log.info(s"Copying files from ${uris("localPath")} to ${uris("remotePath")}")
        fs.copyFromLocalFile(uris("localPath"), uris("remotePath"))
        fs.close()
        log.info(s"File ${uris("localPath")} saved!")
      }
      else {
        log.error(s"Invalid protocol: ${protocol}, save process canceled!")
      }

      sender ! Done

    case _ =>
      log.warning("Received a bad format message...")
  }
}

Source File: IndexBuilder.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.recommender

import com.datastax.spark.connector._
import com.typesafe.config.Config
import io.gzet.recommender.Config._
import org.apache.hadoop.fs.{Path, FileSystem}
import org.apache.spark.SparkContext
import spark.jobserver._

object IndexBuilder extends SparkJob {

  override def runJob(sc: SparkContext, conf: Config): Any = {

    val inputDir = conf.getString("input.dir")

    val sampleSizeB = sc.broadcast(SAMPLE_SIZE)
    val audioSongRDD = AudioLibrary.read(inputDir, sc, MIN_TIME, MAX_TIME)
    val songRDD = audioSongRDD.keys.sortBy(song => song).zipWithIndex().mapValues(l => l + 1)
    val songIdsB = sc.broadcast(songRDD.collectAsMap())

    val audioRDD = audioSongRDD mapPartitions { audios =>
      val songIds = songIdsB.value
      audios map { case (song, audio) =>
        (songIds.get(song).get, audio)
      }
    }

    val sampleRDD = audioRDD flatMap { case (songId, audio) =>
      audio.sampleByTime(sampleSizeB.value) map { sample =>
        (songId, sample)
      }
    }

    val recordRDD = songRDD map { case (name, id) =>
      Record(id, name)
    }

    val hashRDD = sampleRDD.map({case (songId, sample) =>
      ((sample.hash, songId), Array(sample.id))
    }).reduceByKey(_ ++ _).mapValues(a => a.mkString(",")).map({case ((hash, songId), sampleIds) =>
      (hash, songId)
    }).groupByKey().mapValues(it => it.toList).map({case (id, songs) =>
      Hash(id, songs)
    })

    hashRDD.saveAsCassandraTable(KEYSPACE, TABLE_HASH)
    recordRDD.saveAsCassandraTable(KEYSPACE, TABLE_RECORD)

  }

  def containsWav(hdfs: FileSystem, path: Path) = {
    val it = hdfs.listFiles(path, false)
    var i = 0
    while(it.hasNext){
      if(it.next().getPath.getName.endsWith(".wav")){
        i += 1
      }
    }
    i > 0
  }

  override def validate(sc: SparkContext, config: Config): SparkJobValidation = {

    if(!config.hasPath("input.dir")) {
      SparkJobInvalid("Missing parameter [input.dir]")
    } else {
      val hdfs = FileSystem.get(sc.hadoopConfiguration)
      val path = new Path(config.getString("input.dir"))
      val isDir = hdfs.isDirectory(path)
      val isValid = containsWav(hdfs, path)
      hdfs.close()
      if(isDir && isValid) {
        SparkJobValid
      } else {
        SparkJobInvalid("Input directory does not contains .wav files")
      }
    }

  }

}

Source File: ReadingWritingData.scala From Spark-RSVD with Apache License 2.0

5 votes

package com.criteo.rsvd

import java.nio.ByteBuffer

import com.esotericsoftware.kryo.Kryo
import com.typesafe.scalalogging.slf4j.StrictLogging
import de.javakaffee.kryoserializers.UnmodifiableCollectionsSerializer
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.spark.mllib.linalg.distributed.MatrixEntry
import org.apache.spark.rdd.RDD
import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer}
import org.apache.spark.{SparkConf, SparkContext}

import scala.reflect.ClassTag

object ReadingWritingData extends StrictLogging {

  def getInputDataSizeMB(inputPathPattern: String, sc: SparkContext): Int = {
    val fs = FileSystem.get(sc.hadoopConfiguration)
    val path = new Path(inputPathPattern)
    (fs.globStatus(path).map(f => f.getLen).sum / 1024 / 1024).toInt
  }

  def loadMatrixEntries(inputPath: String,
                        singlePartitionSizeMB: Int,
                        sc: SparkContext): RDD[MatrixEntry] = {

    logger.info(s"Input matrix path: $inputPath")
    val inputDataSizeMB = getInputDataSizeMB(inputPath + "
  def makeRddFromKryoFile[T: ClassTag](
      sc: SparkContext,
      path: String,
      minPartitionsOpt: Option[Int] = None): RDD[T] = {
    val minPartitions = minPartitionsOpt.getOrElse(sc.defaultMinPartitions)
    val serializer = new KryoSerializer(sc.getConf)
    sc.sequenceFile(path,
                    classOf[NullWritable],
                    classOf[BytesWritable],
                    minPartitions)
      .mapPartitions { it =>
        val instance = serializer.newInstance()
        it.flatMap {
          case (_, v) =>
            instance.deserialize[Array[T]](ByteBuffer.wrap(v.getBytes))
        }
      }
  }

  object RandomizedSVDKryoRegistrator extends KryoRegistrator {

    def registerClasses(kryo: Kryo): Unit = {
      UnmodifiableCollectionsSerializer.registerSerializers(kryo)
      kryo.register(classOf[MatrixEntry])
      kryo.register(classOf[Array[MatrixEntry]])
    }
  }

  def appendBasicRegistratorToSparkConf(sparkConf: SparkConf): SparkConf =
    appendRegistratorToSparkConf(sparkConf,
                                 RandomizedSVDKryoRegistrator.getClass.getName)

  def appendRegistratorToSparkConf(sparkConf: SparkConf,
                                   registratorName: String): SparkConf = {
    val oldValue = sparkConf.get("spark.kryo.registrator", "")
    if (oldValue == "") {
      sparkConf.set("spark.kryo.registrator", registratorName)
    } else {
      sparkConf.set("spark.kryo.registrator", oldValue + "," + registratorName)
    }
  }

}

Source File: LasRelation.scala From spark-iqmulus with Apache License 2.0

5 votes

package fr.ign.spark.iqmulus.las

import fr.ign.spark.iqmulus.{ BinarySectionRelation, BinarySection }
import org.apache.hadoop.fs.{ FileSystem, Path }
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.sources.OutputWriterFactory
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.sql.types._
import scala.util.{ Try, Success, Failure }

class LasRelation(
  override val paths: Array[String],
  override val maybeDataSchema: Option[StructType],
  override val userDefinedPartitionColumns: Option[StructType],
  parameters: Map[String, String]
)(@transient val sqlContext: SQLContext)
    extends BinarySectionRelation(parameters) {

  def format = parameters.get("lasformat").map(_.toByte)
  def minor = parameters.get("minor").map(_.toByte).getOrElse(Version.minorDefault)
  def major = parameters.get("major").map(_.toByte).getOrElse(Version.majorDefault)
  def version = parameters.get("version").map(Version.fromString)
    .getOrElse(Version(major, minor))

  lazy val headers: Array[LasHeader] = paths flatMap { location =>
    Try {
      val path = new Path(location)
      val fs = FileSystem.get(path.toUri, sqlContext.sparkContext.hadoopConfiguration)
      val dis = fs.open(path)
      try LasHeader.read(location, dis)
      finally {
        dis.close
        fs.close
      }
    } match {
      case Success(h) => Some(h)
      case Failure(e) => logWarning(s"Skipping $location : ${e.getMessage}"); None
    }
  }

  override def sections: Array[BinarySection] = headers.map(_.toBinarySection(paths))

  override def prepareJobForWrite(job: Job): OutputWriterFactory = {
    new LasOutputWriterFactory(format, version)
  }
}

Source File: ParquetHiveFormat.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.formats

import com.landoop.streamreactor.connect.hive.Serde
import com.landoop.streamreactor.connect.hive.parquet.ParquetSinkConfig
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.hadoop.ParquetWriter

import scala.util.Try

object ParquetHiveFormat extends HiveFormat {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  override def serde = Serde(
    "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
    "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
    "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
    Map("serialization.format" -> "1")
  )

  override def writer(path: Path, schema: Schema)
                     (implicit fs: FileSystem): HiveWriter = new HiveWriter {

    logger.debug(s"Creating parquet writer at $path")

    val writer: ParquetWriter[Struct] = com.landoop.streamreactor.connect.hive.parquet.parquetWriter(path, schema, ParquetSinkConfig(overwrite = true))
    Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx")))

    val createdTime: Long = System.currentTimeMillis()
    var lastKnownFileSize: Long = fs.getFileStatus(path).getLen
    var readFileSize = false
    var count = 0

    override def write(struct: Struct): Long = {
      writer.write(struct)
      count = count + 1
      readFileSize = true
      count
    }

    override def close(): Unit = {
      logger.debug(s"Closing writer at path $path")
      writer.close()
    }

    override def currentCount: Long = count
    override def file: Path = path
    override def fileSize: Long = {
      if (readFileSize) {
        lastKnownFileSize = fs.getFileStatus(path).getLen
        readFileSize = false
      }

      lastKnownFileSize
    }
  }

  override def reader(path: Path, startAt: Int, schema: Schema)
                     (implicit fs: FileSystem): HiveReader = new HiveReader {

    logger.debug(s"Creating parquet reader for $path with offset $startAt")
    val reader = com.landoop.streamreactor.connect.hive.parquet.parquetReader(path)
    var offset = startAt

    override def iterator: Iterator[Record] = Iterator.continually(reader.read).takeWhile(_ != null).drop(startAt).map { struct =>
      val record = Record(struct, path, offset)
      offset = offset + 1
      record
    }

    override def close(): Unit = reader.close()
  }
}

Source File: HiveSinkState.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink

import com.landoop.streamreactor.connect.hive
import com.landoop.streamreactor.connect.hive._
import com.landoop.streamreactor.connect.hive.sink.config.TableOptions
import com.landoop.streamreactor.connect.hive.sink.mapper.{DropPartitionValuesMapper, MetastoreSchemaAlignMapper, ProjectionMapper}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.hadoop.hive.metastore.api.Table
import org.apache.kafka.connect.data.{Schema, Struct}

case class HiveSinkState(offsets: Map[TopicPartition, Offset],
                         committedOffsets: Map[TopicPartition, Offset],
                         table: Table,
                         tableLocation: Path,
                         plan: Option[PartitionPlan],
                         metastoreSchema: Schema,
                         mapper: Struct => Struct,
                         lastSchema: Schema) {
  def withTopicPartitionOffset(tpo: TopicPartitionOffset): HiveSinkState = {
    copy(offsets = offsets + (tpo.toTopicPartition -> tpo.offset))
  }

  def withTopicPartitionOffset(tp: TopicPartition, offset: Offset): HiveSinkState = {
    copy(offsets = offsets + (tp -> offset))
  }

  def withCommittedOffset(offsets: Map[TopicPartition, Offset]): HiveSinkState = {
    copy(committedOffsets = committedOffsets ++ offsets)
  }

  def withCommittedOffset(tp: TopicPartition, offset: Offset): HiveSinkState = {
    copy(committedOffsets = committedOffsets + (tp -> offset))
  }

  def withLastSchema(schema: Schema): HiveSinkState = copy(lastSchema = schema)
}

object HiveSinkState {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  def from(schema: Schema,
           table: TableOptions,
           dbName: DatabaseName)(implicit client: IMetaStoreClient, fs: FileSystem) = {
    logger.info(s"Init sink for schema $schema")

    val hiveTable = getOrCreateTable(table, dbName, schema)
    val tableLocation = new Path(hiveTable.getSd.getLocation)
    val plan = hive.partitionPlan(hiveTable)
    val metastoreSchema = table.evolutionPolicy
      .evolve(dbName, table.tableName, HiveSchemas.toKafka(hiveTable), schema)
      .getOrElse(sys.error(s"Unable to retrieve or evolve schema for $schema"))

    val mapperFns: Seq[Struct => Struct] = Seq(
      table.projection.map(new ProjectionMapper(_)),
      Some(new MetastoreSchemaAlignMapper(metastoreSchema)),
      plan.map(new DropPartitionValuesMapper(_))
    ).flatten.map(mapper => mapper.map _)

    val mapper = Function.chain(mapperFns)

    HiveSinkState(Map.empty, Map.empty, hiveTable, tableLocation, plan, metastoreSchema, mapper, schema)
  }

  def getOrCreateTable(table: TableOptions, dbName: DatabaseName, schema: Schema)
                      (implicit client: IMetaStoreClient, fs: FileSystem): Table = {

    def create: Table = {
      val partstring = if (table.partitions.isEmpty) "<no-partitions>" else table.partitions.mkString(",")
      logger.info(s"Creating table in hive [${dbName.value}.${table.tableName.value}, partitions=$partstring]")
      hive.createTable(dbName, table.tableName, schema, table.partitions, table.location, table.format)
    }

    logger.debug(s"Fetching or creating table ${dbName.value}.${table.tableName.value}")
    client.tableExists(dbName.value, table.tableName.value) match {
      case true if table.overwriteTable =>
        hive.dropTable(dbName, table.tableName, true)
        create
      case true => client.getTable(dbName.value, table.tableName.value)
      case false if table.createTable => create
      case false => throw new RuntimeException(s"Table ${dbName.value}.${table.tableName.value} does not exist")
    }
  }
}

Source File: StrictPartitionHandler.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.partitioning

import com.landoop.streamreactor.connect.hive.{DatabaseName, Partition, TableName}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient

import scala.collection.JavaConverters._
import scala.util.control.NonFatal
import scala.util.{Failure, Success, Try}


object StrictPartitionHandler extends PartitionHandler {

  override def path(partition: Partition,
                    db: DatabaseName,
                    tableName: TableName)
                   (client: IMetaStoreClient,
                    fs: FileSystem): Try[Path] = {
    try {
      val part = client.getPartition(db.value, tableName.value, partition.entries.map(_._2).toList.asJava)
      Success(new Path(part.getSd.getLocation))
    } catch {
      case NonFatal(e) =>
        Failure(new RuntimeException(s"Partition '${partition.entries.map(_._2).toList.mkString(",")}' does not exist and strict policy requires upfront creation", e))
    }
  }
}

Source File: CachedPartitionHandler.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.partitioning

import com.landoop.streamreactor.connect.hive.{DatabaseName, Partition, TableName}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient

import scala.util.{Success, Try}


class CachedPartitionHandler(partitioner: PartitionHandler) extends PartitionHandler {

  val cache = scala.collection.mutable.Map.empty[Partition, Path]

  override def path(partition: Partition,
                    db: DatabaseName,
                    tableName: TableName)
                   (client: IMetaStoreClient, fs: FileSystem): Try[Path] = {
    cache.get(partition) match {
      case Some(path) => Success(path)
      case _ =>
        val created = partitioner.path(partition, db, tableName)(client, fs)
        created.foreach(cache.put(partition, _))
        created
    }
  }
}

Source File: DynamicPartitionHandler.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.partitioning

import com.landoop.streamreactor.connect.hive.{DatabaseName, Partition, TableName}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.hadoop.hive.metastore.api.{StorageDescriptor, Table}

import scala.collection.JavaConverters._
import scala.util.{Failure, Success, Try}


class DynamicPartitionHandler(pathPolicy: PartitionPathPolicy = DefaultMetastorePartitionPathPolicy)
  extends PartitionHandler {

  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  override def path(partition: Partition,
                    db: DatabaseName,
                    tableName: TableName)
                   (client: IMetaStoreClient,
                    fs: FileSystem): Try[Path] = {

    def table: Table = client.getTable(db.value, tableName.value)

    def create(path: Path, table: Table): Unit = {
      logger.debug(s"New partition will be created at $path")

      val sd = new StorageDescriptor(table.getSd)
      sd.setLocation(path.toString)

      val params = new java.util.HashMap[String, String]
      val values = partition.entries.map(_._2).toList.asJava
      val ts = (System.currentTimeMillis / 1000).toInt

      val p = new org.apache.hadoop.hive.metastore.api.Partition(values, db.value, tableName.value, ts, 0, sd, params)
      logger.debug(s"Updating hive metastore with partition $p")
      client.add_partition(p)

      logger.info(s"Partition has been created in metastore [$partition]")
    }

    Try(client.getPartition(db.value, tableName.value, partition.entries.toList.map(_._2).asJava)) match {
      case Success(p) => Try {
        new Path(p.getSd.getLocation)
      }
      case Failure(_) => Try {
        val t = table
        val tableLocation = new Path(t.getSd.getLocation)
        val path = pathPolicy.path(tableLocation, partition)
        create(path, t)
        path
      }
    }
  }
}

Source File: StageManager.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.staging

import com.landoop.streamreactor.connect.hive.formats.HiveWriter
import com.landoop.streamreactor.connect.hive.{TopicPartition, TopicPartitionOffset}
import org.apache.hadoop.fs.{FileSystem, Path}


class StageManager(filenamePolicy: FilenamePolicy) {

  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  private def stageFilename(tp: TopicPartition) =
    s".${filenamePolicy.prefix}_${tp.topic.value}_${tp.partition}"

  private def finalFilename(tpo: TopicPartitionOffset) =
    s"${filenamePolicy.prefix}_${tpo.topic.value}_${tpo.partition}_${tpo.offset.value}"

  def stage(dir: Path, tp: TopicPartition)(implicit fs: FileSystem): Path = {
    val filename = stageFilename(tp)
    val stagePath = new Path(dir, filename)
    fs.delete(stagePath, false)
    stagePath
  }

  def commit(stagePath: Path, tpo: TopicPartitionOffset)(implicit fs: FileSystem): Path = {
    val finalPath = new Path(stagePath.getParent, finalFilename(tpo))
    logger.info(s"Commiting file $stagePath=>$finalPath")
    fs.rename(stagePath, finalPath)
    finalPath
  }
}

Source File: OffsetSeeker.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.staging

import com.landoop.streamreactor.connect.hive._
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient

import scala.util.control.NonFatal


class OffsetSeeker(filenamePolicy: FilenamePolicy) {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  import HdfsUtils._

  def seek(db: DatabaseName,
           tableName: TableName)
          (implicit fs: FileSystem, client: IMetaStoreClient): Set[TopicPartitionOffset] = {

    try {
      // the table may not have been created, in which case we have no offsets defined
      if (client.tableExists(db.value, tableName.value)) {
        val loc = com.landoop.streamreactor.connect.hive.tableLocation(db, tableName)
        val prefix = filenamePolicy.prefix

        fs.ls(new Path(loc), true)
          .map(_.getPath.getName)
          .collect {
            case CommittedFileName(`prefix`, topic, partition, _, end) => TopicPartitionOffset(topic, partition, end)
          }
          .toSeq
          .groupBy(_.toTopicPartition)
          .map { case (tp, tpo) =>
            tp.withOffset(tpo.maxBy(_.offset.value).offset)
          }.toSet

      } else {
        Set.empty
      }
    } catch {
      case NonFatal(e) =>
        logger.error(s"Error seeking table $db.table")
        throw e
    }
  }
}

Source File: CommitPolicy.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.staging

import com.landoop.streamreactor.connect.hive.TopicPartitionOffset
import org.apache.hadoop.fs.{FileSystem, Path}

import scala.concurrent.duration.FiniteDuration


case class DefaultCommitPolicy(fileSize: Option[Long],
                               interval: Option[FiniteDuration],
                               fileCount: Option[Long]) extends CommitPolicy {
  require(fileSize.isDefined || interval.isDefined || fileCount.isDefined)
  override def shouldFlush(context: CommitContext)
                          (implicit fs: FileSystem): Boolean = {
    val open_time = System.currentTimeMillis() - context.createdTimestamp
    fileSize.exists(_ <= context.fileSize) ||
      interval.exists(_.toMillis <= open_time) ||
      fileCount.exists(_ <= context.count)
  }
}

Source File: HiveWriterManager.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink

import com.landoop.streamreactor.connect.hive.{Offset, TopicPartition, TopicPartitionOffset}
import com.landoop.streamreactor.connect.hive.formats.{HiveFormat, HiveWriter}
import com.landoop.streamreactor.connect.hive.sink.staging.StageManager
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.Schema


  def flush(offsets: Map[TopicPartition, Offset]): Unit = {
    logger.info(s"Flushing offsets $offsets")
    // we may not have an offset for a given topic/partition if no data was written to that TP
    writers.foreach { case (key, writer) =>
      writer.close()
      offsets.get(key.tp).foreach { offset =>
        stageManager.commit(writer.file, key.tp.withOffset(offset))
      }
      writers.remove(key)
    }
  }

  def getWriters: Seq[OpenWriter] = writers.map { case (key, writer) => OpenWriter(key.tp, key.dir, writer) }.toList
}

Source File: TableFileScanner.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.source

import com.landoop.streamreactor.connect.hive
import com.landoop.streamreactor.connect.hive.HdfsUtils._
import com.landoop.streamreactor.connect.hive.{DatabaseName, Partition, TableName}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient

// for a table that is not partitioned, the files will all reside directly in the table location directory
// otherwise, the files will each live in the particular partition folder (which technically, could be anywhere)
object TableFileScanner {

  def scan(db: DatabaseName, tableName: TableName)
          (implicit fs: FileSystem, client: IMetaStoreClient): Seq[(Path, Option[Partition])] = {

    // the partitions from the metastore which each contain a pointer to the partition location

    hive.partitionPlan(db, tableName) match {
      case Some(plan) =>
        hive.partitions(db, tableName).flatMap { case partition@Partition(entries, Some(location)) =>
          val files = fs.listFiles(location, false)
          files.map(_.getPath).toVector.map(_ -> Some(partition))
        }
      case None =>
        val table = client.getTable(db.value, tableName.value)
        val files = fs.listFiles(new Path(table.getSd.getLocation), false)
        files.map(_.getPath).toVector.map(_ -> None)
    }
  }
}

Source File: HiveSource.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.source

import com.landoop.streamreactor.connect.hive
import com.landoop.streamreactor.connect.hive._
import com.landoop.streamreactor.connect.hive.formats.{HiveFormat, HiveReader, Record}
import com.landoop.streamreactor.connect.hive.source.config.HiveSourceConfig
import com.landoop.streamreactor.connect.hive.source.mapper.{PartitionValueMapper, ProjectionMapper}
import com.landoop.streamreactor.connect.hive.source.offset.HiveSourceOffsetStorageReader
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.kafka.connect.data.Struct
import org.apache.kafka.connect.source.SourceRecord

import scala.collection.JavaConverters._


class HiveSource(db: DatabaseName,
                 tableName: TableName,
                 topic: Topic,
                 offsetReader: HiveSourceOffsetStorageReader,
                 config: HiveSourceConfig)
                (implicit client: IMetaStoreClient, fs: FileSystem) extends Iterator[SourceRecord] {

  val tableConfig = config.tableOptions.filter(_.tableName == tableName).find(_.topic == topic)
    .getOrElse(sys.error(s"Cannot find table configuration for ${db.value}.${tableName.value} => ${topic.value}"))

  private val table = client.getTable(db.value, tableName.value)
  private val format = HiveFormat(hive.serde(table))
  private val metastoreSchema = HiveSchemas.toKafka(table)
  private val parts = TableFileScanner.scan(db, tableName)

  private val readers = parts.map { case (path, partition) =>

    val fns: Seq[Struct => Struct] = Seq(
      partition.map(new PartitionValueMapper(_).map _),
      tableConfig.projection.map(new ProjectionMapper(_).map _)
    ).flatten
    val mapper: Struct => Struct = Function.chain(fns)

    val sourceOffset = offsetReader.offset(SourcePartition(db, tableName, topic, path)).getOrElse(SourceOffset(0))

    new HiveReader {
      lazy val reader = format.reader(path, sourceOffset.rowNumber, metastoreSchema)
      override def iterator: Iterator[Record] = reader.iterator.map { record =>
        Record(mapper(record.struct), record.path, record.offset)
      }
      override def close(): Unit = reader.close()
    }
  }

  private val iterator: Iterator[Record] = readers.map(_.iterator).reduce(_ ++ _).take(tableConfig.limit)

  override def hasNext: Boolean = iterator.hasNext

  override def next(): SourceRecord = {

    val record = iterator.next
    val sourcePartition = SourcePartition(db, tableName, topic, record.path)
    val offset = SourceOffset(record.offset)

    new SourceRecord(
      fromSourcePartition(sourcePartition).asJava,
      fromSourceOffset(offset).asJava,
      topic.value,
      record.struct.schema,
      record.struct
    )
  }

  def close(): Unit = {
    readers.foreach(_.close())
  }
}

Source File: package.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.column.ParquetProperties
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetReader, ParquetWriter}

package object parquet {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  def listFiles(path: Path)(implicit fs: FileSystem): List[Path] = {
    if (fs.isDirectory(path)) {
      logger.debug(s"$path is a directory, reading constituent files")
      val remote = fs.listFiles(path, false)
      new Iterator[Path] {
        override def hasNext: Boolean = remote.hasNext
        override def next(): Path = remote.next().getPath
      }.toList
    } else {
      logger.debug(s"Reading $path as a single file")
      List(path)
    }
  }

  def parquetReader(file: Path)(implicit fs: FileSystem): ParquetReader[Struct] = {
    ParquetReader.builder(new StructReadSupport, file)
      .withConf(fs.getConf)
      .build()
  }

  def parquetWriter(path: Path,
                    schema: Schema,
                    config: ParquetSinkConfig): ParquetWriter[Struct] = {
    new StructParquetWriterBuilder(path, schema)
      .withCompressionCodec(config.compressionCodec)
      .withDictionaryEncoding(config.enableDictionary)
      .withValidation(config.validation)
      .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0)
      .withWriteMode(if (config.overwrite) {
        ParquetFileWriter.Mode.OVERWRITE
      } else {
        ParquetFileWriter.Mode.CREATE
      }).build()
  }
}

Source File: StageManagerTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.staging

import com.landoop.streamreactor.connect.hive.{Offset, Topic, TopicPartition, TopicPartitionOffset}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class StageManagerTest extends AnyWordSpec with Matchers {

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.getLocal(conf)

  val dir = new Path("stageman")
  fs.mkdirs(dir)

  val manager = new StageManager(DefaultFilenamePolicy)

  "StageManager" should {

    "stage file as hidden" in {
      val stagePath = manager.stage(dir, TopicPartition(Topic("mytopic"), 1))
      stagePath.getName.startsWith(".") shouldBe true
    }

    "delete existing file" in {

      val stagePath = manager.stage(dir, TopicPartition(Topic("mytopic"), 1))
      fs.create(stagePath)

      manager.stage(dir, TopicPartition(Topic("mytopic"), 1))
      fs.exists(stagePath) shouldBe false
    }
    "commit file using offset" in {

      val stagePath = manager.stage(dir, TopicPartition(Topic("mytopic"), 1))
      fs.create(stagePath)

      val tpo = TopicPartitionOffset(Topic("mytopic"), 1, Offset(100))
      val finalPath = manager.commit(stagePath, tpo)
      finalPath.getName shouldBe "streamreactor_mytopic_1_100"
    }
  }
}

Source File: DefaultCommitPolicyTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.staging

import com.landoop.streamreactor.connect.hive.{Offset, Topic, TopicPartitionOffset}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

import scala.concurrent.duration._

class DefaultCommitPolicyTest extends AnyWordSpec with Matchers {

  val schema: Schema = SchemaBuilder.struct()
    .field("name", SchemaBuilder.string().required().build())
    .build()

  val struct = new Struct(schema)

  implicit val conf: Configuration = new Configuration()
  implicit val fs: LocalFileSystem = FileSystem.getLocal(conf)
  val tpo = TopicPartitionOffset(Topic("mytopic"), 1, Offset(100))

  private def shouldFlush(policy: CommitPolicy, path: Path, count: Long) = {
    val status = fs.getFileStatus(path)
    policy.shouldFlush(CommitContext(tpo, path, count, status.getLen, status.getModificationTime))
  }

  "DefaultCommitPolicy" should {
    "roll over after interval" in {

      val policy = DefaultCommitPolicy(None, Option(2.seconds), None)
      val path = new Path("foo")
      fs.create(path)

      shouldFlush(policy, path, 10) shouldBe false
      Thread.sleep(2000)
      shouldFlush(policy, path, 10) shouldBe true

      fs.delete(path, false)
    }
    "roll over after file count" in {
      val policy = DefaultCommitPolicy(None, None, Some(9))
      val path = new Path("foo")
      fs.create(path)

      shouldFlush(policy, path, 7) shouldBe false
      shouldFlush(policy, path, 8) shouldBe false
      shouldFlush(policy, path, 9) shouldBe true
      shouldFlush(policy, path, 10) shouldBe true

      fs.delete(path, false)
    }
    "roll over after file size" in {
      val policy = DefaultCommitPolicy(Some(10), None, None)
      val path = new Path("foo")
      val out = fs.create(path)
      shouldFlush(policy, path, 7) shouldBe false
      out.writeBytes("wibble wobble wabble wubble")
      out.close()
      shouldFlush(policy, path, 9) shouldBe true
      fs.delete(path, false)
    }
  }
}

Source File: ParquetWriterTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.parquet

import com.landoop.streamreactor.connect.hive.StructUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class ParquetWriterTest extends AnyWordSpec with Matchers {

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.getLocal(conf)

  "ParquetWriter" should {
    "write parquet files" in {

      val schema = SchemaBuilder.struct()
        .field("name", SchemaBuilder.string().required().build())
        .field("title", SchemaBuilder.string().optional().build())
        .field("salary", SchemaBuilder.float64().optional().build())
        .build()

      val users = List(
        new Struct(schema).put("name", "sam").put("title", "mr").put("salary", 100.43),
        new Struct(schema).put("name", "laura").put("title", "ms").put("salary", 429.06)
      )

      val path = new Path("sinktest.parquet")

      val writer = parquetWriter(path, schema, ParquetSinkConfig(overwrite = true))
      users.foreach(writer.write)
      writer.close()

      val reader = parquetReader(path)
      val actual = Iterator.continually(reader.read).takeWhile(_ != null).toList
      reader.close()

      actual.map(StructUtils.extractValues) shouldBe users.map(StructUtils.extractValues)

      fs.delete(path, false)
    }
    "support writing nulls" in {

      val schema = SchemaBuilder.struct()
        .field("name", SchemaBuilder.string().required().build())
        .field("title", SchemaBuilder.string().optional().build())
        .field("salary", SchemaBuilder.float64().optional().build())
        .build()

      val users = List(
        new Struct(schema).put("name", "sam").put("title", null).put("salary", 100.43),
        new Struct(schema).put("name", "laura").put("title", "ms").put("salary", 429.06)
      )

      val path = new Path("sinktest.parquet")

      val writer = parquetWriter(path, schema, ParquetSinkConfig(overwrite = true))
      users.foreach(writer.write)
      writer.close()

      val reader = parquetReader(path)
      val actual = Iterator.continually(reader.read).takeWhile(_ != null).toList
      reader.close()

      actual.map(StructUtils.extractValues) shouldBe users.map(StructUtils.extractValues)

      fs.delete(path, false)
    }
  }
}

Source File: ParquetHiveFormat.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.formats

import com.landoop.streamreactor.connect.hive.Serde
import com.landoop.streamreactor.connect.hive.parquet.ParquetSinkConfig
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.hadoop.ParquetWriter

import scala.util.Try

object ParquetHiveFormat extends HiveFormat {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  override def serde = Serde(
    "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
    "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
    "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
    Map("serialization.format" -> "1")
  )

  override def writer(path: Path, schema: Schema)
                     (implicit fs: FileSystem): HiveWriter = new HiveWriter {

    logger.debug(s"Creating parquet writer at $path")

    val writer: ParquetWriter[Struct] = com.landoop.streamreactor.connect.hive.parquet.parquetWriter(path, schema, ParquetSinkConfig(overwrite = true))
    Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx")))

    val createdTimestamp: Long = System.currentTimeMillis()
    var lastKnownFileSize:Long = fs.getFileStatus(path).getLen
    var readFileSize = false
    var count = 0

    override def write(struct: Struct): Long = {
      writer.write(struct)
      count = count + 1
      readFileSize = true
      count
    }

    override def close(): Unit = {
      logger.debug(s"Closing writer at path $path")
      writer.close()
    }

    override def currentCount: Long = count
    override def file: Path = path
    override def createdTime: Long = createdTimestamp
    override def fileSize: Long = {
      if (readFileSize) {
        lastKnownFileSize = fs.getFileStatus(path).getLen
        readFileSize = false
      }

      lastKnownFileSize
    }
  }

  override def reader(path: Path, startAt: Int, schema: Schema)
                     (implicit fs: FileSystem): HiveReader = new HiveReader {

    logger.debug(s"Creating parquet reader for $path with offset $startAt")
    val reader = com.landoop.streamreactor.connect.hive.parquet.parquetReader(path)
    var offset = startAt

    override def iterator: Iterator[Record] = Iterator.continually(reader.read).takeWhile(_ != null).drop(startAt).map { struct =>
      val record = Record(struct, path, offset)
      offset = offset + 1
      record
    }

    override def close(): Unit = reader.close()
  }
}

Source File: OrcHiveFormat.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.formats

import com.landoop.streamreactor.connect.hive.{OrcSinkConfig, OrcSourceConfig, Serde}
import com.landoop.streamreactor.connect.hive.orc.OrcSink
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.kafka.connect.data.{Schema, Struct}

import scala.util.Try

object OrcHiveFormat extends HiveFormat {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  override def serde = Serde(
    "org.apache.hadoop.hive.ql.io.orc.OrcSerde",
    "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat",
    "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat",
    Map("org.apache.hadoop.hive.ql.io.orc.OrcSerde" -> "1")
  )

  override def writer(path: Path, schema: Schema)
                     (implicit fs: FileSystem): HiveWriter = new HiveWriter {
    logger.debug(s"Creating orc writer at $path")

    val sink: OrcSink = com.landoop.streamreactor.connect.hive.orc.sink(path, schema, OrcSinkConfig(overwrite = true))
    Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx")))

    val cretedTimestamp: Long = System.currentTimeMillis()
    var lastKnownFileSize:Long = fs.getFileStatus(path).getLen
    var readFileSize = false
    var count = 0

    override def write(struct: Struct): Long = {
      sink.write(struct)
      count = count + 1
      readFileSize = true
      count
    }

    override def close(): Unit = {
      logger.debug(s"Closing orc writer at path $path")
      sink.close()
    }
    override def file: Path = path
    override def currentCount: Long = count
    override def createdTime: Long = cretedTimestamp
    override def fileSize: Long = {
      if (readFileSize) {
        lastKnownFileSize = fs.getFileStatus(path).getLen
        readFileSize = false
      }

      lastKnownFileSize
    }
  }

  override def reader(path: Path, startAt: Int, schema: Schema)
                     (implicit fs: FileSystem): HiveReader = new HiveReader {

    logger.debug(s"Creating orc reader for $path with offset $startAt")
    val reader = com.landoop.streamreactor.connect.hive.orc.source(path, OrcSourceConfig())
    var offset = startAt

    override def iterator: Iterator[Record] = reader.iterator.map { struct =>
      val record = Record(struct, path, offset)
      offset = offset + 1
      record
    }

    override def close(): Unit = reader.close()
  }
}

Source File: OrcSource.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.orc

import com.landoop.streamreactor.connect.hive.OrcSourceConfig
import com.landoop.streamreactor.connect.hive.orc.vectors.OrcVectorReader.fromSchema
import com.landoop.streamreactor.connect.hive.orc.vectors.StructVectorReader
import com.typesafe.scalalogging.StrictLogging
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.ql.exec.vector.{StructColumnVector, VectorizedRowBatch}
import org.apache.kafka.connect.data.Struct
import org.apache.orc.OrcFile.ReaderOptions
import org.apache.orc.{OrcFile, Reader}

import scala.collection.JavaConverters._

class OrcSource(path: Path, config: OrcSourceConfig)(implicit fs: FileSystem) extends StrictLogging {

  private val reader = OrcFile.createReader(path, new ReaderOptions(fs.getConf))

  private val typeDescription = reader.getSchema
  private val schema = OrcSchemas.toKafka(typeDescription)

  private val readers = typeDescription.getChildren.asScala.map(fromSchema)
  private val vectorReader = new StructVectorReader(readers.toIndexedSeq, typeDescription)

  private val batch = typeDescription.createRowBatch()
  private val recordReader = reader.rows(new Reader.Options())

  def close(): Unit = {
    recordReader.close()
  }

  def iterator: Iterator[Struct] = new Iterator[Struct] {
    var iter = new BatchIterator(batch)
    override def hasNext: Boolean = iter.hasNext || {
      batch.reset()
      recordReader.nextBatch(batch)
      iter = new BatchIterator(batch)
      !batch.endOfFile && batch.size > 0 && iter.hasNext
    }
    override def next(): Struct = iter.next()
  }

  // iterates over a batch, be careful not to mutate the batch while it is being iterated
  class BatchIterator(batch: VectorizedRowBatch) extends Iterator[Struct] {
    var offset = 0
    val vector = new StructColumnVector(batch.numCols, batch.cols: _*)
    override def hasNext: Boolean = offset < batch.size
    override def next(): Struct = {
      val struct = vectorReader.read(offset, vector)
      offset = offset + 1
      struct.orNull
    }
  }
}

Source File: OrcSink.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.orc

import com.landoop.streamreactor.connect.hive.orc.vectors.{OrcVectorWriter, StructVectorWriter}
import com.landoop.streamreactor.connect.hive.{OrcSinkConfig, StructUtils}
import com.typesafe.scalalogging.StrictLogging
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector
import org.apache.kafka.connect.data.{Schema, Struct}

import scala.collection.JavaConverters._

class OrcSink(path: Path,
              schema: Schema,
              config: OrcSinkConfig)(implicit fs: FileSystem) extends StrictLogging {

  private val typeDescription = OrcSchemas.toOrc(schema)
  private val structWriter = new StructVectorWriter(typeDescription.getChildren.asScala.map(OrcVectorWriter.fromSchema))
  private val batch = typeDescription.createRowBatch(config.batchSize)
  private val vector = new StructColumnVector(batch.numCols, batch.cols: _*)
  private val orcWriter = createOrcWriter(path, typeDescription, config)
  private var n = 0

  def flush(): Unit = {
    logger.debug(s"Writing orc batch [size=$n, path=$path]")
    batch.size = n
    orcWriter.addRowBatch(batch)
    orcWriter.writeIntermediateFooter
    batch.reset()
    n = 0
  }

  def write(struct: Struct): Unit = {
    structWriter.write(vector, n, Some(StructUtils.extractValues(struct)))
    n = n + 1
    if (n == config.batchSize)
      flush()
  }

  def close(): Unit = {
    if (n > 0)
      flush()
    orcWriter.close()
  }
}

Source File: package.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.Schema
import org.apache.orc.OrcFile.EncodingStrategy
import org.apache.orc._

package object orc {

  def createOrcWriter(path: Path, schema: TypeDescription, config: OrcSinkConfig)
                     (implicit fs: FileSystem): Writer = {

    val options = OrcFile.writerOptions(null, fs.getConf).setSchema(schema)

    options.compress(config.compressionKind)
    options.encodingStrategy(config.encodingStrategy)
    options.blockPadding(config.blockPadding)
    options.version(OrcFile.Version.V_0_12)

    config.bloomFilterColumns.map(_.mkString(",")).foreach(options.bloomFilterColumns)
    config.rowIndexStride.foreach(options.rowIndexStride)
    config.blockSize.foreach(options.blockSize)
    config.stripeSize.foreach(options.stripeSize)

    if (config.overwrite && fs.exists(path))
      fs.delete(path, false)

    OrcFile.createWriter(path, options)
  }

  def source(path: Path, config: OrcSourceConfig)
            (implicit fs: FileSystem) = new OrcSource(path, config)

  def sink(path: Path, schema: Schema, config: OrcSinkConfig)
          (implicit fs: FileSystem) = new OrcSink(path, schema, config)
}

case class OrcSourceConfig()

case class OrcSinkConfig(overwrite: Boolean = false,
                         batchSize: Int = 1024, // orc default is 1024
                         encodingStrategy: EncodingStrategy = EncodingStrategy.COMPRESSION,
                         compressionKind: CompressionKind = CompressionKind.SNAPPY,
                         blockPadding: Boolean = true,
                         blockSize: Option[Long] = None,
                         stripeSize: Option[Long] = None,
                         bloomFilterColumns: Seq[String] = Nil,
                         rowIndexStride: Option[Int] = None)

Source File: HiveSinkState.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink

import com.landoop.streamreactor.connect.hive
import com.landoop.streamreactor.connect.hive._
import com.landoop.streamreactor.connect.hive.sink.config.TableOptions
import com.landoop.streamreactor.connect.hive.sink.mapper.{DropPartitionValuesMapper, MetastoreSchemaAlignMapper, ProjectionMapper}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.hadoop.hive.metastore.api.Table
import org.apache.kafka.connect.data.{Schema, Struct}

case class HiveSinkState(offsets: Map[TopicPartition, Offset],
                         committedOffsets: Map[TopicPartition, Offset],
                         table: Table,
                         tableLocation: Path,
                         plan: Option[PartitionPlan],
                         metastoreSchema: Schema,
                         mapper: Struct => Struct,
                         lastSchema: Schema) {
  def withTopicPartitionOffset(tpo: TopicPartitionOffset): HiveSinkState = {
    copy(offsets = offsets + (tpo.toTopicPartition -> tpo.offset))
  }

  def withTopicPartitionOffset(tp: TopicPartition, offset: Offset): HiveSinkState = {
    copy(offsets = offsets + (tp -> offset))
  }

  def withCommittedOffset(offsets: Map[TopicPartition, Offset]): HiveSinkState = {
    copy(committedOffsets = committedOffsets ++ offsets)
  }

  def withCommittedOffset(tp: TopicPartition, offset: Offset): HiveSinkState = {
    copy(committedOffsets = committedOffsets + (tp -> offset))
  }

  def withLastSchema(schema: Schema): HiveSinkState = copy(lastSchema = schema)
}

object HiveSinkState {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  def from(schema: Schema,
           table: TableOptions,
           dbName: DatabaseName)(implicit client: IMetaStoreClient, fs: FileSystem) = {
    logger.info(s"Init sink for schema $schema")

    val hiveTable = getOrCreateTable(table, dbName, schema)
    val tableLocation = new Path(hiveTable.getSd.getLocation)
    val plan = hive.partitionPlan(hiveTable)
    val metastoreSchema = table.evolutionPolicy
      .evolve(dbName, table.tableName, HiveSchemas.toKafka(hiveTable), schema)
      .getOrElse(sys.error(s"Unable to retrieve or evolve schema for $schema"))

    val mapperFns: Seq[Struct => Struct] = Seq(
      table.projection.map(new ProjectionMapper(_)),
      Some(new MetastoreSchemaAlignMapper(metastoreSchema)),
      plan.map(new DropPartitionValuesMapper(_))
    ).flatten.map(mapper => mapper.map _)

    val mapper = Function.chain(mapperFns)

    HiveSinkState(Map.empty, Map.empty, hiveTable, tableLocation, plan, metastoreSchema, mapper, schema)
  }

  def getOrCreateTable(table: TableOptions, dbName: DatabaseName, schema: Schema)
                      (implicit client: IMetaStoreClient, fs: FileSystem): Table = {

    def create: Table = {
      val partstring = if (table.partitions.isEmpty) "<no-partitions>" else table.partitions.mkString(",")
      logger.info(s"Creating table in hive [${dbName.value}.${table.tableName.value}, partitions=$partstring]")
      hive.createTable(dbName, table.tableName, schema, table.partitions, table.location, table.format)
    }

    logger.debug(s"Fetching or creating table ${dbName.value}.${table.tableName.value}")
    client.tableExists(dbName.value, table.tableName.value) match {
      case true if table.overwriteTable =>
        hive.dropTable(dbName, table.tableName, true)
        create
      case true => client.getTable(dbName.value, table.tableName.value)
      case false if table.createTable => create
      case false => throw new RuntimeException(s"Table ${dbName.value}.${table.tableName.value} does not exist")
    }
  }
}

Source File: StrictPartitionHandler.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.partitioning

import com.landoop.streamreactor.connect.hive.{DatabaseName, Partition, TableName}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient

import scala.collection.JavaConverters._
import scala.util.control.NonFatal
import scala.util.{Failure, Success, Try}


object StrictPartitionHandler extends PartitionHandler {

  override def path(partition: Partition,
                    db: DatabaseName,
                    tableName: TableName)
                   (client: IMetaStoreClient,
                    fs: FileSystem): Try[Path] = {
    try {
      val part = client.getPartition(db.value, tableName.value, partition.entries.map(_._2).toList.asJava)
      Success(new Path(part.getSd.getLocation))
    } catch {
      case NonFatal(e) =>
        Failure(new RuntimeException(s"Partition '${partition.entries.map(_._2).toList.mkString(",")}' does not exist and strict policy requires upfront creation", e))
    }
  }
}

Source File: CachedPartitionHandler.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.partitioning

import com.landoop.streamreactor.connect.hive.{DatabaseName, Partition, TableName}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient

import scala.util.{Success, Try}


class CachedPartitionHandler(partitioner: PartitionHandler) extends PartitionHandler {

  val cache = scala.collection.mutable.Map.empty[Partition, Path]

  override def path(partition: Partition,
                    db: DatabaseName,
                    tableName: TableName)
                   (client: IMetaStoreClient, fs: FileSystem): Try[Path] = {
    cache.get(partition) match {
      case Some(path) => Success(path)
      case _ =>
        val created = partitioner.path(partition, db, tableName)(client, fs)
        created.foreach(cache.put(partition, _))
        created
    }
  }
}

Source File: DynamicPartitionHandler.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.partitioning

import com.landoop.streamreactor.connect.hive.{DatabaseName, Partition, TableName}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.hadoop.hive.metastore.api.{StorageDescriptor, Table}

import scala.collection.JavaConverters._
import scala.util.{Failure, Success, Try}


class DynamicPartitionHandler(pathPolicy: PartitionPathPolicy = DefaultMetastorePartitionPathPolicy)
  extends PartitionHandler {

  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  override def path(partition: Partition,
                    db: DatabaseName,
                    tableName: TableName)
                   (client: IMetaStoreClient,
                    fs: FileSystem): Try[Path] = {

    def table: Table = client.getTable(db.value, tableName.value)

    def create(path: Path, table: Table): Unit = {
      logger.debug(s"New partition will be created at $path")

      val sd = new StorageDescriptor(table.getSd)
      sd.setLocation(path.toString)

      val params = new java.util.HashMap[String, String]
      val values = partition.entries.map(_._2).toList.asJava
      val ts = (System.currentTimeMillis / 1000).toInt

      val p = new org.apache.hadoop.hive.metastore.api.Partition(values, db.value, tableName.value, ts, 0, sd, params)
      logger.debug(s"Updating hive metastore with partition $p")
      client.add_partition(p)

      logger.info(s"Partition has been created in metastore [$partition]")
    }

    Try(client.getPartition(db.value, tableName.value, partition.entries.toList.map(_._2).asJava)) match {
      case Success(p) => Try {
        new Path(p.getSd.getLocation)
      }
      case Failure(_) => Try {
        val t = table
        val tableLocation = new Path(t.getSd.getLocation)
        val path = pathPolicy.path(tableLocation, partition)
        create(path, t)
        path
      }
    }
  }
}

Source File: StageManager.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.staging

import com.landoop.streamreactor.connect.hive.formats.HiveWriter
import com.landoop.streamreactor.connect.hive.{TopicPartition, TopicPartitionOffset}
import org.apache.hadoop.fs.{FileSystem, Path}


class StageManager(filenamePolicy: FilenamePolicy) {

  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  private def stageFilename(tp: TopicPartition) =
    s".${filenamePolicy.prefix}_${tp.topic.value}_${tp.partition}"

  private def finalFilename(tpo: TopicPartitionOffset) =
    s"${filenamePolicy.prefix}_${tpo.topic.value}_${tpo.partition}_${tpo.offset.value}"

  def stage(dir: Path, tp: TopicPartition)(implicit fs: FileSystem): Path = {
    val filename = stageFilename(tp)
    val stagePath = new Path(dir, filename)
    fs.delete(stagePath, false)
    stagePath
  }

  def commit(stagePath: Path, tpo: TopicPartitionOffset)(implicit fs: FileSystem): Path = {
    val finalPath = new Path(stagePath.getParent, finalFilename(tpo))
    logger.info(s"Commiting file $stagePath=>$finalPath")
    fs.rename(stagePath, finalPath)
    finalPath
  }
}

org.apache.hadoop.fs.FileSystem Scala Examples