org.apache.hadoop.fs.permission.FsPermission Scala Example

Source File: AvroSink.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.avro

import java.io.File

import io.eels.schema.StructType
import io.eels.{Row, Sink, SinkWriter}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.fs.{FileSystem, Path}

case class AvroSink(path: Path,
                    overwrite: Boolean = false,
                    permission: Option[FsPermission] = None,
                    inheritPermissions: Option[Boolean] = None)
                   (implicit conf: Configuration, fs: FileSystem) extends Sink {

  def withOverwrite(overwrite: Boolean): AvroSink = copy(overwrite = overwrite)
  def withPermission(permission: FsPermission): AvroSink = copy(permission = Option(permission))
  def withInheritPermission(inheritPermissions: Boolean): AvroSink = copy(inheritPermissions = Option(inheritPermissions))

  override def open(schema: StructType): SinkWriter = new SinkWriter {

    private val writer = new AvroWriter(schema, fs.create(path, overwrite))

    override def write(row: Row): Unit = writer.write(row)

    override def close(): Unit = {
      writer.close()
      permission match {
        case Some(perm) => fs.setPermission(path, perm)
        case None =>
          if (inheritPermissions.getOrElse(false)) {
            val permission = fs.getFileStatus(path.getParent).getPermission
            fs.setPermission(path, permission)
          }
      }
    }
  }
}

object AvroSink {
  def apply(file: File)(implicit conf: Configuration, fs: FileSystem): AvroSink = AvroSink(new Path(file.getAbsoluteFile.toString))
  def apply(path: java.nio.file.Path)(implicit conf: Configuration, fs: FileSystem): AvroSink = apply(path.toFile)
}

Source File: ParquetSink.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import com.sksamuel.exts.Logging
import com.sksamuel.exts.OptionImplicits._
import io.eels.schema.StructType
import io.eels.{Row, Sink, SinkWriter}
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.fs.{FileSystem, Path}

import scala.math.BigDecimal.RoundingMode
import scala.math.BigDecimal.RoundingMode.RoundingMode

case class ParquetWriteOptions(overwrite: Boolean = false,
                               permission: Option[FsPermission] = None,
                               dictionary: Boolean = true,
                               inheritPermissions: Option[Boolean] = None,
                               roundingMode: RoundingMode = RoundingMode.UNNECESSARY,
                               metadata: Map[String, String] = Map.empty) {

  def withOverwrite(overwrite: Boolean): ParquetWriteOptions = copy(overwrite = overwrite)
  def withDictionary(dictionary: Boolean): ParquetWriteOptions = copy(dictionary = dictionary)
  def withMetaData(map: Map[String, String]): ParquetWriteOptions = copy(metadata = map)
  def withPermission(permission: FsPermission): ParquetWriteOptions = copy(permission = permission.some)
  def withInheritPermission(inheritPermissions: Boolean): ParquetWriteOptions = copy(inheritPermissions = inheritPermissions.some)
  def withRoundingMode(mode: RoundingMode): ParquetWriteOptions = copy(roundingMode = mode)
}

case class ParquetSink(path: Path, options: ParquetWriteOptions = ParquetWriteOptions())
                      (implicit fs: FileSystem) extends Sink with Logging {

  // -- convenience methods --
  def withOverwrite(overwrite: Boolean): ParquetSink = copy(options = options.withOverwrite(overwrite))
  def withDictionary(dictionary: Boolean): ParquetSink = copy(options = options.copy(dictionary = dictionary))
  def withMetaData(map: Map[String, String]): ParquetSink = copy(options = options.copy(metadata = map))
  def withPermission(permission: FsPermission): ParquetSink = copy(options = options.copy(permission = permission.some))
  def withInheritPermission(inheritPermissions: Boolean): ParquetSink = copy(options = options.copy(inheritPermissions = inheritPermissions.some))
  def withRoundingMode(mode: RoundingMode): ParquetSink = copy(options = options.copy(roundingMode = mode))

  private def create(schema: StructType, path: Path): SinkWriter = new SinkWriter {

    if (options.overwrite && fs.exists(path))
      fs.delete(path, false)

    val writer = RowParquetWriterFn(path, schema, options.metadata, options.dictionary, options.roundingMode, fs.getConf)

    override def write(row: Row): Unit = {
      writer.write(row)
    }

    override def close(): Unit = {
      writer.close()
      options.permission match {
        case Some(perm) => fs.setPermission(path, perm)
        case None =>
          if (options.inheritPermissions.getOrElse(false)) {
            val permission = fs.getFileStatus(path.getParent).getPermission
            fs.setPermission(path, permission)
          }
      }
    }
  }

  override def open(schema: StructType, n: Int): Seq[SinkWriter] = {
    if (n == 1) Seq(create(schema, path))
    else List.tabulate(n) { k => create(schema, new Path(path.getParent, path.getName + "_" + k)) }
  }

  override def open(schema: StructType): SinkWriter = create(schema, path)
}

object ParquetSink {
  def apply(path: String)(implicit fs: FileSystem): ParquetSink = ParquetSink(new Path(path))
}

Source File: ParquetHiveDialect.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive.dialect

import java.util.concurrent.atomic.AtomicInteger

import com.sksamuel.exts.Logging
import com.sksamuel.exts.OptionImplicits._
import com.sksamuel.exts.io.Using
import io.eels.component.hive.{HiveDialect, HiveOps, HiveOutputStream}
import io.eels.component.parquet._
import io.eels.component.parquet.util.{ParquetIterator, ParquetLogMute}
import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription}
import io.eels.schema.StructType
import io.eels.{Predicate, Row}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
import org.apache.hadoop.hive.ql.io.parquet.{MapredParquetInputFormat, MapredParquetOutputFormat}

import scala.math.BigDecimal.RoundingMode.RoundingMode

case class ParquetHiveDialect(options: ParquetWriteOptions = ParquetWriteOptions()) extends HiveDialect with Logging with Using {

  override val serde: String = classOf[ParquetHiveSerDe].getCanonicalName
  override val inputFormat: String = classOf[MapredParquetInputFormat].getCanonicalName
  override val outputFormat: String = classOf[MapredParquetOutputFormat].getCanonicalName

  override def input(path: Path,
                     ignore: StructType,
                     projectionSchema: StructType,
                     predicate: Option[Predicate])
                    (implicit fs: FileSystem, conf: Configuration): Publisher[Seq[Row]] = new Publisher[Seq[Row]] {

    val client = new HiveMetaStoreClient(new HiveConf)
    val ops = new HiveOps(client)

    override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {
      // convert the eel projection schema into a parquet schema which will be used by the native parquet reader
      try {
        val parquetProjectionSchema = ParquetSchemaFns.toParquetMessageType(projectionSchema)
        using(RowParquetReaderFn(path, predicate, parquetProjectionSchema.some, true)) { reader =>
          val subscription = new Subscription {
            override def cancel(): Unit = reader.close()
          }
          subscriber.subscribed(subscription)
          ParquetIterator(reader).grouped(DataStream.DefaultBatchSize).foreach(subscriber.next)
          subscriber.completed()
        }
      } catch {
        case t: Throwable => subscriber.error(t)
      }
    }
  }

  override def output(schema: StructType,
                      path: Path,
                      permission: Option[FsPermission],
                      roundingMode: RoundingMode,
                      metadata: Map[String, String])
                     (implicit fs: FileSystem, conf: Configuration): HiveOutputStream = {
    val path_x = path
    new HiveOutputStream {
      ParquetLogMute()

      private val _records = new AtomicInteger(0)
      logger.debug(s"Creating parquet writer at $path")
      private val writer = RowParquetWriterFn(path, schema, metadata, true, roundingMode, fs.getConf)

      override def write(row: Row) {
        require(row.values.nonEmpty, "Attempting to write an empty row")
        writer.write(row)
        _records.incrementAndGet()
      }

      override def close(): Unit = {
        logger.debug(s"Closing hive parquet writer $path")
        writer.close()
        // after the files are closed, we should set permissions if we've been asked to, this allows
        // all the files we create to stay consistent
        permission.foreach(fs.setPermission(path, _))
      }

      override def records: Int = _records.get()
      override def path: Path = path_x
    }
  }
}

Source File: OrcHiveDialect.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive.dialect

import com.sksamuel.exts.Logging
import io.eels.component.hive.{HiveDialect, HiveOutputStream}
import io.eels.component.orc.{OrcPublisher, OrcWriteOptions, OrcWriter}
import io.eels.datastream.{Publisher, Subscriber}
import io.eels.schema.StructType
import io.eels.{Predicate, Row}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.ql.io.orc.{OrcInputFormat, OrcOutputFormat, OrcSerde}

import scala.math.BigDecimal.RoundingMode.RoundingMode

case class OrcHiveDialect(options: OrcWriteOptions = OrcWriteOptions()) extends HiveDialect with Logging {

  override val serde: String = classOf[OrcSerde].getCanonicalName
  override val inputFormat: String = classOf[OrcInputFormat].getCanonicalName
  override val outputFormat: String = classOf[OrcOutputFormat].getCanonicalName

  override def input(path: Path,
                     metastoreSchema: StructType,
                     projectionSchema: StructType,
                     predicate: Option[Predicate])
                    (implicit fs: FileSystem, conf: Configuration): Publisher[Seq[Row]] = new Publisher[Seq[Row]] {
    override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {
      new OrcPublisher(path, projectionSchema.fieldNames(), predicate).subscribe(subscriber)
    }
  }

  override def output(schema: StructType,
                      path: Path,
                      permission: Option[FsPermission],
                      roundingMode: RoundingMode,
                      metadata: Map[String, String])(implicit fs: FileSystem, conf: Configuration): HiveOutputStream = {

    val path_x = path
    val writer = new OrcWriter(path, schema, options)

    new HiveOutputStream {

      override def write(row: Row): Unit = {
        require(row.values.nonEmpty, "Attempting to write an empty row")
        writer.write(row)
      }

      override def close(): Unit = {
        writer.close()
        permission.foreach(fs.setPermission(path, _))
      }

      override def records: Int = writer.records
      override def path: Path = path_x
    }
  }
}

Source File: HiveDialect.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive

import com.sksamuel.exts.Logging
import io.eels.component.hive.dialect.{OrcHiveDialect, ParquetHiveDialect}
import io.eels.datastream.Publisher
import io.eels.schema.StructType
import io.eels.{Predicate, Row}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.api.Table

import scala.math.BigDecimal.RoundingMode.RoundingMode

trait HiveDialect extends Logging {

  def serde: String
  def inputFormat: String
  def outputFormat: String

  
  def output(schema: StructType, // schema without partition information
             path: Path,
             permission: Option[FsPermission],
             roundingMode: RoundingMode,
             metadata: Map[String, String])
            (implicit fs: FileSystem, conf: Configuration): HiveOutputStream

  def stats(getPath: Path)(implicit fs: FileSystem): Long = throw new UnsupportedOperationException
}

object HiveDialect extends Logging {

  def apply(format: String): HiveDialect = format match {
    case input if input.contains("ParquetInputFormat") => ParquetHiveDialect()
    case input if input.contains("OrcInputFormat") => OrcHiveDialect()
    //case input if input.contains("AvroHiveDialect") || input.contains("AvroContainerInputFormat") => AvroHiveDialect
    //      "org.apache.hadoop.mapred.TextInputFormat" -> TextHiveDialect
    case _ => throw new UnsupportedOperationException(s"Unknown hive input format $format")
  }

  def apply(table: Table): HiveDialect = {
    val format = table.getSd.getInputFormat
    logger.debug(s"Table format is $format")
    val dialect = HiveDialect(format)
    logger.debug(s"HiveDialect is $dialect")
    dialect
  }
}

Source File: HiveSpeedTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hive

import com.sksamuel.exts.metrics.Timed
import io.eels.Row
import io.eels.datastream.DataStream
import io.eels.schema.StructType
import org.apache.hadoop.fs.permission.FsPermission

import scala.util.Random


object HiveSpeedTest extends App with Timed {

  import HiveConfig._

  val Database = "sam"
  val Table = "speedtest"

  val schema = StructType("artist", "album", "year")
  val data = Array(
    Vector("elton", "yellow brick road ", "1972"),
    Vector("elton", "tumbleweed connection", "1974"),
    Vector("elton", "empty sky", "1969"),
    Vector("beatles", "white album", "1969"),
    Vector("beatles", "tumbleweed connection", "1966"),
    Vector("pinkfloyd", "the wall", "1979"),
    Vector("pinkfloyd", "dark side of the moon", "1974"),
    Vector("pinkfloyd", "emily", "1966")
  )

  def createRow = Row(schema, data(Random.nextInt(data.length)))
  val size = 10000000

  while (true) {

    val ds = DataStream.fromRowIterator(schema, Iterator.continually(createRow).take(size))
      .addField("bibble", "myvalue")
      .addField("timestamp", System.currentTimeMillis.toString)
    println(ds.schema.show())

    HiveTable(Database, Table).drop(true)

    ops.createTable(
      Database,
      Table,
      ds.schema,
      List("artist"),
      overwrite = true
    )

    timed("writing data") {
      val sink = HiveSink(Database, Table).withPermission(new FsPermission("700"))
      ds.to(sink, 4)
      logger.info("Write complete")
    }

    timed("reading data") {
      val source = HiveSource(Database, Table)
      source.toDataStream().size
      logger.info("Read complete")
    }

    Thread.sleep(5000)
  }
}

Source File: OrcSink.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.orc

import com.sksamuel.exts.Logging
import com.sksamuel.exts.OptionImplicits._
import com.sksamuel.exts.config.ConfigSupport
import com.typesafe.config.ConfigFactory
import io.eels.schema.StructType
import io.eels.{Row, Sink, SinkWriter}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.orc.OrcFile.{CompressionStrategy, EncodingStrategy}
import org.apache.orc.OrcProto.CompressionKind

case class OrcWriteOptions(overwrite: Boolean = false,
                           compressionKind: CompressionKind,
                           compressionStrategy: CompressionStrategy,
                           compressionBufferSize: Option[Int],
                           encodingStrategy: Option[EncodingStrategy],
                           bloomFilterColumns: Seq[String] = Nil,
                           permission: Option[FsPermission] = None,
                           inheritPermissions: Option[Boolean] = None,
                           rowIndexStride: Option[Int] = None) {
  def withCompressionKind(kind: CompressionKind): OrcWriteOptions = copy(compressionKind = kind)
  def withCompressionStrategy(strategy: CompressionStrategy): OrcWriteOptions = copy(compressionStrategy = strategy)
  def withCompressionBufferSize(size: Int): OrcWriteOptions = copy(compressionBufferSize = size.some)
  def withEncodingStrategy(strategy: EncodingStrategy): OrcWriteOptions = copy(encodingStrategy = strategy.some)
  def withBloomFilterColumns(bloomFilterColumns: Seq[String]): OrcWriteOptions = copy(bloomFilterColumns = bloomFilterColumns)
  def withRowIndexStride(stride: Int): OrcWriteOptions = copy(rowIndexStride = stride.some)
  def withOverwrite(overwrite: Boolean): OrcWriteOptions = copy(overwrite = overwrite)
  def withPermission(permission: FsPermission): OrcWriteOptions = copy(permission = permission.some)
  def withInheritPermission(inheritPermissions: Boolean): OrcWriteOptions = copy(inheritPermissions = inheritPermissions.some)
}

object OrcWriteOptions extends ConfigSupport {

  // creates a config from the typesafe reference.confs
  def apply(): OrcWriteOptions = {
    val config = ConfigFactory.load()
    OrcWriteOptions(
      false,
      CompressionKind valueOf config.getString("eel.orc.writer.compression-kind"),
      CompressionStrategy valueOf config.getString("eel.orc.writer.compression-strategy"),
      config.getIntOpt("eel.orc.writer.compression-buffer-size"),
      config.getStringOpt("eel.orc.writer.encoding-strategy").map(EncodingStrategy.valueOf)
    )
  }
}

case class OrcSink(path: Path, options: OrcWriteOptions = OrcWriteOptions())
                  (implicit fs: FileSystem, conf: Configuration) extends Sink with Logging {

  // -- convenience options --
  def withCompressionKind(kind: CompressionKind): OrcSink = copy(options = options.copy(compressionKind = kind))
  def withCompressionStrategy(strategy: CompressionStrategy): OrcSink = copy(options = options.copy(compressionStrategy = strategy))
  def withCompressionBufferSize(size: Int): OrcSink = copy(options = options.copy(compressionBufferSize = size.some))
  def withEncodingStrategy(strategy: EncodingStrategy): OrcSink = copy(options = options.copy(encodingStrategy = strategy.some))
  def withBloomFilterColumns(bloomFilterColumns: Seq[String]): OrcSink = copy(options = options.copy(bloomFilterColumns = bloomFilterColumns))
  def withRowIndexStride(stride: Int): OrcSink = copy(options = options.copy(rowIndexStride = stride.some))
  def withOverwrite(overwrite: Boolean): OrcSink = copy(options = options.copy(overwrite = overwrite))
  def withPermission(permission: FsPermission): OrcSink = copy(options = options.copy(permission = permission.some))
  def withInheritPermission(inheritPermissions: Boolean): OrcSink = copy(options = options.copy(inheritPermissions = inheritPermissions.some))

  override def open(schema: StructType, n: Int): Seq[SinkWriter] = {
    if (n == 1) Seq(create(schema, path))
    else List.tabulate(n) { k => create(schema, new Path(path.getParent, path.getName + "_" + k)) }
  }

  override def open(schema: StructType): SinkWriter = create(schema, path)

  private def create(schema: StructType, path: Path): SinkWriter = new SinkWriter {

    if (options.overwrite && fs.exists(path))
      fs.delete(path, false)

    val writer = new OrcWriter(path, schema, options)

    override def write(row: Row): Unit = writer.write(row)
    
    override def close(): Unit = {
      writer.close()
      options.permission match {
        case Some(perm) => fs.setPermission(path, perm)
        case None =>
          if (options.inheritPermissions.getOrElse(false)) {
            val permission = fs.getFileStatus(path.getParent).getPermission
            fs.setPermission(path, permission)
          }
      }
    }
  }
}

Source File: DFSJarStore.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.jarstore.dfs

import java.io.{InputStream, OutputStream}
import org.apache.gearpump.util.Constants
import org.apache.gearpump.jarstore.JarStore
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import com.typesafe.config.Config
import org.apache.hadoop.fs.permission.{FsAction, FsPermission}


  override def getFile(fileName: String): InputStream = {
    val filePath = new Path(rootPath, fileName)
    val fs = filePath.getFileSystem(new Configuration())
    fs.open(filePath)
  }

  private def createDirIfNotExists(path: Path): Unit = {
    val fs = path.getFileSystem(new Configuration())
    if (!fs.exists(path)) {
      fs.mkdirs(path, new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL))
    }
  }
}

Source File: JsonFileReporter.scala From kyuubi with Apache License 2.0

5 votes

package yaooqinn.kyuubi.metrics

import java.io.{BufferedWriter, Closeable, IOException, OutputStreamWriter}
import java.util.{Timer, TimerTask}
import java.util.concurrent.TimeUnit

import scala.util.Try
import scala.util.control.NonFatal

import com.codahale.metrics.MetricRegistry
import com.codahale.metrics.json.MetricsModule
import com.fasterxml.jackson.databind.ObjectMapper
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.kyuubi.Logging
import org.apache.spark.{KyuubiSparkUtil, SparkConf}
import org.apache.spark.KyuubiConf._

private[metrics] class JsonFileReporter(conf: SparkConf, registry: MetricRegistry)
  extends Closeable with Logging {

  private val jsonMapper = new ObjectMapper().registerModule(
    new MetricsModule(TimeUnit.MILLISECONDS, TimeUnit.MILLISECONDS, false))
  private val timer = new Timer(true)
  private val interval = KyuubiSparkUtil.timeStringAsMs(conf.get(METRICS_REPORT_INTERVAL))
  private val path = conf.get(METRICS_REPORT_LOCATION)
  private val hadoopConf = KyuubiSparkUtil.newConfiguration(conf)

  def start(): Unit = {
    timer.schedule(new TimerTask {
      var bw: BufferedWriter = _
      override def run(): Unit = try {
        val json = jsonMapper.writerWithDefaultPrettyPrinter().writeValueAsString(registry)
        val tmpPath = new Path(path + ".tmp")
        val tmpPathUri = tmpPath.toUri
        val fs = if (tmpPathUri.getScheme == null && tmpPathUri.getAuthority == null) {
          FileSystem.getLocal(hadoopConf)
        } else {
          FileSystem.get(tmpPathUri, hadoopConf)
        }
        fs.delete(tmpPath, true)
        bw = new BufferedWriter(new OutputStreamWriter(fs.create(tmpPath, true)))
        bw.write(json)
        bw.close()
        fs.setPermission(tmpPath, FsPermission.createImmutable(Integer.parseInt("644", 8).toShort))
        val finalPath = new Path(path)
        fs.rename(tmpPath, finalPath)
        fs.setPermission(finalPath,
          FsPermission.createImmutable(Integer.parseInt("644", 8).toShort))
      } catch {
        case NonFatal(e) => error("Error writing metrics to json file" + path, e)
      } finally {
        if (bw != null) {
          Try(bw.close())
        }
      }
    }, 0, interval)
  }

  override def close(): Unit = {
    timer.cancel()
  }
}

Source File: package.scala From kyuubi with Apache License 2.0

5 votes

package yaooqinn.kyuubi

import scala.collection.mutable.HashMap

import org.apache.hadoop.fs.permission.FsPermission

package object yarn {

  type EnvMap = HashMap[String, String]

  val KYUUBI_YARN_APP_NAME = "KYUUBI SERVER"
  val KYUUBI_YARN_APP_TYPE = "KYUUBI"
  // Staging directory for any temporary jars or files
  val KYUUBI_STAGING: String = ".kyuubiStaging"
  // Staging directory is private! -> rwx--------
  val STAGING_DIR_PERMISSION: FsPermission =
    FsPermission.createImmutable(Integer.parseInt("700", 8).toShort)
  // App files are world-wide readable and owner writable -> rw-r--r--
  val APP_FILE_PERMISSION: FsPermission =
    FsPermission.createImmutable(Integer.parseInt("644", 8).toShort)

  val SPARK_CONF_DIR = "__spark_conf__"
  val SPARK_CONF_FILE = "__spark_conf__.properties"
  // Subdirectory in the conf directory containing Hadoop config files.
  val HADOOP_CONF_DIR = "__hadoop_conf__"
  // File containing the conf archive in the AM. See prepareLocalResources().
  val SPARK_CONF_ARCHIVE: String = SPARK_CONF_DIR + ".zip"
  val SPARK_LIB_DIR = "__spark_libs__"
  val LOCAL_SCHEME = "local"
}

Source File: SparkHadoopUtilSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.deploy

import java.security.PrivilegedExceptionAction

import scala.util.Random

import org.apache.hadoop.fs.FileStatus
import org.apache.hadoop.fs.permission.{FsAction, FsPermission}
import org.apache.hadoop.security.UserGroupInformation
import org.scalatest.Matchers

import org.apache.spark.SparkFunSuite

class SparkHadoopUtilSuite extends SparkFunSuite with Matchers {
  test("check file permission") {
    import FsAction._
    val testUser = s"user-${Random.nextInt(100)}"
    val testGroups = Array(s"group-${Random.nextInt(100)}")
    val testUgi = UserGroupInformation.createUserForTesting(testUser, testGroups)

    testUgi.doAs(new PrivilegedExceptionAction[Void] {
      override def run(): Void = {
        val sparkHadoopUtil = new SparkHadoopUtil

        // If file is owned by user and user has access permission
        var status = fileStatus(testUser, testGroups.head, READ_WRITE, READ_WRITE, NONE)
        sparkHadoopUtil.checkAccessPermission(status, READ) should be(true)
        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(true)

        // If file is owned by user but user has no access permission
        status = fileStatus(testUser, testGroups.head, NONE, READ_WRITE, NONE)
        sparkHadoopUtil.checkAccessPermission(status, READ) should be(false)
        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(false)

        val otherUser = s"test-${Random.nextInt(100)}"
        val otherGroup = s"test-${Random.nextInt(100)}"

        // If file is owned by user's group and user's group has access permission
        status = fileStatus(otherUser, testGroups.head, NONE, READ_WRITE, NONE)
        sparkHadoopUtil.checkAccessPermission(status, READ) should be(true)
        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(true)

        // If file is owned by user's group but user's group has no access permission
        status = fileStatus(otherUser, testGroups.head, READ_WRITE, NONE, NONE)
        sparkHadoopUtil.checkAccessPermission(status, READ) should be(false)
        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(false)

        // If file is owned by other user and this user has access permission
        status = fileStatus(otherUser, otherGroup, READ_WRITE, READ_WRITE, READ_WRITE)
        sparkHadoopUtil.checkAccessPermission(status, READ) should be(true)
        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(true)

        // If file is owned by other user but this user has no access permission
        status = fileStatus(otherUser, otherGroup, READ_WRITE, READ_WRITE, NONE)
        sparkHadoopUtil.checkAccessPermission(status, READ) should be(false)
        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(false)

        null
      }
    })
  }

  private def fileStatus(
      owner: String,
      group: String,
      userAction: FsAction,
      groupAction: FsAction,
      otherAction: FsAction): FileStatus = {
    new FileStatus(0L,
      false,
      0,
      0L,
      0L,
      0L,
      new FsPermission(userAction, groupAction, otherAction),
      owner,
      group,
      null)
  }
}

Source File: ParquetHiveFormat.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.formats

import com.landoop.streamreactor.connect.hive.Serde
import com.landoop.streamreactor.connect.hive.parquet.ParquetSinkConfig
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.hadoop.ParquetWriter

import scala.util.Try

object ParquetHiveFormat extends HiveFormat {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  override def serde = Serde(
    "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
    "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
    "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
    Map("serialization.format" -> "1")
  )

  override def writer(path: Path, schema: Schema)
                     (implicit fs: FileSystem): HiveWriter = new HiveWriter {

    logger.debug(s"Creating parquet writer at $path")

    val writer: ParquetWriter[Struct] = com.landoop.streamreactor.connect.hive.parquet.parquetWriter(path, schema, ParquetSinkConfig(overwrite = true))
    Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx")))

    val createdTime: Long = System.currentTimeMillis()
    var lastKnownFileSize: Long = fs.getFileStatus(path).getLen
    var readFileSize = false
    var count = 0

    override def write(struct: Struct): Long = {
      writer.write(struct)
      count = count + 1
      readFileSize = true
      count
    }

    override def close(): Unit = {
      logger.debug(s"Closing writer at path $path")
      writer.close()
    }

    override def currentCount: Long = count
    override def file: Path = path
    override def fileSize: Long = {
      if (readFileSize) {
        lastKnownFileSize = fs.getFileStatus(path).getLen
        readFileSize = false
      }

      lastKnownFileSize
    }
  }

  override def reader(path: Path, startAt: Int, schema: Schema)
                     (implicit fs: FileSystem): HiveReader = new HiveReader {

    logger.debug(s"Creating parquet reader for $path with offset $startAt")
    val reader = com.landoop.streamreactor.connect.hive.parquet.parquetReader(path)
    var offset = startAt

    override def iterator: Iterator[Record] = Iterator.continually(reader.read).takeWhile(_ != null).drop(startAt).map { struct =>
      val record = Record(struct, path, offset)
      offset = offset + 1
      record
    }

    override def close(): Unit = reader.close()
  }
}

Source File: ParquetHiveFormat.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.formats

import com.landoop.streamreactor.connect.hive.Serde
import com.landoop.streamreactor.connect.hive.parquet.ParquetSinkConfig
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.hadoop.ParquetWriter

import scala.util.Try

object ParquetHiveFormat extends HiveFormat {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  override def serde = Serde(
    "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
    "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
    "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
    Map("serialization.format" -> "1")
  )

  override def writer(path: Path, schema: Schema)
                     (implicit fs: FileSystem): HiveWriter = new HiveWriter {

    logger.debug(s"Creating parquet writer at $path")

    val writer: ParquetWriter[Struct] = com.landoop.streamreactor.connect.hive.parquet.parquetWriter(path, schema, ParquetSinkConfig(overwrite = true))
    Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx")))

    val createdTimestamp: Long = System.currentTimeMillis()
    var lastKnownFileSize:Long = fs.getFileStatus(path).getLen
    var readFileSize = false
    var count = 0

    override def write(struct: Struct): Long = {
      writer.write(struct)
      count = count + 1
      readFileSize = true
      count
    }

    override def close(): Unit = {
      logger.debug(s"Closing writer at path $path")
      writer.close()
    }

    override def currentCount: Long = count
    override def file: Path = path
    override def createdTime: Long = createdTimestamp
    override def fileSize: Long = {
      if (readFileSize) {
        lastKnownFileSize = fs.getFileStatus(path).getLen
        readFileSize = false
      }

      lastKnownFileSize
    }
  }

  override def reader(path: Path, startAt: Int, schema: Schema)
                     (implicit fs: FileSystem): HiveReader = new HiveReader {

    logger.debug(s"Creating parquet reader for $path with offset $startAt")
    val reader = com.landoop.streamreactor.connect.hive.parquet.parquetReader(path)
    var offset = startAt

    override def iterator: Iterator[Record] = Iterator.continually(reader.read).takeWhile(_ != null).drop(startAt).map { struct =>
      val record = Record(struct, path, offset)
      offset = offset + 1
      record
    }

    override def close(): Unit = reader.close()
  }
}

Source File: OrcHiveFormat.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.formats

import com.landoop.streamreactor.connect.hive.{OrcSinkConfig, OrcSourceConfig, Serde}
import com.landoop.streamreactor.connect.hive.orc.OrcSink
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.kafka.connect.data.{Schema, Struct}

import scala.util.Try

object OrcHiveFormat extends HiveFormat {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  override def serde = Serde(
    "org.apache.hadoop.hive.ql.io.orc.OrcSerde",
    "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat",
    "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat",
    Map("org.apache.hadoop.hive.ql.io.orc.OrcSerde" -> "1")
  )

  override def writer(path: Path, schema: Schema)
                     (implicit fs: FileSystem): HiveWriter = new HiveWriter {
    logger.debug(s"Creating orc writer at $path")

    val sink: OrcSink = com.landoop.streamreactor.connect.hive.orc.sink(path, schema, OrcSinkConfig(overwrite = true))
    Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx")))

    val cretedTimestamp: Long = System.currentTimeMillis()
    var lastKnownFileSize:Long = fs.getFileStatus(path).getLen
    var readFileSize = false
    var count = 0

    override def write(struct: Struct): Long = {
      sink.write(struct)
      count = count + 1
      readFileSize = true
      count
    }

    override def close(): Unit = {
      logger.debug(s"Closing orc writer at path $path")
      sink.close()
    }
    override def file: Path = path
    override def currentCount: Long = count
    override def createdTime: Long = cretedTimestamp
    override def fileSize: Long = {
      if (readFileSize) {
        lastKnownFileSize = fs.getFileStatus(path).getLen
        readFileSize = false
      }

      lastKnownFileSize
    }
  }

  override def reader(path: Path, startAt: Int, schema: Schema)
                     (implicit fs: FileSystem): HiveReader = new HiveReader {

    logger.debug(s"Creating orc reader for $path with offset $startAt")
    val reader = com.landoop.streamreactor.connect.hive.orc.source(path, OrcSourceConfig())
    var offset = startAt

    override def iterator: Iterator[Record] = reader.iterator.map { struct =>
      val record = Record(struct, path, offset)
      offset = offset + 1
      record
    }

    override def close(): Unit = reader.close()
  }
}

org.apache.hadoop.fs.permission.FsPermission Scala Examples