org.apache.hadoop.fs.FileStatus Scala Example

Source File: TextFileFormat.scala From drizzle-spark with Apache License 2.0

12 votes

package org.apache.spark.sql.execution.datasources.text

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, Path}
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat}
import org.apache.hadoop.util.ReflectionUtils

import org.apache.spark.TaskContext
import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.UnsafeRow
import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter}
import org.apache.spark.sql.catalyst.util.CompressionCodecs
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.util.SerializableConfiguration


  def getCompressionExtension(context: TaskAttemptContext): String = {
    // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile
    if (FileOutputFormat.getCompressOutput(context)) {
      val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec])
      ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension
    } else {
      ""
    }
  }
}

Source File: SerializableFileStatus.scala From delta with Apache License 2.0

5 votes

package org.apache.spark.sql.delta.util

import java.util.Objects

import org.apache.hadoop.fs.{FileStatus, LocatedFileStatus, Path}


case class SerializableFileStatus(
    path: String,
    length: Long,
    isDir: Boolean,
    modificationTime: Long) {

  // Important note! This is very expensive to compute, but we don't want to cache it
  // as a `val` because Paths internally contain URIs and therefore consume lots of memory.
  def getPath: Path = new Path(path)
  def getLen: Long = length
  def getModificationTime: Long = modificationTime
  def isDirectory: Boolean = isDir

  def toFileStatus: FileStatus = {
    new LocatedFileStatus(
      new FileStatus(length, isDir, 0, 0, modificationTime, new Path(path)),
      null)
  }

  override def equals(obj: Any): Boolean = obj match {
    case other: SerializableFileStatus =>
      // We only compare the paths to stay consistent with FileStatus.equals.
      Objects.equals(path, other.path)
    case _ => false
  }

  override def hashCode(): Int = {
    // We only use the path to stay consistent with FileStatus.hashCode.
    Objects.hashCode(path)
  }
}

object SerializableFileStatus {
  def fromStatus(status: FileStatus): SerializableFileStatus = {
    SerializableFileStatus(
      Option(status.getPath).map(_.toString).orNull,
      status.getLen,
      status.isDirectory,
      status.getModificationTime)
  }

  val EMPTY: SerializableFileStatus = fromStatus(new FileStatus())
}

Source File: BgenSchemaInferrer.scala From glow with Apache License 2.0

5 votes

package io.projectglow.bgen

import com.google.common.io.LittleEndianDataInputStream
import org.apache.hadoop.fs.{FileStatus, Path}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.StructType

import io.projectglow.common.{BgenOptions, CommonOptions, VariantSchemas, WithUtils}
import io.projectglow.sql.util.SerializableConfiguration


object BgenSchemaInferrer {
  def inferSchema(
      spark: SparkSession,
      files: Seq[FileStatus],
      options: Map[String, String]): StructType = {
    val shouldIncludeSampleIds = options.get(CommonOptions.INCLUDE_SAMPLE_IDS).forall(_.toBoolean)
    if (!shouldIncludeSampleIds) {
      return VariantSchemas.bgenDefaultSchema(hasSampleIds = false)
    }

    val sampleIdsFromSampleFile =
      BgenFileFormat.getSampleIds(options, spark.sparkContext.hadoopConfiguration)
    if (sampleIdsFromSampleFile.isDefined) {
      return VariantSchemas.bgenDefaultSchema(hasSampleIds = true)
    }

    val serializableConf = new SerializableConfiguration(spark.sparkContext.hadoopConfiguration)
    val ignoreExtension = options.get(BgenOptions.IGNORE_EXTENSION_KEY).exists(_.toBoolean)
    val bgenPaths =
      files.filter { fs =>
        fs.getLen > 0 && (fs
          .getPath
          .toString
          .endsWith(BgenFileFormat.BGEN_SUFFIX) || ignoreExtension)
      }.map(_.getPath.toString)
    val hasSampleIds = spark
      .sparkContext
      .parallelize(bgenPaths)
      .map { path =>
        val hPath = new Path(path)
        val hadoopFs = hPath.getFileSystem(serializableConf.value)
        WithUtils.withCloseable(hadoopFs.open(hPath)) { stream =>
          val littleEndianDataInputStream = new LittleEndianDataInputStream(stream)
          new BgenHeaderReader(littleEndianDataInputStream)
            .readHeader(None)
            .sampleIds
            .exists(_.nonEmpty)
        }
      }
      .collect()
      .exists(identity)

    VariantSchemas.bgenDefaultSchema(hasSampleIds)
  }
}

Source File: XyzRelation.scala From spark-iqmulus with Apache License 2.0

5 votes

package fr.ign.spark.iqmulus.xyz

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types._
import org.apache.spark.sql.sources.HadoopFsRelation
import org.apache.spark.sql.Row
import org.apache.spark.rdd.RDD
import org.apache.hadoop.fs.FileStatus

object XyzRelation {

  val xyzrgbSchema = StructType(Array(
    StructField("x", FloatType, false),
    StructField("y", FloatType, false),
    StructField("z", FloatType, false),
    StructField("r", ByteType, false),
    StructField("g", ByteType, false),
    StructField("b", ByteType, false)
  ))

  val xyzSchema = StructType(Array(
    StructField("x", FloatType, false),
    StructField("y", FloatType, false),
    StructField("z", FloatType, false)
  ))

}

class XyzRelation(
  override val paths: Array[String],
  maybeDataSchema: Option[StructType],
  override val userDefinedPartitionColumns: Option[StructType],
  parameters: Map[String, String]
)(@transient val sqlContext: SQLContext)
    extends HadoopFsRelation {

  override lazy val dataSchema = maybeDataSchema.getOrElse(XyzRelation.xyzrgbSchema)

  override def prepareJobForWrite(job: org.apache.hadoop.mapreduce.Job): org.apache.spark.sql.sources.OutputWriterFactory = ???

  override def buildScan(inputs: Array[FileStatus]): RDD[Row] = {
    val lines = sqlContext.sparkContext.textFile(inputs.map(_.getPath).mkString("", ",", ""))
    val dataTypes = dataSchema.fields.map(_.dataType)
    lines map (line => Row.fromSeq((line.split("\t") zip dataTypes).map {
      case (x, StringType) => x
      case (x, ByteType) => x.toByte
      case (x, ShortType) => x.toShort
      case (x, IntegerType) => x.toInt
      case (x, LongType) => x.toLong
      case (x, FloatType) => x.toFloat
      case (x, DoubleType) => x.toDouble
      case _ => null
    }.padTo(dataTypes.size, null)))
  }

}

Source File: S3ALineCountWritebackSuite.scala From cloud-integration with Apache License 2.0

5 votes

package com.cloudera.spark.cloud.s3

import scala.concurrent.duration._
import scala.language.postfixOps

import com.cloudera.spark.cloud.common.CloudSuiteWithCSVDatasource
import org.apache.hadoop.fs.{FileStatus, Path}


class S3ALineCountWritebackSuite extends CloudSuiteWithCSVDatasource with S3ATestSetup {

  init()

  def init(): Unit = {
    // propagate S3 credentials
    if (enabled) {
      initFS()
    }
  }

  override def enabled: Boolean = super.enabled && hasCSVTestFile

  override def cleanFSInTeardownEnabled: Boolean = true

  after {
    cleanFilesystemInTeardown()
  }

  ctest("LineCountWriteback",
    "Execute the LineCount example with the results written back to the test filesystem.") {
    val sourceFile = getTestCSVPath()
    val sourceFS = sourceFile.getFileSystem(getConf)
    val sourceInfo = sourceFS.getFileStatus(sourceFile)
    val sparkConf = newSparkConf()
    sparkConf.setAppName("LineCount")
    val destDir = testPath(filesystem, "LineCountWriteback")
    assert(0 === S3ALineCount.action(sparkConf,
      Array(sourceFile.toString, destDir.toString)))


    val status = filesystem.getFileStatus(destDir)
    assert(status.isDirectory, s"Not a directory: $status")

    // only a small fraction of the source data is needed
    val expectedLen = sourceInfo.getLen / 1024

    def validateChildSize(qualifier: String, files: Seq[FileStatus]) = {
      val (filenames, size) = enumFileSize(destDir, files)
      logInfo(s"total size of $qualifier = $size bytes from ${files.length} files: $filenames")
      assert(size >= expectedLen, s"$qualifier size $size in files $filenames" +
          s" smaller than exoected length $expectedLen")
    }

    val stdInterval = interval(100 milliseconds)
    val appId = eventually(timeout(20 seconds), stdInterval) {
      validateChildSize("descendants",
        listFiles(filesystem, destDir, true)
            .filter(f => f.getPath.getName != "_SUCCESS"))

      validateChildSize("children",
        filesystem.listStatus(destDir,
          pathFilter(p => p.getName != "_SUCCESS")).toSeq)
    }
  }

  private def enumFileSize(destDir: Path, files: Seq[FileStatus]): (String, Long)  = {
    assert(files.nonEmpty, s"No files in destination directory $destDir")
    var size = 0L
    val filenames = new StringBuffer()
    files.foreach { f =>
      size += f.getLen
      filenames.append(" ").append(f.getPath)
    }
    (filenames.toString, size)
  }

}

Source File: SerializableFileStatus.scala From parquet-index with Apache License 2.0

5 votes

package com.github.lightcopy.util

import org.apache.hadoop.fs.{BlockLocation, FileStatus, LocatedFileStatus, Path}


object SerializableFileStatus {
  def fromFileStatus(status: FileStatus): SerializableFileStatus = {
    val blockLocations = status match {
      case f: LocatedFileStatus =>
        f.getBlockLocations.map { loc =>
          SerializableBlockLocation(loc.getNames, loc.getHosts, loc.getOffset, loc.getLength)
        }
      case _ =>
        Array.empty[SerializableBlockLocation]
    }

    SerializableFileStatus(
      status.getPath.toString,
      status.getLen,
      status.isDirectory,
      status.getReplication,
      status.getBlockSize,
      status.getModificationTime,
      status.getAccessTime,
      blockLocations)
  }

  def toFileStatus(status: SerializableFileStatus): FileStatus = {
    val blockLocations = status.blockLocations.map { loc =>
      new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
    }

    new LocatedFileStatus(
      new FileStatus(
        status.length,
        status.isDir,
        status.blockReplication,
        status.blockSize,
        status.modificationTime,
        new Path(status.path)),
      blockLocations)
  }
}

Source File: SageMakerProtobufFileFormat.scala From sagemaker-spark with Apache License 2.0

5 votes

package com.amazonaws.services.sagemaker.sparksdk.protobuf

import org.apache.hadoop.fs.FileStatus
import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriter, OutputWriterFactory}
import org.apache.spark.sql.sources.DataSourceRegister
import org.apache.spark.sql.types.StructType

class SageMakerProtobufFileFormat extends FileFormat with DataSourceRegister {

  override def inferSchema(sparkSession: SparkSession,
                           options: Map[String, String],
                           files: Seq[FileStatus]):
  Option[StructType] = {
    Option.empty
  }

  override def shortName(): String = "sagemaker"

  override def toString: String = "SageMaker"

  override def prepareWrite(
                             sparkSession: SparkSession,
                             job: Job,
                             options: Map[String, String],
                             dataSchema: StructType): OutputWriterFactory = {
    new OutputWriterFactory {
      override def newInstance(
                                path: String,
                                dataSchema: StructType,
                                context: TaskAttemptContext): OutputWriter = {
        new SageMakerProtobufWriter(path, context, dataSchema, options)
      }

      override def getFileExtension(context: TaskAttemptContext): String = {
        ".pbr"
      }
    }
  }
}

Source File: SparkHadoopUtilSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.deploy

import java.security.PrivilegedExceptionAction

import scala.util.Random

import org.apache.hadoop.fs.FileStatus
import org.apache.hadoop.fs.permission.{FsAction, FsPermission}
import org.apache.hadoop.security.UserGroupInformation
import org.scalatest.Matchers

import org.apache.spark.SparkFunSuite

class SparkHadoopUtilSuite extends SparkFunSuite with Matchers {
  test("check file permission") {
    import FsAction._
    val testUser = s"user-${Random.nextInt(100)}"
    val testGroups = Array(s"group-${Random.nextInt(100)}")
    val testUgi = UserGroupInformation.createUserForTesting(testUser, testGroups)

    testUgi.doAs(new PrivilegedExceptionAction[Void] {
      override def run(): Void = {
        val sparkHadoopUtil = new SparkHadoopUtil

        // If file is owned by user and user has access permission
        var status = fileStatus(testUser, testGroups.head, READ_WRITE, READ_WRITE, NONE)
        sparkHadoopUtil.checkAccessPermission(status, READ) should be(true)
        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(true)

        // If file is owned by user but user has no access permission
        status = fileStatus(testUser, testGroups.head, NONE, READ_WRITE, NONE)
        sparkHadoopUtil.checkAccessPermission(status, READ) should be(false)
        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(false)

        val otherUser = s"test-${Random.nextInt(100)}"
        val otherGroup = s"test-${Random.nextInt(100)}"

        // If file is owned by user's group and user's group has access permission
        status = fileStatus(otherUser, testGroups.head, NONE, READ_WRITE, NONE)
        sparkHadoopUtil.checkAccessPermission(status, READ) should be(true)
        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(true)

        // If file is owned by user's group but user's group has no access permission
        status = fileStatus(otherUser, testGroups.head, READ_WRITE, NONE, NONE)
        sparkHadoopUtil.checkAccessPermission(status, READ) should be(false)
        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(false)

        // If file is owned by other user and this user has access permission
        status = fileStatus(otherUser, otherGroup, READ_WRITE, READ_WRITE, READ_WRITE)
        sparkHadoopUtil.checkAccessPermission(status, READ) should be(true)
        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(true)

        // If file is owned by other user but this user has no access permission
        status = fileStatus(otherUser, otherGroup, READ_WRITE, READ_WRITE, NONE)
        sparkHadoopUtil.checkAccessPermission(status, READ) should be(false)
        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(false)

        null
      }
    })
  }

  private def fileStatus(
      owner: String,
      group: String,
      userAction: FsAction,
      groupAction: FsAction,
      otherAction: FsAction): FileStatus = {
    new FileStatus(0L,
      false,
      0,
      0L,
      0L,
      0L,
      new FsPermission(userAction, groupAction, otherAction),
      owner,
      group,
      null)
  }
}

Source File: FileStreamSinkLog.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.net.URI

import org.apache.hadoop.fs.{FileStatus, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.internal.SQLConf


class FileStreamSinkLog(
    metadataLogVersion: Int,
    sparkSession: SparkSession,
    path: String)
  extends CompactibleFileStreamLog[SinkFileStatus](metadataLogVersion, sparkSession, path) {

  private implicit val formats = Serialization.formats(NoTypeHints)

  protected override val fileCleanupDelayMs = sparkSession.sessionState.conf.fileSinkLogCleanupDelay

  protected override val isDeletingExpiredLog = sparkSession.sessionState.conf.fileSinkLogDeletion

  protected override val defaultCompactInterval =
    sparkSession.sessionState.conf.fileSinkLogCompactInterval

  require(defaultCompactInterval > 0,
    s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $defaultCompactInterval) " +
      "to a positive value.")

  override def compactLogs(logs: Seq[SinkFileStatus]): Seq[SinkFileStatus] = {
    val deletedFiles = logs.filter(_.action == FileStreamSinkLog.DELETE_ACTION).map(_.path).toSet
    if (deletedFiles.isEmpty) {
      logs
    } else {
      logs.filter(f => !deletedFiles.contains(f.path))
    }
  }
}

object FileStreamSinkLog {
  val VERSION = 1
  val DELETE_ACTION = "delete"
  val ADD_ACTION = "add"
}

Source File: MetadataLogFileIndex.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import scala.collection.mutable

import org.apache.hadoop.fs.{FileStatus, Path}

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.types.StructType



class MetadataLogFileIndex(
    sparkSession: SparkSession,
    path: Path,
    userPartitionSchema: Option[StructType])
  extends PartitioningAwareFileIndex(sparkSession, Map.empty, userPartitionSchema) {

  private val metadataDirectory = new Path(path, FileStreamSink.metadataDir)
  logInfo(s"Reading streaming file log from $metadataDirectory")
  private val metadataLog =
    new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, metadataDirectory.toUri.toString)
  private val allFilesFromLog = metadataLog.allFiles().map(_.toFileStatus).filterNot(_.isDirectory)
  private var cachedPartitionSpec: PartitionSpec = _

  override protected val leafFiles: mutable.LinkedHashMap[Path, FileStatus] = {
    new mutable.LinkedHashMap ++= allFilesFromLog.map(f => f.getPath -> f)
  }

  override protected val leafDirToChildrenFiles: Map[Path, Array[FileStatus]] = {
    allFilesFromLog.toArray.groupBy(_.getPath.getParent)
  }

  override def rootPaths: Seq[Path] = path :: Nil

  override def refresh(): Unit = { }

  override def partitionSpec(): PartitionSpec = {
    if (cachedPartitionSpec == null) {
      cachedPartitionSpec = inferPartitioning()
    }
    cachedPartitionSpec
  }
}

Source File: KyuubiDistributedCacheManagerSuite.scala From kyuubi with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.net.URI

import scala.collection.mutable.{HashMap, Map}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
import org.apache.hadoop.yarn.api.records.{LocalResource, LocalResourceType, LocalResourceVisibility}
import org.apache.hadoop.yarn.util.ConverterUtils
import org.apache.spark.{KyuubiSparkUtil, SparkFunSuite}
import org.mockito.Mockito.when
import org.scalatest.mock.MockitoSugar

import yaooqinn.kyuubi.utils.ReflectUtils

class KyuubiDistributedCacheManagerSuite extends SparkFunSuite with MockitoSugar {

  class MockClientDistributedCacheManager extends ClientDistributedCacheManager {
    override def getVisibility(conf: Configuration, uri: URI, statCache: Map[URI, FileStatus]):
    LocalResourceVisibility = {
      LocalResourceVisibility.PRIVATE
    }
  }

  test("add resource") {
    val fs = mock[FileSystem]
    val conf = new Configuration()
    val destPath = new Path("file:///foo.bar.com:8080/tmp/testing")
    val localResources = HashMap[String, LocalResource]()
    val statCache = HashMap[URI, FileStatus]()
    val status = new FileStatus()
    when(fs.getFileStatus(destPath)).thenReturn(status)
    val fileLink = "link"
    ReflectUtils.setFieldValue(
      KyuubiDistributedCacheManager, "cacheManager", new MockClientDistributedCacheManager)
    KyuubiDistributedCacheManager.addResource(
      fs, conf, destPath, localResources, LocalResourceType.FILE, fileLink, statCache)
    val res = localResources(fileLink)
    assert(res.getVisibility === LocalResourceVisibility.PRIVATE)
    assert(ConverterUtils.getPathFromYarnURL(res.getResource) === destPath)
    assert(res.getSize === 0)
    assert(res.getTimestamp === 0)
    assert(res.getType === LocalResourceType.FILE)
    val status2 = new FileStatus(
      10, false, 1, 1024, 10,
      10, null, KyuubiSparkUtil.getCurrentUserName, null, new Path("/tmp/testing2"))
    val destPath2 = new Path("file:///foo.bar.com:8080/tmp/testing2")
    when(fs.getFileStatus(destPath2)).thenReturn(status2)
    val fileLink2 = "link2"
    KyuubiDistributedCacheManager.addResource(
      fs, conf, destPath2, localResources, LocalResourceType.FILE, fileLink2, statCache)
    val res2 = localResources(fileLink2)
    assert(res2.getVisibility === LocalResourceVisibility.PRIVATE)
    assert(ConverterUtils.getPathFromYarnURL(res2.getResource) === destPath2)
    assert(res2.getSize === 10)
    assert(res2.getTimestamp === 10)
    assert(res2.getType === LocalResourceType.FILE)
  }

  test("add resource when link null") {
    val distMgr = new MockClientDistributedCacheManager()
    val fs = mock[FileSystem]
    val conf = new Configuration()
    val destPath = new Path("file:///foo.bar.com:8080/tmp/testing")
    ReflectUtils.setFieldValue(KyuubiDistributedCacheManager, "cacheManager", distMgr)
    val localResources = HashMap[String, LocalResource]()
    val statCache = HashMap[URI, FileStatus]()
    when(fs.getFileStatus(destPath)).thenReturn(new FileStatus())
    intercept[Exception] {
      KyuubiDistributedCacheManager.addResource(
        fs, conf, destPath, localResources, LocalResourceType.FILE, null, statCache)
    }
    assert(localResources.get("link") === None)
    assert(localResources.size === 0)
  }

  test("test addResource archive") {
    val distMgr = new MockClientDistributedCacheManager()
    ReflectUtils.setFieldValue(KyuubiDistributedCacheManager, "cacheManager", distMgr)
    val fs = mock[FileSystem]
    val conf = new Configuration()
    val destPath = new Path("file:///foo.bar.com:8080/tmp/testing")
    val localResources = HashMap[String, LocalResource]()
    val statCache = HashMap[URI, FileStatus]()
    val realFileStatus = new FileStatus(10, false, 1, 1024, 10, 10, null, "testOwner",
      null, new Path("/tmp/testing"))
    when(fs.getFileStatus(destPath)).thenReturn(realFileStatus)

    KyuubiDistributedCacheManager.addResource(
      fs, conf, destPath, localResources, LocalResourceType.ARCHIVE, "link", statCache)
    val resource = localResources("link")
    assert(resource.getVisibility === LocalResourceVisibility.PRIVATE)
    assert(ConverterUtils.getPathFromYarnURL(resource.getResource) === destPath)
    assert(resource.getTimestamp === 10)
    assert(resource.getSize === 10)
    assert(resource.getType === LocalResourceType.ARCHIVE)

  }

}

Source File: KyuubiDistributedCacheManager.scala From kyuubi with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.net.URI

import scala.collection.mutable.{HashMap, Map}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
import org.apache.hadoop.yarn.api.records.{LocalResource, LocalResourceType}


  def addResource(
      fs: FileSystem,
      conf: Configuration,
      destPath: Path,
      localResources: HashMap[String, LocalResource],
      resourceType: LocalResourceType,
      link: String,
      statCache: Map[URI, FileStatus]): Unit = {
    cacheManager.addResource(fs, conf, destPath,
      localResources, resourceType, link, statCache, appMasterOnly = true)
  }
}

Source File: FileStreamSourceSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.io.File
import java.net.URI

import scala.util.Random

import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem}

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.execution.streaming.ExistsThrowsExceptionFileSystem._
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.StructType

class FileStreamSourceSuite extends SparkFunSuite with SharedSQLContext {

  import FileStreamSource._

  test("SeenFilesMap") {
    val map = new SeenFilesMap(maxAgeMs = 10)

    map.add("a", 5)
    assert(map.size == 1)
    map.purge()
    assert(map.size == 1)

    // Add a new entry and purge should be no-op, since the gap is exactly 10 ms.
    map.add("b", 15)
    assert(map.size == 2)
    map.purge()
    assert(map.size == 2)

    // Add a new entry that's more than 10 ms than the first entry. We should be able to purge now.
    map.add("c", 16)
    assert(map.size == 3)
    map.purge()
    assert(map.size == 2)

    // Override existing entry shouldn't change the size
    map.add("c", 25)
    assert(map.size == 2)

    // Not a new file because we have seen c before
    assert(!map.isNewFile("c", 20))

    // Not a new file because timestamp is too old
    assert(!map.isNewFile("d", 5))

    // Finally a new file: never seen and not too old
    assert(map.isNewFile("e", 20))
  }

  test("SeenFilesMap should only consider a file old if it is earlier than last purge time") {
    val map = new SeenFilesMap(maxAgeMs = 10)

    map.add("a", 20)
    assert(map.size == 1)

    // Timestamp 5 should still considered a new file because purge time should be 0
    assert(map.isNewFile("b", 9))
    assert(map.isNewFile("b", 10))

    // Once purge, purge time should be 10 and then b would be a old file if it is less than 10.
    map.purge()
    assert(!map.isNewFile("b", 9))
    assert(map.isNewFile("b", 10))
  }

  testWithUninterruptibleThread("do not recheck that files exist during getBatch") {
    withTempDir { temp =>
      spark.conf.set(
        s"fs.$scheme.impl",
        classOf[ExistsThrowsExceptionFileSystem].getName)
      // add the metadata entries as a pre-req
      val dir = new File(temp, "dir") // use non-existent directory to test whether log make the dir
      val metadataLog =
        new FileStreamSourceLog(FileStreamSourceLog.VERSION, spark, dir.getAbsolutePath)
      assert(metadataLog.add(0, Array(FileEntry(s"$scheme:///file1", 100L, 0))))

      val newSource = new FileStreamSource(spark, s"$scheme:///", "parquet", StructType(Nil), Nil,
        dir.getAbsolutePath, Map.empty)
      // this method should throw an exception if `fs.exists` is called during resolveRelation
      newSource.getBatch(None, FileStreamSourceOffset(1))
    }
  }
}


  override def listStatus(file: Path): Array[FileStatus] = {
    val emptyFile = new FileStatus()
    emptyFile.setPath(file)
    Array(emptyFile)
  }
}

object ExistsThrowsExceptionFileSystem {
  val scheme = s"FileStreamSourceSuite${math.abs(Random.nextInt)}fs"
}

Source File: FileStreamSinkLog.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.apache.hadoop.fs.{FileStatus, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization
import org.json4s.jackson.Serialization.{read, write}

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.internal.SQLConf


class FileStreamSinkLog(
    metadataLogVersion: String,
    sparkSession: SparkSession,
    path: String)
  extends CompactibleFileStreamLog[SinkFileStatus](metadataLogVersion, sparkSession, path) {

  private implicit val formats = Serialization.formats(NoTypeHints)

  protected override val fileCleanupDelayMs = sparkSession.sessionState.conf.fileSinkLogCleanupDelay

  protected override val isDeletingExpiredLog = sparkSession.sessionState.conf.fileSinkLogDeletion

  protected override val defaultCompactInterval =
    sparkSession.sessionState.conf.fileSinkLogCompactInterval

  require(defaultCompactInterval > 0,
    s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $defaultCompactInterval) " +
      "to a positive value.")

  override def compactLogs(logs: Seq[SinkFileStatus]): Seq[SinkFileStatus] = {
    val deletedFiles = logs.filter(_.action == FileStreamSinkLog.DELETE_ACTION).map(_.path).toSet
    if (deletedFiles.isEmpty) {
      logs
    } else {
      logs.filter(f => !deletedFiles.contains(f.path))
    }
  }
}

object FileStreamSinkLog {
  val VERSION = "v1"
  val DELETE_ACTION = "delete"
  val ADD_ACTION = "add"
}

Source File: BinaryFileReader.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark

import com.microsoft.ml.spark.core.env.StreamUtilities
import com.microsoft.ml.spark.core.schema.BinaryFileSchema
import com.microsoft.ml.spark.core.utils.AsyncUtils
import org.apache.commons.io.IOUtils
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
import org.apache.spark.binary.BinaryFileFormat
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.binary.ConfUtils
import org.apache.spark.sql.types.BinaryType

import scala.concurrent.{ExecutionContext, Future}
import scala.concurrent.duration.Duration

object BinaryFileReader {

  private def recursePath(fileSystem: FileSystem,
                          path: Path,
                          pathFilter: FileStatus => Boolean,
                          visitedSymlinks: Set[Path]): Array[Path] ={
    val filteredPaths = fileSystem.listStatus(path).filter(pathFilter)
    val filteredDirs = filteredPaths.filter(fs => fs.isDirectory & !visitedSymlinks(fs.getPath))
    val symlinksFound = visitedSymlinks ++ filteredDirs.filter(_.isSymlink).map(_.getPath)
    filteredPaths.map(_.getPath) ++ filteredDirs.map(_.getPath)
      .flatMap(p => recursePath(fileSystem, p, pathFilter, symlinksFound))
  }

  def recursePath(fileSystem: FileSystem, path: Path, pathFilter: FileStatus => Boolean): Array[Path] ={
    recursePath(fileSystem, path, pathFilter, Set())
  }

  
  def readFromPaths(df: DataFrame,
                    pathCol: String,
                    bytesCol: String,
                    concurrency: Int,
                    timeout: Int
                   ): DataFrame = {
    val outputSchema = df.schema.add(bytesCol, BinaryType, nullable = true)
    val encoder = RowEncoder(outputSchema)
    val hconf = ConfUtils.getHConf(df)

    df.mapPartitions { rows =>
      val futures = rows.map {row: Row =>
        Future {
            val path = new Path(row.getAs[String](pathCol))
            val fs = path.getFileSystem(hconf.value)
            val bytes = StreamUtilities.using(fs.open(path)) {is => IOUtils.toByteArray(is)}.get
            val ret = Row.merge(Seq(row, Row(bytes)): _*)
            ret
          }(ExecutionContext.global)
      }
      AsyncUtils.bufferedAwait(
        futures,concurrency, Duration.fromNanos(timeout*(20^6).toLong))(ExecutionContext.global)
    }(encoder)
  }

}

Source File: HadoopFileSystemLogStore.scala From delta with Apache License 2.0

5 votes

package org.apache.spark.sql.delta.storage

import java.io.{BufferedReader, FileNotFoundException, InputStreamReader}
import java.nio.charset.StandardCharsets.UTF_8
import java.nio.file.FileAlreadyExistsException
import java.util.UUID

import scala.collection.JavaConverters._

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession


  protected def writeWithRename(
      path: Path, actions: Iterator[String], overwrite: Boolean = false): Unit = {
    val fs = path.getFileSystem(getHadoopConfiguration)

    if (!fs.exists(path.getParent)) {
      throw new FileNotFoundException(s"No such file or directory: ${path.getParent}")
    }
    if (overwrite) {
      val stream = fs.create(path, true)
      try {
        actions.map(_ + "\n").map(_.getBytes(UTF_8)).foreach(stream.write)
      } finally {
        stream.close()
      }
    } else {
      if (fs.exists(path)) {
        throw new FileAlreadyExistsException(path.toString)
      }
      val tempPath = createTempPath(path)
      var streamClosed = false // This flag is to avoid double close
      var renameDone = false // This flag is to save the delete operation in most of cases.
      val stream = fs.create(tempPath)
      try {
        actions.map(_ + "\n").map(_.getBytes(UTF_8)).foreach(stream.write)
        stream.close()
        streamClosed = true
        try {
          if (fs.rename(tempPath, path)) {
            renameDone = true
          } else {
            if (fs.exists(path)) {
              throw new FileAlreadyExistsException(path.toString)
            } else {
              throw new IllegalStateException(s"Cannot rename $tempPath to $path")
            }
          }
        } catch {
          case _: org.apache.hadoop.fs.FileAlreadyExistsException =>
            throw new FileAlreadyExistsException(path.toString)
        }
      } finally {
        if (!streamClosed) {
          stream.close()
        }
        if (!renameDone) {
          fs.delete(tempPath, false)
        }
      }
    }
  }

  protected def createTempPath(path: Path): Path = {
    new Path(path.getParent, s".${path.getName}.${UUID.randomUUID}.tmp")
  }

  override def invalidateCache(): Unit = {}
}

Source File: FileNames.scala From delta with Apache License 2.0

5 votes

package org.apache.spark.sql.delta.util

import org.apache.spark.sql.delta.DeltaErrors
import org.apache.hadoop.fs.{FileStatus, Path}


  def getFileVersion(path: Path): Long = {
    if (isCheckpointFile(path)) {
      checkpointVersion(path)
    } else if (isDeltaFile(path)) {
      deltaVersion(path)
    } else if (isChecksumFile(path)) {
      checksumVersion(path)
    } else {
      // scalastyle:off throwerror
      throw new AssertionError(
        s"Unexpected file type found in transaction log: $path")
      // scalastyle:on throwerror
    }
  }
}

Source File: HdfsUtils.scala From etl-light with MIT License

5 votes

package yamrcraft.etlite.utils

import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}

import scala.collection.mutable

object HdfsUtils {

  def renameFiles(fromBase: Path, toBase: Path, fs: FileSystem) = {
    if (fs.exists(fromBase)) {
      val filesToMove = listFiles(fromBase, fs)
      println("files to move:")
      filesToMove foreach (p => println("+++" + p.toString))
      filesToMove foreach { file =>
        val relPath = relativize(fromBase, file)
        val toPath = new Path(toBase, relPath)
        fs.mkdirs(toPath.getParent)
        fs.rename(file, toPath)
        println(" file renamed to: " + toPath.toString)
      }
    }
  }

  def relativize(base: Path, files: List[Path]) = {
    files map (file => new Path(base.toUri.relativize(file.toUri).getPath))
  }

  def relativize(base: Path, file: Path): Path = {
    new Path(base.toUri.relativize(file.toUri).getPath)
  }

  def listFiles(path: Path, fs: FileSystem): List[Path] = {
    val statusList = mutable.MutableList[FileStatus]()
    traverse(path, statusList, fs)
    statusList.map(status => new Path(status.getPath.toUri.getPath)).toList
  }

  private def traverse(path: Path, list: mutable.MutableList[FileStatus], fs: FileSystem): Unit = {
    fs.listStatus(path) foreach { status =>
      if (!status.isDirectory) {
        list += status
      } else {
        traverse(status.getPath, list, fs)
      }
    }
  }

}

Source File: FileStreamSourceSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.io.File
import java.net.URI

import scala.util.Random

import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem}

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.execution.streaming.ExistsThrowsExceptionFileSystem._
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.StructType

class FileStreamSourceSuite extends SparkFunSuite with SharedSQLContext {

  import FileStreamSource._

  test("SeenFilesMap") {
    val map = new SeenFilesMap(maxAgeMs = 10)

    map.add("a", 5)
    assert(map.size == 1)
    map.purge()
    assert(map.size == 1)

    // Add a new entry and purge should be no-op, since the gap is exactly 10 ms.
    map.add("b", 15)
    assert(map.size == 2)
    map.purge()
    assert(map.size == 2)

    // Add a new entry that's more than 10 ms than the first entry. We should be able to purge now.
    map.add("c", 16)
    assert(map.size == 3)
    map.purge()
    assert(map.size == 2)

    // Override existing entry shouldn't change the size
    map.add("c", 25)
    assert(map.size == 2)

    // Not a new file because we have seen c before
    assert(!map.isNewFile("c", 20))

    // Not a new file because timestamp is too old
    assert(!map.isNewFile("d", 5))

    // Finally a new file: never seen and not too old
    assert(map.isNewFile("e", 20))
  }

  test("SeenFilesMap should only consider a file old if it is earlier than last purge time") {
    val map = new SeenFilesMap(maxAgeMs = 10)

    map.add("a", 20)
    assert(map.size == 1)

    // Timestamp 5 should still considered a new file because purge time should be 0
    assert(map.isNewFile("b", 9))
    assert(map.isNewFile("b", 10))

    // Once purge, purge time should be 10 and then b would be a old file if it is less than 10.
    map.purge()
    assert(!map.isNewFile("b", 9))
    assert(map.isNewFile("b", 10))
  }

  testWithUninterruptibleThread("do not recheck that files exist during getBatch") {
    withTempDir { temp =>
      spark.conf.set(
        s"fs.$scheme.impl",
        classOf[ExistsThrowsExceptionFileSystem].getName)
      // add the metadata entries as a pre-req
      val dir = new File(temp, "dir") // use non-existent directory to test whether log make the dir
      val metadataLog =
        new FileStreamSourceLog(FileStreamSourceLog.VERSION, spark, dir.getAbsolutePath)
      assert(metadataLog.add(0, Array(FileEntry(s"$scheme:///file1", 100L, 0))))

      val newSource = new FileStreamSource(spark, s"$scheme:///", "parquet", StructType(Nil), Nil,
        dir.getAbsolutePath, Map.empty)
      // this method should throw an exception if `fs.exists` is called during resolveRelation
      newSource.getBatch(None, FileStreamSourceOffset(1))
    }
  }
}


  override def listStatus(file: Path): Array[FileStatus] = {
    val emptyFile = new FileStatus()
    emptyFile.setPath(file)
    Array(emptyFile)
  }
}

object ExistsThrowsExceptionFileSystem {
  val scheme = s"FileStreamSourceSuite${math.abs(Random.nextInt)}fs"
}

Source File: FileStreamSinkLog.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.apache.hadoop.fs.{FileStatus, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization
import org.json4s.jackson.Serialization.{read, write}

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.internal.SQLConf


class FileStreamSinkLog(
    metadataLogVersion: String,
    sparkSession: SparkSession,
    path: String)
  extends CompactibleFileStreamLog[SinkFileStatus](metadataLogVersion, sparkSession, path) {

  private implicit val formats = Serialization.formats(NoTypeHints)

  protected override val fileCleanupDelayMs = sparkSession.sessionState.conf.fileSinkLogCleanupDelay

  protected override val isDeletingExpiredLog = sparkSession.sessionState.conf.fileSinkLogDeletion

  protected override val defaultCompactInterval =
    sparkSession.sessionState.conf.fileSinkLogCompactInterval

  require(defaultCompactInterval > 0,
    s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $defaultCompactInterval) " +
      "to a positive value.")

  override def compactLogs(logs: Seq[SinkFileStatus]): Seq[SinkFileStatus] = {
    val deletedFiles = logs.filter(_.action == FileStreamSinkLog.DELETE_ACTION).map(_.path).toSet
    if (deletedFiles.isEmpty) {
      logs
    } else {
      logs.filter(f => !deletedFiles.contains(f.path))
    }
  }
}

object FileStreamSinkLog {
  val VERSION = "v1"
  val DELETE_ACTION = "delete"
  val ADD_ACTION = "add"
}

Source File: ArrowFileFormat.scala From OAP with Apache License 2.0

5 votes

package com.intel.oap.spark.sql.execution.datasources.arrow

import scala.collection.JavaConverters._

import com.intel.oap.spark.sql.execution.datasources.arrow.ArrowFileFormat.UnsafeItr
import com.intel.oap.spark.sql.execution.datasources.v2.arrow.{ArrowFilters, ArrowOptions}
import com.intel.oap.spark.sql.execution.datasources.v2.arrow.ArrowSQLConf._
import org.apache.arrow.dataset.scanner.ScanOptions
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileStatus
import org.apache.hadoop.mapreduce.Job

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile}
import org.apache.spark.sql.execution.datasources.v2.arrow.ArrowUtils
import org.apache.spark.sql.sources.{DataSourceRegister, Filter}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.util.CaseInsensitiveStringMap;

class ArrowFileFormat extends FileFormat with DataSourceRegister with Serializable {

  val batchSize = 4096

  def convert(files: Seq[FileStatus], options: Map[String, String]): Option[StructType] = {
    ArrowUtils.readSchema(files, new CaseInsensitiveStringMap(options.asJava))
  }

  override def inferSchema(
                            sparkSession: SparkSession,
                            options: Map[String, String],
                            files: Seq[FileStatus]): Option[StructType] = {
    convert(files, options)
  }

  override def prepareWrite(
                             sparkSession: SparkSession,
                             job: Job,
                             options: Map[String, String],
                             dataSchema: StructType): OutputWriterFactory = {
    throw new UnsupportedOperationException("Write is not supported for Arrow source")
  }

  override def supportBatch(sparkSession: SparkSession, dataSchema: StructType): Boolean = true

  override def buildReaderWithPartitionValues(sparkSession: SparkSession,
      dataSchema: StructType,
      partitionSchema: StructType,
      requiredSchema: StructType,
      filters: Seq[Filter],
      options: Map[String, String],
      hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = {
    (file: PartitionedFile) => {

      val sqlConf = sparkSession.sessionState.conf;
      val enableFilterPushDown = sqlConf.arrowFilterPushDown
      val factory = ArrowUtils.makeArrowDiscovery(
        file.filePath, new ArrowOptions(
          new CaseInsensitiveStringMap(
            options.asJava).asScala.toMap))

      // todo predicate validation / pushdown
      val dataset = factory.finish();

      val filter = if (enableFilterPushDown) {
        ArrowFilters.translateFilters(filters)
      } else {
        org.apache.arrow.dataset.filter.Filter.EMPTY
      }

      val scanOptions = new ScanOptions(requiredSchema.map(f => f.name).toArray,
        filter, batchSize)
      val scanner = dataset.newScan(scanOptions)
      val itrList = scanner
        .scan()
        .iterator()
        .asScala
        .map(task => task.scan())
        .toList

      val itr = itrList
        .toIterator
        .flatMap(itr => itr.asScala)
        .map(vsr => ArrowUtils.loadVsr(vsr, file.partitionValues, partitionSchema, dataSchema))
      new UnsafeItr(itr).asInstanceOf[Iterator[InternalRow]]
    }
  }

  override def shortName(): String = "arrow"
}

object ArrowFileFormat {
  class UnsafeItr[T](delegate: Iterator[T]) extends Iterator[T] {
    override def hasNext: Boolean = delegate.hasNext

    override def next(): T = delegate.next()
  }
}

Source File: ArrowTable.scala From OAP with Apache License 2.0

5 votes

package com.intel.oap.spark.sql.execution.datasources.v2.arrow

import org.apache.hadoop.fs.FileStatus

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.connector.read.ScanBuilder
import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder}
import org.apache.spark.sql.execution.datasources.FileFormat
import org.apache.spark.sql.execution.datasources.v2.FileTable
import org.apache.spark.sql.execution.datasources.v2.arrow.ArrowUtils
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.util.CaseInsensitiveStringMap

case class ArrowTable(
    name: String,
    sparkSession: SparkSession,
    options: CaseInsensitiveStringMap,
    paths: Seq[String],
    userSpecifiedSchema: Option[StructType],
    fallbackFileFormat: Class[_ <: FileFormat])
    extends FileTable(sparkSession, options, paths, userSpecifiedSchema) {

  override def inferSchema(files: Seq[FileStatus]): Option[StructType] = {
    ArrowUtils.readSchema(files, options)
  }

  override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = {
    ArrowScanBuilder(sparkSession, fileIndex, schema, dataSchema, options)
  }

  override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = {
    throw new UnsupportedOperationException // fixme implement later
  }

  override def formatName: String = "ARROW"
}

Source File: OapIndexFileFormat.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.index

import org.apache.hadoop.fs.FileStatus
import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
import org.apache.parquet.hadoop.util.ContextUtil

import org.apache.spark.internal.Logging
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory}
import org.apache.spark.sql.execution.datasources.oap.OapFileFormat
import org.apache.spark.sql.types.StructType

private[index] class OapIndexFileFormat
  extends FileFormat
  with Logging
  with Serializable {

  override def inferSchema(
      sparkSession: SparkSession,
      options: Map[String, String],
      files: Seq[FileStatus]): Option[StructType] = None

  
  override def prepareWrite(
      sparkSession: SparkSession,
      job: Job,
      options: Map[String, String],
      dataSchema: StructType): OutputWriterFactory = {

    val configuration = ContextUtil.getConfiguration(job)

    configuration.set(OapIndexFileFormat.ROW_SCHEMA, dataSchema.json)
    configuration.set(OapIndexFileFormat.INDEX_TYPE, options("indexType"))
    configuration.set(OapIndexFileFormat.INDEX_NAME, options("indexName"))
    configuration.set(OapIndexFileFormat.INDEX_TIME, options("indexTime"))
    configuration.set(OapIndexFileFormat.IS_APPEND, options("isAppend"))

    new OutputWriterFactory {
      override def getFileExtension(context: TaskAttemptContext): String =
        OapFileFormat.OAP_INDEX_EXTENSION

      override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext) =
        new OapIndexOutputWriter(path, context)
    }
  }
}

private[index] object OapIndexFileFormat {
  val ROW_SCHEMA: String = "org.apache.spark.sql.oap.row.attributes"
  val INDEX_TYPE: String = "org.apache.spark.sql.oap.index.type"
  val INDEX_NAME: String = "org.apache.spark.sql.oap.index.name"
  val INDEX_TIME: String = "org.apache.spark.sql.oap.index.time"
  val IS_APPEND: String = "org.apache.spark.sql.oap.index.append"
}

case class IndexBuildResult(dataFile: String, rowCount: Long, fingerprint: String, parent: String)

Source File: Checksum.scala From schedoscope with Apache License 2.0

5 votes

package org.schedoscope.dsl.transformations

import java.security.MessageDigest

import org.apache.hadoop.fs.{FileStatus, Path}
import org.schedoscope.Schedoscope
import org.schedoscope.scheduler.driver.FilesystemDriver._

import scala.Array.canBuildFrom
import scala.collection.mutable.HashMap

object Checksum {
  private def md5 = MessageDigest.getInstance("MD5")

  private def listFiles(path: String): Array[FileStatus] = {
    val files = fileSystem(path, Schedoscope.settings.hadoopConf).globStatus(new Path(path))
    if (files != null)
      files
    else Array()
  }

  private def fileChecksum(path: String) =
    if (path == null)
      "null-checksum"
    else if (path.endsWith(".jar"))
      path
    else try {
      val cs = fileSystem(path, Schedoscope.settings.hadoopConf).getFileChecksum(new Path(path))
      if (cs == null)
        path
      else
        cs.toString()
    } catch {
      case _: Throwable => path
    }

  def fileChecksums(paths: List[String], recursive: Boolean): List[String] =
    paths.flatMap(path => {
      if (fileSystem(path, Schedoscope.settings.hadoopConf).isFile(new Path(path)))
        List(fileChecksum(path))
      else if (recursive)
        fileChecksums(listFiles(path + "/*").map(f => f.getPath.toString()).toList, recursive)
      else
        List()
    }).sorted

  val resourceHashCache = new HashMap[List[String], List[String]]()

  def resourceHashes(resources: List[String]): List[String] = synchronized {
    resourceHashCache.getOrElseUpdate(resources, fileChecksums(resources, true))
  }

  val defaultDigest = "0"

  def digest(stringsToDigest: String*): String =
    if (stringsToDigest.isEmpty)
      defaultDigest
    else
      md5.digest(stringsToDigest.sorted.mkString.toCharArray().map(_.toByte)).map("%02X" format _).mkString

  object SchemaChecksum {
    val checksumProperty = "schema.checksum"
  }

  object TransformationChecksum {
    val checksumProperty = "transformation.checksum"
    val timestampProperty = "transformation.timestamp"
  }

}

Source File: UnsplittableSequenceFileInputFormat.scala From spark-util with Apache License 2.0

5 votes

package org.hammerlab.hadoop.splits

import java.io.IOException
import java.util

import org.apache.hadoop.fs.{ FileStatus, FileSystem, Path ⇒ HPath }
import org.apache.hadoop.mapred.{ JobConf, SequenceFileInputFormat }
import org.apache.hadoop.mapreduce.JobContext
import org.apache.hadoop.mapreduce.lib.input

import scala.collection.JavaConverters._



  override def listStatus(job: JobContext): util.List[FileStatus] =
    super
      .listStatus(job)
      .asScala
      .sortBy {
        _.getPath.getName match {
          case PartFileBasename(idx) ⇒
            idx
          case basename ⇒
            throw new IllegalArgumentException(s"Bad partition file: $basename")
        }
      }
      .asJava
}

Source File: FileStreamSinkLog.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.net.URI

import org.apache.hadoop.fs.{FileStatus, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.internal.SQLConf


class FileStreamSinkLog(
    metadataLogVersion: Int,
    sparkSession: SparkSession,
    path: String)
  extends CompactibleFileStreamLog[SinkFileStatus](metadataLogVersion, sparkSession, path) {

  private implicit val formats = Serialization.formats(NoTypeHints)

  protected override val fileCleanupDelayMs = sparkSession.sessionState.conf.fileSinkLogCleanupDelay

  protected override val isDeletingExpiredLog = sparkSession.sessionState.conf.fileSinkLogDeletion

  protected override val defaultCompactInterval =
    sparkSession.sessionState.conf.fileSinkLogCompactInterval

  require(defaultCompactInterval > 0,
    s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $defaultCompactInterval) " +
      "to a positive value.")

  override def compactLogs(logs: Seq[SinkFileStatus]): Seq[SinkFileStatus] = {
    val deletedFiles = logs.filter(_.action == FileStreamSinkLog.DELETE_ACTION).map(_.path).toSet
    if (deletedFiles.isEmpty) {
      logs
    } else {
      logs.filter(f => !deletedFiles.contains(f.path))
    }
  }
}

object FileStreamSinkLog {
  val VERSION = 1
  val DELETE_ACTION = "delete"
  val ADD_ACTION = "add"
}

Source File: MetadataLogFileIndex.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import scala.collection.mutable

import org.apache.hadoop.fs.{FileStatus, Path}

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.types.StructType



class MetadataLogFileIndex(
    sparkSession: SparkSession,
    path: Path,
    userSpecifiedSchema: Option[StructType])
  extends PartitioningAwareFileIndex(sparkSession, Map.empty, userSpecifiedSchema) {

  private val metadataDirectory = new Path(path, FileStreamSink.metadataDir)
  logInfo(s"Reading streaming file log from $metadataDirectory")
  private val metadataLog =
    new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, metadataDirectory.toUri.toString)
  private val allFilesFromLog = metadataLog.allFiles().map(_.toFileStatus).filterNot(_.isDirectory)
  private var cachedPartitionSpec: PartitionSpec = _

  override protected val leafFiles: mutable.LinkedHashMap[Path, FileStatus] = {
    new mutable.LinkedHashMap ++= allFilesFromLog.map(f => f.getPath -> f)
  }

  override protected val leafDirToChildrenFiles: Map[Path, Array[FileStatus]] = {
    allFilesFromLog.groupBy(_.getPath.getParent)
  }

  override def rootPaths: Seq[Path] = path :: Nil

  override def refresh(): Unit = { }

  override def partitionSpec(): PartitionSpec = {
    if (cachedPartitionSpec == null) {
      cachedPartitionSpec = inferPartitioning()
    }
    cachedPartitionSpec
  }
}

Source File: FileStreamSourceSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.io.File
import java.net.URI

import scala.util.Random

import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem}

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.execution.streaming.ExistsThrowsExceptionFileSystem._
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.StructType

class FileStreamSourceSuite extends SparkFunSuite with SharedSQLContext {

  import FileStreamSource._

  test("SeenFilesMap") {
    val map = new SeenFilesMap(maxAgeMs = 10)

    map.add("a", 5)
    assert(map.size == 1)
    map.purge()
    assert(map.size == 1)

    // Add a new entry and purge should be no-op, since the gap is exactly 10 ms.
    map.add("b", 15)
    assert(map.size == 2)
    map.purge()
    assert(map.size == 2)

    // Add a new entry that's more than 10 ms than the first entry. We should be able to purge now.
    map.add("c", 16)
    assert(map.size == 3)
    map.purge()
    assert(map.size == 2)

    // Override existing entry shouldn't change the size
    map.add("c", 25)
    assert(map.size == 2)

    // Not a new file because we have seen c before
    assert(!map.isNewFile("c", 20))

    // Not a new file because timestamp is too old
    assert(!map.isNewFile("d", 5))

    // Finally a new file: never seen and not too old
    assert(map.isNewFile("e", 20))
  }

  test("SeenFilesMap should only consider a file old if it is earlier than last purge time") {
    val map = new SeenFilesMap(maxAgeMs = 10)

    map.add("a", 20)
    assert(map.size == 1)

    // Timestamp 5 should still considered a new file because purge time should be 0
    assert(map.isNewFile("b", 9))
    assert(map.isNewFile("b", 10))

    // Once purge, purge time should be 10 and then b would be a old file if it is less than 10.
    map.purge()
    assert(!map.isNewFile("b", 9))
    assert(map.isNewFile("b", 10))
  }

  testWithUninterruptibleThread("do not recheck that files exist during getBatch") {
    withTempDir { temp =>
      spark.conf.set(
        s"fs.$scheme.impl",
        classOf[ExistsThrowsExceptionFileSystem].getName)
      // add the metadata entries as a pre-req
      val dir = new File(temp, "dir") // use non-existent directory to test whether log make the dir
      val metadataLog =
        new FileStreamSourceLog(FileStreamSourceLog.VERSION, spark, dir.getAbsolutePath)
      assert(metadataLog.add(0, Array(FileEntry(s"$scheme:///file1", 100L, 0))))

      val newSource = new FileStreamSource(spark, s"$scheme:///", "parquet", StructType(Nil), Nil,
        dir.getAbsolutePath, Map.empty)
      // this method should throw an exception if `fs.exists` is called during resolveRelation
      newSource.getBatch(None, LongOffset(1))
    }
  }
}


  override def listStatus(file: Path): Array[FileStatus] = {
    val emptyFile = new FileStatus()
    emptyFile.setPath(file)
    Array(emptyFile)
  }
}

object ExistsThrowsExceptionFileSystem {
  val scheme = s"FileStreamSourceSuite${math.abs(Random.nextInt)}fs"
}

Source File: FileStreamSinkLog.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.apache.hadoop.fs.{FileStatus, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization
import org.json4s.jackson.Serialization.{read, write}

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.internal.SQLConf


class FileStreamSinkLog(
    metadataLogVersion: String,
    sparkSession: SparkSession,
    path: String)
  extends CompactibleFileStreamLog[SinkFileStatus](metadataLogVersion, sparkSession, path) {

  private implicit val formats = Serialization.formats(NoTypeHints)

  protected override val fileCleanupDelayMs = sparkSession.sessionState.conf.fileSinkLogCleanupDelay

  protected override val isDeletingExpiredLog = sparkSession.sessionState.conf.fileSinkLogDeletion

  protected override val compactInterval = sparkSession.sessionState.conf.fileSinkLogCompactInterval
  require(compactInterval > 0,
    s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $compactInterval) " +
      "to a positive value.")

  protected override def serializeData(data: SinkFileStatus): String = {
    write(data)
  }

  protected override def deserializeData(encodedString: String): SinkFileStatus = {
    read[SinkFileStatus](encodedString)
  }

  override def compactLogs(logs: Seq[SinkFileStatus]): Seq[SinkFileStatus] = {
    val deletedFiles = logs.filter(_.action == FileStreamSinkLog.DELETE_ACTION).map(_.path).toSet
    if (deletedFiles.isEmpty) {
      logs
    } else {
      logs.filter(f => !deletedFiles.contains(f.path))
    }
  }
}

object FileStreamSinkLog {
  val VERSION = "v1"
  val DELETE_ACTION = "delete"
  val ADD_ACTION = "add"
}

org.apache.hadoop.fs.FileStatus Scala Examples