org.apache.hadoop.fs.FileStatus Scala Examples
The following examples show how to use org.apache.hadoop.fs.FileStatus.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TextFileFormat.scala From drizzle-spark with Apache License 2.0 | 12 votes |
package org.apache.spark.sql.execution.datasources.text import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat} import org.apache.hadoop.util.ReflectionUtils import org.apache.spark.TaskContext import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter} import org.apache.spark.sql.catalyst.util.CompressionCodecs import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.util.SerializableConfiguration def getCompressionExtension(context: TaskAttemptContext): String = { // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile if (FileOutputFormat.getCompressOutput(context)) { val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec]) ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension } else { "" } } }
Example 2
Source File: SerializableFileStatus.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.util import java.util.Objects import org.apache.hadoop.fs.{FileStatus, LocatedFileStatus, Path} case class SerializableFileStatus( path: String, length: Long, isDir: Boolean, modificationTime: Long) { // Important note! This is very expensive to compute, but we don't want to cache it // as a `val` because Paths internally contain URIs and therefore consume lots of memory. def getPath: Path = new Path(path) def getLen: Long = length def getModificationTime: Long = modificationTime def isDirectory: Boolean = isDir def toFileStatus: FileStatus = { new LocatedFileStatus( new FileStatus(length, isDir, 0, 0, modificationTime, new Path(path)), null) } override def equals(obj: Any): Boolean = obj match { case other: SerializableFileStatus => // We only compare the paths to stay consistent with FileStatus.equals. Objects.equals(path, other.path) case _ => false } override def hashCode(): Int = { // We only use the path to stay consistent with FileStatus.hashCode. Objects.hashCode(path) } } object SerializableFileStatus { def fromStatus(status: FileStatus): SerializableFileStatus = { SerializableFileStatus( Option(status.getPath).map(_.toString).orNull, status.getLen, status.isDirectory, status.getModificationTime) } val EMPTY: SerializableFileStatus = fromStatus(new FileStatus()) }
Example 3
Source File: BgenSchemaInferrer.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.bgen import com.google.common.io.LittleEndianDataInputStream import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.StructType import io.projectglow.common.{BgenOptions, CommonOptions, VariantSchemas, WithUtils} import io.projectglow.sql.util.SerializableConfiguration object BgenSchemaInferrer { def inferSchema( spark: SparkSession, files: Seq[FileStatus], options: Map[String, String]): StructType = { val shouldIncludeSampleIds = options.get(CommonOptions.INCLUDE_SAMPLE_IDS).forall(_.toBoolean) if (!shouldIncludeSampleIds) { return VariantSchemas.bgenDefaultSchema(hasSampleIds = false) } val sampleIdsFromSampleFile = BgenFileFormat.getSampleIds(options, spark.sparkContext.hadoopConfiguration) if (sampleIdsFromSampleFile.isDefined) { return VariantSchemas.bgenDefaultSchema(hasSampleIds = true) } val serializableConf = new SerializableConfiguration(spark.sparkContext.hadoopConfiguration) val ignoreExtension = options.get(BgenOptions.IGNORE_EXTENSION_KEY).exists(_.toBoolean) val bgenPaths = files.filter { fs => fs.getLen > 0 && (fs .getPath .toString .endsWith(BgenFileFormat.BGEN_SUFFIX) || ignoreExtension) }.map(_.getPath.toString) val hasSampleIds = spark .sparkContext .parallelize(bgenPaths) .map { path => val hPath = new Path(path) val hadoopFs = hPath.getFileSystem(serializableConf.value) WithUtils.withCloseable(hadoopFs.open(hPath)) { stream => val littleEndianDataInputStream = new LittleEndianDataInputStream(stream) new BgenHeaderReader(littleEndianDataInputStream) .readHeader(None) .sampleIds .exists(_.nonEmpty) } } .collect() .exists(identity) VariantSchemas.bgenDefaultSchema(hasSampleIds) } }
Example 4
Source File: XyzRelation.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus.xyz import org.apache.spark.sql.SQLContext import org.apache.spark.sql.types._ import org.apache.spark.sql.sources.HadoopFsRelation import org.apache.spark.sql.Row import org.apache.spark.rdd.RDD import org.apache.hadoop.fs.FileStatus object XyzRelation { val xyzrgbSchema = StructType(Array( StructField("x", FloatType, false), StructField("y", FloatType, false), StructField("z", FloatType, false), StructField("r", ByteType, false), StructField("g", ByteType, false), StructField("b", ByteType, false) )) val xyzSchema = StructType(Array( StructField("x", FloatType, false), StructField("y", FloatType, false), StructField("z", FloatType, false) )) } class XyzRelation( override val paths: Array[String], maybeDataSchema: Option[StructType], override val userDefinedPartitionColumns: Option[StructType], parameters: Map[String, String] )(@transient val sqlContext: SQLContext) extends HadoopFsRelation { override lazy val dataSchema = maybeDataSchema.getOrElse(XyzRelation.xyzrgbSchema) override def prepareJobForWrite(job: org.apache.hadoop.mapreduce.Job): org.apache.spark.sql.sources.OutputWriterFactory = ??? override def buildScan(inputs: Array[FileStatus]): RDD[Row] = { val lines = sqlContext.sparkContext.textFile(inputs.map(_.getPath).mkString("", ",", "")) val dataTypes = dataSchema.fields.map(_.dataType) lines map (line => Row.fromSeq((line.split("\t") zip dataTypes).map { case (x, StringType) => x case (x, ByteType) => x.toByte case (x, ShortType) => x.toShort case (x, IntegerType) => x.toInt case (x, LongType) => x.toLong case (x, FloatType) => x.toFloat case (x, DoubleType) => x.toDouble case _ => null }.padTo(dataTypes.size, null))) } }
Example 5
Source File: S3ALineCountWritebackSuite.scala From cloud-integration with Apache License 2.0 | 5 votes |
package com.cloudera.spark.cloud.s3 import scala.concurrent.duration._ import scala.language.postfixOps import com.cloudera.spark.cloud.common.CloudSuiteWithCSVDatasource import org.apache.hadoop.fs.{FileStatus, Path} class S3ALineCountWritebackSuite extends CloudSuiteWithCSVDatasource with S3ATestSetup { init() def init(): Unit = { // propagate S3 credentials if (enabled) { initFS() } } override def enabled: Boolean = super.enabled && hasCSVTestFile override def cleanFSInTeardownEnabled: Boolean = true after { cleanFilesystemInTeardown() } ctest("LineCountWriteback", "Execute the LineCount example with the results written back to the test filesystem.") { val sourceFile = getTestCSVPath() val sourceFS = sourceFile.getFileSystem(getConf) val sourceInfo = sourceFS.getFileStatus(sourceFile) val sparkConf = newSparkConf() sparkConf.setAppName("LineCount") val destDir = testPath(filesystem, "LineCountWriteback") assert(0 === S3ALineCount.action(sparkConf, Array(sourceFile.toString, destDir.toString))) val status = filesystem.getFileStatus(destDir) assert(status.isDirectory, s"Not a directory: $status") // only a small fraction of the source data is needed val expectedLen = sourceInfo.getLen / 1024 def validateChildSize(qualifier: String, files: Seq[FileStatus]) = { val (filenames, size) = enumFileSize(destDir, files) logInfo(s"total size of $qualifier = $size bytes from ${files.length} files: $filenames") assert(size >= expectedLen, s"$qualifier size $size in files $filenames" + s" smaller than exoected length $expectedLen") } val stdInterval = interval(100 milliseconds) val appId = eventually(timeout(20 seconds), stdInterval) { validateChildSize("descendants", listFiles(filesystem, destDir, true) .filter(f => f.getPath.getName != "_SUCCESS")) validateChildSize("children", filesystem.listStatus(destDir, pathFilter(p => p.getName != "_SUCCESS")).toSeq) } } private def enumFileSize(destDir: Path, files: Seq[FileStatus]): (String, Long) = { assert(files.nonEmpty, s"No files in destination directory $destDir") var size = 0L val filenames = new StringBuffer() files.foreach { f => size += f.getLen filenames.append(" ").append(f.getPath) } (filenames.toString, size) } }
Example 6
Source File: SerializableFileStatus.scala From parquet-index with Apache License 2.0 | 5 votes |
package com.github.lightcopy.util import org.apache.hadoop.fs.{BlockLocation, FileStatus, LocatedFileStatus, Path} object SerializableFileStatus { def fromFileStatus(status: FileStatus): SerializableFileStatus = { val blockLocations = status match { case f: LocatedFileStatus => f.getBlockLocations.map { loc => SerializableBlockLocation(loc.getNames, loc.getHosts, loc.getOffset, loc.getLength) } case _ => Array.empty[SerializableBlockLocation] } SerializableFileStatus( status.getPath.toString, status.getLen, status.isDirectory, status.getReplication, status.getBlockSize, status.getModificationTime, status.getAccessTime, blockLocations) } def toFileStatus(status: SerializableFileStatus): FileStatus = { val blockLocations = status.blockLocations.map { loc => new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length) } new LocatedFileStatus( new FileStatus( status.length, status.isDir, status.blockReplication, status.blockSize, status.modificationTime, new Path(status.path)), blockLocations) } }
Example 7
Source File: SageMakerProtobufFileFormat.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.protobuf import org.apache.hadoop.fs.FileStatus import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriter, OutputWriterFactory} import org.apache.spark.sql.sources.DataSourceRegister import org.apache.spark.sql.types.StructType class SageMakerProtobufFileFormat extends FileFormat with DataSourceRegister { override def inferSchema(sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = { Option.empty } override def shortName(): String = "sagemaker" override def toString: String = "SageMaker" override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = { new OutputWriterFactory { override def newInstance( path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { new SageMakerProtobufWriter(path, context, dataSchema, options) } override def getFileExtension(context: TaskAttemptContext): String = { ".pbr" } } } }
Example 8
Source File: SparkHadoopUtilSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import java.security.PrivilegedExceptionAction import scala.util.Random import org.apache.hadoop.fs.FileStatus import org.apache.hadoop.fs.permission.{FsAction, FsPermission} import org.apache.hadoop.security.UserGroupInformation import org.scalatest.Matchers import org.apache.spark.SparkFunSuite class SparkHadoopUtilSuite extends SparkFunSuite with Matchers { test("check file permission") { import FsAction._ val testUser = s"user-${Random.nextInt(100)}" val testGroups = Array(s"group-${Random.nextInt(100)}") val testUgi = UserGroupInformation.createUserForTesting(testUser, testGroups) testUgi.doAs(new PrivilegedExceptionAction[Void] { override def run(): Void = { val sparkHadoopUtil = new SparkHadoopUtil // If file is owned by user and user has access permission var status = fileStatus(testUser, testGroups.head, READ_WRITE, READ_WRITE, NONE) sparkHadoopUtil.checkAccessPermission(status, READ) should be(true) sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(true) // If file is owned by user but user has no access permission status = fileStatus(testUser, testGroups.head, NONE, READ_WRITE, NONE) sparkHadoopUtil.checkAccessPermission(status, READ) should be(false) sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(false) val otherUser = s"test-${Random.nextInt(100)}" val otherGroup = s"test-${Random.nextInt(100)}" // If file is owned by user's group and user's group has access permission status = fileStatus(otherUser, testGroups.head, NONE, READ_WRITE, NONE) sparkHadoopUtil.checkAccessPermission(status, READ) should be(true) sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(true) // If file is owned by user's group but user's group has no access permission status = fileStatus(otherUser, testGroups.head, READ_WRITE, NONE, NONE) sparkHadoopUtil.checkAccessPermission(status, READ) should be(false) sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(false) // If file is owned by other user and this user has access permission status = fileStatus(otherUser, otherGroup, READ_WRITE, READ_WRITE, READ_WRITE) sparkHadoopUtil.checkAccessPermission(status, READ) should be(true) sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(true) // If file is owned by other user but this user has no access permission status = fileStatus(otherUser, otherGroup, READ_WRITE, READ_WRITE, NONE) sparkHadoopUtil.checkAccessPermission(status, READ) should be(false) sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(false) null } }) } private def fileStatus( owner: String, group: String, userAction: FsAction, groupAction: FsAction, otherAction: FsAction): FileStatus = { new FileStatus(0L, false, 0, 0L, 0L, 0L, new FsPermission(userAction, groupAction, otherAction), owner, group, null) } }
Example 9
Source File: FileStreamSinkLog.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.net.URI import org.apache.hadoop.fs.{FileStatus, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.apache.spark.sql.SparkSession import org.apache.spark.sql.internal.SQLConf class FileStreamSinkLog( metadataLogVersion: Int, sparkSession: SparkSession, path: String) extends CompactibleFileStreamLog[SinkFileStatus](metadataLogVersion, sparkSession, path) { private implicit val formats = Serialization.formats(NoTypeHints) protected override val fileCleanupDelayMs = sparkSession.sessionState.conf.fileSinkLogCleanupDelay protected override val isDeletingExpiredLog = sparkSession.sessionState.conf.fileSinkLogDeletion protected override val defaultCompactInterval = sparkSession.sessionState.conf.fileSinkLogCompactInterval require(defaultCompactInterval > 0, s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $defaultCompactInterval) " + "to a positive value.") override def compactLogs(logs: Seq[SinkFileStatus]): Seq[SinkFileStatus] = { val deletedFiles = logs.filter(_.action == FileStreamSinkLog.DELETE_ACTION).map(_.path).toSet if (deletedFiles.isEmpty) { logs } else { logs.filter(f => !deletedFiles.contains(f.path)) } } } object FileStreamSinkLog { val VERSION = 1 val DELETE_ACTION = "delete" val ADD_ACTION = "add" }
Example 10
Source File: MetadataLogFileIndex.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import scala.collection.mutable import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.types.StructType class MetadataLogFileIndex( sparkSession: SparkSession, path: Path, userPartitionSchema: Option[StructType]) extends PartitioningAwareFileIndex(sparkSession, Map.empty, userPartitionSchema) { private val metadataDirectory = new Path(path, FileStreamSink.metadataDir) logInfo(s"Reading streaming file log from $metadataDirectory") private val metadataLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, metadataDirectory.toUri.toString) private val allFilesFromLog = metadataLog.allFiles().map(_.toFileStatus).filterNot(_.isDirectory) private var cachedPartitionSpec: PartitionSpec = _ override protected val leafFiles: mutable.LinkedHashMap[Path, FileStatus] = { new mutable.LinkedHashMap ++= allFilesFromLog.map(f => f.getPath -> f) } override protected val leafDirToChildrenFiles: Map[Path, Array[FileStatus]] = { allFilesFromLog.toArray.groupBy(_.getPath.getParent) } override def rootPaths: Seq[Path] = path :: Nil override def refresh(): Unit = { } override def partitionSpec(): PartitionSpec = { if (cachedPartitionSpec == null) { cachedPartitionSpec = inferPartitioning() } cachedPartitionSpec } }
Example 11
Source File: KyuubiDistributedCacheManagerSuite.scala From kyuubi with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.net.URI import scala.collection.mutable.{HashMap, Map} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.hadoop.yarn.api.records.{LocalResource, LocalResourceType, LocalResourceVisibility} import org.apache.hadoop.yarn.util.ConverterUtils import org.apache.spark.{KyuubiSparkUtil, SparkFunSuite} import org.mockito.Mockito.when import org.scalatest.mock.MockitoSugar import yaooqinn.kyuubi.utils.ReflectUtils class KyuubiDistributedCacheManagerSuite extends SparkFunSuite with MockitoSugar { class MockClientDistributedCacheManager extends ClientDistributedCacheManager { override def getVisibility(conf: Configuration, uri: URI, statCache: Map[URI, FileStatus]): LocalResourceVisibility = { LocalResourceVisibility.PRIVATE } } test("add resource") { val fs = mock[FileSystem] val conf = new Configuration() val destPath = new Path("file:///foo.bar.com:8080/tmp/testing") val localResources = HashMap[String, LocalResource]() val statCache = HashMap[URI, FileStatus]() val status = new FileStatus() when(fs.getFileStatus(destPath)).thenReturn(status) val fileLink = "link" ReflectUtils.setFieldValue( KyuubiDistributedCacheManager, "cacheManager", new MockClientDistributedCacheManager) KyuubiDistributedCacheManager.addResource( fs, conf, destPath, localResources, LocalResourceType.FILE, fileLink, statCache) val res = localResources(fileLink) assert(res.getVisibility === LocalResourceVisibility.PRIVATE) assert(ConverterUtils.getPathFromYarnURL(res.getResource) === destPath) assert(res.getSize === 0) assert(res.getTimestamp === 0) assert(res.getType === LocalResourceType.FILE) val status2 = new FileStatus( 10, false, 1, 1024, 10, 10, null, KyuubiSparkUtil.getCurrentUserName, null, new Path("/tmp/testing2")) val destPath2 = new Path("file:///foo.bar.com:8080/tmp/testing2") when(fs.getFileStatus(destPath2)).thenReturn(status2) val fileLink2 = "link2" KyuubiDistributedCacheManager.addResource( fs, conf, destPath2, localResources, LocalResourceType.FILE, fileLink2, statCache) val res2 = localResources(fileLink2) assert(res2.getVisibility === LocalResourceVisibility.PRIVATE) assert(ConverterUtils.getPathFromYarnURL(res2.getResource) === destPath2) assert(res2.getSize === 10) assert(res2.getTimestamp === 10) assert(res2.getType === LocalResourceType.FILE) } test("add resource when link null") { val distMgr = new MockClientDistributedCacheManager() val fs = mock[FileSystem] val conf = new Configuration() val destPath = new Path("file:///foo.bar.com:8080/tmp/testing") ReflectUtils.setFieldValue(KyuubiDistributedCacheManager, "cacheManager", distMgr) val localResources = HashMap[String, LocalResource]() val statCache = HashMap[URI, FileStatus]() when(fs.getFileStatus(destPath)).thenReturn(new FileStatus()) intercept[Exception] { KyuubiDistributedCacheManager.addResource( fs, conf, destPath, localResources, LocalResourceType.FILE, null, statCache) } assert(localResources.get("link") === None) assert(localResources.size === 0) } test("test addResource archive") { val distMgr = new MockClientDistributedCacheManager() ReflectUtils.setFieldValue(KyuubiDistributedCacheManager, "cacheManager", distMgr) val fs = mock[FileSystem] val conf = new Configuration() val destPath = new Path("file:///foo.bar.com:8080/tmp/testing") val localResources = HashMap[String, LocalResource]() val statCache = HashMap[URI, FileStatus]() val realFileStatus = new FileStatus(10, false, 1, 1024, 10, 10, null, "testOwner", null, new Path("/tmp/testing")) when(fs.getFileStatus(destPath)).thenReturn(realFileStatus) KyuubiDistributedCacheManager.addResource( fs, conf, destPath, localResources, LocalResourceType.ARCHIVE, "link", statCache) val resource = localResources("link") assert(resource.getVisibility === LocalResourceVisibility.PRIVATE) assert(ConverterUtils.getPathFromYarnURL(resource.getResource) === destPath) assert(resource.getTimestamp === 10) assert(resource.getSize === 10) assert(resource.getType === LocalResourceType.ARCHIVE) } }
Example 12
Source File: KyuubiDistributedCacheManager.scala From kyuubi with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.net.URI import scala.collection.mutable.{HashMap, Map} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.hadoop.yarn.api.records.{LocalResource, LocalResourceType} def addResource( fs: FileSystem, conf: Configuration, destPath: Path, localResources: HashMap[String, LocalResource], resourceType: LocalResourceType, link: String, statCache: Map[URI, FileStatus]): Unit = { cacheManager.addResource(fs, conf, destPath, localResources, resourceType, link, statCache, appMasterOnly = true) } }
Example 13
Source File: FileStreamSourceSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.File import java.net.URI import scala.util.Random import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem} import org.apache.spark.SparkFunSuite import org.apache.spark.sql.execution.streaming.ExistsThrowsExceptionFileSystem._ import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.StructType class FileStreamSourceSuite extends SparkFunSuite with SharedSQLContext { import FileStreamSource._ test("SeenFilesMap") { val map = new SeenFilesMap(maxAgeMs = 10) map.add("a", 5) assert(map.size == 1) map.purge() assert(map.size == 1) // Add a new entry and purge should be no-op, since the gap is exactly 10 ms. map.add("b", 15) assert(map.size == 2) map.purge() assert(map.size == 2) // Add a new entry that's more than 10 ms than the first entry. We should be able to purge now. map.add("c", 16) assert(map.size == 3) map.purge() assert(map.size == 2) // Override existing entry shouldn't change the size map.add("c", 25) assert(map.size == 2) // Not a new file because we have seen c before assert(!map.isNewFile("c", 20)) // Not a new file because timestamp is too old assert(!map.isNewFile("d", 5)) // Finally a new file: never seen and not too old assert(map.isNewFile("e", 20)) } test("SeenFilesMap should only consider a file old if it is earlier than last purge time") { val map = new SeenFilesMap(maxAgeMs = 10) map.add("a", 20) assert(map.size == 1) // Timestamp 5 should still considered a new file because purge time should be 0 assert(map.isNewFile("b", 9)) assert(map.isNewFile("b", 10)) // Once purge, purge time should be 10 and then b would be a old file if it is less than 10. map.purge() assert(!map.isNewFile("b", 9)) assert(map.isNewFile("b", 10)) } testWithUninterruptibleThread("do not recheck that files exist during getBatch") { withTempDir { temp => spark.conf.set( s"fs.$scheme.impl", classOf[ExistsThrowsExceptionFileSystem].getName) // add the metadata entries as a pre-req val dir = new File(temp, "dir") // use non-existent directory to test whether log make the dir val metadataLog = new FileStreamSourceLog(FileStreamSourceLog.VERSION, spark, dir.getAbsolutePath) assert(metadataLog.add(0, Array(FileEntry(s"$scheme:///file1", 100L, 0)))) val newSource = new FileStreamSource(spark, s"$scheme:///", "parquet", StructType(Nil), Nil, dir.getAbsolutePath, Map.empty) // this method should throw an exception if `fs.exists` is called during resolveRelation newSource.getBatch(None, FileStreamSourceOffset(1)) } } } override def listStatus(file: Path): Array[FileStatus] = { val emptyFile = new FileStatus() emptyFile.setPath(file) Array(emptyFile) } } object ExistsThrowsExceptionFileSystem { val scheme = s"FileStreamSourceSuite${math.abs(Random.nextInt)}fs" }
Example 14
Source File: FileStreamSinkLog.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.hadoop.fs.{FileStatus, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.json4s.jackson.Serialization.{read, write} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.internal.SQLConf class FileStreamSinkLog( metadataLogVersion: String, sparkSession: SparkSession, path: String) extends CompactibleFileStreamLog[SinkFileStatus](metadataLogVersion, sparkSession, path) { private implicit val formats = Serialization.formats(NoTypeHints) protected override val fileCleanupDelayMs = sparkSession.sessionState.conf.fileSinkLogCleanupDelay protected override val isDeletingExpiredLog = sparkSession.sessionState.conf.fileSinkLogDeletion protected override val defaultCompactInterval = sparkSession.sessionState.conf.fileSinkLogCompactInterval require(defaultCompactInterval > 0, s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $defaultCompactInterval) " + "to a positive value.") override def compactLogs(logs: Seq[SinkFileStatus]): Seq[SinkFileStatus] = { val deletedFiles = logs.filter(_.action == FileStreamSinkLog.DELETE_ACTION).map(_.path).toSet if (deletedFiles.isEmpty) { logs } else { logs.filter(f => !deletedFiles.contains(f.path)) } } } object FileStreamSinkLog { val VERSION = "v1" val DELETE_ACTION = "delete" val ADD_ACTION = "add" }
Example 15
Source File: BinaryFileReader.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark import com.microsoft.ml.spark.core.env.StreamUtilities import com.microsoft.ml.spark.core.schema.BinaryFileSchema import com.microsoft.ml.spark.core.utils.AsyncUtils import org.apache.commons.io.IOUtils import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.spark.binary.BinaryFileFormat import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.binary.ConfUtils import org.apache.spark.sql.types.BinaryType import scala.concurrent.{ExecutionContext, Future} import scala.concurrent.duration.Duration object BinaryFileReader { private def recursePath(fileSystem: FileSystem, path: Path, pathFilter: FileStatus => Boolean, visitedSymlinks: Set[Path]): Array[Path] ={ val filteredPaths = fileSystem.listStatus(path).filter(pathFilter) val filteredDirs = filteredPaths.filter(fs => fs.isDirectory & !visitedSymlinks(fs.getPath)) val symlinksFound = visitedSymlinks ++ filteredDirs.filter(_.isSymlink).map(_.getPath) filteredPaths.map(_.getPath) ++ filteredDirs.map(_.getPath) .flatMap(p => recursePath(fileSystem, p, pathFilter, symlinksFound)) } def recursePath(fileSystem: FileSystem, path: Path, pathFilter: FileStatus => Boolean): Array[Path] ={ recursePath(fileSystem, path, pathFilter, Set()) } def readFromPaths(df: DataFrame, pathCol: String, bytesCol: String, concurrency: Int, timeout: Int ): DataFrame = { val outputSchema = df.schema.add(bytesCol, BinaryType, nullable = true) val encoder = RowEncoder(outputSchema) val hconf = ConfUtils.getHConf(df) df.mapPartitions { rows => val futures = rows.map {row: Row => Future { val path = new Path(row.getAs[String](pathCol)) val fs = path.getFileSystem(hconf.value) val bytes = StreamUtilities.using(fs.open(path)) {is => IOUtils.toByteArray(is)}.get val ret = Row.merge(Seq(row, Row(bytes)): _*) ret }(ExecutionContext.global) } AsyncUtils.bufferedAwait( futures,concurrency, Duration.fromNanos(timeout*(20^6).toLong))(ExecutionContext.global) }(encoder) } }
Example 16
Source File: HadoopFileSystemLogStore.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.storage import java.io.{BufferedReader, FileNotFoundException, InputStreamReader} import java.nio.charset.StandardCharsets.UTF_8 import java.nio.file.FileAlreadyExistsException import java.util.UUID import scala.collection.JavaConverters._ import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SparkSession protected def writeWithRename( path: Path, actions: Iterator[String], overwrite: Boolean = false): Unit = { val fs = path.getFileSystem(getHadoopConfiguration) if (!fs.exists(path.getParent)) { throw new FileNotFoundException(s"No such file or directory: ${path.getParent}") } if (overwrite) { val stream = fs.create(path, true) try { actions.map(_ + "\n").map(_.getBytes(UTF_8)).foreach(stream.write) } finally { stream.close() } } else { if (fs.exists(path)) { throw new FileAlreadyExistsException(path.toString) } val tempPath = createTempPath(path) var streamClosed = false // This flag is to avoid double close var renameDone = false // This flag is to save the delete operation in most of cases. val stream = fs.create(tempPath) try { actions.map(_ + "\n").map(_.getBytes(UTF_8)).foreach(stream.write) stream.close() streamClosed = true try { if (fs.rename(tempPath, path)) { renameDone = true } else { if (fs.exists(path)) { throw new FileAlreadyExistsException(path.toString) } else { throw new IllegalStateException(s"Cannot rename $tempPath to $path") } } } catch { case _: org.apache.hadoop.fs.FileAlreadyExistsException => throw new FileAlreadyExistsException(path.toString) } } finally { if (!streamClosed) { stream.close() } if (!renameDone) { fs.delete(tempPath, false) } } } } protected def createTempPath(path: Path): Path = { new Path(path.getParent, s".${path.getName}.${UUID.randomUUID}.tmp") } override def invalidateCache(): Unit = {} }
Example 17
Source File: FileNames.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.util import org.apache.spark.sql.delta.DeltaErrors import org.apache.hadoop.fs.{FileStatus, Path} def getFileVersion(path: Path): Long = { if (isCheckpointFile(path)) { checkpointVersion(path) } else if (isDeltaFile(path)) { deltaVersion(path) } else if (isChecksumFile(path)) { checksumVersion(path) } else { // scalastyle:off throwerror throw new AssertionError( s"Unexpected file type found in transaction log: $path") // scalastyle:on throwerror } } }
Example 18
Source File: HdfsUtils.scala From etl-light with MIT License | 5 votes |
package yamrcraft.etlite.utils import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import scala.collection.mutable object HdfsUtils { def renameFiles(fromBase: Path, toBase: Path, fs: FileSystem) = { if (fs.exists(fromBase)) { val filesToMove = listFiles(fromBase, fs) println("files to move:") filesToMove foreach (p => println("+++" + p.toString)) filesToMove foreach { file => val relPath = relativize(fromBase, file) val toPath = new Path(toBase, relPath) fs.mkdirs(toPath.getParent) fs.rename(file, toPath) println(" file renamed to: " + toPath.toString) } } } def relativize(base: Path, files: List[Path]) = { files map (file => new Path(base.toUri.relativize(file.toUri).getPath)) } def relativize(base: Path, file: Path): Path = { new Path(base.toUri.relativize(file.toUri).getPath) } def listFiles(path: Path, fs: FileSystem): List[Path] = { val statusList = mutable.MutableList[FileStatus]() traverse(path, statusList, fs) statusList.map(status => new Path(status.getPath.toUri.getPath)).toList } private def traverse(path: Path, list: mutable.MutableList[FileStatus], fs: FileSystem): Unit = { fs.listStatus(path) foreach { status => if (!status.isDirectory) { list += status } else { traverse(status.getPath, list, fs) } } } }
Example 19
Source File: FileStreamSourceSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.File import java.net.URI import scala.util.Random import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem} import org.apache.spark.SparkFunSuite import org.apache.spark.sql.execution.streaming.ExistsThrowsExceptionFileSystem._ import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.StructType class FileStreamSourceSuite extends SparkFunSuite with SharedSQLContext { import FileStreamSource._ test("SeenFilesMap") { val map = new SeenFilesMap(maxAgeMs = 10) map.add("a", 5) assert(map.size == 1) map.purge() assert(map.size == 1) // Add a new entry and purge should be no-op, since the gap is exactly 10 ms. map.add("b", 15) assert(map.size == 2) map.purge() assert(map.size == 2) // Add a new entry that's more than 10 ms than the first entry. We should be able to purge now. map.add("c", 16) assert(map.size == 3) map.purge() assert(map.size == 2) // Override existing entry shouldn't change the size map.add("c", 25) assert(map.size == 2) // Not a new file because we have seen c before assert(!map.isNewFile("c", 20)) // Not a new file because timestamp is too old assert(!map.isNewFile("d", 5)) // Finally a new file: never seen and not too old assert(map.isNewFile("e", 20)) } test("SeenFilesMap should only consider a file old if it is earlier than last purge time") { val map = new SeenFilesMap(maxAgeMs = 10) map.add("a", 20) assert(map.size == 1) // Timestamp 5 should still considered a new file because purge time should be 0 assert(map.isNewFile("b", 9)) assert(map.isNewFile("b", 10)) // Once purge, purge time should be 10 and then b would be a old file if it is less than 10. map.purge() assert(!map.isNewFile("b", 9)) assert(map.isNewFile("b", 10)) } testWithUninterruptibleThread("do not recheck that files exist during getBatch") { withTempDir { temp => spark.conf.set( s"fs.$scheme.impl", classOf[ExistsThrowsExceptionFileSystem].getName) // add the metadata entries as a pre-req val dir = new File(temp, "dir") // use non-existent directory to test whether log make the dir val metadataLog = new FileStreamSourceLog(FileStreamSourceLog.VERSION, spark, dir.getAbsolutePath) assert(metadataLog.add(0, Array(FileEntry(s"$scheme:///file1", 100L, 0)))) val newSource = new FileStreamSource(spark, s"$scheme:///", "parquet", StructType(Nil), Nil, dir.getAbsolutePath, Map.empty) // this method should throw an exception if `fs.exists` is called during resolveRelation newSource.getBatch(None, FileStreamSourceOffset(1)) } } } override def listStatus(file: Path): Array[FileStatus] = { val emptyFile = new FileStatus() emptyFile.setPath(file) Array(emptyFile) } } object ExistsThrowsExceptionFileSystem { val scheme = s"FileStreamSourceSuite${math.abs(Random.nextInt)}fs" }
Example 20
Source File: FileStreamSinkLog.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.hadoop.fs.{FileStatus, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.json4s.jackson.Serialization.{read, write} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.internal.SQLConf class FileStreamSinkLog( metadataLogVersion: String, sparkSession: SparkSession, path: String) extends CompactibleFileStreamLog[SinkFileStatus](metadataLogVersion, sparkSession, path) { private implicit val formats = Serialization.formats(NoTypeHints) protected override val fileCleanupDelayMs = sparkSession.sessionState.conf.fileSinkLogCleanupDelay protected override val isDeletingExpiredLog = sparkSession.sessionState.conf.fileSinkLogDeletion protected override val defaultCompactInterval = sparkSession.sessionState.conf.fileSinkLogCompactInterval require(defaultCompactInterval > 0, s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $defaultCompactInterval) " + "to a positive value.") override def compactLogs(logs: Seq[SinkFileStatus]): Seq[SinkFileStatus] = { val deletedFiles = logs.filter(_.action == FileStreamSinkLog.DELETE_ACTION).map(_.path).toSet if (deletedFiles.isEmpty) { logs } else { logs.filter(f => !deletedFiles.contains(f.path)) } } } object FileStreamSinkLog { val VERSION = "v1" val DELETE_ACTION = "delete" val ADD_ACTION = "add" }
Example 21
Source File: ArrowFileFormat.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.oap.spark.sql.execution.datasources.arrow import scala.collection.JavaConverters._ import com.intel.oap.spark.sql.execution.datasources.arrow.ArrowFileFormat.UnsafeItr import com.intel.oap.spark.sql.execution.datasources.v2.arrow.{ArrowFilters, ArrowOptions} import com.intel.oap.spark.sql.execution.datasources.v2.arrow.ArrowSQLConf._ import org.apache.arrow.dataset.scanner.ScanOptions import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileStatus import org.apache.hadoop.mapreduce.Job import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile} import org.apache.spark.sql.execution.datasources.v2.arrow.ArrowUtils import org.apache.spark.sql.sources.{DataSourceRegister, Filter} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap; class ArrowFileFormat extends FileFormat with DataSourceRegister with Serializable { val batchSize = 4096 def convert(files: Seq[FileStatus], options: Map[String, String]): Option[StructType] = { ArrowUtils.readSchema(files, new CaseInsensitiveStringMap(options.asJava)) } override def inferSchema( sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = { convert(files, options) } override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = { throw new UnsupportedOperationException("Write is not supported for Arrow source") } override def supportBatch(sparkSession: SparkSession, dataSchema: StructType): Boolean = true override def buildReaderWithPartitionValues(sparkSession: SparkSession, dataSchema: StructType, partitionSchema: StructType, requiredSchema: StructType, filters: Seq[Filter], options: Map[String, String], hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { (file: PartitionedFile) => { val sqlConf = sparkSession.sessionState.conf; val enableFilterPushDown = sqlConf.arrowFilterPushDown val factory = ArrowUtils.makeArrowDiscovery( file.filePath, new ArrowOptions( new CaseInsensitiveStringMap( options.asJava).asScala.toMap)) // todo predicate validation / pushdown val dataset = factory.finish(); val filter = if (enableFilterPushDown) { ArrowFilters.translateFilters(filters) } else { org.apache.arrow.dataset.filter.Filter.EMPTY } val scanOptions = new ScanOptions(requiredSchema.map(f => f.name).toArray, filter, batchSize) val scanner = dataset.newScan(scanOptions) val itrList = scanner .scan() .iterator() .asScala .map(task => task.scan()) .toList val itr = itrList .toIterator .flatMap(itr => itr.asScala) .map(vsr => ArrowUtils.loadVsr(vsr, file.partitionValues, partitionSchema, dataSchema)) new UnsafeItr(itr).asInstanceOf[Iterator[InternalRow]] } } override def shortName(): String = "arrow" } object ArrowFileFormat { class UnsafeItr[T](delegate: Iterator[T]) extends Iterator[T] { override def hasNext: Boolean = delegate.hasNext override def next(): T = delegate.next() } }
Example 22
Source File: ArrowTable.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.oap.spark.sql.execution.datasources.v2.arrow import org.apache.hadoop.fs.FileStatus import org.apache.spark.sql.SparkSession import org.apache.spark.sql.connector.read.ScanBuilder import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder} import org.apache.spark.sql.execution.datasources.FileFormat import org.apache.spark.sql.execution.datasources.v2.FileTable import org.apache.spark.sql.execution.datasources.v2.arrow.ArrowUtils import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap case class ArrowTable( name: String, sparkSession: SparkSession, options: CaseInsensitiveStringMap, paths: Seq[String], userSpecifiedSchema: Option[StructType], fallbackFileFormat: Class[_ <: FileFormat]) extends FileTable(sparkSession, options, paths, userSpecifiedSchema) { override def inferSchema(files: Seq[FileStatus]): Option[StructType] = { ArrowUtils.readSchema(files, options) } override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = { ArrowScanBuilder(sparkSession, fileIndex, schema, dataSchema, options) } override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { throw new UnsupportedOperationException // fixme implement later } override def formatName: String = "ARROW" }
Example 23
Source File: OapIndexFileFormat.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.index import org.apache.hadoop.fs.FileStatus import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.parquet.hadoop.util.ContextUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory} import org.apache.spark.sql.execution.datasources.oap.OapFileFormat import org.apache.spark.sql.types.StructType private[index] class OapIndexFileFormat extends FileFormat with Logging with Serializable { override def inferSchema( sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = None override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = { val configuration = ContextUtil.getConfiguration(job) configuration.set(OapIndexFileFormat.ROW_SCHEMA, dataSchema.json) configuration.set(OapIndexFileFormat.INDEX_TYPE, options("indexType")) configuration.set(OapIndexFileFormat.INDEX_NAME, options("indexName")) configuration.set(OapIndexFileFormat.INDEX_TIME, options("indexTime")) configuration.set(OapIndexFileFormat.IS_APPEND, options("isAppend")) new OutputWriterFactory { override def getFileExtension(context: TaskAttemptContext): String = OapFileFormat.OAP_INDEX_EXTENSION override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext) = new OapIndexOutputWriter(path, context) } } } private[index] object OapIndexFileFormat { val ROW_SCHEMA: String = "org.apache.spark.sql.oap.row.attributes" val INDEX_TYPE: String = "org.apache.spark.sql.oap.index.type" val INDEX_NAME: String = "org.apache.spark.sql.oap.index.name" val INDEX_TIME: String = "org.apache.spark.sql.oap.index.time" val IS_APPEND: String = "org.apache.spark.sql.oap.index.append" } case class IndexBuildResult(dataFile: String, rowCount: Long, fingerprint: String, parent: String)
Example 24
Source File: Checksum.scala From schedoscope with Apache License 2.0 | 5 votes |
package org.schedoscope.dsl.transformations import java.security.MessageDigest import org.apache.hadoop.fs.{FileStatus, Path} import org.schedoscope.Schedoscope import org.schedoscope.scheduler.driver.FilesystemDriver._ import scala.Array.canBuildFrom import scala.collection.mutable.HashMap object Checksum { private def md5 = MessageDigest.getInstance("MD5") private def listFiles(path: String): Array[FileStatus] = { val files = fileSystem(path, Schedoscope.settings.hadoopConf).globStatus(new Path(path)) if (files != null) files else Array() } private def fileChecksum(path: String) = if (path == null) "null-checksum" else if (path.endsWith(".jar")) path else try { val cs = fileSystem(path, Schedoscope.settings.hadoopConf).getFileChecksum(new Path(path)) if (cs == null) path else cs.toString() } catch { case _: Throwable => path } def fileChecksums(paths: List[String], recursive: Boolean): List[String] = paths.flatMap(path => { if (fileSystem(path, Schedoscope.settings.hadoopConf).isFile(new Path(path))) List(fileChecksum(path)) else if (recursive) fileChecksums(listFiles(path + "/*").map(f => f.getPath.toString()).toList, recursive) else List() }).sorted val resourceHashCache = new HashMap[List[String], List[String]]() def resourceHashes(resources: List[String]): List[String] = synchronized { resourceHashCache.getOrElseUpdate(resources, fileChecksums(resources, true)) } val defaultDigest = "0" def digest(stringsToDigest: String*): String = if (stringsToDigest.isEmpty) defaultDigest else md5.digest(stringsToDigest.sorted.mkString.toCharArray().map(_.toByte)).map("%02X" format _).mkString object SchemaChecksum { val checksumProperty = "schema.checksum" } object TransformationChecksum { val checksumProperty = "transformation.checksum" val timestampProperty = "transformation.timestamp" } }
Example 25
Source File: UnsplittableSequenceFileInputFormat.scala From spark-util with Apache License 2.0 | 5 votes |
package org.hammerlab.hadoop.splits import java.io.IOException import java.util import org.apache.hadoop.fs.{ FileStatus, FileSystem, Path ⇒ HPath } import org.apache.hadoop.mapred.{ JobConf, SequenceFileInputFormat } import org.apache.hadoop.mapreduce.JobContext import org.apache.hadoop.mapreduce.lib.input import scala.collection.JavaConverters._ override def listStatus(job: JobContext): util.List[FileStatus] = super .listStatus(job) .asScala .sortBy { _.getPath.getName match { case PartFileBasename(idx) ⇒ idx case basename ⇒ throw new IllegalArgumentException(s"Bad partition file: $basename") } } .asJava }
Example 26
Source File: FileStreamSinkLog.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.net.URI import org.apache.hadoop.fs.{FileStatus, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.apache.spark.sql.SparkSession import org.apache.spark.sql.internal.SQLConf class FileStreamSinkLog( metadataLogVersion: Int, sparkSession: SparkSession, path: String) extends CompactibleFileStreamLog[SinkFileStatus](metadataLogVersion, sparkSession, path) { private implicit val formats = Serialization.formats(NoTypeHints) protected override val fileCleanupDelayMs = sparkSession.sessionState.conf.fileSinkLogCleanupDelay protected override val isDeletingExpiredLog = sparkSession.sessionState.conf.fileSinkLogDeletion protected override val defaultCompactInterval = sparkSession.sessionState.conf.fileSinkLogCompactInterval require(defaultCompactInterval > 0, s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $defaultCompactInterval) " + "to a positive value.") override def compactLogs(logs: Seq[SinkFileStatus]): Seq[SinkFileStatus] = { val deletedFiles = logs.filter(_.action == FileStreamSinkLog.DELETE_ACTION).map(_.path).toSet if (deletedFiles.isEmpty) { logs } else { logs.filter(f => !deletedFiles.contains(f.path)) } } } object FileStreamSinkLog { val VERSION = 1 val DELETE_ACTION = "delete" val ADD_ACTION = "add" }
Example 27
Source File: MetadataLogFileIndex.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import scala.collection.mutable import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.types.StructType class MetadataLogFileIndex( sparkSession: SparkSession, path: Path, userSpecifiedSchema: Option[StructType]) extends PartitioningAwareFileIndex(sparkSession, Map.empty, userSpecifiedSchema) { private val metadataDirectory = new Path(path, FileStreamSink.metadataDir) logInfo(s"Reading streaming file log from $metadataDirectory") private val metadataLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, metadataDirectory.toUri.toString) private val allFilesFromLog = metadataLog.allFiles().map(_.toFileStatus).filterNot(_.isDirectory) private var cachedPartitionSpec: PartitionSpec = _ override protected val leafFiles: mutable.LinkedHashMap[Path, FileStatus] = { new mutable.LinkedHashMap ++= allFilesFromLog.map(f => f.getPath -> f) } override protected val leafDirToChildrenFiles: Map[Path, Array[FileStatus]] = { allFilesFromLog.groupBy(_.getPath.getParent) } override def rootPaths: Seq[Path] = path :: Nil override def refresh(): Unit = { } override def partitionSpec(): PartitionSpec = { if (cachedPartitionSpec == null) { cachedPartitionSpec = inferPartitioning() } cachedPartitionSpec } }
Example 28
Source File: FileStreamSourceSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.File import java.net.URI import scala.util.Random import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem} import org.apache.spark.SparkFunSuite import org.apache.spark.sql.execution.streaming.ExistsThrowsExceptionFileSystem._ import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.StructType class FileStreamSourceSuite extends SparkFunSuite with SharedSQLContext { import FileStreamSource._ test("SeenFilesMap") { val map = new SeenFilesMap(maxAgeMs = 10) map.add("a", 5) assert(map.size == 1) map.purge() assert(map.size == 1) // Add a new entry and purge should be no-op, since the gap is exactly 10 ms. map.add("b", 15) assert(map.size == 2) map.purge() assert(map.size == 2) // Add a new entry that's more than 10 ms than the first entry. We should be able to purge now. map.add("c", 16) assert(map.size == 3) map.purge() assert(map.size == 2) // Override existing entry shouldn't change the size map.add("c", 25) assert(map.size == 2) // Not a new file because we have seen c before assert(!map.isNewFile("c", 20)) // Not a new file because timestamp is too old assert(!map.isNewFile("d", 5)) // Finally a new file: never seen and not too old assert(map.isNewFile("e", 20)) } test("SeenFilesMap should only consider a file old if it is earlier than last purge time") { val map = new SeenFilesMap(maxAgeMs = 10) map.add("a", 20) assert(map.size == 1) // Timestamp 5 should still considered a new file because purge time should be 0 assert(map.isNewFile("b", 9)) assert(map.isNewFile("b", 10)) // Once purge, purge time should be 10 and then b would be a old file if it is less than 10. map.purge() assert(!map.isNewFile("b", 9)) assert(map.isNewFile("b", 10)) } testWithUninterruptibleThread("do not recheck that files exist during getBatch") { withTempDir { temp => spark.conf.set( s"fs.$scheme.impl", classOf[ExistsThrowsExceptionFileSystem].getName) // add the metadata entries as a pre-req val dir = new File(temp, "dir") // use non-existent directory to test whether log make the dir val metadataLog = new FileStreamSourceLog(FileStreamSourceLog.VERSION, spark, dir.getAbsolutePath) assert(metadataLog.add(0, Array(FileEntry(s"$scheme:///file1", 100L, 0)))) val newSource = new FileStreamSource(spark, s"$scheme:///", "parquet", StructType(Nil), Nil, dir.getAbsolutePath, Map.empty) // this method should throw an exception if `fs.exists` is called during resolveRelation newSource.getBatch(None, LongOffset(1)) } } } override def listStatus(file: Path): Array[FileStatus] = { val emptyFile = new FileStatus() emptyFile.setPath(file) Array(emptyFile) } } object ExistsThrowsExceptionFileSystem { val scheme = s"FileStreamSourceSuite${math.abs(Random.nextInt)}fs" }
Example 29
Source File: FileStreamSinkLog.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.hadoop.fs.{FileStatus, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.json4s.jackson.Serialization.{read, write} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.internal.SQLConf class FileStreamSinkLog( metadataLogVersion: String, sparkSession: SparkSession, path: String) extends CompactibleFileStreamLog[SinkFileStatus](metadataLogVersion, sparkSession, path) { private implicit val formats = Serialization.formats(NoTypeHints) protected override val fileCleanupDelayMs = sparkSession.sessionState.conf.fileSinkLogCleanupDelay protected override val isDeletingExpiredLog = sparkSession.sessionState.conf.fileSinkLogDeletion protected override val compactInterval = sparkSession.sessionState.conf.fileSinkLogCompactInterval require(compactInterval > 0, s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $compactInterval) " + "to a positive value.") protected override def serializeData(data: SinkFileStatus): String = { write(data) } protected override def deserializeData(encodedString: String): SinkFileStatus = { read[SinkFileStatus](encodedString) } override def compactLogs(logs: Seq[SinkFileStatus]): Seq[SinkFileStatus] = { val deletedFiles = logs.filter(_.action == FileStreamSinkLog.DELETE_ACTION).map(_.path).toSet if (deletedFiles.isEmpty) { logs } else { logs.filter(f => !deletedFiles.contains(f.path)) } } } object FileStreamSinkLog { val VERSION = "v1" val DELETE_ACTION = "delete" val ADD_ACTION = "add" }