org.apache.hadoop.fs.LocatedFileStatus Scala Examples
The following examples show how to use org.apache.hadoop.fs.LocatedFileStatus.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: HdfsOps.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels import com.sksamuel.exts.Logging import io.eels.util.{HdfsIterator, PathIterator} import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path} object HdfsOps extends Logging { def makePathVisible(path: Path)(implicit fs: FileSystem): Unit = { if (path.getName.startsWith(".")) { logger.info(s"Making $path visible by stripping leading .") val dest = new Path(path.getParent, path.getName.drop(1)) fs.rename(path, dest) } } def findFiles(path: Path, recursive: Boolean, fs: FileSystem): Iterator[LocatedFileStatus] = { HdfsIterator.remote(fs.listFiles(path, recursive)) } def mkdirsp(path: Path, fs: FileSystem): Boolean = PathIterator(path).forall(fs.mkdirs) }
Example 2
Source File: HivePartitionScanner.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import com.sksamuel.exts.Logging import com.typesafe.config.{Config, ConfigFactory} import io.eels.component.hive.partition.PartitionMetaData import io.eels.schema.PartitionConstraint import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus} // scans partitions for files, returning the files and the meta data object for each partition class HivePartitionScanner(implicit fs: FileSystem) extends Logging { private val config: Config = ConfigFactory.load() private val missingPartitionAction: String = config.getString("eel.hive.source.missingPartitionAction") def scan(partitions: Seq[PartitionMetaData], constraints: Seq[PartitionConstraint] = Nil): Map[PartitionMetaData, Seq[LocatedFileStatus]] = { logger.debug(s"Scanning ${partitions.size} partitions for applicable files ${partitions.map(_.location).mkString(", ").take(100)}") // first we filter out any partitions not matching the constraints val filteredPartitions = partitions.filter { meta => constraints.forall(_.eval(meta.partition)) } logger.debug(s"Filtered partitions: ${filteredPartitions.map(_.location).mkString(", ")})") // next, we check that the directories that the partitions point to actually exist // this will avoid a situation where a location exists in the metastore but not on disk val exantPartitions = filteredPartitions.filter { partition => if (fs.exists(partition.location)) { true } else { if (missingPartitionAction == "error") { throw new IllegalStateException(s"Partition [${partition.name}] was specified in the hive metastore at [${partition.location}] but did not exist on disk. To disable these exceptions set eel.hive.source.missingPartitionAction=warn or eel.hive.source.missingPartitionAction=none") } else if (missingPartitionAction == "warn") { logger.warn(s"Partition [${partition.name}] was specified in the hive metastore at [${partition.location}] but did not exist on disk. To disable these warnings set eel.hive.source.missingPartitionAction=none") false } else { false } } } // next we grab all the data files from each of these partitions exantPartitions.map { meta => meta -> HiveFileScanner(meta.location, false) }.toMap } }
Example 3
Source File: HiveFileScanner.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import com.sksamuel.exts.Logging import com.typesafe.config.ConfigFactory import io.eels.util.HdfsIterator import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path} // given a hadoop path, will look for files inside that path that match the // configured settings for hidden files // does not return directories object HiveFileScanner extends Logging { private val config = ConfigFactory.load() private val ignoreHiddenFiles = config.getBoolean("eel.hive.source.ignoreHiddenFiles") private val hiddenFilePattern = config.getString("eel.hive.source.hiddenFilePattern") // returns true if the given file should be considered based on the config settings private def skip(file: LocatedFileStatus): Boolean = { file.getLen == 0L || ignoreHiddenFiles && file.getPath.getName.matches(hiddenFilePattern) } def apply(path: Path, recursive: Boolean)(implicit fs: FileSystem): Seq[LocatedFileStatus] = { logger.debug(s"Scanning $path, filtering=$ignoreHiddenFiles, pattern=$hiddenFilePattern") val files: List[LocatedFileStatus] = if (fs.exists(path)) { val files = fs.listFiles(path, recursive) HdfsIterator.remote(files) .filter(_.isFile) .filterNot(skip) .toList } else { Nil } logger.debug(s"Scanner found ${files.size} files") files } }
Example 4
Source File: HiveTableFilesFn.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import com.sksamuel.exts.Logging import io.eels.component.hive.partition.PartitionMetaData import io.eels.schema.{Partition, PartitionConstraint} import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient object HiveTableFilesFn extends Logging { def apply(dbName: String, tableName: String, tableLocation: Path, partitionConstraints: Seq[PartitionConstraint]) (implicit fs: FileSystem, client: IMetaStoreClient): Map[Partition, Seq[LocatedFileStatus]] = { val ops = new HiveOps(client) // when we have no partitions, this will scan just the table folder directly for files def rootScan(): Map[Partition, Seq[LocatedFileStatus]] = { Map(Partition.empty -> HiveFileScanner(tableLocation, false)) } def partitionsScan(partitions: Seq[PartitionMetaData]): Map[Partition, Seq[LocatedFileStatus]] = { new HivePartitionScanner().scan(partitions, partitionConstraints) .map { case (key, value) => key.partition -> value } } // the table may or may not have partitions. // // 1. If we do have partitions then we need to scan the path of each partition // (and each partition may be located anywhere outside of the table root) // // 2. If we do not have partitions then we can simply scan the table root. // we go to the metastore as we need the locations of the partitions not the values val partitions = ops.partitionsMetaData(dbName, tableName) if (partitions.isEmpty && partitionConstraints.nonEmpty) { sys.error("Constraints were used on a table that was not partitioned") } else if (partitions.isEmpty) { logger.debug(s"No partitions for $tableName; performing root table scan") rootScan } else partitionsScan(partitions) } }
Example 5
Source File: HiveFilePublisher.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import com.sksamuel.exts.io.Using import io.eels.datastream.{Subscription, Publisher, Subscriber} import io.eels.schema.{Partition, StructType} import io.eels.{Predicate, _} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus} class HiveFilePublisher(dialect: HiveDialect, file: LocatedFileStatus, metastoreSchema: StructType, projectionSchema: StructType, predicate: Option[Predicate], partition: Partition) (implicit fs: FileSystem, conf: Configuration) extends Publisher[Seq[Row]] with Using { require(projectionSchema.fieldNames.forall { it => it == it.toLowerCase() }, s"Use only lower case field names with hive") override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { val partitionMap: Map[String, Any] = partition.entries.map { it => (it.key, it.value) }.toMap // the schema we send to the dialect must have any partition fields removed, because those // fields won't exist in the data files. This is because partitions are not always written // and instead inferred from the partition itself. val projectionFields = projectionSchema.fields.filterNot(field => partition.containsKey(field.name)) val projectionWithoutPartitions = StructType(projectionFields) // since we removed the partition fields from the target schema, we must repopulate them after the read // we also need to throw away the dummy field if we had an empty schema val publisher = dialect.input(file.getPath, metastoreSchema, projectionWithoutPartitions, predicate) publisher.subscribe(new Subscriber[Seq[Row]] { override def subscribed(s: Subscription): Unit = subscriber.subscribed(s) override def next(chunk: Seq[Row]): Unit = { val aligned = chunk.map { row => if (projectionFields.isEmpty) { val values = projectionSchema.fieldNames().map(partitionMap.apply) Row(projectionSchema, values.toVector) } else { RowUtils.rowAlign(row, projectionSchema, partitionMap) } } subscriber.next(aligned) } override def completed(): Unit = subscriber.completed() override def error(t: Throwable): Unit = subscriber.error(t) }) } }
Example 6
Source File: ShapeInputFormat.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import com.google.common.base.Stopwatch import magellan.io.{ShapeKey, ShapeWritable} import org.apache.commons.logging.LogFactory import org.apache.hadoop.fs.{LocatedFileStatus, Path} import org.apache.hadoop.mapreduce.lib.input._ import org.apache.hadoop.mapreduce.{InputSplit, JobContext, TaskAttemptContext} import scala.collection.JavaConversions._ import scala.collection.mutable.ListBuffer private[magellan] class ShapeInputFormat extends FileInputFormat[ShapeKey, ShapeWritable] { private val log = LogFactory.getLog(classOf[ShapeInputFormat]) override def createRecordReader(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext) = { new ShapefileReader } override def isSplitable(context: JobContext, filename: Path): Boolean = true override def getSplits(job: JobContext): java.util.List[InputSplit] = { val splitInfos = SplitInfos.SPLIT_INFO_MAP.get() computeSplits(job, splitInfos) } private def computeSplits( job: JobContext, splitInfos: scala.collection.Map[String, Array[Long]]) = { val sw = new Stopwatch().start val splits = ListBuffer[InputSplit]() val files = listStatus(job) for (file <- files) { val path = file.getPath val length = file.getLen val blkLocations = if (file.isInstanceOf[LocatedFileStatus]) { file.asInstanceOf[LocatedFileStatus].getBlockLocations } else { val fs = path.getFileSystem(job.getConfiguration) fs.getFileBlockLocations(file, 0, length) } val key = path.getName.split("\\.shp$")(0) if (splitInfos == null || !splitInfos.containsKey(key)) { val blkIndex = getBlockIndex(blkLocations, 0) splits.+= (makeSplit(path, 0, length, blkLocations(blkIndex).getHosts, blkLocations(blkIndex).getCachedHosts)) } else { val s = splitInfos(key).toSeq val start = s val end = s.drop(1) ++ Seq(length) start.zip(end).foreach { case (startOffset: Long, endOffset: Long) => val blkIndex = getBlockIndex(blkLocations, startOffset) splits.+=(makeSplit(path, startOffset, endOffset - startOffset, blkLocations(blkIndex).getHosts, blkLocations(blkIndex).getCachedHosts)) } } } sw.stop if (log.isDebugEnabled) { log.debug("Total # of splits generated by getSplits: " + splits.size + ", TimeTaken: " + sw.elapsedMillis) } splits } } object SplitInfos { // TODO: Can we get rid of this hack to pass split calculation to the Shapefile Reader? val SPLIT_INFO_MAP = new ThreadLocal[scala.collection.Map[String, Array[Long]]] }
Example 7
Source File: SerializableFileStatus.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.util import java.util.Objects import org.apache.hadoop.fs.{FileStatus, LocatedFileStatus, Path} case class SerializableFileStatus( path: String, length: Long, isDir: Boolean, modificationTime: Long) { // Important note! This is very expensive to compute, but we don't want to cache it // as a `val` because Paths internally contain URIs and therefore consume lots of memory. def getPath: Path = new Path(path) def getLen: Long = length def getModificationTime: Long = modificationTime def isDirectory: Boolean = isDir def toFileStatus: FileStatus = { new LocatedFileStatus( new FileStatus(length, isDir, 0, 0, modificationTime, new Path(path)), null) } override def equals(obj: Any): Boolean = obj match { case other: SerializableFileStatus => // We only compare the paths to stay consistent with FileStatus.equals. Objects.equals(path, other.path) case _ => false } override def hashCode(): Int = { // We only use the path to stay consistent with FileStatus.hashCode. Objects.hashCode(path) } } object SerializableFileStatus { def fromStatus(status: FileStatus): SerializableFileStatus = { SerializableFileStatus( Option(status.getPath).map(_.toString).orNull, status.getLen, status.isDirectory, status.getModificationTime) } val EMPTY: SerializableFileStatus = fromStatus(new FileStatus()) }
Example 8
Source File: SerializableFileStatus.scala From parquet-index with Apache License 2.0 | 5 votes |
package com.github.lightcopy.util import org.apache.hadoop.fs.{BlockLocation, FileStatus, LocatedFileStatus, Path} object SerializableFileStatus { def fromFileStatus(status: FileStatus): SerializableFileStatus = { val blockLocations = status match { case f: LocatedFileStatus => f.getBlockLocations.map { loc => SerializableBlockLocation(loc.getNames, loc.getHosts, loc.getOffset, loc.getLength) } case _ => Array.empty[SerializableBlockLocation] } SerializableFileStatus( status.getPath.toString, status.getLen, status.isDirectory, status.getReplication, status.getBlockSize, status.getModificationTime, status.getAccessTime, blockLocations) } def toFileStatus(status: SerializableFileStatus): FileStatus = { val blockLocations = status.blockLocations.map { loc => new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length) } new LocatedFileStatus( new FileStatus( status.length, status.isDir, status.blockReplication, status.blockSize, status.modificationTime, new Path(status.path)), blockLocations) } }