org.apache.hadoop.fs.FileUtil Scala Examples
The following examples show how to use org.apache.hadoop.fs.FileUtil.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: WithHDFSSupport.scala From spark-atlas-connector with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.atlas import java.io.File import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileUtil import org.apache.hadoop.hdfs.MiniDFSCluster import org.apache.spark.sql.SparkSession import org.scalatest.{BeforeAndAfterAll, Suite} trait WithHDFSSupport extends BeforeAndAfterAll { self: Suite => protected var sparkSession: SparkSession = _ private var hdfsCluster: MiniDFSCluster = _ protected var hdfsURI: String = _ private def cleanupAnyExistingSession(): Unit = { val session = SparkSession.getActiveSession.orElse(SparkSession.getDefaultSession) if (session.isDefined) { session.get.sessionState.catalog.reset() session.get.stop() SparkSession.clearActiveSession() SparkSession.clearDefaultSession() } } override protected def beforeAll(): Unit = { super.beforeAll() cleanupAnyExistingSession() val baseDir = new File("./target/hdfs/").getAbsoluteFile() FileUtil.fullyDelete(baseDir) val conf = new Configuration() conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.getAbsolutePath()) val builder = new MiniDFSCluster.Builder(conf) hdfsCluster = builder.build() hdfsURI = s"hdfs://localhost:${hdfsCluster.getNameNodePort()}/" sparkSession = SparkSession.builder() .master("local") .appName(this.getClass.getCanonicalName) .enableHiveSupport() .config("spark.hadoop.fs.defaultFS", hdfsURI) .config("spark.ui.enabled", "false") .getOrCreate() } override protected def afterAll(): Unit = { try { sparkSession.sessionState.catalog.reset() sparkSession.stop() SparkSession.clearActiveSession() SparkSession.clearDefaultSession() } finally { sparkSession = null } System.clearProperty("spark.driver.port") hdfsCluster.shutdown(true) super.afterAll() } }
Example 2
Source File: StorageHelper.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.storage import java.io.File import org.apache.hadoop.fs.{FileSystem, FileUtil, Path} import org.apache.spark.{SparkContext, SparkFiles} import org.apache.spark.sql.SparkSession object StorageHelper { def resolveStorageName(database: String, storageRef: String): String = new Path(database + "_" + storageRef).toString def load( storageSourcePath: String, spark: SparkSession, database: String, storageRef: String, withinStorage: Boolean ): RocksDBConnection = { val dbFolder = StorageHelper.resolveStorageName(database.toString, storageRef) val src = StorageLocator.getStorageSerializedPath(storageSourcePath.replaceAllLiterally("\\", "/"), dbFolder, withinStorage) val locator = StorageLocator(database, storageRef, spark) sendToCluster(src, locator.clusterFilePath, locator.clusterFileName, locator.destinationScheme, spark.sparkContext) RocksDBConnection.getOrCreate(locator.clusterFileName) } def save(path: String, connection: RocksDBConnection, spark: SparkSession, withinStorage: Boolean): Unit = { val indexUri = "file://"+(new java.net.URI(connection.findLocalIndex.replaceAllLiterally("\\", "/")).getPath) val index = new Path(indexUri) val uri = new java.net.URI(path.replaceAllLiterally("\\", "/")) val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration) val dst = new Path(path+{if (withinStorage) "/storage/" else ""}) save(fs, index, dst) } private def save(fs: FileSystem, index: Path, dst: Path): Unit = { if (!fs.exists(dst)) fs.mkdirs(dst) fs.copyFromLocalFile(false, true, index, dst) } def sendToCluster(source: Path, clusterFilePath: Path, clusterFileName: String, destinationScheme: String, sparkContext: SparkContext): Unit = { if (destinationScheme == "file") { copyIndexToLocal(source, new Path(RocksDBConnection.getLocalPath(clusterFileName)), sparkContext) } else { copyIndexToCluster(source, clusterFilePath, sparkContext) } } private def copyIndexToCluster(sourcePath: Path, dst: Path, spark: SparkContext): String = { if (!new File(SparkFiles.get(dst.getName)).exists()) { val srcFS = sourcePath.getFileSystem(spark.hadoopConfiguration) val dstFS = dst.getFileSystem(spark.hadoopConfiguration) if (srcFS.getScheme == "file") { val src = sourcePath dstFS.copyFromLocalFile(false, true, src, dst) } else { FileUtil.copy(srcFS, sourcePath, dstFS, dst, false, true, spark.hadoopConfiguration) } spark.addFile(dst.toString, recursive = true) } dst.toString } private def copyIndexToLocal(source: Path, destination: Path, context: SparkContext): Unit = { val fs = source.getFileSystem(context.hadoopConfiguration) if (!fs.exists(destination)) fs.copyFromLocalFile(false, true, source, destination) } }
Example 3
Source File: LocalFileSystemClient.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang import java.io._ import java.nio.file.{Files, Paths} import org.apache.hadoop.fs.FileUtil import ai.deepsense.commons.datetime.DateTimeConverter import ai.deepsense.commons.resources.ManagedResource import ai.deepsense.commons.serialization.Serialization case class LocalFileSystemClient() extends FileSystemClient with Serialization { override def fileExists(path: String): Boolean = Files.exists(Paths.get(path)) override def copyLocalFile[T <: Serializable]( localFilePath: String, remoteFilePath: String): Unit = { def copyFile(f: File, dest: String): Unit = { ManagedResource(new FileInputStream(f)) { fis => saveInputStreamToFile(fis, dest) } } val input = new File(localFilePath) if (input.isDirectory) { input.listFiles().foreach {f => copyFile(f, remoteFilePath + "/" + f.getName)} } else { copyFile(input, remoteFilePath) } } override def saveObjectToFile[T <: Serializable](path: String, instance: T): Unit = { val inputStream = new BufferedInputStream(new ByteArrayInputStream(serialize(instance))) ManagedResource(inputStream) { inputStream => saveInputStreamToFile(inputStream, path) } } override def saveInputStreamToFile(inputStream: InputStream, destinationPath: String): Unit = ManagedResource(new BufferedOutputStream(new FileOutputStream(destinationPath))) { fos => org.apache.commons.io.IOUtils.copy(inputStream, fos) } override def readFileAsObject[T <: Serializable](path: String): T = ManagedResource(new FileInputStream(path)) { inputStream => deserialize(org.apache.commons.io.IOUtils.toByteArray(inputStream)) } override def getFileInfo(path: String): Option[FileInfo] = { val file = new File(path) if (file.exists()) { Some(FileInfo(file.length(), DateTimeConverter.fromMillis(file.lastModified()))) } else { None } } override def delete(path: String): Unit = FileUtil.fullyDelete(new File(path)) }
Example 4
Source File: LocalFileSystemClient.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang import java.io._ import java.nio.file.{Files, Paths} import org.apache.hadoop.fs.FileUtil import io.deepsense.commons.datetime.DateTimeConverter import io.deepsense.commons.resources.ManagedResource import io.deepsense.commons.serialization.Serialization case class LocalFileSystemClient() extends FileSystemClient with Serialization { override def fileExists(path: String): Boolean = Files.exists(Paths.get(path)) override def copyLocalFile[T <: Serializable]( localFilePath: String, remoteFilePath: String): Unit = { def copyFile(f: File, dest: String): Unit = { ManagedResource(new FileInputStream(f)) { fis => saveInputStreamToFile(fis, dest) } } val input = new File(localFilePath) if (input.isDirectory) { input.listFiles().foreach {f => copyFile(f, remoteFilePath + "/" + f.getName)} } else { copyFile(input, remoteFilePath) } } override def saveObjectToFile[T <: Serializable](path: String, instance: T): Unit = { val inputStream = new BufferedInputStream(new ByteArrayInputStream(serialize(instance))) ManagedResource(inputStream) { inputStream => saveInputStreamToFile(inputStream, path) } } override def saveInputStreamToFile(inputStream: InputStream, destinationPath: String): Unit = ManagedResource(new BufferedOutputStream(new FileOutputStream(destinationPath))) { fos => org.apache.commons.io.IOUtils.copy(inputStream, fos) } override def readFileAsObject[T <: Serializable](path: String): T = ManagedResource(new FileInputStream(path)) { inputStream => deserialize(org.apache.commons.io.IOUtils.toByteArray(inputStream)) } override def getFileInfo(path: String): Option[FileInfo] = { val file = new File(path) if (file.exists()) { Some(FileInfo(file.length(), DateTimeConverter.fromMillis(file.lastModified()))) } else { None } } override def delete(path: String): Unit = FileUtil.fullyDelete(new File(path)) }
Example 5
Source File: Neo4jPersistence.scala From csb with GNU General Public License v3.0 | 5 votes |
package edu.msstate.dasi.csb.persistence import java.io.{File, PrintWriter} import edu.msstate.dasi.csb.model.{EdgeData, VertexData} import edu.msstate.dasi.csb.util.Util import org.apache.hadoop.fs.FileUtil import org.apache.spark.graphx.Graph object Neo4jPersistence extends GraphPersistence { private val vertices_suffix = "_nodes" private val edges_suffix = "_relationships" def saveAsText(graph: Graph[VertexData, EdgeData], graphName: String, overwrite :Boolean = false): Unit = { val verticesPath = graphName + vertices_suffix val verticesTmpPath = "__" + verticesPath val edgesPath = graphName + edges_suffix val edgesTmpPath = "__" + edgesPath if (overwrite) { FileUtil.fullyDelete(new File(verticesPath + "-header")) FileUtil.fullyDelete(new File(verticesPath)) FileUtil.fullyDelete(new File(edgesPath + "-header")) FileUtil.fullyDelete(new File(edgesPath)) } val nodeHeader = s"name:ID($graphName),:LABEL\n" val nodeHeaderWriter = new PrintWriter(new File(verticesPath + "-header")) nodeHeaderWriter.write(nodeHeader) nodeHeaderWriter.close() graph.vertices.map { case (id, _) => s"$id,$graphName" }.saveAsTextFile(verticesTmpPath) Util.merge(verticesTmpPath, verticesPath) FileUtil.fullyDelete(new File(verticesTmpPath)) val relationshipHeader = s":START_ID($graphName),:END_ID($graphName),:TYPE,${EdgeData.neo4jCsvHeader}\n" val relHeaderWriter = new PrintWriter(new File(edgesPath + "-header")) relHeaderWriter.write(relationshipHeader) relHeaderWriter.close() graph.edges.map(edge => edge.attr match { case edgeData: EdgeData => s"${edge.srcId},${edge.dstId},EDGE,${edgeData.toCsv}" case _ => s"${edge.srcId},${edge.dstId},EDGE" } ).saveAsTextFile(edgesTmpPath) Util.merge(edgesTmpPath, edgesPath) FileUtil.fullyDelete(new File(edgesTmpPath)) } }
Example 6
Source File: SparkPersistence.scala From csb with GNU General Public License v3.0 | 5 votes |
package edu.msstate.dasi.csb.persistence import java.io.File import edu.msstate.dasi.csb.model.{EdgeData, VertexData} import edu.msstate.dasi.csb.sc import edu.msstate.dasi.csb.util.Util import org.apache.hadoop.fs.FileUtil import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.storage.StorageLevel object SparkPersistence extends GraphPersistence { private val vertices_suffix = "_vertices" private val edges_suffix = "_edges" def saveAsText(graph: Graph[VertexData, EdgeData], graphName: String, overwrite: Boolean = false): Unit = { val verticesPath = graphName + vertices_suffix val verticesTmpPath = "__" + verticesPath val edgesPath = graphName + edges_suffix val edgesTmpPath = "__" + edgesPath if (overwrite) { FileUtil.fullyDelete(new File(verticesPath)) FileUtil.fullyDelete(new File(edgesPath)) } graph.vertices.saveAsTextFile(verticesTmpPath) Util.merge(verticesTmpPath, verticesPath) FileUtil.fullyDelete(new File(verticesTmpPath)) graph.edges.saveAsTextFile(edgesTmpPath) Util.merge(edgesTmpPath, edgesPath) FileUtil.fullyDelete(new File(edgesTmpPath)) } }
Example 7
Source File: Util.scala From csb with GNU General Public License v3.0 | 5 votes |
package edu.msstate.dasi.csb.util import java.io.File import edu.msstate.dasi.csb.model.{EdgeData, VertexData} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FileUtil, Path} import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD import scala.collection.mutable import scala.reflect.ClassTag object Util { def time[R](taskName: String, task: => R): R = { println(s"[TIME] $taskName started...") val start = System.nanoTime val ret = task // call-by-name val end = System.nanoTime println(s"[TIME] $taskName completed in ${(end - start) / 1e9} s") ret } def convertLabelsToStandardForm[VD: ClassTag, ED: ClassTag](G: Graph[VD, ED]): Graph[VertexData, EdgeData] = { val nodeList = G.vertices val edgeList = G.edges val hash = new mutable.HashMap[Long, Long] val nodes = nodeList.map(record => record._1).collect() var counter = 0 for(entry <- nodes) { hash.put(entry, counter) counter += 1 } val newNodes = nodeList.map(record => hash.get(record._1).head).sortBy(record => record, ascending = true) val newEdges = edgeList.map(record => (hash.get(record.srcId).head, hash.get(record.dstId).head)) val newEdgesRDD: RDD[Edge[EdgeData]] = newEdges.map(record => Edge(record._1, record._2)) // val newEdges = edgeList.flatMap(record => Array((hash.get(record._1).head, hash.get(record._2).head), (hash.get(record._2).head, hash.get(record._1).head))) return Graph.fromEdges(newEdgesRDD, VertexData()) } def stripMultiEdges[VD: ClassTag, ED: ClassTag](G: Graph[VD, ED]): Graph[VD, ED] = { G.groupEdges(mergeEdges[ED]) // val stripedEdges = G.edges.groupBy(record => (record.srcId, record.dstId)).map(record => record._2.head) // return Graph.fromEdges(EdgeRDD.fromEdges(stripedEdges), VertexData()) } def mergeEdges[ED: ClassTag](e1: ED, e2: ED): ED = { null.asInstanceOf[ED] } }