org.apache.hadoop.fs.FileUtil Scala Examples

The following examples show how to use org.apache.hadoop.fs.FileUtil. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: WithHDFSSupport.scala    From spark-atlas-connector   with Apache License 2.0 5 votes vote down vote up
package com.hortonworks.spark.atlas

import java.io.File

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileUtil
import org.apache.hadoop.hdfs.MiniDFSCluster
import org.apache.spark.sql.SparkSession
import org.scalatest.{BeforeAndAfterAll, Suite}

trait WithHDFSSupport extends BeforeAndAfterAll { self: Suite =>

  protected var sparkSession: SparkSession = _

  private var hdfsCluster: MiniDFSCluster = _
  protected var hdfsURI: String = _

  private def cleanupAnyExistingSession(): Unit = {
    val session = SparkSession.getActiveSession.orElse(SparkSession.getDefaultSession)
    if (session.isDefined) {
      session.get.sessionState.catalog.reset()
      session.get.stop()
      SparkSession.clearActiveSession()
      SparkSession.clearDefaultSession()
    }
  }

  override protected def beforeAll(): Unit = {
    super.beforeAll()

    cleanupAnyExistingSession()

    val baseDir = new File("./target/hdfs/").getAbsoluteFile()
    FileUtil.fullyDelete(baseDir)

    val conf = new Configuration()
    conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.getAbsolutePath())
    val builder = new MiniDFSCluster.Builder(conf)

    hdfsCluster = builder.build()
    hdfsURI = s"hdfs://localhost:${hdfsCluster.getNameNodePort()}/"

    sparkSession = SparkSession.builder()
      .master("local")
      .appName(this.getClass.getCanonicalName)
      .enableHiveSupport()
      .config("spark.hadoop.fs.defaultFS", hdfsURI)
      .config("spark.ui.enabled", "false")
      .getOrCreate()
  }

  override protected def afterAll(): Unit = {
    try {
      sparkSession.sessionState.catalog.reset()
      sparkSession.stop()
      SparkSession.clearActiveSession()
      SparkSession.clearDefaultSession()
    } finally {
      sparkSession = null
    }
    System.clearProperty("spark.driver.port")

    hdfsCluster.shutdown(true)

    super.afterAll()
  }
} 
Example 2
Source File: StorageHelper.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.storage

import java.io.File

import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.spark.{SparkContext, SparkFiles}
import org.apache.spark.sql.SparkSession


object StorageHelper {

  def resolveStorageName(database: String, storageRef: String): String = new Path(database + "_" + storageRef).toString

  def load(
            storageSourcePath: String,
            spark: SparkSession,
            database: String,
            storageRef: String,
            withinStorage: Boolean
          ): RocksDBConnection = {

    val dbFolder = StorageHelper.resolveStorageName(database.toString, storageRef)
    val src = StorageLocator.getStorageSerializedPath(storageSourcePath.replaceAllLiterally("\\", "/"), dbFolder, withinStorage)

    val locator = StorageLocator(database, storageRef, spark)

    sendToCluster(src, locator.clusterFilePath, locator.clusterFileName, locator.destinationScheme, spark.sparkContext)

    RocksDBConnection.getOrCreate(locator.clusterFileName)
  }

  def save(path: String, connection: RocksDBConnection, spark: SparkSession, withinStorage: Boolean): Unit = {
    val indexUri = "file://"+(new java.net.URI(connection.findLocalIndex.replaceAllLiterally("\\", "/")).getPath)
    val index = new Path(indexUri)

    val uri = new java.net.URI(path.replaceAllLiterally("\\", "/"))
    val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration)
    val dst = new Path(path+{if (withinStorage) "/storage/" else ""})

    save(fs, index, dst)
  }

  private def save(fs: FileSystem, index: Path, dst: Path): Unit = {
    if (!fs.exists(dst))
      fs.mkdirs(dst)
    fs.copyFromLocalFile(false, true, index, dst)
  }

  def sendToCluster(source: Path, clusterFilePath: Path, clusterFileName: String, destinationScheme: String, sparkContext: SparkContext): Unit = {
    if (destinationScheme == "file") {
      copyIndexToLocal(source, new Path(RocksDBConnection.getLocalPath(clusterFileName)), sparkContext)
    } else {
      copyIndexToCluster(source, clusterFilePath, sparkContext)
    }
  }

  private def copyIndexToCluster(sourcePath: Path, dst: Path, spark: SparkContext): String = {
    if (!new File(SparkFiles.get(dst.getName)).exists()) {
      val srcFS = sourcePath.getFileSystem(spark.hadoopConfiguration)
      val dstFS = dst.getFileSystem(spark.hadoopConfiguration)

      if (srcFS.getScheme == "file") {
        val src = sourcePath
        dstFS.copyFromLocalFile(false, true, src, dst)
      } else {
        FileUtil.copy(srcFS, sourcePath, dstFS, dst, false, true, spark.hadoopConfiguration)
      }

      spark.addFile(dst.toString, recursive = true)
    }
    dst.toString
  }

  private def copyIndexToLocal(source: Path, destination: Path, context: SparkContext): Unit = {
    
    val fs = source.getFileSystem(context.hadoopConfiguration)
    if (!fs.exists(destination))
      fs.copyFromLocalFile(false, true, source, destination)
  }

} 
Example 3
Source File: LocalFileSystemClient.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang

import java.io._
import java.nio.file.{Files, Paths}

import org.apache.hadoop.fs.FileUtil

import ai.deepsense.commons.datetime.DateTimeConverter
import ai.deepsense.commons.resources.ManagedResource
import ai.deepsense.commons.serialization.Serialization

case class LocalFileSystemClient() extends FileSystemClient with Serialization {

  override def fileExists(path: String): Boolean = Files.exists(Paths.get(path))

  override def copyLocalFile[T <: Serializable](
    localFilePath: String,
    remoteFilePath: String): Unit = {
    def copyFile(f: File, dest: String): Unit = {
      ManagedResource(new FileInputStream(f)) { fis =>
        saveInputStreamToFile(fis, dest)
      }
    }
    val input = new File(localFilePath)
    if (input.isDirectory) {
      input.listFiles().foreach {f => copyFile(f, remoteFilePath + "/" + f.getName)}
    } else {
      copyFile(input, remoteFilePath)
    }

  }

  override def saveObjectToFile[T <: Serializable](path: String, instance: T): Unit = {
    val inputStream = new BufferedInputStream(new ByteArrayInputStream(serialize(instance)))
    ManagedResource(inputStream) { inputStream =>
      saveInputStreamToFile(inputStream, path)
    }
  }

  override def saveInputStreamToFile(inputStream: InputStream, destinationPath: String): Unit =
    ManagedResource(new BufferedOutputStream(new FileOutputStream(destinationPath))) { fos =>
      org.apache.commons.io.IOUtils.copy(inputStream, fos)
    }

  override def readFileAsObject[T <: Serializable](path: String): T =
    ManagedResource(new FileInputStream(path)) { inputStream =>
      deserialize(org.apache.commons.io.IOUtils.toByteArray(inputStream))
    }

  override def getFileInfo(path: String): Option[FileInfo] = {
    val file = new File(path)
    if (file.exists()) {
      Some(FileInfo(file.length(), DateTimeConverter.fromMillis(file.lastModified())))
    } else {
      None
    }
  }

  override def delete(path: String): Unit = FileUtil.fullyDelete(new File(path))
} 
Example 4
Source File: LocalFileSystemClient.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang

import java.io._
import java.nio.file.{Files, Paths}

import org.apache.hadoop.fs.FileUtil

import io.deepsense.commons.datetime.DateTimeConverter
import io.deepsense.commons.resources.ManagedResource
import io.deepsense.commons.serialization.Serialization

case class LocalFileSystemClient() extends FileSystemClient with Serialization {

  override def fileExists(path: String): Boolean = Files.exists(Paths.get(path))

  override def copyLocalFile[T <: Serializable](
    localFilePath: String,
    remoteFilePath: String): Unit = {
    def copyFile(f: File, dest: String): Unit = {
      ManagedResource(new FileInputStream(f)) { fis =>
        saveInputStreamToFile(fis, dest)
      }
    }
    val input = new File(localFilePath)
    if (input.isDirectory) {
      input.listFiles().foreach {f => copyFile(f, remoteFilePath + "/" + f.getName)}
    } else {
      copyFile(input, remoteFilePath)
    }

  }

  override def saveObjectToFile[T <: Serializable](path: String, instance: T): Unit = {
    val inputStream = new BufferedInputStream(new ByteArrayInputStream(serialize(instance)))
    ManagedResource(inputStream) { inputStream =>
      saveInputStreamToFile(inputStream, path)
    }
  }

  override def saveInputStreamToFile(inputStream: InputStream, destinationPath: String): Unit =
    ManagedResource(new BufferedOutputStream(new FileOutputStream(destinationPath))) { fos =>
      org.apache.commons.io.IOUtils.copy(inputStream, fos)
    }

  override def readFileAsObject[T <: Serializable](path: String): T =
    ManagedResource(new FileInputStream(path)) { inputStream =>
      deserialize(org.apache.commons.io.IOUtils.toByteArray(inputStream))
    }

  override def getFileInfo(path: String): Option[FileInfo] = {
    val file = new File(path)
    if (file.exists()) {
      Some(FileInfo(file.length(), DateTimeConverter.fromMillis(file.lastModified())))
    } else {
      None
    }
  }

  override def delete(path: String): Unit = FileUtil.fullyDelete(new File(path))
} 
Example 5
Source File: Neo4jPersistence.scala    From csb   with GNU General Public License v3.0 5 votes vote down vote up
package edu.msstate.dasi.csb.persistence

import java.io.{File, PrintWriter}

import edu.msstate.dasi.csb.model.{EdgeData, VertexData}
import edu.msstate.dasi.csb.util.Util
import org.apache.hadoop.fs.FileUtil
import org.apache.spark.graphx.Graph

object Neo4jPersistence extends GraphPersistence {
  private val vertices_suffix = "_nodes"
  private val edges_suffix = "_relationships"

  
  def saveAsText(graph: Graph[VertexData, EdgeData], graphName: String, overwrite :Boolean = false): Unit = {
    val verticesPath = graphName + vertices_suffix
    val verticesTmpPath = "__" + verticesPath
    val edgesPath = graphName + edges_suffix
    val edgesTmpPath = "__" + edgesPath

    if (overwrite) {
      FileUtil.fullyDelete(new File(verticesPath + "-header"))
      FileUtil.fullyDelete(new File(verticesPath))
      FileUtil.fullyDelete(new File(edgesPath + "-header"))
      FileUtil.fullyDelete(new File(edgesPath))
    }

    val nodeHeader = s"name:ID($graphName),:LABEL\n"

    val nodeHeaderWriter = new PrintWriter(new File(verticesPath + "-header"))
    nodeHeaderWriter.write(nodeHeader)
    nodeHeaderWriter.close()

    graph.vertices.map {
      case (id, _) => s"$id,$graphName"
    }.saveAsTextFile(verticesTmpPath)

    Util.merge(verticesTmpPath, verticesPath)
    FileUtil.fullyDelete(new File(verticesTmpPath))

    val relationshipHeader = s":START_ID($graphName),:END_ID($graphName),:TYPE,${EdgeData.neo4jCsvHeader}\n"

    val relHeaderWriter = new PrintWriter(new File(edgesPath + "-header"))
    relHeaderWriter.write(relationshipHeader)
    relHeaderWriter.close()

    graph.edges.map(edge =>
      edge.attr match {
        case edgeData: EdgeData => s"${edge.srcId},${edge.dstId},EDGE,${edgeData.toCsv}"
        case _ => s"${edge.srcId},${edge.dstId},EDGE"
      }
    ).saveAsTextFile(edgesTmpPath)

    Util.merge(edgesTmpPath, edgesPath)
    FileUtil.fullyDelete(new File(edgesTmpPath))
  }
} 
Example 6
Source File: SparkPersistence.scala    From csb   with GNU General Public License v3.0 5 votes vote down vote up
package edu.msstate.dasi.csb.persistence

import java.io.File

import edu.msstate.dasi.csb.model.{EdgeData, VertexData}
import edu.msstate.dasi.csb.sc
import edu.msstate.dasi.csb.util.Util
import org.apache.hadoop.fs.FileUtil
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.storage.StorageLevel

object SparkPersistence extends GraphPersistence {
  private val vertices_suffix = "_vertices"
  private val edges_suffix = "_edges"

  
  def saveAsText(graph: Graph[VertexData, EdgeData], graphName: String, overwrite: Boolean = false): Unit = {
    val verticesPath = graphName + vertices_suffix
    val verticesTmpPath = "__" + verticesPath
    val edgesPath = graphName + edges_suffix
    val edgesTmpPath = "__" + edgesPath

    if (overwrite) {
      FileUtil.fullyDelete(new File(verticesPath))
      FileUtil.fullyDelete(new File(edgesPath))
    }

    graph.vertices.saveAsTextFile(verticesTmpPath)
    Util.merge(verticesTmpPath, verticesPath)
    FileUtil.fullyDelete(new File(verticesTmpPath))

    graph.edges.saveAsTextFile(edgesTmpPath)
    Util.merge(edgesTmpPath, edgesPath)
    FileUtil.fullyDelete(new File(edgesTmpPath))
  }
} 
Example 7
Source File: Util.scala    From csb   with GNU General Public License v3.0 5 votes vote down vote up
package edu.msstate.dasi.csb.util

import java.io.File

import edu.msstate.dasi.csb.model.{EdgeData, VertexData}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD

import scala.collection.mutable
import scala.reflect.ClassTag

object Util {
  
  def time[R](taskName: String, task: => R): R = {
    println(s"[TIME] $taskName started...")
    val start = System.nanoTime
    val ret = task // call-by-name
    val end = System.nanoTime
    println(s"[TIME] $taskName completed in ${(end - start) / 1e9} s")
    ret
  }

  def convertLabelsToStandardForm[VD: ClassTag, ED: ClassTag](G: Graph[VD, ED]): Graph[VertexData, EdgeData] =
  {
   val nodeList = G.vertices
    val edgeList = G.edges
    val hash = new mutable.HashMap[Long, Long]
    val nodes = nodeList.map(record => record._1).collect()
    var counter = 0
    for(entry <- nodes)
    {
      hash.put(entry, counter)
      counter += 1
    }
    val newNodes = nodeList.map(record => hash.get(record._1).head).sortBy(record => record, ascending = true)
    val newEdges = edgeList.map(record => (hash.get(record.srcId).head, hash.get(record.dstId).head))
    val newEdgesRDD: RDD[Edge[EdgeData]] = newEdges.map(record => Edge(record._1, record._2))
    //    val newEdges = edgeList.flatMap(record => Array((hash.get(record._1).head, hash.get(record._2).head), (hash.get(record._2).head, hash.get(record._1).head)))
    return Graph.fromEdges(newEdgesRDD, VertexData())
  }

  def stripMultiEdges[VD: ClassTag, ED: ClassTag](G: Graph[VD, ED]): Graph[VD, ED] =
  {
    G.groupEdges(mergeEdges[ED])
//    val stripedEdges = G.edges.groupBy(record => (record.srcId, record.dstId)).map(record => record._2.head)
//    return Graph.fromEdges(EdgeRDD.fromEdges(stripedEdges), VertexData())
  }

  def mergeEdges[ED: ClassTag](e1: ED, e2: ED): ED = {
    null.asInstanceOf[ED]
  }
}