org.apache.hadoop.fs.FileUtil Scala Example

Source File: WithHDFSSupport.scala From spark-atlas-connector with Apache License 2.0

5 votes

package com.hortonworks.spark.atlas

import java.io.File

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileUtil
import org.apache.hadoop.hdfs.MiniDFSCluster
import org.apache.spark.sql.SparkSession
import org.scalatest.{BeforeAndAfterAll, Suite}

trait WithHDFSSupport extends BeforeAndAfterAll { self: Suite =>

  protected var sparkSession: SparkSession = _

  private var hdfsCluster: MiniDFSCluster = _
  protected var hdfsURI: String = _

  private def cleanupAnyExistingSession(): Unit = {
    val session = SparkSession.getActiveSession.orElse(SparkSession.getDefaultSession)
    if (session.isDefined) {
      session.get.sessionState.catalog.reset()
      session.get.stop()
      SparkSession.clearActiveSession()
      SparkSession.clearDefaultSession()
    }
  }

  override protected def beforeAll(): Unit = {
    super.beforeAll()

    cleanupAnyExistingSession()

    val baseDir = new File("./target/hdfs/").getAbsoluteFile()
    FileUtil.fullyDelete(baseDir)

    val conf = new Configuration()
    conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.getAbsolutePath())
    val builder = new MiniDFSCluster.Builder(conf)

    hdfsCluster = builder.build()
    hdfsURI = s"hdfs://localhost:${hdfsCluster.getNameNodePort()}/"

    sparkSession = SparkSession.builder()
      .master("local")
      .appName(this.getClass.getCanonicalName)
      .enableHiveSupport()
      .config("spark.hadoop.fs.defaultFS", hdfsURI)
      .config("spark.ui.enabled", "false")
      .getOrCreate()
  }

  override protected def afterAll(): Unit = {
    try {
      sparkSession.sessionState.catalog.reset()
      sparkSession.stop()
      SparkSession.clearActiveSession()
      SparkSession.clearDefaultSession()
    } finally {
      sparkSession = null
    }
    System.clearProperty("spark.driver.port")

    hdfsCluster.shutdown(true)

    super.afterAll()
  }
}

Source File: StorageHelper.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.storage

import java.io.File

import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.spark.{SparkContext, SparkFiles}
import org.apache.spark.sql.SparkSession


object StorageHelper {

  def resolveStorageName(database: String, storageRef: String): String = new Path(database + "_" + storageRef).toString

  def load(
            storageSourcePath: String,
            spark: SparkSession,
            database: String,
            storageRef: String,
            withinStorage: Boolean
          ): RocksDBConnection = {

    val dbFolder = StorageHelper.resolveStorageName(database.toString, storageRef)
    val src = StorageLocator.getStorageSerializedPath(storageSourcePath.replaceAllLiterally("\\", "/"), dbFolder, withinStorage)

    val locator = StorageLocator(database, storageRef, spark)

    sendToCluster(src, locator.clusterFilePath, locator.clusterFileName, locator.destinationScheme, spark.sparkContext)

    RocksDBConnection.getOrCreate(locator.clusterFileName)
  }

  def save(path: String, connection: RocksDBConnection, spark: SparkSession, withinStorage: Boolean): Unit = {
    val indexUri = "file://"+(new java.net.URI(connection.findLocalIndex.replaceAllLiterally("\\", "/")).getPath)
    val index = new Path(indexUri)

    val uri = new java.net.URI(path.replaceAllLiterally("\\", "/"))
    val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration)
    val dst = new Path(path+{if (withinStorage) "/storage/" else ""})

    save(fs, index, dst)
  }

  private def save(fs: FileSystem, index: Path, dst: Path): Unit = {
    if (!fs.exists(dst))
      fs.mkdirs(dst)
    fs.copyFromLocalFile(false, true, index, dst)
  }

  def sendToCluster(source: Path, clusterFilePath: Path, clusterFileName: String, destinationScheme: String, sparkContext: SparkContext): Unit = {
    if (destinationScheme == "file") {
      copyIndexToLocal(source, new Path(RocksDBConnection.getLocalPath(clusterFileName)), sparkContext)
    } else {
      copyIndexToCluster(source, clusterFilePath, sparkContext)
    }
  }

  private def copyIndexToCluster(sourcePath: Path, dst: Path, spark: SparkContext): String = {
    if (!new File(SparkFiles.get(dst.getName)).exists()) {
      val srcFS = sourcePath.getFileSystem(spark.hadoopConfiguration)
      val dstFS = dst.getFileSystem(spark.hadoopConfiguration)

      if (srcFS.getScheme == "file") {
        val src = sourcePath
        dstFS.copyFromLocalFile(false, true, src, dst)
      } else {
        FileUtil.copy(srcFS, sourcePath, dstFS, dst, false, true, spark.hadoopConfiguration)
      }

      spark.addFile(dst.toString, recursive = true)
    }
    dst.toString
  }

  private def copyIndexToLocal(source: Path, destination: Path, context: SparkContext): Unit = {
    
    val fs = source.getFileSystem(context.hadoopConfiguration)
    if (!fs.exists(destination))
      fs.copyFromLocalFile(false, true, source, destination)
  }

}

Source File: LocalFileSystemClient.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.deeplang

import java.io._
import java.nio.file.{Files, Paths}

import org.apache.hadoop.fs.FileUtil

import ai.deepsense.commons.datetime.DateTimeConverter
import ai.deepsense.commons.resources.ManagedResource
import ai.deepsense.commons.serialization.Serialization

case class LocalFileSystemClient() extends FileSystemClient with Serialization {

  override def fileExists(path: String): Boolean = Files.exists(Paths.get(path))

  override def copyLocalFile[T <: Serializable](
    localFilePath: String,
    remoteFilePath: String): Unit = {
    def copyFile(f: File, dest: String): Unit = {
      ManagedResource(new FileInputStream(f)) { fis =>
        saveInputStreamToFile(fis, dest)
      }
    }
    val input = new File(localFilePath)
    if (input.isDirectory) {
      input.listFiles().foreach {f => copyFile(f, remoteFilePath + "/" + f.getName)}
    } else {
      copyFile(input, remoteFilePath)
    }

  }

  override def saveObjectToFile[T <: Serializable](path: String, instance: T): Unit = {
    val inputStream = new BufferedInputStream(new ByteArrayInputStream(serialize(instance)))
    ManagedResource(inputStream) { inputStream =>
      saveInputStreamToFile(inputStream, path)
    }
  }

  override def saveInputStreamToFile(inputStream: InputStream, destinationPath: String): Unit =
    ManagedResource(new BufferedOutputStream(new FileOutputStream(destinationPath))) { fos =>
      org.apache.commons.io.IOUtils.copy(inputStream, fos)
    }

  override def readFileAsObject[T <: Serializable](path: String): T =
    ManagedResource(new FileInputStream(path)) { inputStream =>
      deserialize(org.apache.commons.io.IOUtils.toByteArray(inputStream))
    }

  override def getFileInfo(path: String): Option[FileInfo] = {
    val file = new File(path)
    if (file.exists()) {
      Some(FileInfo(file.length(), DateTimeConverter.fromMillis(file.lastModified())))
    } else {
      None
    }
  }

  override def delete(path: String): Unit = FileUtil.fullyDelete(new File(path))
}

Source File: LocalFileSystemClient.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.deeplang

import java.io._
import java.nio.file.{Files, Paths}

import org.apache.hadoop.fs.FileUtil

import io.deepsense.commons.datetime.DateTimeConverter
import io.deepsense.commons.resources.ManagedResource
import io.deepsense.commons.serialization.Serialization

case class LocalFileSystemClient() extends FileSystemClient with Serialization {

  override def fileExists(path: String): Boolean = Files.exists(Paths.get(path))

  override def copyLocalFile[T <: Serializable](
    localFilePath: String,
    remoteFilePath: String): Unit = {
    def copyFile(f: File, dest: String): Unit = {
      ManagedResource(new FileInputStream(f)) { fis =>
        saveInputStreamToFile(fis, dest)
      }
    }
    val input = new File(localFilePath)
    if (input.isDirectory) {
      input.listFiles().foreach {f => copyFile(f, remoteFilePath + "/" + f.getName)}
    } else {
      copyFile(input, remoteFilePath)
    }

  }

  override def saveObjectToFile[T <: Serializable](path: String, instance: T): Unit = {
    val inputStream = new BufferedInputStream(new ByteArrayInputStream(serialize(instance)))
    ManagedResource(inputStream) { inputStream =>
      saveInputStreamToFile(inputStream, path)
    }
  }

  override def saveInputStreamToFile(inputStream: InputStream, destinationPath: String): Unit =
    ManagedResource(new BufferedOutputStream(new FileOutputStream(destinationPath))) { fos =>
      org.apache.commons.io.IOUtils.copy(inputStream, fos)
    }

  override def readFileAsObject[T <: Serializable](path: String): T =
    ManagedResource(new FileInputStream(path)) { inputStream =>
      deserialize(org.apache.commons.io.IOUtils.toByteArray(inputStream))
    }

  override def getFileInfo(path: String): Option[FileInfo] = {
    val file = new File(path)
    if (file.exists()) {
      Some(FileInfo(file.length(), DateTimeConverter.fromMillis(file.lastModified())))
    } else {
      None
    }
  }

  override def delete(path: String): Unit = FileUtil.fullyDelete(new File(path))
}

Source File: Neo4jPersistence.scala From csb with GNU General Public License v3.0

5 votes

package edu.msstate.dasi.csb.persistence

import java.io.{File, PrintWriter}

import edu.msstate.dasi.csb.model.{EdgeData, VertexData}
import edu.msstate.dasi.csb.util.Util
import org.apache.hadoop.fs.FileUtil
import org.apache.spark.graphx.Graph

object Neo4jPersistence extends GraphPersistence {
  private val vertices_suffix = "_nodes"
  private val edges_suffix = "_relationships"

  
  def saveAsText(graph: Graph[VertexData, EdgeData], graphName: String, overwrite :Boolean = false): Unit = {
    val verticesPath = graphName + vertices_suffix
    val verticesTmpPath = "__" + verticesPath
    val edgesPath = graphName + edges_suffix
    val edgesTmpPath = "__" + edgesPath

    if (overwrite) {
      FileUtil.fullyDelete(new File(verticesPath + "-header"))
      FileUtil.fullyDelete(new File(verticesPath))
      FileUtil.fullyDelete(new File(edgesPath + "-header"))
      FileUtil.fullyDelete(new File(edgesPath))
    }

    val nodeHeader = s"name:ID($graphName),:LABEL\n"

    val nodeHeaderWriter = new PrintWriter(new File(verticesPath + "-header"))
    nodeHeaderWriter.write(nodeHeader)
    nodeHeaderWriter.close()

    graph.vertices.map {
      case (id, _) => s"$id,$graphName"
    }.saveAsTextFile(verticesTmpPath)

    Util.merge(verticesTmpPath, verticesPath)
    FileUtil.fullyDelete(new File(verticesTmpPath))

    val relationshipHeader = s":START_ID($graphName),:END_ID($graphName),:TYPE,${EdgeData.neo4jCsvHeader}\n"

    val relHeaderWriter = new PrintWriter(new File(edgesPath + "-header"))
    relHeaderWriter.write(relationshipHeader)
    relHeaderWriter.close()

    graph.edges.map(edge =>
      edge.attr match {
        case edgeData: EdgeData => s"${edge.srcId},${edge.dstId},EDGE,${edgeData.toCsv}"
        case _ => s"${edge.srcId},${edge.dstId},EDGE"
      }
    ).saveAsTextFile(edgesTmpPath)

    Util.merge(edgesTmpPath, edgesPath)
    FileUtil.fullyDelete(new File(edgesTmpPath))
  }
}

Source File: SparkPersistence.scala From csb with GNU General Public License v3.0

5 votes

package edu.msstate.dasi.csb.persistence

import java.io.File

import edu.msstate.dasi.csb.model.{EdgeData, VertexData}
import edu.msstate.dasi.csb.sc
import edu.msstate.dasi.csb.util.Util
import org.apache.hadoop.fs.FileUtil
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.storage.StorageLevel

object SparkPersistence extends GraphPersistence {
  private val vertices_suffix = "_vertices"
  private val edges_suffix = "_edges"

  
  def saveAsText(graph: Graph[VertexData, EdgeData], graphName: String, overwrite: Boolean = false): Unit = {
    val verticesPath = graphName + vertices_suffix
    val verticesTmpPath = "__" + verticesPath
    val edgesPath = graphName + edges_suffix
    val edgesTmpPath = "__" + edgesPath

    if (overwrite) {
      FileUtil.fullyDelete(new File(verticesPath))
      FileUtil.fullyDelete(new File(edgesPath))
    }

    graph.vertices.saveAsTextFile(verticesTmpPath)
    Util.merge(verticesTmpPath, verticesPath)
    FileUtil.fullyDelete(new File(verticesTmpPath))

    graph.edges.saveAsTextFile(edgesTmpPath)
    Util.merge(edgesTmpPath, edgesPath)
    FileUtil.fullyDelete(new File(edgesTmpPath))
  }
}

Source File: Util.scala From csb with GNU General Public License v3.0

5 votes

package edu.msstate.dasi.csb.util

import java.io.File

import edu.msstate.dasi.csb.model.{EdgeData, VertexData}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD

import scala.collection.mutable
import scala.reflect.ClassTag

object Util {
  
  def time[R](taskName: String, task: => R): R = {
    println(s"[TIME] $taskName started...")
    val start = System.nanoTime
    val ret = task // call-by-name
    val end = System.nanoTime
    println(s"[TIME] $taskName completed in ${(end - start) / 1e9} s")
    ret
  }

  def convertLabelsToStandardForm[VD: ClassTag, ED: ClassTag](G: Graph[VD, ED]): Graph[VertexData, EdgeData] =
  {
   val nodeList = G.vertices
    val edgeList = G.edges
    val hash = new mutable.HashMap[Long, Long]
    val nodes = nodeList.map(record => record._1).collect()
    var counter = 0
    for(entry <- nodes)
    {
      hash.put(entry, counter)
      counter += 1
    }
    val newNodes = nodeList.map(record => hash.get(record._1).head).sortBy(record => record, ascending = true)
    val newEdges = edgeList.map(record => (hash.get(record.srcId).head, hash.get(record.dstId).head))
    val newEdgesRDD: RDD[Edge[EdgeData]] = newEdges.map(record => Edge(record._1, record._2))
    //    val newEdges = edgeList.flatMap(record => Array((hash.get(record._1).head, hash.get(record._2).head), (hash.get(record._2).head, hash.get(record._1).head)))
    return Graph.fromEdges(newEdgesRDD, VertexData())
  }

  def stripMultiEdges[VD: ClassTag, ED: ClassTag](G: Graph[VD, ED]): Graph[VD, ED] =
  {
    G.groupEdges(mergeEdges[ED])
//    val stripedEdges = G.edges.groupBy(record => (record.srcId, record.dstId)).map(record => record._2.head)
//    return Graph.fromEdges(EdgeRDD.fromEdges(stripedEdges), VertexData())
  }

  def mergeEdges[ED: ClassTag](e1: ED, e2: ED): ED = {
    null.asInstanceOf[ED]
  }
}

org.apache.hadoop.fs.FileUtil Scala Examples