org.apache.commons.io.FileUtils Scala Example

Source File: Releaser.scala From releaser with Apache License 2.0

5 votes

package uk.gov.hmrc.releaser

import java.io.File
import java.nio.file.{Files, Path}

import org.apache.commons.io.FileUtils
import uk.gov.hmrc.releaser.bintray.{BintrayHttp, BintrayRepoConnector, DefaultBintrayRepoConnector}
import uk.gov.hmrc.releaser.github.{GithubConnector, Repo}
import uk.gov.hmrc.{CredentialsFinder, FileDownloader, Logger}

import scala.util.{Failure, Success, Try}

object ReleaserMain {
  def main(args: Array[String]): Unit = {
    val result = Releaser(args)
    System.exit(result)
  }
}

object Releaser extends Logger {

  import ArgParser._

  def apply(args: Array[String]): Int = {
    parser.parse(args, Config()) match {
      case Some(config) =>
        val githubName = config.githubNameOverride.getOrElse(config.artefactName)
        run(config.artefactName, ReleaseCandidateVersion(config.rcVersion), config.releaseType, githubName, config.releaseNotes, config.dryRun)
      case None => -1
    }
  }

  def run(artefactName: String, rcVersion: ReleaseCandidateVersion, releaseType: ReleaseType.Value, gitHubName: String, releaseNotes: Option[String], dryRun: Boolean = false): Int = {
    val githubCredsFile = System.getProperty("user.home") + "/.github/.credentials"
    val bintrayCredsFile = System.getProperty("user.home") + "/.bintray/.credentials"

    val githubCredsOpt = CredentialsFinder.findGithubCredsInFile(new File(githubCredsFile).toPath)
    val bintrayCredsOpt = CredentialsFinder.findBintrayCredsInFile(new File(bintrayCredsFile).toPath)

    doReleaseWithCleanup { directories =>
      if (githubCredsOpt.isEmpty) {
        log.info(s"Didn't find github credentials in $githubCredsFile")
        -1
      } else if (bintrayCredsOpt.isEmpty) {
        log.info(s"Didn't find Bintray credentials in $bintrayCredsFile")
        -1
      } else {

        val releaserVersion = getClass.getPackage.getImplementationVersion
        val metaDataProvider = new ArtefactMetaDataProvider()
        val gitHubDetails = if (dryRun) GithubConnector.dryRun(githubCredsOpt.get, releaserVersion) else GithubConnector(githubCredsOpt.get, releaserVersion)
        val bintrayDetails = if (dryRun) BintrayRepoConnector.dryRun(bintrayCredsOpt.get, directories.workDir) else BintrayRepoConnector(bintrayCredsOpt.get, directories.workDir)
        val bintrayRepoConnector = new DefaultBintrayRepoConnector(directories.workDir, new BintrayHttp(bintrayCredsOpt.get), new FileDownloader)

        val coordinator = new Coordinator(directories.stageDir, metaDataProvider, gitHubDetails, bintrayRepoConnector)
        val result = coordinator.start(artefactName, Repo(gitHubName), rcVersion, releaseType, releaseNotes)

        result match {
          case Success(targetVersion) =>
            log.info(s"Releaser successfully released $artefactName $targetVersion")
            0
          case Failure(e) =>
            e.printStackTrace()
            log.info(s"Releaser failed to release $artefactName $rcVersion with error '${e.getMessage}'")
            1
        }
      }
    }
  }

  def doReleaseWithCleanup[T](f: ReleaseDirectories => T): T = {
    val directories = ReleaseDirectories()
    try {
      f(directories)
    } finally {
      log.info("cleaning releaser work directory")
      directories.delete().recover{case  t => log.warn(s"failed to delete releaser work directory ${t.getMessage}")}
    }

  }
}

case class ReleaseDirectories(tmpDirectory: Path = Files.createTempDirectory("releaser")) {

  lazy val workDir = Files.createDirectories(tmpDirectory.resolve("work"))
  lazy val stageDir = Files.createDirectories(tmpDirectory.resolve("stage"))

  def delete() = Try {
    FileUtils.forceDelete(tmpDirectory.toFile)
  }
}

Source File: RemoteConfigWriter.scala From mvn_scalafmt with Apache License 2.0

5 votes

package org.antipathy.mvn_scalafmt.io

import org.antipathy.mvn_scalafmt.model.RemoteConfig
import java.io.File
import java.nio.charset.StandardCharsets

import org.apache.commons.io.FileUtils
import org.apache.maven.plugin.logging.Log
import java.nio.file.{Files, Path}


  override def write(input: RemoteConfig): Path = {

    log.info(s"Writing remote config to ${input.location.toAbsolutePath}")

    if (Files.exists(input.location)) {
      Files.delete(input.location)
    }

    val newConfig = new File(input.location.toAbsolutePath.toString)
    FileUtils.writeStringToFile(
      newConfig,
      input.contents,
      StandardCharsets.UTF_8
    )
    newConfig.toPath
  }
}

Source File: RemoteConfigWriterSpec.scala From mvn_scalafmt with Apache License 2.0

5 votes

package org.antipathy.mvn_scalafmt.io

import java.io.File
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}

import org.antipathy.mvn_scalafmt.model.RemoteConfig
import org.apache.commons.io.FileUtils
import org.apache.maven.plugin.logging.SystemStreamLog
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.GivenWhenThen
import org.scalatest.matchers.should.Matchers

class RemoteConfigWriterSpec extends AnyFlatSpec with GivenWhenThen with Matchers {

  behavior of "RemoteConfigWriter"

  it should "Write a config to a local path" in {

    val localPath = s"${System.getProperty("java.io.tmpdir")}${File.separator}.scalafmt.conf"
    val contents  = """version = "1.5.1"
                     |maxColumn = 120
                     |align = false
                     |rewrite.rules = [SortImports]
                     |danglingParentheses = true
                     |importSelectors = singleLine
                     |binPack.parentConstructors = true
                     |includeCurlyBraceInSelectChains = false""".stripMargin
    val writer    = new RemoteConfigWriter(new SystemStreamLog)
    val input     = RemoteConfig(contents, Paths.get(localPath))

    writer.write(input)

    new String(Files.readAllBytes(new File(localPath).toPath))
    Files.delete(input.location)
  }

  it should "Overwrite a config in a local path" in {

    val localPath = s"${System.getProperty("java.io.tmpdir")}${File.separator}.scalafmt2.conf"

    val contents    = """version = "1.5.1"
                     |maxColumn = 120
                     |align = false
                     |rewrite.rules = [SortImports]
                     |danglingParentheses = true
                     |importSelectors = singleLine
                     |binPack.parentConstructors = true
                     |includeCurlyBraceInSelectChains = false""".stripMargin
    val oldContents = "SomeOldConfig"

    val writer = new RemoteConfigWriter(new SystemStreamLog)
    val input  = RemoteConfig(contents, Paths.get(localPath))

    FileUtils.writeStringToFile(new File(localPath), oldContents, StandardCharsets.UTF_8)
    new String(Files.readAllBytes(new File(localPath).toPath)) should be(oldContents)

    writer.write(input)

    new String(Files.readAllBytes(new File(localPath).toPath)) should be(contents)
    Files.delete(input.location)
  }

}

Source File: TaglessFinal.scala From Mastering-Functional-Programming with MIT License

5 votes

package jvm

import scala.concurrent.{ Future, Await }
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.duration.Duration

import cats._, cats.implicits._

trait Capabilities[F[_]] {
  def resource(name: String): F[String]
  def notify(target: String, text: String): F[Unit]
}

object TaglessFinalExample extends App {
  implicit val capabilities: Capabilities[Future] = new Capabilities[Future] {
    import java.io.File
    import org.apache.commons.io.FileUtils

    def resource(name: String): Future[String] =
      Future { FileUtils.readFileToString(new File(name), "utf8") }

    def notify(target: String, text: String): Future[Unit] =
      Future { println(s"Notifying $target: $text") }
  }

  implicit val anotherEnvironmentCapabilities: Capabilities[Future] = new Capabilities[Future] {
    def resource(name: String): Future[String] = ???
    def notify(target: String, text: String): Future[Unit] = ???
  }

  implicit val logMonad: Monad[Future] = new Monad[Future] {
    def flatMap[A, B](fa: Future[A])(f: (A) ⇒ Future[B]): Future[B] =
      fa.flatMap { x =>
        println(s"Trace of the Future's result: $x")
        f(x) }
    
    def pure[A](x: A): Future[A] = Future(x)

    def tailRecM[A, B](a: A)(f: (A) ⇒ Future[Either[A, B]]): Future[B] = ???
  }

  def income[F[_]](implicit M: Monad[F], C: Capabilities[F]): F[Unit] =
    for {
      contents <- C.resource("sales.csv")
      total = contents
        .split("\n").toList.tail  // Collection of lines, drop the CSV header
        .map { _.split(",").toList match  // List[Double] - prices of each of the entries
          { case name :: price :: Nil => price.toDouble }
        }
        .sum
      _ <- C.notify("[email protected]", s"Total income made today: $total")
    } yield ()

  Await.result(income[Future](logMonad, capabilities), Duration.Inf)  // Block so that the application does not exit prematurely
}

object FacadeExample {
  trait Capabilities {
    def resource(name: String): String
    def notify(target: String, text: String): Unit
  }

  def income(c: Capabilities): Unit = {
    val contents = c.resource("sales.csv")
    val total = contents
      .split("\n").toList.tail  // Collection of lines, drop the CSV header
      .map { _.split(",").toList match  // List[Double] - prices of each of the entries
        { case name :: price :: Nil => price.toDouble }
      }
      .sum
    c.notify("[email protected]", s"Total income made today: $total")
  }
}

Source File: TilingServiceSpec.scala From recogito2 with Apache License 2.0

5 votes

package transform.tiling

import java.io.File
import org.apache.commons.io.FileUtils
import org.specs2.mutable._
import org.specs2.runner._
import org.junit.runner._
import play.api.test._
import play.api.test.Helpers._

@RunWith(classOf[JUnitRunner])
class TilingServiceSpec extends Specification {

  val TEST_IMAGE = new File("test/resources/transform/tiling/Ptolemy_map_15th_century.jpg")

  val TMP_DIR = {
    val dir = new File("test/resources/transform/tiling/tmp")
    if (dir.exists)
      FileUtils.deleteDirectory(dir)
    dir
  }

  "The Tiling function" should {

    "create proper Zoomify tiles from the test image" in {
      TilingService.createZoomify(TEST_IMAGE, TMP_DIR)

      TMP_DIR.exists must equalTo(true)
      TMP_DIR.list.size must equalTo(2)
      new File(TMP_DIR, "ImageProperties.xml").exists must equalTo(true)

      val tileGroup0 = new File(TMP_DIR, "TileGroup0")
      tileGroup0.exists must equalTo(true)

      tileGroup0.list.size must equalTo(65)
      tileGroup0.list.filter(_.endsWith(".jpg")).size must equalTo(65)

      FileUtils.deleteDirectory(TMP_DIR)

      success
    }

  }

}

Source File: TestNewApiWithCaseClass.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.newapi

import java.io.File

import io.gzet.newapi.CreateAvroWithCase.{V21EnhancedDate, GkgRecordCase}
import io.gzet.test.SparkFunSuite
import com.databricks.spark.avro._
import org.apache.commons.io.FileUtils

class TestNewApiWithCaseClass extends SparkFunSuite {

  val inputFilePath = getClass.getResource("/20160101020000.gkg.csv")
  val avroStructPath = "target/20160101020000.gkg.case.avro"

  localTest("Create and write Avro using spark-avro lib and case") { spark =>

    val gdeltRDD = spark.sparkContext.textFile(inputFilePath.toString)

    val gdeltRowRDD = gdeltRDD.map(_.split("\t", -1))

    val gkgRecordRDD = gdeltRowRDD.map(attributes =>
      GkgRecordCase(CreateAvroWithCase.createGkgRecordId(attributes(0)),
      attributes(1).toLong,
      attributes(2),
      attributes(3),
      attributes(4),
      CreateAvroWithCase.createV1Counts(attributes(5)),
      CreateAvroWithCase.createV21Counts(attributes(6)),
      CreateAvroWithCase.createV1Themes(attributes(7)),
      CreateAvroWithCase.createV2EnhancedThemes(attributes(8)),
      CreateAvroWithCase.createV1Locations(attributes(9)),
      CreateAvroWithCase.createV2Locations(attributes(10)),
      CreateAvroWithCase.createV1Persons(attributes(11)),
      CreateAvroWithCase.createV2Persons(attributes(12)),
      CreateAvroWithCase.createV1Orgs(attributes(13)),
      CreateAvroWithCase.createV2Orgs(attributes(14)),
      CreateAvroWithCase.createV1Stone(attributes(15)),
      CreateAvroWithCase.createEnhancedDate((attributes(16))),
      CreateAvroWithCase.createV2GCAM(attributes(17)),
      attributes(18),
      CreateAvroWithCase.createV21RelImgAndVid(attributes(19)),
      CreateAvroWithCase.createV21RelImgAndVid(attributes(20)),
      CreateAvroWithCase.createV21RelImgAndVid(attributes(21)),
      CreateAvroWithCase.createV21Quotations(attributes(22)),
      CreateAvroWithCase.createV21AllNames(attributes(23)),
      CreateAvroWithCase.createV21Amounts(attributes(24)),
      CreateAvroWithCase.createV21TransInfo(attributes(25)),
      attributes(26))
    )

    FileUtils.deleteDirectory(new File(avroStructPath))

    val gdeltDF = spark.createDataFrame(gkgRecordRDD)
    gdeltDF.write.avro(avroStructPath)

    assertResult(4) (new File(avroStructPath).listFiles.length)
  }

  localTest("Read Avro into Dataframe using spark-avro") { spark =>
    val gdeltAvroDF = spark.read.format("com.databricks.spark.avro").load(avroStructPath)
    assertResult(10)(gdeltAvroDF.count)

    gdeltAvroDF.show
  }
}

Source File: TestNewApiWithStructs.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.newapi

import java.io.File

import io.gzet.test.SparkFunSuite
import com.databricks.spark.avro._
import org.apache.commons.io.FileUtils
import org.apache.spark.sql.Row

class TestNewApiWithStructs extends SparkFunSuite {

  val inputFilePath = getClass.getResource("/20160101020000.gkg.csv")
  val avroStructPath = "target/20160101020000.gkg.struct.avro"

  localTest("Create and write Avro using spark-avro lib and Structs") { spark =>
    val gdeltRDD = spark.sparkContext.textFile(inputFilePath.toString)

    val gdeltRowRDD = gdeltRDD.map(_.split("\t", -1))
      .map(attributes => Row(
        CreateAvroWithStructs.createGkgRecordID(attributes(0)),
        attributes(1).toLong,
        attributes(2),
        attributes(3),
        attributes(4),
        CreateAvroWithStructs.createV1Counts(attributes(5)),
        CreateAvroWithStructs.createV21Counts(attributes(6)),
        CreateAvroWithStructs.createV1Themes(attributes(7)),
        CreateAvroWithStructs.createV2EnhancedThemes(attributes(8)),
        CreateAvroWithStructs.createV1Locations(attributes(9)),
        CreateAvroWithStructs.createV2Locations(attributes(10)),
        CreateAvroWithStructs.createV1Persons(attributes(11)),
        CreateAvroWithStructs.createV2Persons(attributes(12)),
        CreateAvroWithStructs.createV1Orgs(attributes(13)),
        CreateAvroWithStructs.createV2Orgs(attributes(14)),
        CreateAvroWithStructs.createV1Stone(attributes(15)),
        CreateAvroWithStructs.createV21Dates(attributes(16)),
        CreateAvroWithStructs.createV2GCAM(attributes(17)),
        attributes(18),
        CreateAvroWithStructs.createV21RelImgAndVid(attributes(19)),
        CreateAvroWithStructs.createV21RelImgAndVid(attributes(20)),
        CreateAvroWithStructs.createV21RelImgAndVid(attributes(21)),
        CreateAvroWithStructs.createV21Quotations(attributes(22)),
        CreateAvroWithStructs.createV21AllNames(attributes(23)),
        CreateAvroWithStructs.createV21Amounts(attributes(24)),
        CreateAvroWithStructs.createV21TransInfo(attributes(25)),
        attributes(26)
      ))

    FileUtils.deleteDirectory(new File(avroStructPath))

    val gdeltDF = spark.createDataFrame(gdeltRowRDD, CreateAvroWithStructs.GkgSchema)
    gdeltDF.write.avro(avroStructPath)

    assertResult(4) (new File(avroStructPath).listFiles.length)

  }

  localTest("Read Avro into Dataframe using spark-avro") { spark =>
    val gdeltAvroDF = spark.read.format("com.databricks.spark.avro").load(avroStructPath)
    assertResult(10) (gdeltAvroDF.count)
    gdeltAvroDF.show
  }
}

Source File: CryptoTest.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet

import java.io.File

import org.apache.commons.io.FileUtils
import org.apache.hadoop.io.compress.CryptoCodec
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{Matchers, FunSuite}

class CryptoTest extends FunSuite with Matchers {

  val cryptoDir = System.getProperty("java.io.tmpdir") + "cryptTestDir"

  test("Crypto encrypt then decrypt file") {
    val conf = new SparkConf()
      .setAppName("Test Crypto")
      .setMaster("local")
      .set("spark.default.parallelism", "1")
      .set("spark.hadoop.io.compression.codecs", "org.apache.hadoop.io.compress.CryptoCodec")
    val sc = new SparkContext(conf)

    val testFile = getClass.getResource("/gdeltTestFile.csv")
    val rdd = sc.textFile(testFile.getPath)

    rdd.saveAsTextFile(cryptoDir, classOf[CryptoCodec])
    val read = sc.textFile(cryptoDir)

    val allLines = read.collect
    allLines.size should be(20)
    allLines(0).startsWith("331150686") should be (true)
    allLines(allLines.length - 1).endsWith("polytrack/") should be (true)

    FileUtils.deleteDirectory(new File(cryptoDir))
    sc.stop
  }
}

Source File: StorageSpec.scala From piglet with Apache License 2.0

5 votes

package dbis.piglet.backends.spark

import java.io.File

import dbis.piglet.backends.{Record, SchemaClass}
import org.apache.commons.io.FileUtils
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest._



case class DataRecord(col1: Int, col2: String) extends java.io.Serializable with SchemaClass {
  override def mkString(delim: String) = s"$col1$delim$col2"
}

case class DoubleRecord(col1: Double, col2: Double) extends java.io.Serializable with SchemaClass {
  override def mkString(delim: String) = s"$col1$delim$col2"
}

class StorageSpec extends FlatSpec with Matchers with BeforeAndAfter {
  var sc: SparkContext = _
  var conf: SparkConf = _

  before {
    // to avoid Akka rebinding to the same port, since it doesn't unbind
    // immediately after shutdown
    System.clearProperty("spark.driver.port")
    System.clearProperty("spark.hostPort")
    conf = new SparkConf().setMaster("local").setAppName(getClass.getSimpleName)
    sc = new SparkContext(conf)
  }

  after {
    // cleanup SparkContext data
    sc.stop()
    sc = null
    conf = null
    System.clearProperty("spark.driver.port")
    System.clearProperty("spark.hostPort")
  }

  "PigStorage" should "load objects using an extractor" in {
    val res = PigStorage[Person]().load(sc, "sparklib/src/test/resources/person.csv",
      (data: Array[String]) => Person(data(0), data(1).toInt), ",")
    res.collect() should be (Array(Person("Anna", 21), Person("John", 53), Person("Mike", 32)))
  }

  it should "save and load records" in {
    val res = PigStorage[Person]().load(sc, "sparklib/src/test/resources/person.csv",
      (data: Array[String]) => Person(data(0), data(1).toInt), ",")
    PigStorage[Person]().write("person.data", res, "|")
    val otherRes = PigStorage[Person]().load(sc, "person.data",
      (data: Array[String]) => Person(data(0), data(1).toInt), "[|]")
    res.collect() should be (otherRes.collect())
    FileUtils.deleteDirectory(new File("person.data"))
   }

  
  }
}

Source File: FlinkStreamingCEPTest.scala From piglet with Apache License 2.0

5 votes

package dbis.cep.test.flink

import java.io.File

import dbis.piglet.backends.{ Record, SchemaClass }
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.scalatest._
import org.apache.commons.io.FileUtils
import org.apache.flink.api.scala._
import dbis.piglet.cep.nfa._
import dbis.piglet.cep.ops.SelectionStrategy._
import dbis.piglet.cep.ops.OutputStrategy._
import dbis.piglet.cep.flink.CustomDataStreamMatcher._
import scala.collection.mutable.ArrayBuffer
import org.apache.flink.streaming.api.windowing.windows.GlobalWindow
import org.apache.flink.streaming.api.windowing.assigners.GlobalWindows

case class StreamingDoubleRecord(col1: Int, col2: Int) extends java.io.Serializable with SchemaClass {
  override def mkString(delim: String) = s"$col1$delim$col2"
}

object OurStreamingNFA {
    def filter1(record: StreamingDoubleRecord, rvalues: NFAStructure[StreamingDoubleRecord]): Boolean = record.col1 == 1
    def filter2(record: StreamingDoubleRecord, rvalues: NFAStructure[StreamingDoubleRecord]): Boolean = record.col1 == 2
    def filter3(record: StreamingDoubleRecord, rvalues: NFAStructure[StreamingDoubleRecord]): Boolean = record.col1 == 3
    def createNFA = {
      val testNFA: NFAController[StreamingDoubleRecord] = new NFAController()
      val firstState = testNFA.createAndGetStartState("First")
      val secondState = testNFA.createAndGetNormalState("Second")
      val thirdState = testNFA.createAndGetNormalState("Third")
      val finalState = testNFA.createAndGetFinalState("Final")

      val firstEdge = testNFA.createAndGetForwardEdge(filter1)
      val secondEdge = testNFA.createAndGetForwardEdge(filter2)
      val thirdEdge = testNFA.createAndGetForwardEdge(filter3)

      testNFA.createForwardTransition(firstState, firstEdge, secondState)
      testNFA.createForwardTransition(secondState, secondEdge, thirdState)
      testNFA.createForwardTransition(thirdState, thirdEdge, finalState)
      testNFA
    }
  }

class FlinkStreamingCEPTest extends FlatSpec with Matchers with BeforeAndAfterEach {
  var resultArray = new ArrayBuffer[StreamingDoubleRecord]
  override def beforeEach() {
     resultArray.clear()
  }

  val sample = Seq(
      StreamingDoubleRecord(1,1), 
      StreamingDoubleRecord(2,2), 
      StreamingDoubleRecord(1,3), 
      StreamingDoubleRecord(2,4), 
      StreamingDoubleRecord(3,5), 
      StreamingDoubleRecord(1,6),
      StreamingDoubleRecord(2,7),
      StreamingDoubleRecord(3,8))
      
  "Flink Streaming CEP" should "detect the pattern SEQ(A, B, C) with first match" in {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.getConfig.disableSysoutLogging()
    val data = env.fromCollection(sample)
    val res = data.matchNFA(OurStreamingNFA.createNFA, env, FirstMatch)
  }

  it should "detect the pattern SEQ(A, B, C) with any match" in {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.getConfig.disableSysoutLogging()
    val data = env.fromCollection(sample)
    val res = data.matchNFA(OurStreamingNFA.createNFA, env, AllMatches)
  }

  it should "detect the pattern SEQ(A, B, C) with next match" in {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.getConfig.disableSysoutLogging()
    val data = env.fromCollection(sample)
    val res = data.matchNFA(OurStreamingNFA.createNFA, env, NextMatches)
  }

  it should "detect the pattern SEQ(A, B, C) with contiguity match" in {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.getConfig.disableSysoutLogging()
    val data = env.fromCollection(sample)
    val res = data.matchNFA(OurStreamingNFA.createNFA, env, ContiguityMatches)
  }
}

Source File: ArchiveUtils.scala From dl4scala with MIT License

5 votes

package org.dl4scala.util


import org.slf4j.LoggerFactory
import org.apache.commons.compress.archivers.tar.TarArchiveEntry
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream
import org.apache.commons.io.FileUtils
import org.apache.commons.io.IOUtils
import java.io._
import java.util.zip.GZIPInputStream
import java.util.zip.ZipInputStream

      tarIn.close()
    }
    else if (file.endsWith(".gz")) {
      val is2 = new GZIPInputStream(fin)
      val extracted = new File(target.getParent, target.getName.replace(".gz", ""))
      if (extracted.exists) extracted.delete
      extracted.createNewFile
      val fos = FileUtils.openOutputStream(extracted)
      IOUtils.copyLarge(is2, fos)
      is2.close()
      fos.flush()
      fos.close()
    }
    target.delete
  }
}

Source File: FlowerDataSetIterator.scala From dl4scala with MIT License

5 votes

package org.dl4scala.examples.transferlearning.vgg16.dataHelpers

import java.io.{File, IOException}
import java.net.URL

import org.datavec.api.io.filters.BalancedPathFilter
import org.datavec.api.io.labels.ParentPathLabelGenerator
import org.datavec.api.split.{FileSplit, InputSplit}
import org.datavec.image.loader.BaseImageLoader
import org.nd4j.linalg.dataset.api.iterator.DataSetIterator
import java.util
import java.util.Random

import org.apache.commons.io.FileUtils
import org.datavec.api.util.ArchiveUtils
import org.datavec.image.recordreader.ImageRecordReader
import org.deeplearning4j.datasets.datavec.RecordReaderDataSetIterator
import org.deeplearning4j.nn.modelimport.keras.trainedmodels.TrainedModels


object FlowerDataSetIterator {
  private val log = org.slf4j.LoggerFactory.getLogger(FlowerDataSetIterator.getClass)

  private val DATA_DIR = new File(System.getProperty("user.home")) + "/dl4jDataDir"
  private val DATA_URL = "http://download.tensorflow.org/example_images/flower_photos.tgz"
  private val FLOWER_DIR = DATA_DIR + "/flower_photos"

  private val allowedExtensions = BaseImageLoader.ALLOWED_FORMATS
  private val rng = new Random(13)

  private val height = 224
  private val width = 224
  private val channels = 3
  private val numClasses = 5

  private val labelMaker = new ParentPathLabelGenerator
  private var trainData: InputSplit = _
  private var testData: InputSplit = _
  private var batchSize = 0

  @throws(classOf[IOException])
  def trainIterator: DataSetIterator = makeIterator(trainData)

  @throws(classOf[IOException])
  def testIterator: DataSetIterator = makeIterator(testData)

  @throws(classOf[IOException])
  def setup(batchSizeArg: Int, trainPerc: Int): Unit = {
    try
      downloadAndUntar()
    catch {
      case e: IOException =>
        e.printStackTrace()
        log.error("IOException : ", e)
    }

    batchSize = batchSizeArg
    val parentDir = new File(FLOWER_DIR)
    val filesInDir = new FileSplit(parentDir, allowedExtensions, rng)
    val pathFilter = new BalancedPathFilter(rng, allowedExtensions, labelMaker)
    if (trainPerc >= 100)
      throw new IllegalArgumentException("Percentage of data set aside for training has to be less than 100%." +
        " Test percentage = 100 - training percentage, has to be greater than 0")
    val filesInDirSplit = filesInDir.sample(pathFilter, trainPerc, 100 - trainPerc)
    trainData = filesInDirSplit(0)
    testData = filesInDirSplit(1)
  }

  @throws(classOf[IOException])
  private def makeIterator(split: InputSplit) = {
    val recordReader = new ImageRecordReader(height, width, channels, labelMaker)
    recordReader.initialize(split)
    val iter = new RecordReaderDataSetIterator(recordReader, batchSize, 1, numClasses)
    iter.setPreProcessor(TrainedModels.VGG16.getPreProcessor)
    iter
  }

  @throws(classOf[IOException])
  def downloadAndUntar(): Unit = {
    val rootFile = new File(DATA_DIR)
    if (!rootFile.exists) rootFile.mkdir
    val tarFile = new File(DATA_DIR, "flower_photos.tgz")
    if (!tarFile.isFile) {
      log.info("Downloading the flower dataset from " + DATA_URL + "...")
      FileUtils.copyURLToFile(new URL(DATA_URL), tarFile)
    }
    ArchiveUtils.unzipFileTo(tarFile.getAbsolutePath, rootFile.getAbsolutePath)
  }
}

Source File: PerTestSparkSession.scala From Spark-RSVD with Apache License 2.0

5 votes

package com.criteo.rsvd

import java.io.File
import java.nio.file.{Files, Path}
import java.util.concurrent.locks.ReentrantLock

import org.apache.commons.io.FileUtils
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.scalatest.{BeforeAndAfterEach, Suite}

import scala.reflect.ClassTag
import scala.util.control.NonFatal

object LocalSparkSession {
  private[this] val lock = new ReentrantLock()

  def acquire(): Unit = lock.lock()

  def release(): Unit = lock.unlock()

  def builder: SparkSession.Builder = {
    SparkSession
      .builder()
      .master("local[*]")
      .appName("test")
      .config("spark.ui.enabled", false)
  }
}


  def sparkConf: Map[String, Any] = Map()

  def toRDD[T: ClassTag](input: Seq[T]): RDD[T] = sc.parallelize(input)

  def toArray[T](input: RDD[T]): Array[T] = input.collect()

  protected def closeSession() = {
    currentSession.foreach(_.stop())
    currentSession = None
    try {
      checkpointDir.foreach(path =>
        FileUtils.deleteDirectory(new File(path.toString)))
    } catch {
      case NonFatal(_) =>
    }
    checkpointDir = None
    LocalSparkSession.release()
  }

  private def getOrCreateSession = synchronized {
    if (currentSession.isEmpty) {
      val builder = LocalSparkSession.builder
      for ((key, value) <- sparkConf) {
        builder.config(key, value.toString)
      }
      currentSession = Some(builder.getOrCreate())
      checkpointDir =
        Some(Files.createTempDirectory("spark-unit-test-checkpoint-"))
      currentSession.get.sparkContext
        .setCheckpointDir(checkpointDir.get.toString)
        currentSession.get.sparkContext.setLogLevel("WARN")
    }
    currentSession.get
  }

  override def beforeEach(): Unit = {
    LocalSparkSession.acquire()
    super.beforeEach()
  }

  override def afterEach(): Unit = {
    try {
      super.afterEach()
    } finally {
      closeSession()
    }
  }
}

Source File: DockerCopyBuildAction.scala From berilia with Apache License 2.0

5 votes

package com.criteo.dev.cluster.docker

import java.io.File

import com.criteo.dev.cluster.{GeneralConstants, GeneralUtilities}
import org.apache.commons.io.FileUtils



class DockerCopyBuildAction (dockerFile: String,
                             dockerImage: String,
                             resourcePath: String)
  extends DockerBuildAction (dockerFile, dockerImage) {

  val tempDir = "tmpResources"

  override def run() : Unit = {

    val tmpResourcePath = s"${GeneralUtilities.getHomeDir}/${DockerConstants.dockerBaseDir}/$tempDir"
    val tmpResource = new File(tmpResourcePath)
    GeneralUtilities.prepareDir(tmpResourcePath)

    val resource = new File(s"${GeneralUtilities.getHomeDir}/$resourcePath")
    require (resource.exists(), s"Internal error, resource to copy does not exist: $resourcePath")

    if (resource.isFile()) {
       FileUtils.copyFileToDirectory(resource, tmpResource)
    } else if (resource.isDirectory()) {
       FileUtils.copyDirectory(resource, tmpResource)
    }

    super.addArg(DockerConstants.resource, s"$tempDir")
    super.run()

    FileUtils.deleteDirectory(tmpResource)
  }
}

object DockerCopyBuildAction {
  def apply(dockerFile: String,
            dockerImage: String,
            resourcePath: String) = {
    val obj = new DockerCopyBuildAction(dockerFile, dockerImage, resourcePath)
    obj.run
  }
}

Source File: SourceFileSequenceBuilder.scala From mvn_scalafmt with Apache License 2.0

5 votes

package org.antipathy.mvn_scalafmt.builder

import java.io.File
import java.nio.file.{Files, Paths}

import org.apache.commons.io.FileUtils
import org.apache.maven.plugin.logging.Log

import scala.jdk.CollectionConverters._


  override def build(paths: Seq[File]): Seq[File] =
    if (paths == null) {
      log.warn("Could not locate any scala sources to format")
      Seq.empty[File]
    } else {
      val files = paths.map(_.getCanonicalPath).flatMap { p =>
        if (Files.exists(Paths.get(p))) {
          Some(new File(p))
        } else {
          log.warn(s"Could not locate Scala source at $p")
          None
        }
      }
      files.flatMap(file => FileUtils.listFiles(file, Array("scala", "sc", "sbt"), true).asScala)
    }
}

Source File: TgzTransformerSpec.scala From releaser with Apache License 2.0

5 votes

package uk.gov.hmrc.releaser

import java.io._
import java.nio.file.{Files, Path}

import org.apache.commons.compress.archivers.tar.{TarArchiveEntry, TarArchiveInputStream}
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream
import org.apache.commons.io.FileUtils
import org.scalatest._

import scala.collection.mutable.ListBuffer
import scala.util.{Failure, Success}

class TgzTransformerSpec extends WordSpec with Matchers with BeforeAndAfterEach with OptionValues with TryValues{

  val tgzPath = new File(this.getClass.getResource("/help-frontend/uk/gov/hmrc/help-frontend_2.11/1.26.0-3-gd7ed03c/help-frontend_2.11-1.26.0-3-gd7ed03c.tgz").toURI).toPath

  var transformer:TgzTransformer = _
  val candidate_1_26_0_3_gd7ed03c = ReleaseCandidateVersion("1.26.0-3-gd7ed03c")
  val release_1_4_0 = ReleaseVersion("1.4.0")
  var tmpDir:Path = _

  override def beforeEach(){
    tmpDir = Files.createTempDirectory("tmp")
    transformer = new TgzTransformer()
    FileUtils.copyFileToDirectory(tgzPath.toFile, tmpDir.toFile)
  }

  override def afterEach(){
    FileUtils.deleteDirectory(tmpDir.toFile)
  }

  "the transformer" should {

    "decompress the tgz, rename the main folder and compress it back" in {

      val inFile = new File(tmpDir.toFile, tgzPath.getFileName.toString).toPath
      val targetFilePath = tmpDir.resolve("help-frontend-1.4.0.tgz")

      val originalTarEntries = listTgzEntries(inFile)
      assertTarEntry(originalTarEntries, "./help-frontend-1.26.0-3-gd7ed03c/")
      assertTarEntry(originalTarEntries, "./help-frontend-1.4.0/", exists = false)
      assertTarEntry(originalTarEntries, "./start-docker.sh", mode = Some(493))

      val outFileTry = transformer(inFile, "help-frontend", candidate_1_26_0_3_gd7ed03c, release_1_4_0, targetFilePath)
      outFileTry match {
        case Success(outFile) =>
          val tarEntries = listTgzEntries(targetFilePath)
          assertTarEntry(tarEntries, "./help-frontend-1.26.0-3-gd7ed03c/", exists = false)
          assertTarEntry(tarEntries, "./help-frontend-1.4.0/")
          assertTarEntry(tarEntries, "./start-docker.sh", mode = Some(493))
        case Failure(e) => fail("Caught exception: " + e.getMessage, e)
      }


    }
  }

  private def listTgzEntries(localTgzFile: Path) : List[TarArchiveEntry] =  {
    val bytes = new Array[Byte](2048)
    val fin = new BufferedInputStream(new FileInputStream(localTgzFile.toFile))
    val gzIn = new GzipCompressorInputStream(fin)
    val tarIn = new TarArchiveInputStream(gzIn)

    val entries = ListBuffer[TarArchiveEntry]()

    Iterator continually tarIn.getNextTarEntry takeWhile (null !=) foreach { tarEntry =>
      entries += tarEntry
    }

    tarIn.close()

    entries.toList

  }

  private def assertTarEntry(tarEntries: List[TarArchiveEntry], entryName: String, exists: Boolean = true, mode: Option[Int] = None) = {
    val entryOption = tarEntries.find(_.getName == entryName)
    entryOption match {
      case Some(entry) =>
        exists shouldBe true
        mode.foreach { m => m shouldBe entry.getMode}
      case None => exists shouldBe false
    }

  }

}

Source File: HttpSlippyTileReader.scala From geotrellis-osm-elevation with Apache License 2.0

5 votes

package geotrellis.osme.core

import geotrellis.vector._
import geotrellis.raster._
import geotrellis.raster.io.geotiff._
import geotrellis.spark._
import geotrellis.spark.io.s3._

import geotrellis.spark.io.slippy._
import geotrellis.util.Filesystem

import org.apache.commons.io.FileUtils
import org.apache.commons.io.filefilter._
import org.apache.commons.io.IOUtils._
import org.apache.spark._
import org.apache.spark.rdd._
import java.net._
import java.io.File

class HttpSlippyTileReader[T](pathTemplate: String)(fromBytes: (SpatialKey, Array[Byte]) => T) extends SlippyTileReader[T] {
    def getURL(template: String, z: Int, x: Int, y: Int) = 
        template.replace("{z}", z.toString).replace("{x}", x.toString).replace("{y}", y.toString)
    def getByteArray(url: String) = {
      val inStream = new URL(url).openStream()
      try {
        toByteArray(inStream)
      } finally {
        inStream.close()
      }
    }

    def read(zoom: Int)(implicit sc: SparkContext): RDD[(SpatialKey, T)] = ???
    def read(zoom: Int, key: SpatialKey): T = fromBytes(key, getByteArray(getURL(pathTemplate, zoom, key.col, key.row)))
    override def read(zoom: Int, x: Int, y: Int): T =
        read(zoom, SpatialKey(x, y))
}

Source File: QueryCsvTest.scala From apache-spark-test with Apache License 2.0

5 votes

package com.github.dnvriend.spark.sstreaming

import com.github.dnvriend.TestSpec
import org.apache.commons.io.FileUtils
import org.apache.spark.sql.streaming.{ OutputMode, ProcessingTime }
import org.apache.spark.sql.types._
import org.scalatest.Ignore

import scala.concurrent.duration._
import scala.language.implicitConversions

@Ignore
class QueryCsvTest extends TestSpec {
  def copyFiles(nrTimes: Int = 10): Unit = {
    FileUtils.deleteDirectory("/tmp/csv")
    FileUtils.forceMkdir("/tmp/csv")
    (1 to nrTimes).foreach { x =>
      FileUtils.copyFile(TestSpec.PeopleCsv, s"/tmp/csv/people-$x")
    }
  }

  val schema: StructType = StructType(Array(
    StructField("id", LongType, nullable = false),
    StructField("name", StringType, nullable = true),
    StructField("age", IntegerType, nullable = true)
  ))

  it should "query csv file" in withSparkSession { spark =>
    copyFiles()

    val csv = spark.readStream
      .schema(schema)
      .format("csv")
      .option("maxFilesPerTrigger", 1)
      .option("header", "false") // Use first line of all files as header
      .option("inferSchema", "false") // Automatically infer data types
      .option("delimiter", ";")
      .load("/tmp/csv")

    csv.printSchema()

    println("Is the query streaming: " + csv.isStreaming)
    println("Are there any streaming queries? " + spark.streams.active.isEmpty)

    val query = csv
      .writeStream
      .format("console")
      .trigger(ProcessingTime(5.seconds))
      .queryName("consoleStream")
      .outputMode(OutputMode.Append())
      .start()

    // waiting for data
    sleep(3.seconds)
    spark.streams
      .active
      .foreach(println)

    spark.streams
      .active
      .foreach(_.explain(extended = true))

    query.awaitTermination(20.seconds)
  }
}

Source File: SharedSparkSessionSuite.scala From ecosystem with Apache License 2.0

5 votes

package org.tensorflow.spark.datasources.tfrecords

import java.io.File

import org.apache.commons.io.FileUtils
import org.apache.spark.SharedSparkSession
import org.junit.{After, Before}
import org.scalatest.{BeforeAndAfterAll, Matchers, WordSpecLike}


trait BaseSuite extends WordSpecLike with Matchers with BeforeAndAfterAll

class SharedSparkSessionSuite extends SharedSparkSession with BaseSuite {
  val TF_SANDBOX_DIR = "tf-sandbox"
  val file = new File(TF_SANDBOX_DIR)

  @Before
  override def beforeAll() = {
    super.setUp()
    FileUtils.deleteQuietly(file)
    file.mkdirs()
  }

  @After
  override def afterAll() = {
    FileUtils.deleteQuietly(file)
    super.tearDown()
  }
}

Source File: LocalWriteSuite.scala From ecosystem with Apache License 2.0

5 votes

package org.tensorflow.spark.datasources.tfrecords

import java.nio.file.Files
import java.nio.file.Paths

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRow
import org.apache.spark.sql.types._

import org.apache.commons.io.FileUtils

class LocalWriteSuite extends SharedSparkSessionSuite {

  val testRows: Array[Row] = Array(
    new GenericRow(Array[Any](11, 1, 23L, 10.0F, 14.0, List(1.0, 3.0), "r1")),
    new GenericRow(Array[Any](21, 2, 24L, 12.0F, 15.0, List(2.0, 3.0), "r2")),
    new GenericRow(Array[Any](31, 3, 25L, 14.0F, 16.0, List(3.0, 3.0), "r3")))
  val schema = StructType(List(StructField("id", IntegerType),
    StructField("IntegerTypeLabel", IntegerType),
    StructField("LongTypeLabel", LongType),
    StructField("FloatTypeLabel", FloatType),
    StructField("DoubleTypeLabel", DoubleType),
    StructField("VectorLabel", ArrayType(DoubleType, true)),
    StructField("name", StringType)))


  "Propagate" should {
    "write data locally" in {
      // Create a dataframe with 2 partitions
      val rdd = spark.sparkContext.parallelize(testRows, numSlices = 2)
      val df = spark.createDataFrame(rdd, schema)

      // Write the partitions onto the local hard drive. Since it is going to be the
      // local file system, the partitions will be written in the same directory of the
      // same machine.
      // In a distributed setting though, two different machines would each hold a single
      // partition.
      val localPath = Files.createTempDirectory("spark-connector-propagate").toAbsolutePath.toString
      val savePath = localPath + "/testResult"
      df.write.format("tfrecords")
        .option("recordType", "Example")
        .option("writeLocality", "local")
        .save(savePath)

      // Read again this directory, this time using the Hadoop file readers, it should
      // return the same data.
      // This only works in this test and does not hold in general, because the partitions
      // will be written on the workers. Everything runs locally for tests.
      val df2 = spark.read.format("tfrecords").option("recordType", "Example")
        .load(savePath).sort("id").select("id", "IntegerTypeLabel", "LongTypeLabel",
        "FloatTypeLabel", "DoubleTypeLabel", "VectorLabel", "name") // Correct column order.

      assert(df2.collect().toSeq === testRows.toSeq)
    }
  }
}

Source File: JsonIOTest.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.extra.json

import java.nio.file.Files

import io.circe.Printer
import com.spotify.scio._
import com.spotify.scio.io.TapSpec
import com.spotify.scio.testing._
import com.spotify.scio.util.ScioUtil
import org.apache.beam.sdk.Pipeline.PipelineExecutionException
import org.apache.commons.io.FileUtils

import scala.jdk.CollectionConverters._
import scala.io.Source

object JsonIOTest {
  case class Record(i: Int, s: String, o: Option[Int])
}

class JsonIOTest extends ScioIOSpec with TapSpec {
  import JsonIOTest._

  private val xs = (1 to 100).map(x => Record(x, x.toString, if (x % 2 == 0) Some(x) else None))

  "JsonIO" should "work" in {
    testTap(xs)(_.saveAsJsonFile(_))(".json")
    testJobTest(xs)(JsonIO(_))(_.jsonFile(_))(_.saveAsJsonFile(_))
  }

  it should "support custom printer" in {
    val dir = tmpDir
    val t = runWithFileFuture {
      _.parallelize(xs)
        .saveAsJsonFile(dir.getPath, printer = Printer.noSpaces.copy(dropNullValues = true))
    }
    verifyTap(t, xs.toSet)
    val result = Files
      .list(dir.toPath)
      .iterator()
      .asScala
      .flatMap(p => Source.fromFile(p.toFile).getLines())
      .toSeq
    val expected = (1 to 100).map { x =>
      s"""{"i":$x,"s":"$x"${if (x % 2 == 0) s""","o":$x""" else ""}}"""
    }
    result should contain theSameElementsAs expected
    FileUtils.deleteDirectory(dir)
  }

  it should "handle invalid JSON" in {
    val badData = Seq(
      """{"i":1, "s":hello}""",
      """{"i":1}""",
      """{"s":"hello"}""",
      """{"i":1, "s":1}""",
      """{"i":"hello", "s":1}"""
    )
    val dir = tmpDir
    runWithFileFuture {
      _.parallelize(badData).saveAsTextFile(dir.getPath)
    }

    val sc = ScioContext()
    sc.jsonFile[Record](ScioUtil.addPartSuffix(dir.getPath))

    a[PipelineExecutionException] should be thrownBy { sc.run() }

    FileUtils.deleteDirectory(dir)
  }
}

Source File: ConverterProviderTest.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.avro.types

import java.nio.file.Files

import com.spotify.scio._
import com.spotify.scio.avro._
import org.apache.commons.io.FileUtils
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

class ConverterProviderTest extends AnyFlatSpec with Matchers {
  import ConverterProviderTest._

  "ConverterProvider" should "#1831: handle Avro map" in {
    val dir = Files.createTempDirectory("avro-")
    val data = Seq(Record(Map("a" -> 1), Some(Map("b" -> 2)), List(Map("c" -> 3))))

    val sc1 = ScioContext()
    sc1.parallelize(data).saveAsTypedAvroFile(dir.toString)
    sc1.run()

    val sc2 = ScioContext()
    val t = sc2.typedAvroFile[Record](s"$dir/*.avro").materialize
    sc2.run()

    t.underlying.value.toSeq should contain theSameElementsAs data

    FileUtils.deleteDirectory(dir.toFile)
  }
}

object ConverterProviderTest {
  @AvroType.toSchema
  case class Record(a: Map[String, Int], b: Option[Map[String, Int]], c: List[Map[String, Int]])
}

Source File: TFTapTest.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.tensorflow

import java.util.UUID

import com.spotify.scio.io.TapSpec
import org.apache.commons.io.FileUtils

class TFTapTest extends TapSpec {
  "SCollection" should "support saveAsTFRecordFile" in {
    val data = Seq.fill(100)(UUID.randomUUID().toString)
    import org.apache.beam.sdk.io.{Compression => CType}
    for (compressionType <- Seq(CType.UNCOMPRESSED, CType.DEFLATE, CType.GZIP)) {
      val dir = tmpDir
      val t = runWithFileFuture {
        _.parallelize(data)
          .map(_.getBytes)
          .saveAsTfRecordFile(dir.getPath, compression = compressionType)
      }
      verifyTap(t.map(new String(_)), data.toSet)
      FileUtils.deleteDirectory(dir)
    }
  }
}

Source File: UDFBuilder.scala From sope with Apache License 2.0

5 votes

package com.sope.etl.register

import java.io.File
import java.net.URLClassLoader

import com.sope.etl.getObjectInstance
import com.sope.etl.transform.exception.YamlDataTransformException
import com.sope.etl.utils.JarUtils
import com.sope.utils.Logging
import org.apache.commons.io.FileUtils
import org.apache.spark.sql.expressions.UserDefinedFunction

import scala.tools.nsc.Settings
import scala.tools.nsc.interpreter.IMain

object  UDFBuilder extends Logging {

  val DefaultClassLocation = "/tmp/sope/dynamic/"
  val DefaultJarLocation = "/tmp/sope/sope-dynamic-udf.jar"


  
  def buildDynamicUDFs(udfCodeMap: Map[String, String]): Map[String, UserDefinedFunction] = {
    val file = new java.io.File(UDFBuilder.DefaultClassLocation)
    FileUtils.deleteDirectory(file)
    file.mkdirs()
    val udfMap = evalUDF(udfCodeMap)
    JarUtils.buildJar(DefaultClassLocation, DefaultJarLocation)
    udfMap
  }

}

Source File: BruteForceSequenceMatcher.scala From sonar-scala with GNU Lesser General Public License v3.0

5 votes

package com.buransky.plugins.scoverage.pathcleaner

import java.io.File
import org.apache.commons.io.FileUtils
import BruteForceSequenceMatcher._
import com.buransky.plugins.scoverage.util.PathUtil
import scala.collection.JavaConversions._
import org.sonar.api.utils.log.Loggers

object BruteForceSequenceMatcher {

  val extensions = Array[String]("java", "scala")

  type PathSeq = Seq[String]
}


class BruteForceSequenceMatcher(baseDir: File, sourcePath: String) extends PathSanitizer {

  private val sourceDir = initSourceDir()
  require(sourceDir.isAbsolute)
  require(sourceDir.isDirectory)

  private val log = Loggers.get(classOf[BruteForceSequenceMatcher])
  private val sourcePathLength = PathUtil.splitPath(sourceDir.getAbsolutePath).size
  private val filesMap = initFilesMap()


  def getSourceRelativePath(reportPath: PathSeq): Option[PathSeq] = {
    // match with file system map of files
    val relPathOption = for {
      absPathCandidates <- filesMap.get(reportPath.last)
      path <- absPathCandidates.find(absPath => absPath.endsWith(reportPath))
    } yield path.drop(sourcePathLength)

    relPathOption
  }

  // mock able helpers that allow us to remove the dependency to the real file system during tests

  private[pathcleaner] def initSourceDir(): File = {
    sourcePath.split(",").headOption.map { first =>
      val firstFile = new File(first)
      if (firstFile.isAbsolute) {
        firstFile
      } else {
        val sourceDir = new File(baseDir, first)
        sourceDir
      }
    }.orNull
  }

  private[pathcleaner] def initFilesMap(): Map[String, Seq[PathSeq]] = {
    val srcFiles = FileUtils.iterateFiles(sourceDir, extensions, true)
    val paths = srcFiles.map(file => PathUtil.splitPath(file.getAbsolutePath)).toSeq

    // group them by filename, in case multiple files have the same name
    paths.groupBy(path => path.last)
  }

}

Source File: GeneratorTest.scala From courier with Apache License 2.0

5 votes

package org.coursera.courier.generator

import java.io.File
import java.io.IOException

import com.linkedin.data.DataList
import com.linkedin.data.DataMap
import com.linkedin.data.codec.JacksonDataCodec
import com.linkedin.data.template.DataTemplate
import com.linkedin.data.template.JacksonDataTemplateCodec
import com.linkedin.data.template.PrettyPrinterJacksonDataTemplateCodec
import org.apache.commons.io.FileUtils
import org.scalatest.junit.AssertionsForJUnit
import org.scalatest.junit.JUnitSuite

abstract class GeneratorTest extends JUnitSuite with AssertionsForJUnit {

  def printJson(dataTemplate: DataTemplate[DataMap]): Unit = printJson(dataTemplate.data)

  def printJson(dataMap: DataMap): Unit = println(mapToJson(dataMap))

  def assertJson(left: DataTemplate[DataMap], right: String): Unit = {
    val leftMap = readJsonToMap(mapToJson(left.data()))
    val rightMap = readJsonToMap(right)
    assert(leftMap === rightMap)
  }

  def roundTrip(complex: DataMap): DataMap = {
    readJsonToMap(mapToJson(complex))
  }

  def roundTrip(complex: DataList): DataList = {
    readJsonToList(listToJson(complex))
  }

  private val jsonPath = new File(
    System.getProperty("referencesuite.srcdir") +
      File.separator + "main" + File.separator + "json")

  protected def load(filename: String): String = {
    FileUtils.readFileToString(new File(jsonPath, filename))
  }

  private val prettyPrinter = new PrettyPrinterJacksonDataTemplateCodec
  private val codec = new JacksonDataTemplateCodec
  private val dataCodec = new JacksonDataCodec

  private def mapToJson(dataTemplate: DataTemplate[DataMap]): String = mapToJson(dataTemplate.data)

  private def listToJson(dataTemplate: DataTemplate[DataList]): String = {
    listToJson(dataTemplate.data)
  }

  private def mapToJson(dataMap: DataMap): String = prettyPrinter.mapToString(dataMap)

  private def listToJson(dataList: DataList): String = prettyPrinter.listToString(dataList)

  private def readJsonToMap(string: String): DataMap = dataCodec.stringToMap(string)

  private def readJsonToList(string: String): DataList = dataCodec.stringToList(string)
}

Source File: BillerCache.scala From apple-of-my-iap with MIT License

5 votes

package com.meetup.iap

import com.meetup.iap.receipt.Subscription
import org.slf4j.LoggerFactory

import java.io.File
import scala.io.Source

import org.json4s.DefaultFormats
import org.json4s.native.Serialization.{read, writePretty}
import org.apache.commons.io.FileUtils


object BillerCache {
  val log = LoggerFactory.getLogger(BillerCache.getClass)

  implicit val formats = DefaultFormats

  private val ProjectName = "iap-service"
  private val inProject = new File(".").getCanonicalPath.endsWith(ProjectName)

  private val Folder = {
    val base = if(inProject) "" else "iap-service/"
    new File(s"${base}tmp/")
  }
  if(!Folder.exists) {
    Folder.mkdirs
  }

  private val TempFile = new File(Folder, "subscriptions.json")
  if(!TempFile.exists) {
    TempFile.createNewFile
  }

  private val PlansFile = new File(Folder, "plans.json")
  if (!PlansFile.exists) {
    PlansFile.createNewFile
  }

  def readFromCache(): Map[String, Subscription] = {
    log.info("Reading from file: " + TempFile.getAbsolutePath)
    val raw = Source.fromFile(TempFile).mkString.trim

    if(raw.nonEmpty) {
        Map(read[Map[String, Subscription]](raw).toSeq: _*)
    } else Map.empty
  }

  def writeToCache(subs: Map[String, Subscription]) {
      val json = writePretty(subs)
      FileUtils.writeStringToFile(TempFile, json, "UTF-8")
  }

  def readPlansFromFile(): List[Plan] = {
    log.info(s"Reading from plans file: ${PlansFile.getAbsolutePath}")
    val raw = Source.fromFile(PlansFile).mkString.trim

    if(raw.nonEmpty) {
      log.info("Found some plans")
      List(read[List[Plan]](raw).toSeq: _*)
    } else List.empty
  }
}

Source File: TransformerSerialization.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.deeplang.doperables.spark.wrappers.transformers

import java.nio.file.{Files, Path}

import org.apache.commons.io.FileUtils
import org.scalatest.{BeforeAndAfter, Suite}

import io.deepsense.deeplang.doperables.Transformer
import io.deepsense.deeplang.doperables.dataframe.DataFrame
import io.deepsense.deeplang.{DeeplangIntegTestSupport, ExecutionContext}

trait TransformerSerialization extends Suite with BeforeAndAfter {

  var tempDir: Path = _

  before {
    tempDir = Files.createTempDirectory("writeReadTransformer")
  }

  after {
    FileUtils.deleteDirectory(tempDir.toFile)
  }
}

object TransformerSerialization {

  implicit class TransformerSerializationOps(private val transformer: Transformer) {

    def applyTransformationAndSerialization(
        path: Path,
        df: DataFrame)(implicit executionContext: ExecutionContext): DataFrame = {
      val result = transformer._transform(executionContext, df)
      val deserialized = loadSerializedTransformer(path)
      val resultFromSerializedTransformer = deserialized._transform(executionContext, df)
      DeeplangIntegTestSupport.assertDataFramesEqual(result, resultFromSerializedTransformer)
      result
    }

    def loadSerializedTransformer(
        path: Path)(
        implicit executionContext: ExecutionContext): Transformer = {
      val outputPath: Path = path.resolve(this.getClass.getName)
      transformer.save(executionContext, outputPath.toString)
      Transformer.load(executionContext, outputPath.toString)
    }
  }
}

Source File: ParquetIOTest.scala From ratatool with Apache License 2.0

5 votes

package com.spotify.ratatool.io

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, File}
import java.nio.file.Files

import com.spotify.ratatool.Schemas
import com.spotify.ratatool.avro.specific.TestRecord
import com.spotify.ratatool.scalacheck._
import org.apache.commons.io.FileUtils
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

class ParquetIOTest extends AnyFlatSpec with Matchers {

  private val genericSchema = Schemas.avroSchema
  private val genericGen = genericRecordOf(genericSchema)
  private val genericData = (1 to 100).flatMap(_ => genericGen.sample)

  private val specificSchema = TestRecord.getClassSchema
  private val specificGen = specificRecordOf[TestRecord]
  private val specificData = (1 to 100).flatMap(_ => specificGen.sample)

  "ParquetIO" should "work with generic record and stream" in {
    val out = new ByteArrayOutputStream()
    ParquetIO.writeToOutputStream(genericData, genericSchema, out)
    val in = new ByteArrayInputStream(out.toByteArray)
    val result = ParquetIO.readFromInputStream(in).toList
    result should equal (genericData)
  }

  it should "work with generic record and file" in {
    val dir = Files.createTempDirectory("ratatool-")
    val file = new File(dir.toString, "temp.parquet")
    ParquetIO.writeToFile(genericData, genericSchema, file)
    val result = ParquetIO.readFromFile(file).toList
    result should equal (genericData)
    FileUtils.deleteDirectory(dir.toFile)
  }

  it should "work with specific record and stream" in {
    val out = new ByteArrayOutputStream()
    ParquetIO.writeToOutputStream(specificData, specificSchema, out)
    val in = new ByteArrayInputStream(out.toByteArray)
    val result = ParquetIO.readFromInputStream[TestRecord](in).toList
    result.map(FixRandomData(_)) should equal (specificData.map(FixRandomData(_)))
  }

  it should "work with specific record and file" in {
    val dir = Files.createTempDirectory("ratatool-")
    val file = new File(dir.toString, "temp.parquet")
    ParquetIO.writeToFile(specificData, specificSchema, file)
    val result = ParquetIO.readFromFile[TestRecord](file).toList
    result.map(FixRandomData(_)) should equal (specificData.map(FixRandomData(_)))
    FileUtils.deleteDirectory(dir.toFile)
  }

}

Source File: ModifyFilesRuleTest.scala From RTran with Apache License 2.0

5 votes

package com.ebay.rtran.generic

import java.io.File

import org.apache.commons.io.FileUtils
import org.scalatest.{BeforeAndAfterEach, FlatSpecLike, Matchers}

import scala.io.Source


class ModifyFilesRuleTest extends FlatSpecLike with Matchers with BeforeAndAfterEach {

  val projectRoot = new File(getClass.getClassLoader.getResource("someproject").getFile)
  val destProjectRoot = new File(projectRoot.getParentFile, projectRoot.getName + "-bak")

  override def beforeEach = {
    FileUtils.deleteQuietly(destProjectRoot)
    FileUtils.copyDirectory(projectRoot, destProjectRoot)
  }

  "ModifyFilesRule" should "modify the file correctly" in {
    val ruleConfig = ModifyFilesRuleConfig(
      "**/fileA.txt",
      None,
      List(
        ContentMapping("hello\\s(.+)\\n", "hallo $1\n"),
        ContentMapping("(.+)\\sBob", "$1 Alice")
      )
    )
    val projectCtx = new GenericProjectCtx(destProjectRoot)
    val provider = new AllFilesModelProvider
    val model = provider create projectCtx
    val rule = new ModifyFilesRule(ruleConfig)
    val result = rule transform model
    val file = result.files.find(_.getName == "fileA.txt")
    file.nonEmpty should be (true)
    Source.fromFile(file.get).getLines.toList should be (List("hallo world", "hi Alice"))
  }

}

Source File: MoveFilesRuleTest.scala From RTran with Apache License 2.0

5 votes

package com.ebay.rtran.generic

import java.io.File

import org.apache.commons.io.FileUtils
import org.json4s.jackson.JsonMethods._
import org.scalatest.{BeforeAndAfterEach, FlatSpecLike, Matchers}


class MoveFilesRuleTest extends FlatSpecLike with Matchers with BeforeAndAfterEach {

  val projectRoot = new File(getClass.getClassLoader.getResource("someproject").getFile)
  val destProjectRoot = new File(projectRoot.getParentFile, projectRoot.getName + "-bak")

  override def beforeEach = {
    FileUtils.deleteQuietly(destProjectRoot)
    FileUtils.copyDirectory(projectRoot, destProjectRoot)
  }

  "MoveFilesRule" should "move file to the dest directory" in {
    val ruleConfigJson = asJsonNode(parse(
      """
        |{
        | "moves":[
        |   {
        |     "pathPattern":"**.txt", "otherdirectory/dest"),
        Move("*.txt", "otherdirectory")
      )
    )
    val projectCtx = new GenericProjectCtx(destProjectRoot)
    val provider = new AllFilesModelProvider
    val model = provider create projectCtx
    val rule = new MoveFilesRule(ruleConfig)
    val result = rule transform model
    result.files forall (_.exists) should be (true)
  }

}

Source File: ModifyXMLFilesRuleTest.scala From RTran with Apache License 2.0

5 votes

package com.ebay.rtran.generic

import java.io.File

import org.apache.commons.io.FileUtils
import com.ebay.rtran.xml._
import org.scalatest.{BeforeAndAfterEach, FlatSpecLike, Matchers}

import scala.io.Source
import scala.language.postfixOps


class ModifyXMLFilesRuleTest extends FlatSpecLike with Matchers with BeforeAndAfterEach {

  val projectRoot = new File(getClass.getClassLoader.getResource("someproject").getFile)
  val destProjectRoot = new File(projectRoot.getParentFile, projectRoot.getName + "-bak")

  override def beforeEach = {
    FileUtils.deleteQuietly(destProjectRoot)
    FileUtils.copyDirectory(projectRoot, destProjectRoot)
  }

  "ModifyXMLFilesRuleTest" should "able to delete nodes" in {
    val provider = new XMLFilesModelProvider
    val ruleConfig = ModifyXMLFilesRuleConfig(
      Some("***.xml"),
      List(
        ModifyXMLOperation(
          "//person[@name=\'Bob\']/job",
          OperationType.Replace,
          Some("<job>Software Engineer</job>")
        )
      )
    )
    val provider = new XMLFilesModelProvider
    val rule = new ModifyXMLFilesRule(ruleConfig)
    val transformedModel = rule.transform(provider.create(new GenericProjectCtx(destProjectRoot)))
    provider save transformedModel

    val transformedContent = Source.fromFile(new File(destProjectRoot, "somedirectory/someXML.xml")).getLines.mkString("\n")
    transformedContent should include ("Bob")
    transformedContent should include ("Software Engineer")
    transformedContent should not include "Salesman"
  }

}

Source File: RuleEngineTest.scala From RTran with Apache License 2.0

5 votes

package com.ebay.rtran.core

import java.io.File

import org.apache.commons.io.FileUtils
import org.json4s.jackson.JsonMethods._
import com.ebay.rtran.core.mock.{MyModifyFileRule, MyProject, MyRenameFileRule, MyRenameFileRuleConfig}
import org.scalatest.{BeforeAndAfterEach, FlatSpecLike, Matchers}

import scala.io.Source
import scala.collection.JavaConversions._


class RuleEngineTest extends FlatSpecLike with Matchers with BeforeAndAfterEach {

  val projectDir = new File(getClass.getClassLoader.getResource("myproject").getFile)
  val backupDir = new File(projectDir.getParentFile, projectDir.getName + "-bak")

  override def beforeEach = {
    FileUtils.copyDirectory(projectDir, backupDir)
  }
  override def afterEach = {
    FileUtils.deleteQuietly(backupDir)
  }

  "RuleEngine" should "execute rules from UpgradeConfiguration" in {
    val engine = new RuleEngine
    val projectRoot = backupDir
    val configuration = JsonUpgradeConfiguration( List(
      JsonRuleConfiguration("ModifyFileRule", None),
      JsonRuleConfiguration("RenameFileRule", Some(parse("""{"newName":"anotherfile"}""")))
    ))
    engine.execute(new MyProject(projectRoot), configuration)
    new File(projectRoot, "somefile").exists should be (false)
    new File(projectRoot, "anotherfile").exists should be (true)
    Source.fromFile(new File(projectRoot, "anotherfile")).getLines.toList should be (List("hi world", "hi Bob"))
  }

  "RuleEngine" should "execute rules from code" in {
    val engine = new RuleEngine
    val projectRoot = backupDir
    engine.execute(
      new MyProject(projectRoot),
      List(
        new MyModifyFileRule(),
        new MyRenameFileRule(MyRenameFileRuleConfig("anotherfile"))
      )
    )
    new File(projectRoot, "somefile").exists should be (false)
    new File(projectRoot, "anotherfile").exists should be (true)
    Source.fromFile(new File(projectRoot, "anotherfile")).getLines.toList should be (List("hi world", "hi Bob"))
  }

}

Source File: CliExec.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.cli

// scalastyle:off
// TODO(vlad): make sure that a simple intellij run fills in the resources
// @see https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala#L54
// scalastyle:on
import java.io.File

import com.salesforce.op.cli.gen.Ops
import org.apache.commons.io.FileUtils

class CliExec {
  protected val DEBUG = false

  private[cli] def delete(dir: File): Unit = {
    FileUtils.deleteDirectory(dir)
    if (dir.exists()) {
      throw new IllegalStateException(s"Directory '${dir.getAbsolutePath}' still exists")
    }
  }

  def main(args: Array[String]): Unit = try {
    val ops = for {
      arguments <- CommandParser.parse(args, CliParameters())
      if arguments.command == "gen"
      settings <- arguments.values
    } yield Ops(settings)

    ops getOrElse {
      CommandParser.showUsage()
      quit("wrong arguments", 1)
    }

    val outcome = ops.map (_.run())

    outcome getOrElse quit("Generation failed; see error messages", 1)

  } catch {
    case x: Exception =>
      if (DEBUG) x.printStackTrace()
      val msg = Option(x.getMessage).getOrElse(x.getStackTrace.mkString("", "\n", "\n"))
      quit(msg)
  }

  def quit(errorMsg: String, code: Int = -1): Nothing = {
    System.err.println(errorMsg)
    sys.exit(code)
  }
}

object CLI {
  def main(args: Array[String]): Unit = (new CliExec).main(args)
}

Source File: LogFile.scala From kyuubi with Apache License 2.0

5 votes

package yaooqinn.kyuubi.operation

import java.io.{BufferedReader, File, FileInputStream, FileNotFoundException, FileOutputStream, InputStreamReader, IOException, PrintStream}
import java.util.ArrayList

import scala.collection.JavaConverters._

import org.apache.commons.io.FileUtils
import org.apache.hadoop.io.IOUtils
import org.apache.kyuubi.Logging
import org.apache.spark.sql.Row

import yaooqinn.kyuubi.KyuubiSQLException

class LogFile private (
    file: File,
    private var reader: Option[BufferedReader],
    writer: PrintStream,
    @volatile private var isRemoved: Boolean = false) extends Logging {

  def this(file: File) = {
    this(file,
      LogFile.createReader(file, isRemoved = false),
      new PrintStream(new FileOutputStream(file)))
  }

  private def resetReader(): Unit = {
    reader.foreach(IOUtils.closeStream)
    reader = None
  }

  private def readResults(nLines: Long): Seq[Row] = {
    reader = reader.orElse(LogFile.createReader(file, isRemoved))

    val logs = new ArrayList[Row]()
    reader.foreach { r =>
      var i = 1
      try {
        var line: String = r.readLine()
        while ((i < nLines || nLines <= 0) && line != null) {
          logs.add(Row(line))
          line = r.readLine()
          i += 1
        }
      } catch {
        case e: FileNotFoundException =>
          val operationHandle = file.getName
          val path = file.getAbsolutePath
          val msg = if (isRemoved) {
            s"Operation[$operationHandle] has been closed and the log file $path has been removed"
          } else {
            s"Operation[$operationHandle] Log file $path is not found"
          }
          throw new KyuubiSQLException(msg, e)
      }
    }
    logs.asScala
  }

  
  def write(msg: String): Unit = {
    writer.print(msg)
  }


  def close(): Unit = synchronized {
    try {
      reader.foreach(_.close())
      writer.close()
      if (!isRemoved) {
        FileUtils.forceDelete(file)
        isRemoved = true
      }
    } catch {
      case e: IOException =>
        error(s"Failed to remove corresponding log file of operation: ${file.getName}", e)
    }
  }
}

object LogFile {

  def createReader(file: File, isRemoved: Boolean): Option[BufferedReader] = try {
    Option(new BufferedReader(new InputStreamReader(new FileInputStream(file))))
  } catch {
    case e: FileNotFoundException =>
      val operationHandle = file.getName
      val path = file.getAbsolutePath
      val msg = if (isRemoved) {
        s"Operation[$operationHandle] has been closed and the log file $path has been removed"
      } else {
        s"Operation[$operationHandle] Log file $path is not found"
      }
      throw new KyuubiSQLException(msg, e)
  }
}

Source File: DefaultSourceSpec.scala From jgit-spark-connector with Apache License 2.0

5 votes

package tech.sourced.engine

import java.nio.file.{Path, Paths}
import java.util.UUID

import org.apache.commons.io.FileUtils
import org.eclipse.jgit.api.Git

class DefaultSourceSpec extends BaseSourceSpec("DefaultSource") {

  var tmpPath: Path = Paths.get(System.getProperty("java.io.tmpdir"), UUID.randomUUID.toString)

  override protected def beforeAll(): Unit = {
    super.beforeAll()

    tmpPath.toFile.mkdir()
  }

  "DefaultSource" should "not optimize if the conditions on the " +
    "join are not the expected ones" in {
    val repos = engine.getRepositories
    val references = ss.read.format("tech.sourced.engine").option("table", "references").load()
    val out = repos.join(references,
      (references("repository_id") === repos("id"))
        .and(references("name").startsWith("refs/pull"))
    ).count()

    val df = references.limit(1).getCommits
    df.count() should be(1)
  }

  it should "return the remote branches renamed to refs/heads" in {
    val repoDir = tmpPath.resolve("repo")

    Git.cloneRepository()
      .setURI("https://github.com/src-d/jgit-spark-connector.git")
      .setDirectory(repoDir.toFile)
      .call()

    val engine = Engine(ss, tmpPath.toString, "standard")
    val masters = engine.getRepositories
      .getMaster
      .collect()
      .sortBy(_.getAs[String]("repository_id"))

    masters.length should be(2)
    masters(0).getAs[String]("repository_id") should startWith("file")
    masters(0).getAs[Boolean]("is_remote") should be(false)

    masters(1).getAs[String]("repository_id") should startWith("github")
    masters(1).getAs[Boolean]("is_remote") should be(true)

    engine.getRepositories.getRemoteReferences.getMaster.count() should be(1)
  }

  it should "match HEAD and not just refs/heads/HEAD" in {
    val repoDir = tmpPath.resolve("repo")

    import tech.sourced.engine.util.RepoUtils._

    val repo = createRepo(repoDir)
    commitFile(repo, "foo", "bar", "baz")

    Engine(ss, tmpPath.toString, "standard").getRepositories.getHEAD.count() should be(1)
  }

  it should "traverse all commits if it's not chained" in {
    val row = engine.session.sql("SELECT COUNT(*) FROM commits").first()
    row(0) should be(4444)

    val row2 = engine.session.sql("SELECT COUNT(*) FROM commits WHERE index > 0").first()
    row2(0) should be(4390)
  }

  override protected def afterAll(): Unit = {
    super.afterAll()

    FileUtils.deleteQuietly(tmpPath.toFile)
  }
}

Source File: RepositoryRDDProviderSpec.scala From jgit-spark-connector with Apache License 2.0

5 votes

package tech.sourced.engine.provider

import java.nio.file.{Path, Paths}
import java.util.UUID

import org.apache.commons.io.FileUtils
import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers}
import tech.sourced.engine.util.RepoUtils
import tech.sourced.engine.{BaseSivaSpec, BaseSparkSpec}

class RepositoryRDDProviderSpec extends FlatSpec with Matchers with BeforeAndAfterEach
  with BaseSparkSpec with BaseSivaSpec {

  private var provider: RepositoryRDDProvider = _
  private var tmpPath: Path = _

  override def beforeEach(): Unit = {
    super.beforeEach()
    provider = RepositoryRDDProvider(ss.sparkContext)
    tmpPath = Paths.get(
      System.getProperty("java.io.tmpdir"),
      UUID.randomUUID().toString
    )
  }

  override def afterEach(): Unit = {
    super.afterEach()

    FileUtils.deleteQuietly(tmpPath.toFile)
  }

  "RepositoryRDDProvider" should "retrieve bucketized raw repositories" in {
    tmpPath.resolve("a").toFile.mkdir()
    createRepo(tmpPath.resolve("a").resolve("repo"))

    tmpPath.resolve("b").toFile.mkdir()
    createRepo(tmpPath.resolve("b").resolve("repo"))

    createRepo(tmpPath.resolve("repo"))

    val repos = provider.get(tmpPath.toString, "standard").collect()
    repos.length should be(3)
  }

  it should "retrieve non-bucketized raw repositories" in {
    tmpPath.resolve("a").toFile.mkdir()
    createRepo(tmpPath.resolve("repo"))

    tmpPath.resolve("b").toFile.mkdir()
    createRepo(tmpPath.resolve("repo2"))

    val repos = provider.get(tmpPath.toString, "standard").collect()
    repos.length should be(2)
  }

  it should "retrieve bucketized siva repositories" in {
    val repos = provider.get(resourcePath, "siva").collect()
    repos.length should be(3)
  }

  it should "retrieve non-bucketized siva repositories" in {
    val repos = provider.get(Paths.get(resourcePath, "ff").toString, "siva").collect()
    repos.length should be(1)
  }

  private def createRepo(path: Path) = {
    val repo = RepoUtils.createRepo(path)
    RepoUtils.commitFile(repo, "file.txt", "something something", "some commit")
  }

}

Source File: RepoUtils.scala From jgit-spark-connector with Apache License 2.0

5 votes

package tech.sourced.engine.util

import java.nio.file.{Path, Paths}

import org.apache.commons.io.FileUtils
import org.eclipse.jgit.api.CreateBranchCommand.SetupUpstreamMode
import org.eclipse.jgit.api.Git
import org.eclipse.jgit.revwalk.RevCommit
import org.eclipse.jgit.transport.URIish

object RepoUtils {

  def createBareRepo(path: Path): Git = {
    Git.init().setBare(true).setDirectory(path.toFile).call()
  }

  def createRepo(path: Path): Git = {
    Git.init().setDirectory(path.toFile).call()
  }

  def addRemote(repo: Git, name: String, url: String): Unit = {
    val cmd = repo.remoteAdd()
    cmd.setName(name)
    cmd.setUri(new URIish(url))
    cmd.call()
  }

  def commitFile(repo: Git, name: String, content: String, msg: String): RevCommit = {
    val file = Paths.get(repo.getRepository.getDirectory.getParent, name)
    FileUtils.write(file.toFile, content)
    repo.add().addFilepattern(name).call()
    repo.commit().setMessage(msg).call()
  }

}

Source File: MetadataIteratorSpec.scala From jgit-spark-connector with Apache License 2.0

5 votes

package tech.sourced.engine.iterator

import java.nio.file.Paths
import java.util.{Properties, UUID}

import org.apache.commons.io.FileUtils
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.types.{Metadata, StringType, StructType}
import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers}
import tech.sourced.engine.{BaseSparkSpec, Schema}

class JDBCQueryIteratorSpec
  extends FlatSpec with Matchers with BeforeAndAfterAll with BaseSparkSpec {
  private val tmpPath = Paths.get(
    System.getProperty("java.io.tmpdir"),
    UUID.randomUUID.toString
  )

  private val dbPath = tmpPath.resolve("test.db")

  override def beforeAll(): Unit = {
    super.beforeAll()
    tmpPath.toFile.mkdir()
    val rdd = ss.sparkContext.parallelize(Seq(
      Row("id1"),
      Row("id2"),
      Row("id3")
    ))

    val properties = new Properties()
    properties.put("driver", "org.sqlite.JDBC")
    val df = ss.createDataFrame(rdd, StructType(Seq(Schema.repositories.head)))
    df.write.jdbc(s"jdbc:sqlite:${dbPath.toString}", "repositories", properties)
  }

  override def afterAll(): Unit = {
    super.afterAll()
    FileUtils.deleteQuietly(tmpPath.toFile)
  }

  "JDBCQueryIterator" should "return all rows for the query" in {
    val iter = new JDBCQueryIterator(
      Seq(attr("id")),
      dbPath.toString,
      "SELECT id FROM repositories ORDER BY id"
    )

    // calling hasNext more than one time does not cause rows to be lost
    iter.hasNext
    iter.hasNext
    val rows = (for (row <- iter) yield row).toArray
    rows.length should be(3)
    rows(0).length should be(1)
    rows(0)(0).toString should be("id1")
    rows(1)(0).toString should be("id2")
    rows(2)(0).toString should be("id3")
  }

  private def attr(name: String): Attribute = AttributeReference(
    name, StringType, nullable = false, Metadata.empty
  )()
}

Source File: BloopSpec.scala From seed with Apache License 2.0

5 votes

package seed.generation

import java.nio.file.{Files, Path}

import bloop.config.ConfigEncoderDecoders
import minitest.SimpleTestSuite
import org.apache.commons.io.FileUtils
import seed.generation.util.BuildUtil.tempPath

object BloopSpec extends SimpleTestSuite {
  def parseBloopFile(path: Path): bloop.config.Config.File = {
    val json = FileUtils.readFileToString(path.toFile, "UTF-8")
    io.circe.parser.decode(json)(ConfigEncoderDecoders.allDecoder).right.get
  }

  test("Inherit javaDeps in child modules") {
    val projectPath = tempPath.resolve("inherit-javadeps")
    Files.createDirectory(projectPath)

    val bloopPath = projectPath.resolve(".bloop")
    val build     = util.ProjectGeneration.generateJavaDepBloopProject(projectPath)

    assertEquals(build("example").module.jvm.get.moduleDeps, List("base"))

    val base = parseBloopFile(bloopPath.resolve("base.json"))
    assert(
      base.project.classpath
        .exists(_.toString.contains("/org/postgresql/postgresql/"))
    )

    val example = parseBloopFile(bloopPath.resolve("example.json"))
    assert(
      example.project.classpath
        .exists(_.toString.contains("/org/postgresql/postgresql/"))
    )

    val exampleTest = parseBloopFile(bloopPath.resolve("example-test.json"))
    assert(
      exampleTest.project.classpath
        .exists(_.toString.contains("/org/postgresql/postgresql/"))
    )
  }
}

Source File: PublishSpec.scala From seed with Apache License 2.0

5 votes

package seed.cli

import java.io.File
import java.nio.file.{Files, Path}

import minitest.SimpleTestSuite
import org.apache.commons.io.FileUtils
import seed.Log
import seed.generation.util.BuildUtil

import sys.process._

object PublishSpec extends SimpleTestSuite {
  def testVersionDetection(path: File): Unit = {
    Process("git init", path).!!

    FileUtils.write(new File(path, "test.txt"), "test", "UTF-8")
    Process("git add test.txt", path).!!
    Process("git commit . -m import", path).!!
    Process("git tag 0.1.0", path).!! // no 'v' prefix
    assertEquals(
      Publish.getVersion(path.toPath, None, Log.silent),
      Some("0.1.0")
    )

    FileUtils.write(new File(path, "test2.txt"), "test", "UTF-8")
    Process("git add test2.txt", path).!!
    Process("git commit . -m import", path).!!
    Process("git tag v0.1.1", path).!! // 'v' prefix
    assertEquals(
      Publish.getVersion(path.toPath, None, Log.silent),
      Some("0.1.1")
    )
  }

  test("Determine version number (relative path)") {
    val relativePath = new File("temp-git-version")
    if (Files.exists(relativePath.toPath))
      FileUtils.deleteDirectory(relativePath)
    Files.createDirectories(relativePath.toPath)
    testVersionDetection(relativePath)
    FileUtils.deleteDirectory(relativePath)
  }

  test("Determine version number (absolute path)") {
    val relativePath = BuildUtil.tempPath.resolve("git-version")
    if (Files.exists(relativePath))
      FileUtils.deleteDirectory(relativePath.toFile)
    Files.createDirectories(relativePath)
    testVersionDetection(relativePath.toFile)
  }
}

Source File: WatcherSpec.scala From seed with Apache License 2.0

5 votes

package seed.cli.util

import java.nio.file.Files

import minitest.SimpleTestSuite
import org.apache.commons.io.FileUtils
import seed.generation.util.BuildUtil
import zio.IO

import scala.collection.mutable
import scala.concurrent.ExecutionContext.Implicits.global

object WatcherSpec extends SimpleTestSuite {
  testAsync("Detect new file in root path") {
    val rootPath = BuildUtil.tempPath.resolve("watcher")
    Files.createDirectories(rootPath)

    val collected = mutable.ListBuffer[Unit]()
    var stop      = false

    val watcher = Watcher
      .watchPaths(
        List(rootPath),
        () => {
          // Only consider Scala/Java source files
          FileUtils.write(rootPath.resolve("test.html").toFile, "test", "UTF-8")
          stop = true
          FileUtils
            .write(rootPath.resolve("test.scala").toFile, "test", "UTF-8")
        }
      )
      .foreachWhile { v =>
        IO.effectTotal {
          collected += v
          !stop
        }
      }

    RTS.unsafeRunToFuture(watcher).map(_ => assertEquals(collected, List(())))
  }

  testAsync("Detect new file in sub-directory") {
    val rootPath         = BuildUtil.tempPath.resolve("watcher2")
    val subDirectoryPath = rootPath.resolve("sub")
    Files.createDirectories(subDirectoryPath)

    val collected = mutable.ListBuffer[Unit]()
    var stop      = false

    val watcher = Watcher
      .watchPaths(List(rootPath), { () =>
        stop = true
        FileUtils.write(rootPath.resolve("test.scala").toFile, "test", "UTF-8")
      })
      .foreachWhile { v =>
        IO.effectTotal {
          collected += v
          !stop
        }
      }

    RTS.unsafeRunToFuture(watcher).map(_ => assertEquals(collected, List(())))
  }

  testAsync("Watch file path") {
    val rootPath = BuildUtil.tempPath.resolve("watcher3")
    Files.createDirectories(rootPath)
    val filePath = rootPath.resolve("test.scala")
    FileUtils.write(filePath.toFile, "test", "UTF-8")

    val collected = mutable.ListBuffer[Unit]()
    var stop      = false

    val watcher = Watcher
      .watchPaths(List(filePath), { () =>
        stop = true
        FileUtils.write(filePath.toFile, "test2", "UTF-8")
      })
      .foreachWhile { v =>
        IO.effectTotal {
          collected += v
          !stop
        }
      }

    RTS.unsafeRunToFuture(watcher).map(_ => assertEquals(collected, List(())))
  }
}

Source File: XMLFilesModelProviderTest.scala From RTran with Apache License 2.0

5 votes

package com.ebay.rtran.generic

import java.io.File

import org.apache.commons.io.FileUtils
import com.ebay.rtran.xml.XMLFilesModelProvider
import org.scalatest.{FlatSpecLike, Matchers}

import scala.io.Source


class XMLFilesModelProviderTest extends FlatSpecLike with Matchers {

  val projectRoot = new File(getClass.getClassLoader.getResource("someproject").getFile)

  "XMLFilesModeProvider" should "get all xml files in the project" in {
    val provider = new XMLFilesModelProvider
    val model = provider.create(new GenericProjectCtx(projectRoot))
    model.xmlRoots.size should be (1)
  }

  "XMLFilesModeProvider" should "be able to save the files that are marked modified" in {
    val provider = new XMLFilesModelProvider
    val model = provider.create(new GenericProjectCtx(projectRoot))
    val (file, root) = model.xmlRoots.head
    val newFile = new File(file.getParentFile, file.getName + ".new")
    provider.save(model.copy(modified = Map(newFile -> Some(root))))
    val content = Source.fromFile(newFile).getLines.filterNot(_.matches("\\s+")).map(_.trim).mkString
    content should not be ""
    FileUtils.deleteQuietly(newFile)
  }

}

Source File: LogWriterSpec.scala From mist with Apache License 2.0

5 votes

package io.hydrosphere.mist.master.logging

import java.nio.file.{Files, Paths}

import akka.actor.ActorSystem
import akka.pattern.ask
import akka.testkit.{TestActorRef, TestKit}
import akka.util.Timeout
import com.typesafe.config.ConfigFactory
import io.hydrosphere.mist.core.logging.LogEvent
import io.hydrosphere.mist.master.LogStoragePaths
import org.apache.commons.io.FileUtils
import org.scalatest.{BeforeAndAfterAll, FunSpecLike, Matchers}

import scala.concurrent.Await
import scala.concurrent.duration._

class LogWriterSpec extends TestKit(ActorSystem("log-writer-test", ConfigFactory.load("master")))
  with FunSpecLike
  with Matchers
  with BeforeAndAfterAll {

  val dirName = "log_writer_test"
  val dir = Paths.get(".", "target", dirName)

  override def beforeAll(): Unit = {
    Files.createDirectories(dir)
  }

  override def afterAll(): Unit = {
    FileUtils.deleteDirectory(dir.toFile)
    TestKit.shutdownActorSystem(system)
  }

  implicit val timeout = Timeout(5 second)

  describe("writer actor") {

    it("should write to file") {
      val path = dir.resolve("test")
      val f = path.toFile
      if (f.exists()) f.delete()
      Files.createFile(path)

      val actor = TestActorRef(WriterActor.props(path))

      val event = LogEvent.mkDebug("id", "message")
      val future = actor ? WriteRequest("id", Seq(event))
      val update = Await.result(future.mapTo[LogUpdate], Duration.Inf)

      update.jobId shouldBe "id"
      update.events shouldBe Seq(event)
      update.bytesOffset shouldBe (event.mkString + "\n").getBytes.length
    }
  }

  describe("writers group") {

    it("should proxy to writer") {
      val mappings = new LogStoragePaths(dir)
      val expectedPath = mappings.pathFor("id")
      if (Files.exists(expectedPath)) Files.delete(expectedPath)

      val actor = TestActorRef(WritersGroupActor.props(mappings))

      val event = LogEvent.mkDebug("id", "message")
      val future = actor ? WriteRequest("id", Seq(event))
      val update = Await.result(future.mapTo[LogUpdate], Duration.Inf)

      val expectedSize = (event.mkString + "\n").getBytes.length

      update.jobId shouldBe "id"
      update.events shouldBe Seq(event)
      update.bytesOffset shouldBe expectedSize

      Files.readAllBytes(expectedPath).length shouldBe expectedSize
    }
  }
}

Source File: FStorageSpec.scala From mist with Apache License 2.0

5 votes

package io.hydrosphere.mist.master.data

import java.nio.file.Paths

import com.typesafe.config.{Config, ConfigValueFactory}
import io.hydrosphere.mist.master.models.NamedConfig
import org.apache.commons.io.FileUtils
import org.scalatest._

class FStorageSpec extends FunSpec with Matchers with BeforeAndAfter {

  case class TestEntry(
    name: String,
    value: Int
  ) extends NamedConfig

  val testEntryConfigRepr = new ConfigRepr[TestEntry] {
    import scala.collection.JavaConverters._

    override def fromConfig(config: Config): TestEntry = {
      TestEntry(config.getString("name"), config.getInt("value"))
    }

    override def toConfig(a: TestEntry): Config = {
      val map = Map("value" -> ConfigValueFactory.fromAnyRef(a.value))
      ConfigValueFactory.fromMap(map.asJava).toConfig
    }
  }

  val path = "./target/file_store_test"

  before {
    val f = Paths.get(path).toFile
    if (f.exists()) FileUtils.deleteDirectory(f)
  }

  it("should store files") {
    val storage = FsStorage.create(path, testEntryConfigRepr)

    storage.write("one", TestEntry("one", 1))
    storage.write("two", TestEntry("two", 2))

    storage.entries should contain allOf(
      TestEntry("one", 1),
      TestEntry("two", 2)
    )

    storage.delete("one")
    storage.entries should contain allElementsOf(Seq(TestEntry("two", 2)))
  }

}

Source File: FunctionConfigStorageSpec.scala From mist with Apache License 2.0

5 votes

package io.hydrosphere.mist.master.data

import java.nio.file.Paths

import io.hydrosphere.mist.master.models.FunctionConfig
import org.apache.commons.io.FileUtils
import org.scalatest.{BeforeAndAfter, Matchers, FunSpec}

class FunctionConfigStorageSpec extends FunSpec with Matchers with BeforeAndAfter {

  val path = "./target/data/func_store_test"

  before {
    val f = Paths.get(path).toFile
    if (f.exists()) FileUtils.deleteDirectory(f)
  }

  import scala.concurrent.ExecutionContext.Implicits.global
  import io.hydrosphere.mist.master.TestUtils._

  it("should update") {
    val functions = testStorage()

    functions.all.await.size shouldBe 1

    functions.update(FunctionConfig("second", "path", "className", "foo")).await
    functions.all.await.size shouldBe 2
  }

  it("should get") {
    val functions = testStorage()

    functions.get("first").await.isDefined shouldBe true
    functions.get("second").await.isDefined shouldBe false

    functions.update(FunctionConfig("second", "path", "className", "foo")).await
    functions.get("second").await.isDefined shouldBe true
  }

  it("should override defaults") {
    val functions = testStorage()

    functions.get("first").await.get.className shouldBe "className"

    functions.update(FunctionConfig("first", "path", "anotherClassName", "foo")).await
    functions.get("first").await.get.className shouldBe "anotherClassName"
  }

  def testStorage(
    defaults: Seq[FunctionConfig] = Seq(FunctionConfig("first", "path", "className", "foo"))): FunctionConfigStorage = {
    new FunctionConfigStorage(
      FsStorage.create(path, ConfigRepr.EndpointsRepr),
      defaults
    )
  }
}

Source File: RunnerSelectorSpec.scala From mist with Apache License 2.0

5 votes

package io.hydrosphere.mist.worker.runners

import java.io.File
import java.nio.file.Paths

import io.hydrosphere.mist.worker.SparkArtifact
import io.hydrosphere.mist.worker.runners.python.PythonRunner
import org.apache.commons.io.FileUtils
import org.scalatest.{BeforeAndAfter, FunSpecLike, Matchers}

class RunnerSelectorSpec extends FunSpecLike
  with Matchers
  with BeforeAndAfter {
  val basePath = "./target/runner"
  val pyFile = SparkArtifact(Paths.get(basePath, "test.py").toFile, "url")
  val jarFile = SparkArtifact(Paths.get(basePath, "test.jar").toFile, "url")
  val unknown = SparkArtifact(Paths.get(basePath, "test.unknown").toFile, "url")

  before {
    val f = new File(basePath)
    if (f.exists()) FileUtils.deleteDirectory(f)
    FileUtils.forceMkdir(f)
    FileUtils.touch(pyFile.local)
    FileUtils.touch(jarFile.local)
  }

  after {
    FileUtils.deleteQuietly(pyFile.local)
    FileUtils.deleteQuietly(jarFile.local)
  }

  it("should select runner by extension") {
    val selector = new SimpleRunnerSelector
    selector.selectRunner(pyFile) shouldBe a[PythonRunner]
    selector.selectRunner(jarFile) shouldBe a[ScalaRunner]
  }


  it("should throw exception when unknown file type is passed") {
    val selector = new SimpleRunnerSelector
    intercept[IllegalArgumentException] {
      selector.selectRunner(unknown)
    }
  }

}

Source File: ForkedSparkContextSpec.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl

import java.io.File

import breeze.linalg
import odkl.analysis.spark.TestEnv
import odkl.analysis.spark.util.SQLOperations
import org.apache.commons.io.FileUtils
import org.scalatest.FlatSpec

class ForkedSparkContextSpec extends FlatSpec with TestEnv with org.scalatest.Matchers with WithTestData {

  "Fork " should " support one layer" in {
    val directory = new File(FileUtils.getTempDirectory, "forkedSpark")
    try {
      val estimator = new ForkedSparkEstimator[LinearRegressionModel,LinearRegressionSGD](new LinearRegressionSGD())
        .setTempPath(directory.getAbsolutePath)
        .setMaster("local[1]")

      val model = estimator.fit(noInterceptData)

      val dev: linalg.Vector[Double] = hiddenModel.asBreeze - model.getCoefficients.asBreeze

      val deviation: Double = dev dot dev

      deviation should be <= delta
      model.getIntercept should be(0.0)
    } finally {
      FileUtils.deleteDirectory(directory)
    }
  }

  "Fork " should " support two layers" in {
    val directory = new File(FileUtils.getTempDirectory, "forkedSpark")
    try {
      val estimator =  new ForkedSparkEstimator[LinearRegressionModel,ForkedSparkEstimator[LinearRegressionModel,LinearRegressionSGD]](
          new ForkedSparkEstimator[LinearRegressionModel,LinearRegressionSGD](new LinearRegressionSGD())
          .setTempPath(directory.getAbsolutePath)
          .setMaster("local[1]"))
        .setTempPath(directory.getAbsolutePath)
        .setMaster("local[1]")

      val model = estimator.fit(noInterceptData)

      val dev: linalg.Vector[Double] = hiddenModel.asBreeze - model.getCoefficients.asBreeze

      val deviation: Double = dev dot dev

      deviation should be <= delta
      model.getIntercept should be(0.0)
    } finally {
      FileUtils.deleteDirectory(directory)
    }
  }
}

Source File: BetweennessEdmonds$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.measures.vertex.betweenness.edmonds

import java.nio.file.Files

import ml.sparkling.graph.operators.MeasureTest
import org.apache.commons.io.FileUtils
import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Graph, VertexRDD}


class BetweennessEdmonds$Test(implicit sc: SparkContext) extends MeasureTest {
  val tempDir = Files.createTempDirectory("spark-checkpoint")

  override def beforeAll() = {
    sc.setCheckpointDir(tempDir.toAbsolutePath.toString)
  }

  override def afterAll() = {
    FileUtils.deleteDirectory(tempDir.toFile)
  }

  "Edmonds betweenness centrality for random graph" should "be correctly calculated" in {
    Given("graph")
    val filePath = getClass.getResource("/graphs/graph_ER_15")
    val graph: Graph[Int, Int] = loadGraph(filePath.toString)
    When("Computes betweenness")
    val result = EdmondsBC.computeBC(graph)
    Then("Should calculate betweenness correctly")
    val bcFile = getClass.getResource("/graphs/graph_ER_15_bc")
    val bcCorrectValues = sc.textFile(bcFile.getPath)
      .filter(_.nonEmpty)
      .map(l => { val t = l.split("\t", 2); (t(0).toInt, t(1).toDouble) })
      .sortBy({ case (vId, data) => vId })
      .map({ case (vId, data) => data}).collect()
    val bcValues = result.sortBy({ case (vId, data) => vId })
      .map({ case (vId, data) => data }).collect()
    bcCorrectValues.zip(bcValues).foreach({ case (a, b) =>
      a should be(b +- 1e-5)
    })

    result.unpersist(false)
  }

}

Source File: BetweennessHua$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.measures.vertex.betweenness.hua

import java.nio.file.Files

import ml.sparkling.graph.operators.MeasureTest
import ml.sparkling.graph.operators.measures.vertex.betweenness.edmonds.EdmondsBC
import org.apache.commons.io.FileUtils
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import org.scalatest.tagobjects.Slow


class BetweennessHua$Test (implicit sc: SparkContext) extends MeasureTest {
  val tempDir = Files.createTempDirectory("spark-checkpoint")

  override def beforeAll() = {
    sc.setCheckpointDir(tempDir.toAbsolutePath.toString)
  }

  override def afterAll() = {
    FileUtils.deleteDirectory(tempDir.toFile)
  }

  "Hua betweenness centrality for random graph" should "be correctly calculated" in {
    Given("graph")
    val filePath = getClass.getResource("/graphs/graph_ER_15")
    val graph: Graph[Int, Int] = loadGraph(filePath.toString)
    When("Computes betweenness")
    val result = HuaBC.computeBC(graph)
    Then("Should calculate betweenness correctly")
    val bcFile = getClass.getResource("/graphs/graph_ER_15_bc")
    val bcCorrectValues = sc.textFile(bcFile.getPath)
      .filter(_.nonEmpty)
      .map(l => { val t = l.split("\t", 2); (t(0).toInt, t(1).toDouble) })
      .sortBy({ case (vId, data) => vId })
      .map({ case (vId, data) => data}).collect()
    val bcValues = result.sortBy({ case (vId, data) => vId })
      .map({ case (vId, data) => data }).collect()
    bcCorrectValues.zip(bcValues).foreach({ case (a, b) =>
      a should be(b +- 1e-5)
    })

    result.unpersist(false)
  }

  "Hua betweenness centrality for random graph" should "take no longer then Edmonds" taggedAs(Slow) in {
    Given("graph")
    val filePath = getClass.getResource("/graphs/graph_ER_15")
    val graph: Graph[Int, Int] = loadGraph(filePath.toString)
    When("computes betwenness centrality")
    val (_, edmondsTime) = time("Edmonds algorithm for betweenness centrality")(EdmondsBC.computeBC(graph))
    val (_, huaTime) = time("Hua algorithm for betweenness centrality")(HuaBC.computeBC(graph))
    Then("Hua algorithm should be faster")
    huaTime should be <= edmondsTime
  }

}

Source File: SparkTest.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators

import java.nio.file.{Files, Path}

import ml.sparkling.graph.operators.algorithms.aproximation.ApproximatedShortestPathsAlgorithm$Test
import ml.sparkling.graph.operators.algorithms.coarsening.labelpropagation.LPCoarsening$Test
import ml.sparkling.graph.operators.algorithms.community.pscan.PSCAN$Test
import ml.sparkling.graph.operators.algorithms.link.BasicLinkPredictor$Test
import ml.sparkling.graph.operators.algorithms.shortestpaths.ShortestPathsAlgorithm$Test
import ml.sparkling.graph.operators.measures.edge.AdamicAdar$Test
import ml.sparkling.graph.operators.measures.graph.{FreemanCentrality$Test, Modularity$Test}
import ml.sparkling.graph.operators.measures.vertex.betweenness.edmonds.BetweennessEdmonds$Test
import ml.sparkling.graph.operators.measures.vertex.betweenness.hua.BetweennessHua$Test
import ml.sparkling.graph.operators.measures.vertex.closenes.Closeness$Test
import ml.sparkling.graph.operators.measures.vertex.clustering.LocalClustering$Test
import ml.sparkling.graph.operators.measures.vertex.eigenvector.EigenvectorCentrality$Test
import ml.sparkling.graph.operators.measures.vertex.hits.Hits$Test
import ml.sparkling.graph.operators.measures.{NeighborhoodConnectivity$Test, VertexEmbeddedness$Test}
import ml.sparkling.graph.operators.partitioning.{CommunityBasedPartitioning$Test, PSCANBasedPartitioning$Test, PropagationBasedPartitioning$Test}
import org.apache.commons.io.FileUtils
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest._


class SparkTest extends Spec with BeforeAndAfterAll  {
  val file: Path = Files.createTempDirectory("tmpCheckpoint")
  override val invokeBeforeAllAndAfterAllEvenIfNoTestsAreExpected=true
  val master = "local[8]"


  def appName: String = "operators-tests"

  implicit val sc: SparkContext = {
    val conf = new SparkConf()
      .setMaster(master)
      .setAppName(appName)
    val out=new SparkContext(conf)
    out.setCheckpointDir(file.toString)
    out
  }


  override def afterAll() = {
    if(!sc.isStopped){
      sc.stop()
    }
    FileUtils.deleteDirectory(file.toFile)
  }


  override def nestedSuites = {
    Vector(
      new PSCANBasedPartitioning$Test,
      new PropagationBasedPartitioning$Test,
      new ApproximatedShortestPathsAlgorithm$Test,
      new ShortestPathsAlgorithm$Test,
      new EigenvectorCentrality$Test,
      new VertexEmbeddedness$Test,
      new PSCAN$Test,
      new Modularity$Test,
      new CommunityBasedPartitioning$Test,
      new NeighborhoodConnectivity$Test,
      new Hits$Test,
      new LocalClustering$Test,
      new FreemanCentrality$Test,
      new AdamicAdar$Test,
      new BasicLinkPredictor$Test,
      new Closeness$Test,
      new BetweennessEdmonds$Test,
      new BetweennessHua$Test
    )
  }


}

Source File: SortShuffleSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark

import java.io.File

import scala.collection.JavaConverters._

import org.apache.commons.io.FileUtils
import org.apache.commons.io.filefilter.TrueFileFilter
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.rdd.ShuffledRDD
import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
import org.apache.spark.shuffle.sort.SortShuffleManager
import org.apache.spark.util.Utils

class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {

  // This test suite should run all tests in ShuffleSuite with sort-based shuffle.

  private var tempDir: File = _

  override def beforeAll() {
    super.beforeAll()
    // Once 'spark.local.dir' is set, it is cached. Unless this is manually cleared
    // before/after a test, it could return the same directory even if this property
    // is configured.
    Utils.clearLocalRootDirs()
    conf.set("spark.shuffle.manager", "sort")
  }

  override def beforeEach(): Unit = {
    super.beforeEach()
    tempDir = Utils.createTempDir()
    conf.set("spark.local.dir", tempDir.getAbsolutePath)
  }

  override def afterEach(): Unit = {
    try {
      Utils.deleteRecursively(tempDir)
      Utils.clearLocalRootDirs()
    } finally {
      super.afterEach()
    }
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the new serialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new KryoSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the old deserialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new JavaSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = {
    def getAllFiles: Set[File] =
      FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
    val filesBeforeShuffle = getAllFiles
    // Force the shuffle to be performed
    shuffledRdd.count()
    // Ensure that the shuffle actually created files that will need to be cleaned up
    val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle
    filesCreatedByShuffle.map(_.getName) should be
    Set("shuffle_0_0_0.data", "shuffle_0_0_0.index")
    // Check that the cleanup actually removes the files
    sc.env.blockManager.master.removeShuffle(0, blocking = true)
    for (file <- filesCreatedByShuffle) {
      assert (!file.exists(), s"Shuffle file $file was not cleaned up")
    }
  }
}

Source File: NsdbMiniCluster.scala From NSDb with Apache License 2.0

5 votes

package io.radicalbit.nsdb.minicluster

import java.io.File
import java.time.Duration
import java.util.UUID

import com.typesafe.scalalogging.LazyLogging
import org.apache.commons.io.FileUtils

trait NsdbMiniCluster extends LazyLogging {

  protected[this] val instanceId = { UUID.randomUUID }

  protected[this] val startingHostname = "127.0.0."

  protected[this] def rootFolder: String
  protected[this] def nodesNumber: Int
  protected[this] def passivateAfter: Duration
  protected[this] def replicationFactor: Int

  lazy val nodes: Set[NSDbMiniClusterNode] =
    (for {
      i <- 0 until nodesNumber
    } yield
      new NSDbMiniClusterNode(
        hostname = s"$startingHostname${i + 1}",
        storageDir = s"$rootFolder/data$i",
        passivateAfter = passivateAfter,
        replicationFactor = replicationFactor
      )).toSet

  def start(cleanup: Boolean = false): Unit = {
    if (cleanup)
      FileUtils.deleteDirectory(new File(rootFolder))
    nodes.foreach(_.start())
  }

  def stop(): Unit = nodes.foreach(n => n.stop())

}

Source File: GzipUtils.scala From odinson with Apache License 2.0

5 votes

package ai.lum.odinson.extra

import org.apache.commons.io.FileUtils
import java.io._
import java.util.zip._
import java.nio.charset.StandardCharsets


object GzipUtils {

  def compress(data: String): Array[Byte] = {
    val baos = new ByteArrayOutputStream(data.length)
    val gzip = new GZIPOutputStream(baos)
    val bytes = data.getBytes(StandardCharsets.UTF_8)
    gzip.write(bytes)
    gzip.close()
    val compressed = baos.toByteArray
    baos.close()
    compressed
  }

  def uncompress(file: File): String = {
    val inputStream = FileUtils.openInputStream(file)
    val res = uncompress(inputStream)
    inputStream.close()
    res
  }

  def uncompress(compressed: Array[Byte]): String = {
    uncompress(new ByteArrayInputStream(compressed))
  }

  def uncompress(input: InputStream): String = {
    val gzip = new GZIPInputStream(input)
    val br = new BufferedReader(new InputStreamReader(gzip, StandardCharsets.UTF_8))
    val sb = new StringBuilder()
    var line: String = br.readLine()
    while (line != null) {
      sb.append(line)
      line = br.readLine()
    }
    br.close()
    gzip.close()
    sb.toString()
  }

}

Source File: YarnShuffleIntegrationSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.io.File

import com.google.common.base.Charsets.UTF_8
import com.google.common.io.Files
import org.apache.commons.io.FileUtils
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.scalatest.Matchers

import org.apache.spark._
import org.apache.spark.network.shuffle.ShuffleTestAccessor
import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor}
import org.apache.spark.tags.ExtendedYarnTest


@ExtendedYarnTest
class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {

  override def newYarnConfig(): YarnConfiguration = {
    val yarnConfig = new YarnConfiguration()
    yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle")
    yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"),
      classOf[YarnShuffleService].getCanonicalName)
    yarnConfig.set("spark.shuffle.service.port", "0")
    yarnConfig
  }

  test("external shuffle service") {
    val shuffleServicePort = YarnTestAccessor.getShuffleServicePort
    val shuffleService = YarnTestAccessor.getShuffleServiceInstance

    val registeredExecFile = YarnTestAccessor.getRegisteredExecutorFile(shuffleService)

    logInfo("Shuffle service port = " + shuffleServicePort)
    val result = File.createTempFile("result", null, tempDir)
    val finalState = runSpark(
      false,
      mainClassName(YarnExternalShuffleDriver.getClass),
      appArgs = Seq(result.getAbsolutePath(), registeredExecFile.getAbsolutePath),
      extraConf = Map(
        "spark.shuffle.service.enabled" -> "true",
        "spark.shuffle.service.port" -> shuffleServicePort.toString
      )
    )
    checkResult(finalState, result)
    assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists())
  }
}

private object YarnExternalShuffleDriver extends Logging with Matchers {

  val WAIT_TIMEOUT_MILLIS = 10000

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      // scalastyle:off println
      System.err.println(
        s"""
        |Invalid command line: ${args.mkString(" ")}
        |
        |Usage: ExternalShuffleDriver [result file] [registed exec file]
        """.stripMargin)
      // scalastyle:on println
      System.exit(1)
    }

    val sc = new SparkContext(new SparkConf()
      .setAppName("External Shuffle Test"))
    val conf = sc.getConf
    val status = new File(args(0))
    val registeredExecFile = new File(args(1))
    logInfo("shuffle service executor file = " + registeredExecFile)
    var result = "failure"
    val execStateCopy = new File(registeredExecFile.getAbsolutePath + "_dup")
    try {
      val data = sc.parallelize(0 until 100, 10).map { x => (x % 10) -> x }.reduceByKey{ _ + _ }.
        collect().toSet
      sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
      data should be ((0 until 10).map{x => x -> (x * 10 + 450)}.toSet)
      result = "success"
      // only one process can open a leveldb file at a time, so we copy the files
      FileUtils.copyDirectory(registeredExecFile, execStateCopy)
      assert(!ShuffleTestAccessor.reloadRegisteredExecutors(execStateCopy).isEmpty)
    } finally {
      sc.stop()
      FileUtils.deleteDirectory(execStateCopy)
      Files.write(result, status, UTF_8)
    }
  }

}

Source File: SortShuffleSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark

import java.io.File

import scala.collection.JavaConverters._

import org.apache.commons.io.FileUtils
import org.apache.commons.io.filefilter.TrueFileFilter
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.rdd.ShuffledRDD
import org.apache.spark.shuffle.sort.SortShuffleManager
import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
import org.apache.spark.util.Utils

class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {

  // This test suite should run all tests in ShuffleSuite with sort-based shuffle.

  private var tempDir: File = _

  override def beforeAll() {
    conf.set("spark.shuffle.manager", "sort")
  }

  override def beforeEach(): Unit = {
    tempDir = Utils.createTempDir()
    conf.set("spark.local.dir", tempDir.getAbsolutePath)
  }

  override def afterEach(): Unit = {
    try {
      Utils.deleteRecursively(tempDir)
    } finally {
      super.afterEach()
    }
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the new serialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new KryoSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the old deserialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new JavaSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = {
    def getAllFiles: Set[File] =
      FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
    val filesBeforeShuffle = getAllFiles
    // Force the shuffle to be performed
    shuffledRdd.count()
    // Ensure that the shuffle actually created files that will need to be cleaned up
    val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle
    filesCreatedByShuffle.map(_.getName) should be
    Set("shuffle_0_0_0.data", "shuffle_0_0_0.index")
    // Check that the cleanup actually removes the files
    sc.env.blockManager.master.removeShuffle(0, blocking = true)
    for (file <- filesCreatedByShuffle) {
      assert (!file.exists(), s"Shuffle file $file was not cleaned up")
    }
  }
}

Source File: YarnShuffleIntegrationSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files
import org.apache.commons.io.FileUtils
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.scalatest.Matchers

import org.apache.spark._
import org.apache.spark.internal.Logging
import org.apache.spark.network.shuffle.ShuffleTestAccessor
import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor}
import org.apache.spark.tags.ExtendedYarnTest


@ExtendedYarnTest
class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {

  override def newYarnConfig(): YarnConfiguration = {
    val yarnConfig = new YarnConfiguration()
    yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle")
    yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"),
      classOf[YarnShuffleService].getCanonicalName)
    yarnConfig.set("spark.shuffle.service.port", "0")
    yarnConfig
  }

  test("external shuffle service") {
    val shuffleServicePort = YarnTestAccessor.getShuffleServicePort
    val shuffleService = YarnTestAccessor.getShuffleServiceInstance

    val registeredExecFile = YarnTestAccessor.getRegisteredExecutorFile(shuffleService)

    logInfo("Shuffle service port = " + shuffleServicePort)
    val result = File.createTempFile("result", null, tempDir)
    val finalState = runSpark(
      false,
      mainClassName(YarnExternalShuffleDriver.getClass),
      appArgs = Seq(result.getAbsolutePath(), registeredExecFile.getAbsolutePath),
      extraConf = Map(
        "spark.shuffle.service.enabled" -> "true",
        "spark.shuffle.service.port" -> shuffleServicePort.toString
      )
    )
    checkResult(finalState, result)
    assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists())
  }
}

private object YarnExternalShuffleDriver extends Logging with Matchers {

  val WAIT_TIMEOUT_MILLIS = 10000

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      // scalastyle:off println
      System.err.println(
        s"""
        |Invalid command line: ${args.mkString(" ")}
        |
        |Usage: ExternalShuffleDriver [result file] [registered exec file]
        """.stripMargin)
      // scalastyle:on println
      System.exit(1)
    }

    val sc = new SparkContext(new SparkConf()
      .setAppName("External Shuffle Test"))
    val conf = sc.getConf
    val status = new File(args(0))
    val registeredExecFile = new File(args(1))
    logInfo("shuffle service executor file = " + registeredExecFile)
    var result = "failure"
    val execStateCopy = new File(registeredExecFile.getAbsolutePath + "_dup")
    try {
      val data = sc.parallelize(0 until 100, 10).map { x => (x % 10) -> x }.reduceByKey{ _ + _ }.
        collect().toSet
      sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
      data should be ((0 until 10).map{x => x -> (x * 10 + 450)}.toSet)
      result = "success"
      // only one process can open a leveldb file at a time, so we copy the files
      FileUtils.copyDirectory(registeredExecFile, execStateCopy)
      assert(!ShuffleTestAccessor.reloadRegisteredExecutors(execStateCopy).isEmpty)
    } finally {
      sc.stop()
      FileUtils.deleteDirectory(execStateCopy)
      Files.write(result, status, StandardCharsets.UTF_8)
    }
  }

}

Source File: SortShuffleSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark

import java.io.File

import scala.collection.JavaConverters._

import org.apache.commons.io.FileUtils
import org.apache.commons.io.filefilter.TrueFileFilter
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.rdd.ShuffledRDD
import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
import org.apache.spark.shuffle.sort.SortShuffleManager
import org.apache.spark.util.Utils

class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {

  // This test suite should run all tests in ShuffleSuite with sort-based shuffle.

  private var tempDir: File = _

  override def beforeAll() {
    super.beforeAll()
    conf.set("spark.shuffle.manager", "sort")
  }

  override def beforeEach(): Unit = {
    super.beforeEach()
    tempDir = Utils.createTempDir()
    conf.set("spark.local.dir", tempDir.getAbsolutePath)
  }

  override def afterEach(): Unit = {
    try {
      Utils.deleteRecursively(tempDir)
    } finally {
      super.afterEach()
    }
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the new serialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new KryoSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the old deserialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new JavaSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = {
    def getAllFiles: Set[File] =
      FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
    val filesBeforeShuffle = getAllFiles
    // Force the shuffle to be performed
    shuffledRdd.count()
    // Ensure that the shuffle actually created files that will need to be cleaned up
    val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle
    filesCreatedByShuffle.map(_.getName) should be
    Set("shuffle_0_0_0.data", "shuffle_0_0_0.index")
    // Check that the cleanup actually removes the files
    sc.env.blockManager.master.removeShuffle(0, blocking = true)
    for (file <- filesCreatedByShuffle) {
      assert (!file.exists(), s"Shuffle file $file was not cleaned up")
    }
  }
}

Source File: ScenarioLoader.scala From mantis with Apache License 2.0

5 votes

package io.iohk.ethereum.ets.common

import java.io.File

import io.iohk.ethereum.utils.Logger
import org.apache.commons.io.FileUtils

import scala.collection.JavaConverters._
import scala.io.Source


trait ScenarioLoader[T] extends ScenarioParser[T] with Logger {

  def load(path: String, options: TestOptions, ignoredTestNames: Set[String] = Set.empty): List[ScenarioGroup[T]] = {
    val testDir = new File(getClass.getClassLoader.getResource(path).toURI)
    val files = FileUtils.listFiles(testDir, Array("json"), true).asScala.toList

    files.filterNot(file => ignoredTestNames.contains(file.getName)).flatMap { file =>
      val name = file.getAbsolutePath.drop(testDir.getAbsolutePath.length + 1).dropRight(".json".length)

      if (!options.isGroupIncluded(name))
        None
      else {
        log.info(s"Loading test scenarios from: $file")
        val text = Source.fromFile(file).getLines.mkString
        val scenarios = parse(text)
        Some(ScenarioGroup(name, scenarios))
      }
    }
  }
}

Source File: InceptionFetcherTest.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.core.fetcher.tensorflow

import java.io.File

import org.apache.commons.io.FileUtils
import org.apache.s2graph.core.fetcher.BaseFetcherTest
import play.api.libs.json.Json

class InceptionFetcherTest extends BaseFetcherTest {
  val runDownloadModel: Boolean = true
  val runCleanup: Boolean = true

  def cleanup(downloadPath: String, dir: String) = {
    synchronized {
      FileUtils.deleteQuietly(new File(downloadPath))
      FileUtils.deleteDirectory(new File(dir))
    }
  }
  def downloadModel(dir: String) = {
    import sys.process._
    synchronized {
      FileUtils.forceMkdir(new File(dir))

      val url = "https://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip"
      val wget = s"wget $url"
      wget !
      val unzip = s"unzip inception5h.zip -d $dir"
      unzip !
    }
  }

  //TODO: make this test case to run smoothly
  ignore("test get bytes for image url") {
    val downloadPath = "inception5h.zip"
    val modelPath = "inception"
    try {
      if (runDownloadModel) downloadModel(modelPath)

      val serviceName = "s2graph"
      val columnName = "user"
      val labelName = "image_net"
      val options =
        s"""
           |{
           |  "fetcher": {
           |    "className": "org.apache.s2graph.core.fetcher.tensorflow.InceptionFetcher",
           |    "modelPath": "$modelPath"
           |  }
           |}
       """.stripMargin
      val (service, column, label) = initEdgeFetcher(serviceName, columnName, labelName, Option(options))

      val srcVertices = Seq(
        "http://www.gstatic.com/webp/gallery/1.jpg",
        "http://www.gstatic.com/webp/gallery/2.jpg",
        "http://www.gstatic.com/webp/gallery/3.jpg"
      )
      val stepResult = queryEdgeFetcher(service, column, label, srcVertices)

      stepResult.edgeWithScores.groupBy(_.edge.srcVertex).foreach { case (srcVertex, ls) =>
        val url = srcVertex.innerIdVal.toString
        val scores = ls.map { es =>
          val edge = es.edge
          val label = edge.tgtVertex.innerIdVal.toString
          val score = edge.property[Double]("score").value()

          Json.obj("label" -> label, "score" -> score)
        }
        val jsArr = Json.toJson(scores)
        val json = Json.obj("url" -> url, "scores" -> jsArr)
        println(Json.prettyPrint(json))
      }
    } finally {
      if (runCleanup) cleanup(downloadPath, modelPath)
    }
  }
}

Source File: PailDataSourceSpec.scala From utils with Apache License 2.0

5 votes

package com.indix.utils.spark.pail

import java.util

import com.backtype.hadoop.pail.{PailFormatFactory, PailSpec, PailStructure}
import com.backtype.support.{Utils => PailUtils}
import com.google.common.io.Files
import org.apache.commons.io.FileUtils
import org.apache.spark.sql.SparkSession
import org.scalatest.{BeforeAndAfterAll, FlatSpec}
import org.scalatest.Matchers._

import scala.collection.JavaConverters._
import scala.util.Random

case class User(name: String, age: Int)

class UserPailStructure extends PailStructure[User] {
  override def isValidTarget(dirs: String*): Boolean = true

  override def getType: Class[_] = classOf[User]

  override def serialize(user: User): Array[Byte] = PailUtils.serialize(user)

  override def getTarget(user: User): util.List[String] = List(user.age % 10).map(_.toString).asJava

  override def deserialize(serialized: Array[Byte]): User = PailUtils.deserialize(serialized).asInstanceOf[User]
}

class PailDataSourceSpec extends FlatSpec with BeforeAndAfterAll with PailDataSource {
  private var spark: SparkSession = _

  override protected def beforeAll(): Unit = {
    super.beforeAll()
    spark = SparkSession.builder().master("local[2]").appName("PailDataSource").getOrCreate()
  }

  val userPailSpec = new PailSpec(PailFormatFactory.SEQUENCE_FILE, new UserPailStructure)

  "PailBasedReaderWriter" should "read/write user records from/into pail" in {
    val output = Files.createTempDir()
    val users = (1 to 100).map { index => User(s"foo$index", Random.nextInt(40))}
    spark.sparkContext.parallelize(users)
      .saveAsPail(output.getAbsolutePath, userPailSpec)

    val input = output.getAbsolutePath
    val total = spark.sparkContext.pailFile[User](input)
      .map(u => u.name)
      .count()

    total should be(100)
    FileUtils.deleteDirectory(output)
  }
}

Source File: ParquetAvroDataSourceSpec.scala From utils with Apache License 2.0

5 votes

package com.indix.utils.spark.parquet

import java.io.File

import com.google.common.io.Files
import com.indix.utils.spark.parquet.avro.ParquetAvroDataSource
import org.apache.commons.io.FileUtils
import org.apache.parquet.hadoop.metadata.CompressionCodecName
import org.apache.spark.sql.SparkSession
import org.scalactic.Equality
import org.scalatest.Matchers.{be, convertToAnyShouldWrapper, equal}
import org.scalatest.{BeforeAndAfterAll, FlatSpec}
import java.util.{Arrays => JArrays}

case class SampleAvroRecord(a: Int, b: String, c: Seq[String], d: Boolean, e: Double, f: collection.Map[String, String], g: Array[Byte])

class ParquetAvroDataSourceSpec extends FlatSpec with BeforeAndAfterAll with ParquetAvroDataSource {
  private var spark: SparkSession = _
  implicit val sampleAvroRecordEq = new Equality[SampleAvroRecord] {
    override def areEqual(left: SampleAvroRecord, b: Any): Boolean = b match {
      case right: SampleAvroRecord =>
        left.a == right.a &&
          left.b == right.b &&
          Equality.default[Seq[String]].areEqual(left.c, right.c) &&
          left.d == right.d &&
          left.e == right.e &&
          Equality.default[collection.Map[String, String]].areEqual(left.f, right.f) &&
          JArrays.equals(left.g, right.g)
      case _ => false
    }
  }

  override protected def beforeAll(): Unit = {
    super.beforeAll()
    spark = SparkSession.builder().master("local[2]").appName("ParquetAvroDataSource").getOrCreate()
  }

  override protected def afterAll(): Unit = {
    try {
      spark.sparkContext.stop()
    } finally {
      super.afterAll()
    }
  }

  "AvroBasedParquetDataSource" should "read/write avro records as ParquetData" in {

    val outputLocation = Files.createTempDir().getAbsolutePath + "/output"

    val sampleRecords: Seq[SampleAvroRecord] = Seq(
      SampleAvroRecord(1, "1", List("a1"), true, 1.0d, Map("a1" -> "b1"), "1".getBytes),
      SampleAvroRecord(2, "2", List("a2"), false, 2.0d, Map("a2" -> "b2"), "2".getBytes),
      SampleAvroRecord(3, "3", List("a3"), true, 3.0d, Map("a3" -> "b3"), "3".getBytes),
      SampleAvroRecord(4, "4", List("a4"), true, 4.0d, Map("a4" -> "b4"), "4".getBytes),
      SampleAvroRecord(5, "5", List("a5"), false, 5.0d, Map("a5" -> "b5"), "5".getBytes)
    )

    val sampleDf = spark.createDataFrame(sampleRecords)

    sampleDf.rdd.saveAvroInParquet(outputLocation, sampleDf.schema, CompressionCodecName.GZIP)

    val sparkVal = spark

    import sparkVal.implicits._

    val records: Array[SampleAvroRecord] = spark.read.parquet(outputLocation).as[SampleAvroRecord].collect()

    records.length should be(5)
    // We use === to use the custom Equality defined above for comparing Array[Byte]
    // Ref - https://github.com/scalatest/scalatest/issues/491
    records.sortBy(_.a) === sampleRecords.sortBy(_.a)

    FileUtils.deleteDirectory(new File(outputLocation))
  }

}

Source File: RocksMapTest.scala From utils with Apache License 2.0

5 votes

package com.indix.utils.store

import java.io.Serializable
import java.nio.file.{Paths, Files}

import org.apache.commons.io.FileUtils
import org.scalatest.{Matchers, FlatSpec}


case class TestObject(a: Int, b: String, c: Array[Int], d: Array[String]) extends Serializable {

  def equals(other: TestObject): Boolean = {
    this.a.equals(other.a) && this.b.equals(other.b) && this.c.sameElements(other.c) && this.d.sameElements(other.d)
  }

}

case class ComplexTestObject(a: Int, b: TestObject) extends Serializable {
  def equals(other: ComplexTestObject): Boolean = {
    this.a.equals(other.a) && this.b.equals(other.b)
  }
}

class RocksMapTest extends FlatSpec with Matchers {

  "RocksMap" should "serialize and deserialize the keys and values" in {
    val db = new RocksMap("test")

    val a: Int = 1
    val b: String = "hello"
    val c: Array[Int] = Array(1, 2, 3)

    val d: Array[String] = Array("a", "b", "c")

    val serialized_a = db.serialize(a)
    val serialized_b = db.serialize(b)
    val serialized_c = db.serialize(c)
    val serialized_d = db.serialize(d)
    val serialized_TestObject = db.serialize(TestObject(a, b, c, d))
    val serialized_ComplexObject = db.serialize(ComplexTestObject(a, TestObject(a, b, c, d)))

    db.deserialize[Int](serialized_a) should be(a)
    db.deserialize[String](serialized_b) should be(b)
    db.deserialize[Array[Int]](serialized_c) should be(c)
    db.deserialize[Array[String]](serialized_d) should be(d)
    db.deserialize[TestObject](serialized_TestObject).equals(TestObject(a, b, c, d)) should be(true)
    db.deserialize[ComplexTestObject](serialized_ComplexObject).equals(ComplexTestObject(a, TestObject(a, b, c, d))) should be(true)
    db.drop()
    db.close()
  }

  it should "put and get values" in {
    val db = new RocksMap("test")

    db.put(1, 1.0)
    db.get[Int, Double](1).getOrElse(0) should be(1.0)
    db.clear()
    db.drop()
    db.close()
  }

  it should "remove values" in {
    val db = new RocksMap("test")

    db.put(1, 1L)
    db.get[Int, Long](1).getOrElse(0) should be(1L)
    db.remove(1)
    db.get[Int, Long](1) should be(None)
    db.drop()
    db.close()
  }

  it should "clear all the values" in {
    val db = new RocksMap(name = "test")
    db.put(1, "hello")
    db.put(2, "yello")
    db.get(1) should not be (None)
    db.get(2) should not be (None)
    db.clear()
    db.get(1) should be(None)
    db.get(2) should be(None)
    db.drop()
    db.close()
  }

  it should "clear the data files when drop is called" in {
    val db = new RocksMap(name = "test")
    Files.exists(Paths.get(db.pathString)) should be (true)
    db.drop()
    Files.exists(Paths.get(db.pathString)) should be (false)
    db.close()
  }


}

Source File: TestSolrStreamWriter.scala From spark-solr with Apache License 2.0

5 votes

package com.lucidworks.spark

import java.io.File
import java.util.UUID

import com.lucidworks.spark.util.{ConfigurationConstants, SolrCloudUtil, SolrQuerySupport, SolrSupport}
import org.apache.commons.io.FileUtils
import org.apache.spark.solr.SparkInternalObjects

class TestSolrStreamWriter extends TestSuiteBuilder {

  test("Stream data into Solr") {
    val collectionName = "testStreaming-" + UUID.randomUUID().toString
    SolrCloudUtil.buildCollection(zkHost, collectionName, null, 1, cloudClient, sc)
    sparkSession.conf.set("spark.sql.streaming.schemaInference", "true")
    sparkSession.sparkContext.setLogLevel("DEBUG")
    val offsetsDir = FileUtils.getTempDirectory + "/spark-stream-offsets-" + UUID.randomUUID().toString
    try {
      val datasetPath = "src/test/resources/test-data/oneusagov"
      val streamingJsonDF = sparkSession.readStream.json(datasetPath)
      val accName = "acc-" + UUID.randomUUID().toString
      assert(streamingJsonDF.isStreaming)
      val writeOptions = Map(
        "collection" -> collectionName,
        "zkhost" -> zkHost,
        "checkpointLocation" -> offsetsDir,
        ConfigurationConstants.GENERATE_UNIQUE_KEY -> "true",
        ConfigurationConstants.ACCUMULATOR_NAME -> accName)
      val streamingQuery = streamingJsonDF
        .drop("_id")
        .writeStream
        .outputMode("append")
        .format("solr")
        .options(writeOptions)
        .start()
      try {
        logger.info(s"Explain ${streamingQuery.explain()}")
        streamingQuery.processAllAvailable()
        logger.info(s"Status ${streamingQuery.status}")
        SolrSupport.getCachedCloudClient(zkHost).commit(collectionName)
        assert(SolrQuerySupport.getNumDocsFromSolr(collectionName, zkHost, None) === 13)
        val acc = SparkInternalObjects.getAccumulatorById(SparkSolrAccumulatorContext.getId(accName).get)
        assert(acc.isDefined)
        assert(acc.get.value == 13)
      } finally {
        streamingQuery.stop()
      }
    } finally {
      SolrCloudUtil.deleteCollection(collectionName, cluster)
      FileUtils.deleteDirectory(new File(offsetsDir))
    }
  }
}

Source File: ZookeeperLocalServer.scala From daf with BSD 3-Clause "New" or "Revised" License

5 votes

package it.teamdigitale.miniclusters

import java.io.File
import java.net.InetSocketAddress

import org.apache.commons.io.FileUtils
import org.apache.zookeeper.server.{ServerCnxnFactory, ZooKeeperServer}

class ZookeeperLocalServer(port: Int) {

  var zkServer: Option[ServerCnxnFactory] = None

  def start(): Unit = {
    if (zkServer.isEmpty) {

      val dataDirectory = System.getProperty("java.io.tmpdir")
      val dir = new File(dataDirectory, "zookeeper")
      println(dir.toString)
      if (dir.exists())
        FileUtils.deleteDirectory(dir)

      try {
        val tickTime = 5000
        val server = new ZooKeeperServer(dir.getAbsoluteFile, dir.getAbsoluteFile, tickTime)
        val factory = ServerCnxnFactory.createFactory
        factory.configure(new InetSocketAddress("0.0.0.0", port), 1024)
        factory.startup(server)
        println("ZOOKEEPER server up!!")
        zkServer = Some(factory)

      } catch {
        case ex: Exception => System.err.println(s"Error in zookeeper server: ${ex.printStackTrace()}")
      } finally { dir.deleteOnExit() }
    } else println("ZOOKEEPER is already up")
  }

  def stop() = {
    if (zkServer.isDefined) {
      zkServer.get.shutdown()
    }
    println("ZOOKEEPER server stopped")
  }
}

Source File: CodeGeneratorEngineHook.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.engine.execute.hook

import java.io.File

import com.webank.wedatasphere.linkis.common.utils.Logging
import com.webank.wedatasphere.linkis.engine.execute.{EngineExecutor, EngineHook}
import com.webank.wedatasphere.linkis.scheduler.executer.{ExecuteRequest, RunTypeExecuteRequest}
import com.webank.wedatasphere.linkis.server.JMap
import org.apache.commons.io.FileUtils
import org.apache.commons.lang.StringUtils

import scala.collection.mutable.ArrayBuffer


@Deprecated
//changed to UdfLoadEngineHook
abstract class CodeGeneratorEngineHook extends EngineHook with Logging{ self =>
  val udfPathProp = "udf.paths"
  protected var creator: String = _
  protected var user: String = _
  protected var initSpecialCode: String = _
  protected val runType: String

  protected def acceptCodeType(line: String): Boolean

  protected def generateCode(): Array[String] = {
    val codeBuffer = new ArrayBuffer[String]
    val statementBuffer = new ArrayBuffer[String]
    var accept = true
    initSpecialCode.split("\n").foreach{
      case "" =>
      case l if l.startsWith("%") =>
        if(acceptCodeType(l)){
          accept = true
          codeBuffer.append(statementBuffer.mkString("\n"))
          statementBuffer.clear()
        }else{
          accept = false
        }
      case l if accept => statementBuffer.append(l)
      case _ =>
    }
    if(statementBuffer.nonEmpty) codeBuffer.append(statementBuffer.mkString("\n"))
    codeBuffer.toArray
  }

  override def beforeCreateEngine(params: JMap[String, String]): JMap[String, String] = {
    creator = params.get("creator")
    user = params.get("user")
    initSpecialCode = StringUtils.split(params.get(udfPathProp), ",").map(readFile).mkString("\n")
    params
  }

  override def afterCreatedEngine(executor: EngineExecutor): Unit = {
    generateCode().foreach {
      case "" =>
      case c: String =>
        info("Submit udf registration to engine, code: " + c)
        executor.execute(new ExecuteRequest with RunTypeExecuteRequest{
          override val code: String = c
          override val runType: String = self.runType
        })
        info("executed code: " + c)
    }
  }

  protected def readFile(path: String): String = {
    info("read file: " + path)
    val file = new File(path)
    if(file.exists()){
      FileUtils.readFileToString(file)
    } else {
      info("udf file: [" + path + "] doesn't exist, ignore it.")
      ""
    }
  }
}
@Deprecated
class SqlCodeGeneratorEngineHook extends CodeGeneratorEngineHook{
  override val runType = "sql"
  override protected def acceptCodeType(line: String): Boolean = {
    line.startsWith("%sql")
  }
}
@Deprecated
class PythonCodeGeneratorEngineHook extends CodeGeneratorEngineHook{
  override val runType = "python"
  override protected def acceptCodeType(line: String): Boolean = {
    line.startsWith("%python")
  }
}
@Deprecated
class ScalaCodeGeneratorEngineHook extends CodeGeneratorEngineHook{
  override val runType = "scala"
  override protected def acceptCodeType(line: String): Boolean = {
    line.startsWith("%scala")
  }
}

Source File: PythonCodeParserTest.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.engine.execute

import java.io.File

import com.google.common.io.Resources
import org.apache.commons.io.FileUtils

object PythonCodeParserTest {
  def main(args: Array[String]): Unit = {
    val parser = new PythonCodeParser
    var code = FileUtils.readFileToString(new File(Resources.getResource("stack.py").getPath))
    parser.parse(code, null).foreach { statement =>
      println("---------------------------statement begin-----------------")
      println(statement)
      println("---------------------------statement end-----------------")
    }
  }
}

Source File: SQLCodeParserTest.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.engine.execute

import java.io.File

import com.google.common.io.Resources
import org.apache.commons.io.FileUtils

object SQLCodeParserTest {
  def main(args: Array[String]): Unit = {
    val parser = new SQLCodeParser
    var code = FileUtils.readFileToString(new File(Resources.getResource("very_complex.sql").getPath))
    parser.parse(code, null).foreach { statement =>
      println("---------------------------statement begin-----------------")
      println(statement)
      println("---------------------------statement end-----------------")
    }
  }
}

Source File: CodeGeneratorEngineHookTest.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.engine.execute.hook

import java.io.File
import java.util

import com.google.common.io.Resources
import com.webank.wedatasphere.linkis.engine.execute.{EngineExecutor, EngineExecutorContext}
import com.webank.wedatasphere.linkis.protocol.engine.RequestEngine
import com.webank.wedatasphere.linkis.resourcemanager.Resource
import com.webank.wedatasphere.linkis.scheduler.executer.{ExecuteRequest, ExecuteResponse}
import org.apache.commons.io.FileUtils

object CodeGeneratorEngineHookTest {
  def main(args: Array[String]): Unit = {
    val requestEngine = new TestRequestEngine
    requestEngine.properties.put(RequestEngine.ENGINE_INIT_SPECIAL_CODE,
      FileUtils.readFileToString(new File(Resources.getResource("engine_special_code").getPath)))
    val engineExecutor = new TestEngineExecutor(1, true)

    var engineHook: CodeGeneratorEngineHook = new SqlCodeGeneratorEngineHook
    engineHook.beforeCreateEngine(new util.HashMap(requestEngine.properties))
    engineHook.afterCreatedEngine(engineExecutor)
    engineHook = new PythonCodeGeneratorEngineHook
    engineHook.beforeCreateEngine(new util.HashMap(requestEngine.properties))
    engineHook.afterCreatedEngine(engineExecutor)
    engineHook = new ScalaCodeGeneratorEngineHook
    engineHook.beforeCreateEngine(new util.HashMap(requestEngine.properties))
    engineHook.afterCreatedEngine(engineExecutor)

  }
}
class TestRequestEngine extends RequestEngine {
  override val user: String = ""
  override val properties: util.Map[String, String] = new util.HashMap[String, String](){

  }
  override val creator: String = ""
}
class TestEngineExecutor(outputPrintLimit: Int, isSupportParallelism: Boolean) extends EngineExecutor(outputPrintLimit, isSupportParallelism){

  override def execute(executeRequest: ExecuteRequest): ExecuteResponse = {
    null
  }

  override def getName: String = ""

  override def getActualUsedResources: Resource = null

  override protected def executeLine(engineExecutorContext: EngineExecutorContext, code: String): ExecuteResponse = null

  override protected def executeCompletely(engineExecutorContext: EngineExecutorContext, code: String, completedLine: String): ExecuteResponse = null

  override def close(): Unit = null
}

Source File: ScalaCodeParserTest.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.engine.execute

import java.io.File

import com.google.common.io.Resources
import org.apache.commons.io.FileUtils

object ScalaCodeParserTest {
  def main(args: Array[String]): Unit = {
    val parser = new ScalaCodeParser
    var code = FileUtils.readFileToString(new File(Resources.getResource("test.scala.txt").getPath))
    parser.parse(code, null).foreach { statement =>
      println("---------------------------statement begin-----------------")
      println(statement)
      println("---------------------------statement end-----------------")
    }
  }

}

Source File: JarLoaderEngineHook.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.enginemanager.hook

import com.webank.wedatasphere.linkis.common.utils.Logging
import com.webank.wedatasphere.linkis.enginemanager.{Engine, EngineHook}
import com.webank.wedatasphere.linkis.enginemanager.conf.EngineManagerConfiguration.ENGINE_UDF_APP_NAME
import com.webank.wedatasphere.linkis.protocol.engine.RequestEngine
import com.webank.wedatasphere.linkis.rpc.Sender
import com.webank.wedatasphere.linkis.udf.api.rpc.{RequestUdfTree, ResponseUdfTree}
import com.webank.wedatasphere.linkis.udf.entity.{UDFInfo, UDFTree}
import org.apache.commons.collections.CollectionUtils
import org.apache.commons.io.FileUtils
import org.apache.commons.lang.StringUtils
import org.codehaus.jackson.map.ObjectMapper

import scala.collection.JavaConversions._
import scala.collection.mutable

class JarLoaderEngineHook extends EngineHook with Logging{

  override def beforeCreateSession(requestEngine: RequestEngine): RequestEngine = {
    info("start loading UDFs")
    val udfInfos = extractUdfInfos(requestEngine).filter{info => info.getUdfType == 0 && info.getExpire == false && StringUtils.isNotBlank(info.getPath) && isJarExists(info) && info.getLoad == true }
    // add to class path
    val jars = new mutable.HashSet[String]()
    udfInfos.foreach{udfInfo => jars.add("file://" + udfInfo.getPath)}
    val jarPaths = jars.mkString(",")
    if(StringUtils.isBlank(requestEngine.properties.get("jars"))){
      requestEngine.properties.put("jars", jarPaths)
    } else {
      requestEngine.properties.put("jars", requestEngine.properties.get("jars") + "," + jarPaths)
    }
    info("added jars: " + jarPaths)
    //jars.foreach(fetchRemoteFile)
    //info("copied jars.")
    info("end loading UDFs")
    requestEngine
  }

  override def afterCreatedSession(engine: Engine, requestEngine: RequestEngine): Unit = {
  }

  protected def isJarExists(udfInfo: UDFInfo) : Boolean = {
    true
//    if(FileUtils.getFile(udfInfo.getPath).exists()){
//      true
//    } else {
//      info(s"The jar file [${udfInfo.getPath}] of UDF [${udfInfo.getUdfName}] doesn't exist, ignore it.")
//      false
//    }
  }

  protected def extractUdfInfos(requestEngine: RequestEngine): mutable.ArrayBuffer[UDFInfo] = {
    val udfInfoBuilder = new mutable.ArrayBuffer[UDFInfo]
    val userName = requestEngine.user
    val udfTree = queryUdfRpc(userName)
    extractUdfInfos(udfInfoBuilder, udfTree, userName)
    udfInfoBuilder
  }

  protected def extractUdfInfos(udfInfoBuilder: mutable.ArrayBuffer[UDFInfo], udfTree: UDFTree, userName: String) : Unit = {
    if(CollectionUtils.isNotEmpty(udfTree.getUdfInfos)){
      for(udfInfo <- udfTree.getUdfInfos){
        udfInfoBuilder.append(udfInfo)
      }
    }
    if(CollectionUtils.isNotEmpty(udfTree.getChildrens)){
      for(child <- udfTree.getChildrens){
        var childInfo = child
        if(TreeType.specialTypes.contains(child.getUserName)){
          childInfo = queryUdfRpc(userName, child.getId, child.getUserName)
        } else {
          childInfo = queryUdfRpc(userName, child.getId, TreeType.SELF)
        }
        extractUdfInfos(udfInfoBuilder, childInfo, userName)
      }
    }
  }

  private def queryUdfRpc(userName: String, treeId: Long = -1, treeType: String = "self"): UDFTree = {
    val udfTree = Sender.getSender(ENGINE_UDF_APP_NAME.getValue)
      .ask(RequestUdfTree(userName, treeType, treeId, "udf"))
      .asInstanceOf[ResponseUdfTree]
      .udfTree
    //info("got udf tree:" + new ObjectMapper().writer().withDefaultPrettyPrinter().writeValueAsString(udfTree))
    udfTree
  }
}

Source File: TokenAuthentication.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.gateway.security.token

import java.io.File
import java.util.Properties
import java.util.concurrent.TimeUnit

import com.webank.wedatasphere.linkis.common.utils.{Logging, Utils}
import com.webank.wedatasphere.linkis.gateway.config.GatewayConfiguration._
import com.webank.wedatasphere.linkis.gateway.http.GatewayContext
import com.webank.wedatasphere.linkis.gateway.security.{GatewaySSOUtils, SecurityFilter}
import com.webank.wedatasphere.linkis.server.Message
import org.apache.commons.io.{FileUtils, IOUtils}
import org.apache.commons.lang.StringUtils


object TokenAuthentication extends Logging {

  private val (props, file) = if(ENABLE_TOKEN_AUTHENTICATION.getValue)
    (new Properties, new File(this.getClass.getClassLoader.getResource(TOKEN_AUTHENTICATION_CONFIG.getValue).toURI.getPath))
  else (null, null)
  private var lastModified = 0l

  if(ENABLE_TOKEN_AUTHENTICATION.getValue) {
    Utils.defaultScheduler.scheduleAtFixedRate(new Runnable {
      override def run(): Unit = Utils.tryAndError(init())
    }, TOKEN_AUTHENTICATION_SCAN_INTERVAL.getValue, TOKEN_AUTHENTICATION_SCAN_INTERVAL.getValue, TimeUnit.MILLISECONDS)
    init()
  }

  private def init(): Unit = if(file.lastModified() > lastModified) {
    lastModified = file.lastModified()
    info(s"loading token authentication file $file.")
    val newProps = new Properties
    val input = FileUtils.openInputStream(file)
    Utils.tryFinally(newProps.load(input))(IOUtils.closeQuietly(input))
    props.putAll(newProps)
  }

  private def validateTokenUser(token: String, tokenUser: String): Boolean = {
    val tokenUsers = props.getProperty(token)
    if(tokenUsers == "*" || (StringUtils.isNotBlank(tokenUsers) && tokenUsers.contains(tokenUser))) true
    else false
  }

  def isTokenRequest(gatewayContext: GatewayContext) : Boolean = {
    (gatewayContext.getRequest.getHeaders.containsKey(TOKEN_KEY) &&
      gatewayContext.getRequest.getHeaders.containsKey(TOKEN_USER_KEY)) || (
      gatewayContext.getRequest.getCookies.containsKey(TOKEN_KEY) &&
        gatewayContext.getRequest.getCookies.containsKey(TOKEN_USER_KEY))
  }

  def tokenAuth(gatewayContext: GatewayContext): Boolean = {
    if(!ENABLE_TOKEN_AUTHENTICATION.getValue) {
      val message = Message.noLogin(s"Gateway未启用token认证，请采用其他认证方式!") << gatewayContext.getRequest.getRequestURI
      SecurityFilter.filterResponse(gatewayContext, message)
      return false
    }
    var token = gatewayContext.getRequest.getHeaders.get(TOKEN_KEY)(0)
    var tokenUser = gatewayContext.getRequest.getHeaders.get(TOKEN_USER_KEY)(0)
    if(StringUtils.isBlank(token) || StringUtils.isBlank(tokenUser)) {
      token = gatewayContext.getRequest.getCookies.get(TOKEN_KEY)(0).getValue
      tokenUser = gatewayContext.getRequest.getCookies.get(TOKEN_USER_KEY)(0).getValue
      if(StringUtils.isBlank(token) || StringUtils.isBlank(tokenUser)) {
        val message = Message.noLogin(s"请在Header或Cookie中同时指定$TOKEN_KEY 和 $TOKEN_USER_KEY，以便完成token认证！") << gatewayContext.getRequest.getRequestURI
        SecurityFilter.filterResponse(gatewayContext, message)
        return false
      }
    }
    if(validateTokenUser(token, tokenUser)){
      info(s"Token authentication succeed, uri: ${gatewayContext.getRequest.getRequestURI}, token: $token, tokenUser: $tokenUser.")
      GatewaySSOUtils.setLoginUser(gatewayContext.getRequest, tokenUser)
      true
    } else {
      val message = Message.noLogin(s"未授权的token$token，无法将请求绑定给tokenUser$tokenUser!") << gatewayContext.getRequest.getRequestURI
      SecurityFilter.filterResponse(gatewayContext, message)
      false
    }
  }

}

Source File: RefreshUtils.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.common.utils

import java.io.File
import java.util.concurrent.TimeUnit

import com.webank.wedatasphere.linkis.common.conf.Configuration
import org.apache.commons.io.FileUtils


object RefreshUtils {

  def registerFileRefresh(period: Long, file: String, deal: java.util.List[String] => Unit): Unit = {
    Utils.defaultScheduler.scheduleAtFixedRate(new Runnable {
      val f = new File(file)
      var fileModifiedTime = if(f.exists()) f.lastModified() else 0
      override def run(): Unit = {
        if(!f.exists()) return
        if(f.lastModified() > fileModifiedTime) {
          deal(FileUtils.readLines(f, Configuration.BDP_ENCODING.getValue))
          fileModifiedTime = f.lastModified()
        }
      }
    }, period, period, TimeUnit.MILLISECONDS)
  }

}
abstract class Deal {
  def deal(line: String): Unit
}

Source File: TestUtil.scala From pulse with Apache License 2.0

5 votes

package io.phdata.pulse.solr

import java.io.File
import java.nio.file.Paths
import java.util.UUID

import org.apache.commons.io.FileUtils
import org.apache.solr.client.solrj.embedded.JettyConfig
import org.apache.solr.cloud.MiniSolrCloudCluster

object TestUtil {

  def miniSolrCloudCluster(): MiniSolrCloudCluster = {
    // clean up the solr files so we don't try to read collections from old runs
    FileUtils.deleteDirectory(new File("target/solr7"))

    // Set up a MiniSolrCloudCluster
    val clusterHome =
      s"${System.getProperty("user.dir")}/target/solr7/solrHome/${UUID.randomUUID()}"
    val jettyConfig =
      JettyConfig.builder().setContext("/solr").setPort(8983).stopAtShutdown(true).build()

    new MiniSolrCloudCluster(1,
                             null,
                             Paths.get(clusterHome),
                             MiniSolrCloudCluster.DEFAULT_CLOUD_SOLR_XML,
                             null,
                             null)
  }

  def randomIdentifier() = UUID.randomUUID().toString.substring(0, 5)
}

Source File: TestUtil.scala From pulse with Apache License 2.0

5 votes

package io.phdata.pulse.solr

import java.io.File
import java.nio.file.Paths
import java.util.UUID

import org.apache.commons.io.FileUtils
import org.apache.solr.client.solrj.embedded.JettyConfig
import org.apache.solr.cloud.MiniSolrCloudCluster

object TestUtil {

  def miniSolrCloudCluster(): MiniSolrCloudCluster = {

    val DEFAULT_SOLR_CLOUD_XML =
      """<solr>
        |
        |  <str name="shareSchema">${shareSchema:false}</str>
        |  <str name="configSetBaseDir">${configSetBaseDir:configsets}</str>
        |  <str name="coreRootDirectory">${coreRootDirectory:target/solr4/cores}</str>
        |
        |  <shardHandlerFactory name="shardHandlerFactory" class="HttpShardHandlerFactory">
        |    <str name="urlScheme">${urlScheme:}</str>
        |    <int name="socketTimeout">${socketTimeout:90000}</int>
        |    <int name="connTimeout">${connTimeout:15000}</int>
        |  </shardHandlerFactory>
        |
        |  <solrcloud>
        |    <str name="host">127.0.0.1</str>
        |    <int name="hostPort">${hostPort:8983}</int>
        |    <str name="hostContext">${hostContext:solr}</str>
        |    <int name="zkClientTimeout">${solr.zkclienttimeout:30000}</int>
        |    <bool name="genericCoreNodeNames">${genericCoreNodeNames:true}</bool>
        |    <int name="leaderVoteWait">10000</int>
        |    <int name="distribUpdateConnTimeout">${distribUpdateConnTimeout:45000}</int>
        |    <int name="distribUpdateSoTimeout">${distribUpdateSoTimeout:340000}</int>
        |  </solrcloud>
        |
        |</solr>""".stripMargin

    System.setProperty("solr.directoryFactory", "solr.RAMDirectoryFactory")
    // clean up the solr files so we don't try to read collections from old runs
    FileUtils.deleteDirectory(new File("target/solr4"))

    // Set up a MiniSolrCloudCluster
    val clusterHome =
      s"${System.getProperty("user.dir")}/target/solr4/solrHome/${UUID.randomUUID()}"
    val jettyConfig =
      JettyConfig.builder().setContext("/solr").setPort(8983).stopAtShutdown(true).build()

    new MiniSolrCloudCluster(1, Paths.get(clusterHome), DEFAULT_SOLR_CLOUD_XML, jettyConfig)
  }

  def randomIdentifier() = UUID.randomUUID().toString.substring(0, 5)

}

Source File: S3PointCloudInputFormat.scala From geotrellis-pointcloud with Apache License 2.0

5 votes

package geotrellis.pointcloud.spark.store.s3

import geotrellis.spark.store.s3._
import geotrellis.pointcloud.spark.store.hadoop.formats._
import geotrellis.pointcloud.util.Filesystem

import io.pdal._
import io.circe.Json
import io.circe.syntax._
import cats.syntax.either._
import org.apache.hadoop.mapreduce.{InputSplit, TaskAttemptContext}
import org.apache.commons.io.FileUtils

import java.io.{File, InputStream}
import java.net.URI

import scala.collection.JavaConverters._


    mode match {
      case "s3" =>
        new S3URIRecordReader[S3PointCloudHeader, List[PointCloud]](s3Client) {
          def read(key: String, uri: URI): (S3PointCloudHeader, List[PointCloud]) = {
            val s3Pipeline =
              pipeline
                .hcursor
                .downField("pipeline").downArray
                .downField("filename").withFocus(_ => uri.toString.asJson)
                .top.fold(pipeline)(identity)

            executePipeline(context)(key, s3Pipeline)
          }
        }

      case _ =>
        val tmpDir = {
          val dir = PointCloudInputFormat.getTmpDir(context)
          if (dir == null) Filesystem.createDirectory()
          else Filesystem.createDirectory(dir)
        }

        new S3StreamRecordReader[S3PointCloudHeader, List[PointCloud]](s3Client) {
          def read(key: String, is: InputStream): (S3PointCloudHeader, List[PointCloud]) = {
            // copy remote file into local tmp dir
            tmpDir.mkdirs() // to be sure that dirs created
            val localPath = new File(tmpDir, key.replace("/", "_"))
            FileUtils.copyInputStreamToFile(is, localPath)
            is.close()

            // use local filename path if it's present in json
            val localPipeline =
              pipeline
                .hcursor
                .downField("pipeline").downArray
                .downField("filename").withFocus(_ => localPath.getAbsolutePath.asJson)
                .top.fold(pipeline)(identity)

            try executePipeline(context)(key, localPipeline) finally {
              localPath.delete()
              tmpDir.delete()
            }
          }
        }
    }
  }
}

Source File: KafkaServer.scala From akka_streams_tutorial with MIT License

5 votes

package alpakka.env

import java.io.File
import java.net.InetSocketAddress
import java.nio.file.{Files, Paths}
import java.util.Properties

import kafka.server.{KafkaConfig, KafkaServerStartable}
import org.apache.commons.io.FileUtils
import org.apache.zookeeper.server.quorum.QuorumPeerConfig
import org.apache.zookeeper.server.{ServerConfig, ZooKeeperServerMain}


object KafkaServer extends App {

  val zookeeperPort = 2181

  val kafkaLogs = "/tmp/kafka-logs"
  val kafkaLogsPath = Paths.get(kafkaLogs)

  // See: https://stackoverflow.com/questions/59592518/kafka-broker-doesnt-find-cluster-id-and-creates-new-one-after-docker-restart/60864763#comment108382967_60864763
  def fix25Behaviour() = {
    val fileWithConflictingContent = kafkaLogsPath.resolve("meta.properties").toFile
    if (fileWithConflictingContent.exists())  FileUtils.forceDelete(fileWithConflictingContent)
  }

  def removeKafkaLogs(): Unit = {
    if (kafkaLogsPath.toFile.exists()) FileUtils.forceDelete(kafkaLogsPath.toFile)
  }

  // Keeps the persistent data
  fix25Behaviour()
  // If everything fails
  //removeKafkaLogs()

  val quorumConfiguration = new QuorumPeerConfig {
    // Since we do not run a cluster, we are not interested in zookeeper data
    override def getDataDir: File = Files.createTempDirectory("zookeeper").toFile
    override def getDataLogDir: File = Files.createTempDirectory("zookeeper-logs").toFile
    override def getClientPortAddress: InetSocketAddress = new InetSocketAddress(zookeeperPort)
  }

  class StoppableZooKeeperServerMain extends ZooKeeperServerMain {
    def stop(): Unit = shutdown()
  }

  val zooKeeperServer = new StoppableZooKeeperServerMain()

  val zooKeeperConfig = new ServerConfig()
  zooKeeperConfig.readFrom(quorumConfiguration)

  val zooKeeperThread = new Thread {
    override def run(): Unit = zooKeeperServer.runFromConfig(zooKeeperConfig)
  }

  zooKeeperThread.start()

  val kafkaProperties = new Properties()
  kafkaProperties.put("zookeeper.connect", s"localhost:$zookeeperPort")
  kafkaProperties.put("broker.id", "0")
  kafkaProperties.put("offsets.topic.replication.factor", "1")
  kafkaProperties.put("log.dirs", kafkaLogs)
  kafkaProperties.put("delete.topic.enable", "true")
  kafkaProperties.put("group.initial.rebalance.delay.ms", "0")
  kafkaProperties.put("transaction.state.log.min.isr", "1")
  kafkaProperties.put("transaction.state.log.replication.factor", "1")
  kafkaProperties.put("zookeeper.connection.timeout.ms", "6000")
  kafkaProperties.put("num.partitions", "10")

  val kafkaConfig = KafkaConfig.fromProps(kafkaProperties)

  val kafka = new KafkaServerStartable(kafkaConfig)

  println("About to start...")
  kafka.startup()

  scala.sys.addShutdownHook{
    println("About to shutdown...")
    kafka.shutdown()
    kafka.awaitShutdown()
    zooKeeperServer.stop()
  }

  zooKeeperThread.join()
}

Source File: TestSetup.scala From incubator-retired-iota with Apache License 2.0

5 votes

package org.apache.iota.fey

import java.io.File
import java.nio.file.Paths

import org.apache.commons.io.FileUtils
import org.scalatest.Tag

object TestSetup {

  private var runSetup = true

  val configTest = getClass.getResource("/test-fey-configuration.conf")

  def setup(): Unit = {
    if(runSetup){
      println("SETTING UP ...")
      createFeyTmpDirectoriesForTest()
      copyTestActorToTmp()
      copyJSONstoTmp()
      runSetup = false
    }
  }

  private def copyTestActorToTmp(): Unit = {
    copyResourceFileToLocal("/fey-test-actor.jar",s"${CONFIG.JAR_REPOSITORY}/fey-test-actor.jar")
  }

  private def copyJSONstoTmp(): Unit = {
    copyResourceFileToLocal("/json/valid-json.json",s"${CONFIG.JSON_REPOSITORY}/valid-json.json.not")
    copyResourceFileToLocal("/json/invalid-json.json",s"${CONFIG.JSON_REPOSITORY}/invalid-json.json.not")
  }

  private def copyResourceFileToLocal(resourcePath: String, destination: String): Unit = {
    val resourceFile = getClass.getResource(resourcePath)
    val dest = new File(destination)
    FileUtils.copyURLToFile(resourceFile, dest)
  }

  private def createFeyTmpDirectoriesForTest(): Unit = {
    var file = new File(s"/tmp/fey/test/checkpoint")
    file.mkdirs()
    file = new File(s"/tmp/fey/test/json")
    file.mkdirs()
    file = new File(s"/tmp/fey/test/json/watchtest")
    file.mkdirs()
    file = new File(s"/tmp/fey/test/jars")
    file.mkdirs()
    file = new File(s"/tmp/fey/test/jars/dynamic")
    file.mkdirs()
  }

}

object SlowTest extends Tag("org.apache.iota.fey.SlowTest")

Source File: Tryout.scala From spark-es with Apache License 2.0

5 votes

import java.nio.file.Files

import org.apache.commons.io.FileUtils
import org.apache.spark.SparkContext
import org.elasticsearch.common.settings.Settings
import org.elasticsearch.node.NodeBuilder
import org.apache.spark.elasticsearch._

object Tryout {
  def main(args: Array[String]): Unit = {
    val sparkContext = new SparkContext("local[2]", "SparkES")

    val dataDir = Files.createTempDirectory("elasticsearch").toFile

    dataDir.deleteOnExit()

    val settings = Settings.settingsBuilder()
      .put("path.home", dataDir.getAbsolutePath)
      .put("path.logs", s"${dataDir.getAbsolutePath}/logs")
      .put("path.data", s"${dataDir.getAbsolutePath}/data")
      .put("index.store.fs.memory.enabled", true)
      .put("index.number_of_shards", 1)
      .put("index.number_of_replicas", 0)
      .put("cluster.name", "SparkES")
      .build()

    val node = NodeBuilder.nodeBuilder().settings(settings).node()

    val client = node.client()

    sparkContext
      .parallelize(Seq(
      ESDocument(ESMetadata("2", "type1", "index1"), """{"name": "John Smith"}"""),
      ESDocument(ESMetadata("1", "type1", "index1"), """{"name": "Sergey Shumov"}""")
    ), 2)
      .saveToES(Seq("localhost"), "SparkES")
    
    client.admin().cluster().prepareHealth("index1").setWaitForGreenStatus().get()

    val documents = sparkContext.esRDD(
      Seq("localhost"), "SparkES", Seq("index1"), Seq("type1"), "name:sergey")

    println(documents.count())

    documents.foreach(println)

    sparkContext.stop()

    client.close()

    node.close()

    FileUtils.deleteQuietly(dataDir)
  }
}

Source File: LocalElasticSearch.scala From spark-es with Apache License 2.0

5 votes

package org.apache.spark.elasticsearch

import java.nio.file.Files
import java.util.UUID

import org.apache.commons.io.FileUtils
import org.elasticsearch.common.settings.Settings
import org.elasticsearch.node.{NodeBuilder, Node}

class LocalElasticSearch(val clusterName: String = UUID.randomUUID().toString) {
  lazy val node = buildNode()
  lazy val client = node.client()
  val dataDir = Files.createTempDirectory("elasticsearch").toFile

  private var started = false

  def buildNode(): Node = {
    val settings = Settings.settingsBuilder()
      .put("path.home", dataDir.getAbsolutePath)
      .put("path.logs", s"${dataDir.getAbsolutePath}/logs")
      .put("path.data", s"${dataDir.getAbsolutePath}/data")
      .put("index.store.fs.memory.enabled", true)
      .put("index.number_of_shards", 1)
      .put("index.number_of_replicas", 0)
      .put("cluster.name", clusterName)
      .build()

    val instance = NodeBuilder.nodeBuilder().settings(settings).node()

    started = true

    instance
  }

  def close(): Unit = {
    if (started) {
      client.close()
      node.close()
    }

    try {
      FileUtils.forceDelete(dataDir)
    } catch {
      case e: Exception =>
    }
  }
}

Source File: MultiNodeSupportCassandra.scala From eventuate with Apache License 2.0

5 votes

package com.rbmhtechnology.eventuate

import java.io.File

import akka.actor.Props
import akka.remote.testconductor.RoleName
import akka.remote.testkit.MultiNodeSpec

import com.rbmhtechnology.eventuate.log.cassandra._

import org.apache.commons.io.FileUtils
import org.scalatest.BeforeAndAfterAll

trait MultiNodeSupportCassandra extends BeforeAndAfterAll { this: MultiNodeSpec with MultiNodeWordSpec =>
  val coordinator = RoleName("nodeA")

  def cassandraDir: String =
    MultiNodeEmbeddedCassandra.DefaultCassandraDir

  def logProps(logId: String): Props =
    CassandraEventLog.props(logId)

  override def atStartup(): Unit = {
    if (isNode(coordinator)) {
      MultiNodeEmbeddedCassandra.start(cassandraDir)
      Cassandra(system)
    }
    enterBarrier("startup")
  }

  override def afterAll(): Unit = {
    // get all config data before shutting down node
    val snapshotRootDir = new File(system.settings.config.getString("eventuate.snapshot.filesystem.dir"))

    // shut down node
    super.afterAll()

    // clean database and delete snapshot files
    if (isNode(coordinator)) {
      FileUtils.deleteDirectory(snapshotRootDir)
      MultiNodeEmbeddedCassandra.clean()
    }
  }
}

Source File: MultiNodeSupportLeveldb.scala From eventuate with Apache License 2.0

5 votes

package com.rbmhtechnology.eventuate

import java.io.File

import akka.actor.Props
import akka.remote.testconductor.RoleName
import akka.remote.testkit.MultiNodeSpec

import com.rbmhtechnology.eventuate.log.leveldb.LeveldbEventLog

import org.apache.commons.io.FileUtils
import org.scalatest.BeforeAndAfterAll

trait MultiNodeSupportLeveldb extends BeforeAndAfterAll { this: MultiNodeSpec with MultiNodeWordSpec =>
  val coordinator = RoleName("nodeA")

  def logProps(logId: String): Props =
    LeveldbEventLog.props(logId)

  override def afterAll(): Unit = {
    // get all config data before shutting down node
    val snapshotRootDir = new File(system.settings.config.getString("eventuate.snapshot.filesystem.dir"))
    val logRootDir = new File(system.settings.config.getString("eventuate.log.leveldb.dir"))

    // shut down node
    super.afterAll()

    // delete log and snapshot files
    if (isNode(coordinator)) {
      FileUtils.deleteDirectory(snapshotRootDir)
      FileUtils.deleteDirectory(logRootDir)
    }
  }
}

Source File: PersistOnEventWithRecoverySpecLeveldb.scala From eventuate with Apache License 2.0

5 votes

package com.rbmhtechnology.eventuate

import java.util.UUID

import akka.actor.Actor
import akka.actor.ActorRef
import akka.actor.Props
import akka.testkit.TestProbe
import com.rbmhtechnology.eventuate.ReplicationIntegrationSpec.replicationConnection
import com.rbmhtechnology.eventuate.utilities._
import org.apache.commons.io.FileUtils
import org.scalatest.Matchers
import org.scalatest.WordSpec

import scala.concurrent.duration.DurationInt

object PersistOnEventWithRecoverySpecLeveldb {
  class OnBEmitRandomActor(val eventLog: ActorRef, probe: TestProbe) extends EventsourcedActor with PersistOnEvent {

    override def id = getClass.getName

    override def onCommand = Actor.emptyBehavior

    override def onEvent = {
      case "A"          =>
      case "B"          => persistOnEvent(UUID.randomUUID().toString)
      case uuid: String => probe.ref ! uuid
    }
  }

  def persistOnEventProbe(locationA1: Location, log: ActorRef) = {
    val probe = locationA1.probe
    locationA1.system.actorOf(Props(new OnBEmitRandomActor(log, probe)))
    probe
  }

  val noMsgTimeout = 100.millis
}

class PersistOnEventWithRecoverySpecLeveldb extends WordSpec with Matchers with MultiLocationSpecLeveldb {
  import RecoverySpecLeveldb._
  import PersistOnEventWithRecoverySpecLeveldb._

  override val logFactory: String => Props =
    id => SingleLocationSpecLeveldb.TestEventLog.props(id, batching = true)

  "An EventsourcedActor with PersistOnEvent" must {
    "not re-attempt persistence on successful write after reordering of events through disaster recovery" in {
      val locationB = location("B", customConfig = RecoverySpecLeveldb.config)
      def newLocationA = location("A", customConfig = RecoverySpecLeveldb.config)
      val locationA1 = newLocationA

      val endpointB = locationB.endpoint(Set("L1"), Set(replicationConnection(locationA1.port)))
      def newEndpointA(l: Location, activate: Boolean) = l.endpoint(Set("L1"), Set(replicationConnection(locationB.port)), activate = activate)
      val endpointA1 = newEndpointA(locationA1, activate = true)

      val targetA = endpointA1.target("L1")
      val logDirA = logDirectory(targetA)
      val targetB = endpointB.target("L1")
      val a1Probe = persistOnEventProbe(locationA1, targetA.log)

      write(targetA, List("A"))
      write(targetB, List("B"))
      val event = a1Probe.expectMsgClass(classOf[String])
      assertConvergence(Set("A", "B", event), endpointA1, endpointB)

      locationA1.terminate().await
      FileUtils.deleteDirectory(logDirA)

      val locationA2 = newLocationA
      val endpointA2 = newEndpointA(locationA2, activate = false)
      endpointA2.recover().await

      val a2Probe = persistOnEventProbe(locationA2, endpointA2.logs("L1"))
      a2Probe.expectMsg(event)
      a2Probe.expectNoMsg(noMsgTimeout)
      assertConvergence(Set("A", "B", event), endpointA2, endpointB)
    }
  }
}

Source File: NeuralNetwork.scala From Scala-Machine-Learning-Projects with MIT License

5 votes

package Yelp.Trainer

import org.deeplearning4j.nn.conf.MultiLayerConfiguration
import org.deeplearning4j.nn.multilayer.MultiLayerNetwork
import org.nd4j.linalg.factory.Nd4j
import java.io.File
import org.apache.commons.io.FileUtils
import java.io.{DataInputStream, DataOutputStream, FileInputStream}
import java.nio.file.{Files, Paths}

object NeuralNetwork {  
  def loadNN(NNconfig: String, NNparams: String) = {
    // get neural network config
    val confFromJson: MultiLayerConfiguration = MultiLayerConfiguration.fromJson(FileUtils.readFileToString(new File(NNconfig)))    
     // get neural network parameters 
    val dis: DataInputStream = new DataInputStream(new FileInputStream(NNparams))
    val newParams = Nd4j.read(dis)    
     // creating network object
    val savedNetwork: MultiLayerNetwork = new MultiLayerNetwork(confFromJson)
    savedNetwork.init()
    savedNetwork.setParameters(newParams)    
    savedNetwork
  }
  
  def saveNN(model: MultiLayerNetwork, NNconfig: String, NNparams: String) = {
    // save neural network config
    FileUtils.write(new File(NNconfig), model.getLayerWiseConfigurations().toJson())     
    // save neural network parms
    val dos: DataOutputStream = new DataOutputStream(Files.newOutputStream(Paths.get(NNparams)))
    Nd4j.write(model.params(), dos)
  }  
}

Source File: ExampleMahaService.scala From maha with Apache License 2.0

5 votes

// Copyright 2017, Yahoo Holdings Inc.
// Licensed under the terms of the Apache License 2.0. Please see LICENSE file in project root for terms.
package com.yahoo.maha.api.jersey.example

import java.io.File
import java.util.UUID

import com.yahoo.maha.core.ddl.OracleDDLGenerator
import com.yahoo.maha.jdbc.{JdbcConnection, List, Seq}
import com.yahoo.maha.service.{DefaultMahaService, MahaService, MahaServiceConfig}
import com.zaxxer.hikari.{HikariConfig, HikariDataSource}
import grizzled.slf4j.Logging
import org.apache.commons.io.FileUtils
import org.joda.time.DateTime
import org.joda.time.format.DateTimeFormat

object ExampleMahaService extends Logging {

  val REGISTRY_NAME = "academic";

  private var dataSource: Option[HikariDataSource] = None
  private var jdbcConnection: Option[JdbcConnection] = None
  val h2dbId = UUID.randomUUID().toString.replace("-","")
  val today: String = DateTimeFormat.forPattern("yyyy-MM-dd").print(DateTime.now())
  val yesterday: String = DateTimeFormat.forPattern("yyyy-MM-dd").print(DateTime.now().minusDays(1))

  def initJdbcToH2(): Unit = {
    val config = new HikariConfig()
    config.setJdbcUrl(s"jdbc:h2:mem:$h2dbId;MODE=Oracle;DB_CLOSE_DELAY=-1")
    config.setUsername("sa")
    config.setPassword("h2.test.database.password")
    config.setMaximumPoolSize(2)
    dataSource = Option(new HikariDataSource(config))
    jdbcConnection = dataSource.map(new JdbcConnection(_))
    assert(jdbcConnection.isDefined, "Failed to connect to h2 local server")
  }

  def getMahaService(scope: String = "main"): MahaService = {
    val jsonString = FileUtils.readFileToString(new File(s"src/$scope/resources/maha-service-config.json"))
      .replaceAll("h2dbId", s"$h2dbId")

    initJdbcToH2()

    val mahaServiceResult = MahaServiceConfig.fromJson(jsonString.getBytes("utf-8"))
    if (mahaServiceResult.isFailure) {
      mahaServiceResult.leftMap {
        res=>
          error(s"Failed to launch Example MahaService, MahaService Error list is: ${res.list.toList}")
      }
    }
    val mahaServiceConfig = mahaServiceResult.toOption.get
    val mahaService: MahaService = new DefaultMahaService(mahaServiceConfig)
    stageStudentData(mahaServiceConfig)
    mahaService
  }

  def stageStudentData(mahaServiceConfig: MahaServiceConfig) : Unit = {

    val ddlGenerator = new OracleDDLGenerator
    val erRegistryConfig = mahaServiceConfig.registry.get(ExampleMahaService.REGISTRY_NAME).get
    val erRegistry= erRegistryConfig.registry
    erRegistry.factMap.values.foreach {
      publicFact =>
        publicFact.factList.foreach {
          fact=>
            val ddl = ddlGenerator.toDDL(fact)
            assert(jdbcConnection.get.executeUpdate(ddl).isSuccess)
        }
    }

    val insertSql =
      """
        INSERT INTO student_grade_sheet (year, section_id, student_id, class_id, total_marks, date, comment)
        VALUES (?, ?, ?, ?, ?, ?, ?)
      """

    val rows: List[Seq[Any]] = List(
      Seq(1, 100, 213, 200, 125, ExampleMahaService.today, "some comment")
    )

    rows.foreach {
      row =>
        val result = jdbcConnection.get.executeUpdate(insertSql, row)
        assert(result.isSuccess)
    }
    var count = 0
    jdbcConnection.get.queryForObject("select * from student_grade_sheet") {
      rs =>
        while (rs.next()) {
          count += 1
        }
    }
    assert(rows.size == count)
  }
}

Source File: StandaloneKCFTests.scala From openwhisk with Apache License 2.0

5 votes

package org.apache.openwhisk.standalone

import java.nio.charset.StandardCharsets.UTF_8
import java.nio.file.Files

import common.WskProps
import org.apache.commons.io.FileUtils
import org.apache.openwhisk.core.containerpool.kubernetes.test.KubeClientSupport
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import system.basic.WskRestBasicTests

@RunWith(classOf[JUnitRunner])
class StandaloneKCFTests
    extends WskRestBasicTests
    with StandaloneServerFixture
    with StandaloneSanityTestSupport
    with KubeClientSupport {
  override implicit val wskprops = WskProps().copy(apihost = serverUrl)

  //Turn on to debug locally easily
  override protected val dumpLogsAlways = false

  override protected val dumpStartupLogs = false

  override protected def useMockServer = false

  override protected def supportedTests = Set("Wsk Action REST should invoke a blocking action and get only the result")

  override protected def extraArgs: Seq[String] = Seq("--dev-mode", "--dev-kcf")

  private val podTemplate = """---
                              |apiVersion: "v1"
                              |kind: "Pod"
                              |metadata:
                              |  annotations:
                              |    allow-outbound : "true"
                              |  labels:
                              |     launcher: standalone""".stripMargin

  private val podTemplateFile = Files.createTempFile("whisk", null).toFile

  override val customConfig = {
    FileUtils.write(podTemplateFile, podTemplate, UTF_8)
    Some(s"""include classpath("standalone-kcf.conf")
         |
         |whisk {
         |  kubernetes {
         |    pod-template = "${podTemplateFile.toURI}"
         |  }
         |}""".stripMargin)
  }

  override def afterAll(): Unit = {
    checkPodState()
    super.afterAll()
    podTemplateFile.delete()
  }

  def checkPodState(): Unit = {
    val podList = kubeClient.pods().withLabel("launcher").list()
    podList.getItems.isEmpty shouldBe false
  }
}

Source File: ConfigMapValueTests.scala From openwhisk with Apache License 2.0

5 votes

package org.apache.openwhisk.common

import java.nio.charset.StandardCharsets.UTF_8
import java.nio.file.Files

import com.typesafe.config.ConfigFactory
import org.apache.commons.io.FileUtils
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{FlatSpec, Matchers}
import pureconfig._
import pureconfig.generic.auto._

@RunWith(classOf[JUnitRunner])
class ConfigMapValueTests extends FlatSpec with Matchers {
  behavior of "ConfigMapValue"

  case class ValueTest(template: ConfigMapValue, count: Int)

  it should "read from string" in {
    val config = ConfigFactory.parseString("""
       |whisk {
       |  value-test {
       |    template = "test string"
       |    count = 42
       |  }
       |}""".stripMargin)

    val valueTest = readValueTest(config)
    valueTest.template.value shouldBe "test string"
  }

  it should "read from file reference" in {
    val file = Files.createTempFile("whisk", null).toFile
    FileUtils.write(file, "test string", UTF_8)

    val config = ConfigFactory.parseString(s"""
       |whisk {
       |  value-test {
       |    template = "${file.toURI}"
       |    count = 42
       |  }
       |}""".stripMargin)

    val valueTest = readValueTest(config)
    valueTest.template.value shouldBe "test string"

    file.delete()
  }

  private def readValueTest(config: com.typesafe.config.Config) = {
    loadConfigOrThrow[ValueTest](config.getConfig("whisk.value-test"))
  }
}

Source File: ConfigMapValue.scala From openwhisk with Apache License 2.0

5 votes

package org.apache.openwhisk.common

import java.io.File
import java.net.URI
import java.nio.charset.StandardCharsets.UTF_8

import org.apache.commons.io.FileUtils
import pureconfig.ConfigReader
import pureconfig.ConvertHelpers.catchReadError

class ConfigMapValue private (val value: String)

object ConfigMapValue {

  
  def apply(config: String): ConfigMapValue = {
    val value = if (config.startsWith("file:")) {
      val uri = new URI(config)
      val file = new File(uri)
      FileUtils.readFileToString(file, UTF_8)
    } else config
    new ConfigMapValue(value)
  }

  implicit val reader: ConfigReader[ConfigMapValue] = ConfigReader.fromString[ConfigMapValue](catchReadError(apply))
}

Source File: CollectionResourceUsage.scala From openwhisk with Apache License 2.0

5 votes

package org.apache.openwhisk.core.database.cosmosdb

import org.apache.commons.io.FileUtils
import org.apache.openwhisk.core.entity.ByteSize
import org.apache.openwhisk.core.entity.SizeUnits.KB

case class CollectionResourceUsage(documentsSize: Option[ByteSize],
                                   collectionSize: Option[ByteSize],
                                   documentsCount: Option[Long],
                                   indexingProgress: Option[Int],
                                   documentsSizeQuota: Option[ByteSize]) {
  def indexSize: Option[ByteSize] = {
    for {
      ds <- documentsSize
      cs <- collectionSize
    } yield cs - ds
  }

  def asString: String = {
    List(
      documentsSize.map(ds => s"documentSize: ${displaySize(ds)}"),
      indexSize.map(is => s"indexSize: ${displaySize(is)}"),
      documentsCount.map(dc => s"documentsCount: $dc"),
      documentsSizeQuota.map(dq => s"collectionSizeQuota: ${displaySize(dq)}")).flatten.mkString(",")
  }

  private def displaySize(b: ByteSize) = FileUtils.byteCountToDisplaySize(b.toBytes)
}

object CollectionResourceUsage {
  val quotaHeader = "x-ms-resource-quota"
  val usageHeader = "x-ms-resource-usage"
  val indexHeader = "x-ms-documentdb-collection-index-transformation-progress"

  def apply(responseHeaders: Map[String, String]): Option[CollectionResourceUsage] = {
    for {
      quota <- responseHeaders.get(quotaHeader).map(headerValueToMap)
      usage <- responseHeaders.get(usageHeader).map(headerValueToMap)
    } yield {
      CollectionResourceUsage(
        usage.get("documentsSize").map(_.toLong).map(ByteSize(_, KB)),
        usage.get("collectionSize").map(_.toLong).map(ByteSize(_, KB)),
        usage.get("documentsCount").map(_.toLong),
        responseHeaders.get(indexHeader).map(_.toInt),
        quota.get("collectionSize").map(_.toLong).map(ByteSize(_, KB)))
    }
  }

  private def headerValueToMap(value: String): Map[String, String] = {
    //storedProcedures=100;triggers=25;functions=25;documentsCount=-1;documentsSize=xxx;collectionSize=xxx
    val pairs = value.split("=|;").grouped(2)
    pairs.map { case Array(k, v) => k -> v }.toMap
  }
}

Source File: InstallRouteMgmt.scala From openwhisk with Apache License 2.0

5 votes

package org.apache.openwhisk.standalone

import java.io.File

import akka.http.scaladsl.model.Uri
import org.apache.commons.io.{FileUtils, IOUtils}
import org.apache.openwhisk.common.TransactionId.systemPrefix
import org.apache.openwhisk.common.{Logging, TransactionId}

import scala.sys.process.ProcessLogger
import scala.util.Try
import scala.sys.process._

case class InstallRouteMgmt(workDir: File,
                            authKey: String,
                            apiHost: Uri,
                            namespace: String,
                            gatewayUrl: Uri,
                            wsk: String)(implicit log: Logging) {
  case class Action(name: String, desc: String)
  private val noopLogger = ProcessLogger(_ => ())
  private implicit val tid: TransactionId = TransactionId(systemPrefix + "apiMgmt")
  val actionNames = Array(
    Action("createApi", "Create an API"),
    Action("deleteApi", "Delete the API"),
    Action("getApi", "Retrieve the specified API configuration (in JSON format)"))

  def run(): Unit = {
    require(wskExists, s"wsk command not found at $wsk. Route management actions cannot be installed")
    log.info(this, packageUpdateCmd.!!.trim)
    //TODO Optimize to ignore this if package already installed
    actionNames.foreach { action =>
      val name = action.name
      val actionZip = new File(workDir, s"$name.zip")
      FileUtils.copyURLToFile(IOUtils.resourceToURL(s"/$name.zip"), actionZip)
      val cmd = createActionUpdateCmd(action, name, actionZip)
      val result = cmd.!!.trim
      log.info(this, s"Installed $name - $result")
      FileUtils.deleteQuietly(actionZip)
    }
    //This log message is used by tests to confirm that actions are installed
    log.info(this, "Installed Route Management Actions")
  }

  private def createActionUpdateCmd(action: Action, name: String, actionZip: File) = {
    Seq(
      wsk,
      "--apihost",
      apiHost.toString(),
      "--auth",
      authKey,
      "action",
      "update",
      s"$namespace/apimgmt/$name",
      actionZip.getAbsolutePath,
      "-a",
      "description",
      action.desc,
      "--kind",
      "nodejs:default",
      "-a",
      "web-export",
      "true",
      "-a",
      "final",
      "true")
  }

  private def packageUpdateCmd = {
    Seq(
      wsk,
      "--apihost",
      apiHost.toString(),
      "--auth",
      authKey,
      "package",
      "update",
      s"$namespace/apimgmt",
      "--shared",
      "no",
      "-a",
      "description",
      "This package manages the gateway API configuration.",
      "-p",
      "gwUrlV2",
      gatewayUrl.toString())
  }

  def wskExists: Boolean = Try(s"$wsk property get --cliversion".!(noopLogger)).getOrElse(-1) == 0
}

Source File: TestSpec.scala From spark-distcp with Apache License 2.0

5 votes

package com.coxautodata

import java.io.ByteArrayInputStream
import java.nio.file.Files

import com.coxautodata.objects.SerializableFileStatus
import com.coxautodata.utils.FileListing
import org.apache.commons.io.{FileUtils, IOUtils}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path}
import org.scalatest.{BeforeAndAfterEach, FunSpec, Matchers}

trait TestSpec extends FunSpec with Matchers with BeforeAndAfterEach {

  var testingBaseDir: java.nio.file.Path = _
  var testingBaseDirName: String = _
  var testingBaseDirPath: Path = _
  var localFileSystem: LocalFileSystem = _

  override def beforeEach(): Unit = {
    super.beforeEach()
    testingBaseDir = Files.createTempDirectory("test_output")
    testingBaseDirName = testingBaseDir.toString
    localFileSystem = FileSystem.getLocal(new Configuration())
    testingBaseDirPath = localFileSystem.makeQualified(new Path(testingBaseDirName))
  }

  override def afterEach(): Unit = {
    super.afterEach()
    FileUtils.deleteDirectory(testingBaseDir.toFile)
  }

  def createFile(relativePath: Path, content: Array[Byte]): SerializableFileStatus = {
    val path = new Path(testingBaseDirPath, relativePath)
    localFileSystem.mkdirs(path.getParent)
    val in = new ByteArrayInputStream(content)
    val out = localFileSystem.create(path)
    IOUtils.copy(in, out)
    in.close()
    out.close()
    SerializableFileStatus(localFileSystem.getFileStatus(path))
  }

  def fileStatusToResult(f: SerializableFileStatus): FileListing = {
    FileListing(f.getPath.toString, if (f.isFile) Some(f.getLen) else None)
  }

}

Source File: TestFolder.scala From schedoscope with Apache License 2.0

5 votes

package org.schedoscope.scheduler.driver

import java.io.File

import org.apache.commons.io.FileUtils
import org.scalatest.Suite

trait TestFolder extends Suite {
  self: Suite =>
  var testFolder: File = _
  var inputFolder: File = _
  var outputFolder: File = _

  def in = inputFolder.getAbsolutePath()

  def out = outputFolder.getAbsolutePath()

  private def deleteFile(file: File) {
    if (!file.exists) return
    if (file.isFile) {
      file.delete()
    } else {
      file.listFiles().foreach(deleteFile)
      file.delete()
    }
  }

  def /() = File.separator

  def createInputFile(path: String) {
    FileUtils.touch(new File(s"${inputFolder}${File.separator}${path}"))
  }

  def outputFile(path: String) = new File(outputPath(path))

  def inputFile(path: String) = new File(inputPath(path))

  def inputPath(path: String) = s"${in}${File.separator}${path}"

  def outputPath(path: String) = s"${out}${File.separator}${path}"

  abstract override def withFixture(test: NoArgTest) = {
    val tempFolder = System.getProperty("java.io.tmpdir")
    var folder: File = null

    do {
      folder = new File(tempFolder, "scalatest-" + System.nanoTime)
    } while (!folder.mkdir())

    testFolder = folder

    inputFolder = new File(testFolder, "in");
    inputFolder.mkdir()
    outputFolder = new File(testFolder, "out")
    outputFolder.mkdir()

    try {
      super.withFixture(test)
    } finally {
      deleteFile(testFolder)
    }
  }
}

Source File: JavaMetricsScreen.scala From Pi-Akka-Cluster with Apache License 2.0

5 votes

package akka_oled

import java.lang.management.ManagementFactory
import java.text.DecimalFormat

import com.sun.management.OperatingSystemMXBean
import org.apache.commons.io.FileUtils

import scala.collection.mutable

trait JavaMetricsScreen {
   def getJavaMetrics(): Array[Array[String]] = {
      val bean = ManagementFactory.getPlatformMXBean(classOf[OperatingSystemMXBean])
      val formatter = new DecimalFormat("#0.00")
      val map = mutable.LinkedHashMap[String, String](
         "Max mem:" -> FileUtils.byteCountToDisplaySize( ManagementFactory.getMemoryMXBean.getHeapMemoryUsage.getMax),
         "Curr mem:" -> FileUtils.byteCountToDisplaySize(ManagementFactory.getMemoryMXBean.getHeapMemoryUsage.getUsed),
         "CPU:" -> (formatter.format(bean.getSystemCpuLoad) + "%"),
         "Threads:" -> ManagementFactory.getThreadMXBean.getThreadCount.toString,
         "Classes:" -> ManagementFactory.getClassLoadingMXBean.getLoadedClassCount.toString)
      map.toArray.map(x => Array(x._1, x._2))
   }
}

Source File: GraphFrameTestSparkContext.scala From graphframes with Apache License 2.0

5 votes

package org.graphframes

import java.io.File
import java.nio.file.Files

import org.apache.commons.io.FileUtils
import org.scalatest.{BeforeAndAfterAll, Suite}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SparkSession, SQLContext, SQLImplicits}

trait GraphFrameTestSparkContext extends BeforeAndAfterAll { self: Suite =>
  @transient var spark: SparkSession = _
  @transient var sc: SparkContext = _
  @transient var sqlContext: SQLContext = _
  @transient var sparkMajorVersion: Int = _
  @transient var sparkMinorVersion: Int = _

  
  def isLaterVersion(minVersion: String): Boolean = {
    val (minMajorVersion, minMinorVersion) = TestUtils.majorMinorVersion(minVersion)
    if (sparkMajorVersion != minMajorVersion) {
      return sparkMajorVersion > minMajorVersion
    } else {
      return sparkMinorVersion >= minMinorVersion
    }
  }

  override def beforeAll() {
    super.beforeAll()

    spark = SparkSession.builder()
      .master("local[2]")
      .appName("GraphFramesUnitTest")
      .config("spark.sql.shuffle.partitions", 4)
      .getOrCreate()

    val checkpointDir = Files.createTempDirectory(this.getClass.getName).toString
    spark.sparkContext.setCheckpointDir(checkpointDir)
    sc = spark.sparkContext
    sqlContext = spark.sqlContext

    val (verMajor, verMinor) = TestUtils.majorMinorVersion(sc.version)
    sparkMajorVersion = verMajor
    sparkMinorVersion = verMinor
  }

  override def afterAll() {
    val checkpointDir = sc.getCheckpointDir
    if (spark != null) {
      spark.stop()
    }
    spark = null
    sqlContext = null
    sc = null

    checkpointDir.foreach { dir =>
      FileUtils.deleteQuietly(new File(dir))
    }
    super.afterAll()
  }
}

Source File: PluginsFilesUtils.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.serving.core.utils

import java.io.File
import java.net.URL
import java.util.{Calendar, UUID}

import akka.event.slf4j.SLF4JLogging
import com.stratio.sparta.serving.core.helpers.JarsHelper
import org.apache.commons.io.FileUtils

trait PluginsFilesUtils extends SLF4JLogging {

  def addPluginsToClassPath(pluginsFiles: Array[String]): Unit = {
    log.info(pluginsFiles.mkString(","))
    pluginsFiles.foreach(filePath => {
      log.info(s"Adding to classpath plugin file: $filePath")
      if (filePath.startsWith("/") || filePath.startsWith("file://")) addFromLocal(filePath)
      if (filePath.startsWith("hdfs")) addFromHdfs(filePath)
      if (filePath.startsWith("http")) addFromHttp(filePath)
    })
  }

  private def addFromLocal(filePath: String): Unit = {
    log.info(s"Getting file from local: $filePath")
    val file = new File(filePath.replace("file://", ""))
    JarsHelper.addToClasspath(file)
  }

  private def addFromHdfs(fileHdfsPath: String): Unit = {
    log.info(s"Getting file from HDFS: $fileHdfsPath")
    val inputStream = HdfsUtils().getFile(fileHdfsPath)
    val fileName = fileHdfsPath.split("/").last
    log.info(s"HDFS file name is $fileName")
    val file = new File(s"/tmp/sparta/userjars/${UUID.randomUUID().toString}/$fileName")
    log.info(s"Downloading HDFS file to local file system: ${file.getAbsoluteFile}")
    FileUtils.copyInputStreamToFile(inputStream, file)
    JarsHelper.addToClasspath(file)
  }

  private def addFromHttp(fileURI: String): Unit = {
    log.info(s"Getting file from HTTP: $fileURI")
    val tempFile = File.createTempFile(s"sparta-plugin-${Calendar.getInstance().getTimeInMillis}", ".jar")
    val url = new URL(fileURI)
    FileUtils.copyURLToFile(url, tempFile)
    JarsHelper.addToClasspath(tempFile)
  }
}

Source File: FileSystemOutputIT.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.output.filesystem

import java.io.File

import com.stratio.sparta.plugin.TemporalSparkContext
import com.stratio.sparta.plugin.output.fileSystem.FileSystemOutput
import com.stratio.sparta.sdk.pipeline.output.{Output, OutputFormatEnum, SaveModeEnum}
import org.apache.commons.io.FileUtils
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.junit.runner.RunWith
import org.scalatest.Matchers
import org.scalatest.junit.JUnitRunner


@RunWith(classOf[JUnitRunner])
class FileSystemOutputIT extends TemporalSparkContext with Matchers {

  val directory = getClass().getResource("/origin.txt")
  val parentFile = new File(directory.getPath).getParent
  val properties = Map(("path", parentFile + "/testRow"), ("outputFormat", "row"))
  val fields = StructType(StructField("name", StringType, false) ::
    StructField("age", IntegerType, false) ::
    StructField("year", IntegerType, true) :: Nil)
  val fsm = new FileSystemOutput("key", properties)


  "An object of type FileSystemOutput " should "have the same values as the properties Map" in {
    fsm.outputFormat should be(OutputFormatEnum.ROW)
  }

  
  private def dfGen(): DataFrame = {
    val sqlCtx = SparkSession.builder().config(sc.getConf).getOrCreate()
    val dataRDD = sc.parallelize(List(("user1", 23, 1993), ("user2", 26, 1990), ("user3", 21, 1995)))
      .map { case (name, age, year) => Row(name, age, year) }

    sqlCtx.createDataFrame(dataRDD, fields)
  }

  def fileExists(path: String): Boolean = new File(path).exists()

  "Given a DataFrame, a directory" should "be created with the data written inside" in {
    fsm.save(dfGen(), SaveModeEnum.Append, Map(Output.TableNameKey -> "test"))
    fileExists(fsm.path.get) should equal(true)
  }

  it should "exist with the given path and be deleted" in {
    if (fileExists(fsm.path.get))
      FileUtils.deleteDirectory(new File(fsm.path.get))
    fileExists(fsm.path.get) should equal(false)
  }

  val fsm2 = new FileSystemOutput("key", properties.updated("outputFormat", "json")
    .updated("path", parentFile + "/testJson"))

  "Given another DataFrame, a directory" should "be created with the data inside in JSON format" in {
    fsm2.outputFormat should be(OutputFormatEnum.JSON)
    fsm2.save(dfGen(), SaveModeEnum.Append, Map(Output.TableNameKey -> "test"))
    fileExists(fsm2.path.get) should equal(true)
  }

  it should "exist with the given path and be deleted" in {
    if (fileExists(s"${fsm2.path.get}/test"))
      FileUtils.deleteDirectory(new File(s"${fsm2.path.get}/test"))
    fileExists(s"${fsm2.path.get}/test") should equal(false)
  }
}

Source File: MLAtlasEntityUtilsSuite.scala From spark-atlas-connector with Apache License 2.0

5 votes

package com.hortonworks.spark.atlas.types

import java.io.File

import org.apache.atlas.{AtlasClient, AtlasConstants}
import org.apache.atlas.model.instance.AtlasEntity
import org.apache.commons.io.FileUtils
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.MinMaxScaler
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
import org.scalatest.{FunSuite, Matchers}
import com.hortonworks.spark.atlas.TestUtils._
import com.hortonworks.spark.atlas.{AtlasUtils, WithHiveSupport}

class MLAtlasEntityUtilsSuite extends FunSuite with Matchers with WithHiveSupport {

  def getTableEntity(tableName: String): AtlasEntity = {
    val dbDefinition = createDB("db1", "hdfs:///test/db/db1")
    val sd = createStorageFormat()
    val schema = new StructType()
      .add("user", StringType, false)
      .add("age", IntegerType, true)
    val tableDefinition = createTable("db1", s"$tableName", schema, sd)

    val tableEntities = internal.sparkTableToEntity(
      tableDefinition, AtlasConstants.DEFAULT_CLUSTER_NAME, Some(dbDefinition))
    val tableEntity = tableEntities.entity

    tableEntity
  }

  test("pipeline, pipeline model, fit and transform") {
    val uri = "/"
    val pipelineDir = "tmp/pipeline"
    val modelDir = "tmp/model"

    val pipelineDirEntity = internal.mlDirectoryToEntity(uri, pipelineDir)
    pipelineDirEntity.entity.getAttribute("uri") should be (uri)
    pipelineDirEntity.entity.getAttribute("directory") should be (pipelineDir)
    pipelineDirEntity.dependencies.length should be (0)

    val modelDirEntity = internal.mlDirectoryToEntity(uri, modelDir)
    modelDirEntity.entity.getAttribute("uri") should be (uri)
    modelDirEntity.entity.getAttribute("directory") should be (modelDir)
    modelDirEntity.dependencies.length should be (0)

    val df = sparkSession.createDataFrame(Seq(
      (1, Vectors.dense(0.0, 1.0, 4.0), 1.0),
      (2, Vectors.dense(1.0, 0.0, 4.0), 2.0),
      (3, Vectors.dense(1.0, 0.0, 5.0), 3.0),
      (4, Vectors.dense(0.0, 0.0, 5.0), 4.0)
    )).toDF("id", "features", "label")

    val scaler = new MinMaxScaler()
      .setInputCol("features")
      .setOutputCol("features_scaled")
      .setMin(0.0)
      .setMax(3.0)
    val pipeline = new Pipeline().setStages(Array(scaler))

    val model = pipeline.fit(df)

    pipeline.write.overwrite().save(pipelineDir)

    val pipelineEntity = internal.mlPipelineToEntity(pipeline.uid, pipelineDirEntity)
    pipelineEntity.entity.getTypeName should be (metadata.ML_PIPELINE_TYPE_STRING)
    pipelineEntity.entity.getAttribute(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME) should be (
      pipeline.uid)
    pipelineEntity.entity.getAttribute("name") should be (pipeline.uid)
    pipelineEntity.entity.getRelationshipAttribute("directory") should be (
      AtlasUtils.entityToReference(pipelineDirEntity.entity, useGuid = false))
    pipelineEntity.dependencies should be (Seq(pipelineDirEntity))

    val modelEntity = internal.mlModelToEntity(model.uid, modelDirEntity)
    val modelUid = model.uid.replaceAll("pipeline", "model")
    modelEntity.entity.getTypeName should be (metadata.ML_MODEL_TYPE_STRING)
    modelEntity.entity.getAttribute(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME) should be (modelUid)
    modelEntity.entity.getAttribute("name") should be (modelUid)
    modelEntity.entity.getRelationshipAttribute("directory") should be (
      AtlasUtils.entityToReference(modelDirEntity.entity, useGuid = false))

    modelEntity.dependencies should be (Seq(modelDirEntity))

    FileUtils.deleteDirectory(new File("tmp"))
  }
}

Source File: WithRemoteHiveMetastoreServiceSupport.scala From spark-atlas-connector with Apache License 2.0

5 votes

package com.hortonworks.spark.atlas

import java.io.File
import java.nio.file.Files

import com.hortonworks.spark.atlas.utils.SparkUtils
import com.hotels.beeju.ThriftHiveMetaStoreTestUtil
import org.apache.commons.io.FileUtils
import org.apache.spark.sql.SparkSession
import org.scalatest.{BeforeAndAfterAll, Suite}

trait WithRemoteHiveMetastoreServiceSupport extends BeforeAndAfterAll { self: Suite =>
  protected val dbName = "sac_hive_metastore"

  protected var sparkSession: SparkSession = _

  private var warehouseDir: String = _

  private val hive = new ThriftHiveMetaStoreTestUtil(dbName)

  private def cleanupAnyExistingSession(): Unit = {
    val session = SparkSession.getActiveSession.orElse(SparkSession.getDefaultSession)
    if (session.isDefined) {
      session.get.sessionState.catalog.reset()
      session.get.stop()
      SparkSession.clearActiveSession()
      SparkSession.clearDefaultSession()
    }
  }

  override protected def beforeAll(): Unit = {
    super.beforeAll()

    cleanupAnyExistingSession()

    hive.before()

    warehouseDir = Files.createTempDirectory("sac-warehouse-").toString
    sparkSession = SparkSession.builder()
      .master("local")
      .appName(this.getClass.getCanonicalName)
      .enableHiveSupport()
      .config("spark.ui.enabled", "false")
      .config("spark.sql.warehouse.dir", warehouseDir)
      .config("spark.hadoop.hive.metastore.uris", hive.getThriftConnectionUri)
      .getOrCreate()

    // reset hiveConf to make sure the configuration change takes effect
    SparkUtils.resetHiveConf
  }

  override protected def afterAll(): Unit = {
    try {
      hive.after()
      sparkSession.sessionState.catalog.reset()
      sparkSession.stop()
      SparkSession.clearActiveSession()
      SparkSession.clearDefaultSession()
    } finally {
      // reset hiveConf again to prevent affecting other tests
      SparkUtils.resetHiveConf

      sparkSession = null
      FileUtils.deleteDirectory(new File(warehouseDir))
    }
    System.clearProperty("spark.driver.port")

    super.afterAll()
  }
}

Source File: WithHiveSupport.scala From spark-atlas-connector with Apache License 2.0

5 votes

package com.hortonworks.spark.atlas

import java.io.File
import java.nio.file.Files

import org.apache.commons.io.FileUtils
import org.apache.spark.sql.SparkSession
import org.scalatest.{BeforeAndAfterAll, Suite}

trait WithHiveSupport extends BeforeAndAfterAll { self: Suite =>

  protected var sparkSession: SparkSession = _

  private var metastoreDir: String = _
  private var warehouseDir: String = _

  private def cleanupAnyExistingSession(): Unit = {
    val session = SparkSession.getActiveSession.orElse(SparkSession.getDefaultSession)
    if (session.isDefined) {
      session.get.sessionState.catalog.reset()
      session.get.stop()
      SparkSession.clearActiveSession()
      SparkSession.clearDefaultSession()
    }
  }

  override protected def beforeAll(): Unit = {
    super.beforeAll()

    cleanupAnyExistingSession()

    metastoreDir = Files.createTempDirectory("sac-metastore-").toString
    warehouseDir = Files.createTempDirectory("sac-warehouse-").toString
    System.setProperty("derby.system.home", metastoreDir)
    sparkSession = SparkSession.builder()
      .master("local")
      .appName(this.getClass.getCanonicalName)
      .enableHiveSupport()
      .config("spark.ui.enabled", "false")
      .config("spark.sql.warehouse.dir", warehouseDir)
      .getOrCreate()
  }

  override protected def afterAll(): Unit = {
    try {
      sparkSession.sessionState.catalog.reset()
      sparkSession.stop()
      SparkSession.clearActiveSession()
      SparkSession.clearDefaultSession()
    } finally {
      sparkSession = null
      FileUtils.deleteDirectory(new File(warehouseDir))
    }
    System.clearProperty("spark.driver.port")

    super.afterAll()
  }
}

Source File: DistServiceExecutor.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.experiments.distributeservice

import java.io.{File, FileWriter}
import java.net.InetAddress
import scala.collection.JavaConverters._
import scala.io.Source
import scala.sys.process._
import scala.util.{Failure, Success, Try}

import akka.actor.Actor
import org.apache.commons.io.FileUtils
import org.apache.commons.lang.text.StrSubstitutor
import org.slf4j.Logger

import org.apache.gearpump.cluster.{ExecutorContext, UserConfig}
import org.apache.gearpump.experiments.distributeservice.DistServiceAppMaster.InstallService
import org.apache.gearpump.util.{ActorUtil, LogUtil}

class DistServiceExecutor(executorContext: ExecutorContext, userConf: UserConfig) extends Actor {
  import executorContext._
  private val LOG: Logger = LogUtil.getLogger(getClass, executor = executorId, app = appId)

  override def receive: Receive = {
    case InstallService(url, zipFileName, targetPath, scriptData, serviceName, serviceSettings) =>
      LOG.info(s"Executor $executorId receive command to install " +
        s"service $serviceName to $targetPath")
      unzipFile(url, zipFileName, targetPath)
      installService(scriptData, serviceName, serviceSettings)
  }

  private def unzipFile(url: String, zipFileName: String, targetPath: String) = {
    val zipFile = File.createTempFile(System.currentTimeMillis().toString, zipFileName)
    val dir = new File(targetPath)
    if (dir.exists()) {
      FileUtils.forceDelete(dir)
    }
    val bytes = FileServer.newClient.get(url).get
    FileUtils.writeByteArrayToFile(zipFile, bytes)
    val result = Try(s"unzip ${zipFile.getAbsolutePath} -d $targetPath".!!)
    result match {
      case Success(msg) => LOG.info(s"Executor $executorId unzip file to $targetPath")
      case Failure(ex) => throw ex
    }
  }

  private def installService(
      scriptData: Array[Byte], serviceName: String, serviceSettings: Map[String, Any]) = {
    val tempFile = File.createTempFile("gearpump", serviceName)
    FileUtils.writeByteArrayToFile(tempFile, scriptData)
    val script = new File("/etc/init.d", serviceName)
    writeFileWithEnvVariables(tempFile, script, serviceSettings ++ getEnvSettings)
    val result = Try(s"chkconfig --add $serviceName".!!)
    result match {
      case Success(msg) => LOG.info(s"Executor install service $serviceName successfully!")
      case Failure(ex) => throw ex
    }
  }

  private def getEnvSettings: Map[String, Any] = {
    Map("workerId" -> worker,
      "localhost" -> ActorUtil.getSystemAddress(context.system).host.get,
      "hostname" -> InetAddress.getLocalHost.getHostName)
  }

  private def writeFileWithEnvVariables(source: File, target: File, envs: Map[String, Any]) = {
    val writer = new FileWriter(target)
    val sub = new StrSubstitutor(envs.asJava)
    sub.setEnableSubstitutionInVariables(true)
    Source.fromFile(source).getLines().foreach(line => writer.write(sub.replace(line) + "\r\n"))
    writer.close()
  }
}

Source File: DistributeServiceClient.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.experiments.distributeservice

import java.io.File
import scala.concurrent.Future
import scala.util.{Failure, Success}

import akka.pattern.ask
import org.apache.commons.io.FileUtils

import org.apache.gearpump.cluster.client.ClientContext
import org.apache.gearpump.cluster.main.{ArgumentsParser, CLIOption}
import org.apache.gearpump.experiments.distributeservice.DistServiceAppMaster.{FileContainer, GetFileContainer, InstallService}
import org.apache.gearpump.util.{AkkaApp, Constants}


object DistributeServiceClient extends AkkaApp with ArgumentsParser {
  implicit val timeout = Constants.FUTURE_TIMEOUT

  override val options: Array[(String, CLIOption[Any])] = Array(
    "appid" -> CLIOption[Int]("<the distributed shell appid>", required = true),
    "file" -> CLIOption[String]("<service zip file path>", required = true),
    "script" -> CLIOption[String](
      "<file path of service script that will be installed to /etc/init.d>", required = true),
    "serviceName" -> CLIOption[String]("<service name>", required = true),
    "target" -> CLIOption[String]("<target path on each machine>", required = true)
  )

  override def help(): Unit = {
    super.help()
    // scalastyle:off println
    Console.err.println(s"-D<name>=<value> set a property to the service")
    // scalastyle:on println
  }

  override def main(akkaConf: Config, args: Array[String]): Unit = {
    val config = parse(filterCustomOptions(args))
    val context = ClientContext(akkaConf)
    implicit val system = context.system
    implicit val dispatcher = system.dispatcher
    val appid = config.getInt("appid")
    val zipFile = new File(config.getString("file"))
    val script = new File(config.getString("script"))
    val serviceName = config.getString("serviceName")
    val appMaster = context.resolveAppID(appid)
    (appMaster ? GetFileContainer).asInstanceOf[Future[FileContainer]].map { container =>
      val bytes = FileUtils.readFileToByteArray(zipFile)
      val result = FileServer.newClient.save(container.url, bytes)
      result match {
        case Success(_) =>
          appMaster ! InstallService(container.url, zipFile.getName, config.getString("target"),
            FileUtils.readFileToByteArray(script), serviceName, parseServiceConfig(args))
          context.close()
        case Failure(ex) => throw ex
      }
    }
  }

  private def filterCustomOptions(args: Array[String]): Array[String] = {
    args.filter(!_.startsWith("-D"))
  }

  private def parseServiceConfig(args: Array[String]): Map[String, Any] = {
    val result = Map.empty[String, Any]
    args.foldLeft(result) { (result, argument) =>
      if (argument.startsWith("-D") && argument.contains("=")) {
        val fixedKV = argument.substring(2).split("=")
        result + (fixedKV(0) -> fixedKV(1))
      } else {
        result
      }
    }
  }
}

Source File: PersistenceSpec.scala From 006877 with MIT License

5 votes

package akka.testkit


import java.io.File
import com.typesafe.config._

import scala.util._

import akka.actor._
import org.scalatest._

import org.apache.commons.io.FileUtils

abstract class PersistenceSpec(system: ActorSystem) extends TestKit(system)
  with ImplicitSender
  with WordSpecLike
  with Matchers
  with BeforeAndAfterAll
  with PersistenceCleanup {

  def this(name: String, config: Config) = this(ActorSystem(name, config))
  override protected def beforeAll() = deleteStorageLocations()

  override protected def afterAll() = {
    deleteStorageLocations()
    TestKit.shutdownActorSystem(system)
  }

  def killActors(actors: ActorRef*) = {
    actors.foreach { actor =>
      watch(actor)
      system.stop(actor)
      expectTerminated(actor)
      Thread.sleep(1000) // the actor name is not unique intermittently on travis when creating it again after killActors, this is ducktape.
    }
  }
}

trait PersistenceCleanup {
  def system: ActorSystem

  val storageLocations = List(
    "akka.persistence.journal.leveldb.dir",
    "akka.persistence.journal.leveldb-shared.store.dir",
    "akka.persistence.snapshot-store.local.dir").map { s =>
    new File(system.settings.config.getString(s))
  }

  def deleteStorageLocations(): Unit = {
    storageLocations.foreach(dir => Try(FileUtils.deleteDirectory(dir)))
  }
}

Source File: PersistenceSpec.scala From 006877 with MIT License

5 votes

package akka.testkit


import java.io.File
import com.typesafe.config._

import scala.util._

import akka.actor._
import org.scalatest._

import org.apache.commons.io.FileUtils

abstract class PersistenceSpec(system: ActorSystem) extends TestKit(system)
  with ImplicitSender
  with WordSpecLike
  with Matchers
  with BeforeAndAfterAll
  with PersistenceCleanup {

  def this(name: String, config: Config) = this(ActorSystem(name, config))
  override protected def beforeAll() = deleteStorageLocations()

  override protected def afterAll() = {
    deleteStorageLocations()
    TestKit.shutdownActorSystem(system)
  }

  def killActors(actors: ActorRef*) = {
    actors.foreach { actor =>
      watch(actor)
      system.stop(actor)
      expectTerminated(actor)
      Thread.sleep(1000) // the actor name is not unique intermittently on travis when creating it again after killActors, this is ducktape.
    }
  }
}

trait PersistenceCleanup {
  def system: ActorSystem

  val storageLocations = List(
    "akka.persistence.journal.leveldb.dir",
    "akka.persistence.journal.leveldb-shared.store.dir",
    "akka.persistence.snapshot-store.local.dir").map(s => new File(system.settings.config.getString(s)))

  def deleteStorageLocations(): Unit = {
    storageLocations.foreach(dir => Try(FileUtils.deleteDirectory(dir)))
  }
}

Source File: ExternalCluster.scala From incubator-livy with Apache License 2.0

5 votes

package org.apache.livy.test.framework

import java.io._

import org.apache.commons.io.FileUtils
import org.apache.hadoop.fs.Path

import org.apache.livy.{LivyConf, Logging}
import org.apache.livy.client.common.TestUtils
import org.apache.livy.server.LivyServer

/**
 * Used to run tests on an real external cluster
 * In order to utilize test against an external cluster, you need to create
 * a configuration called cluster.spec and run the test suite with the option
 * -Dcluster.spec=<yourcluster.spec>
 *
 * Also, make sure to disable the following line InteractiveIT.scala
 * s.run("""sc.getConf.get("spark.executor.instances")""").verifyResult("res1: String = 1\n")
 *
 * This is because your external cluster may not have the same configuration as the MiniCluster
 *
 * See the cluster.spec.template file for an example cluster.spec
 */
class ExternalCluster(config: Map[String, String]) extends Cluster with Logging {
  private var _configDir: File = _

  private var _livyEndpoint: String = _
  private var _livyThriftJdbcUrl: Option[String] = _
  private var _hdfsScrathDir: Path = _

  private var _authScheme: String = _
  private var _user: String = _
  private var _password: String = _
  private var _sslCertPath: String = _

  private var _principal: String = _
  private var _keytabPath: String = _

  // Livy rest url endpoint
  override def livyEndpoint: String = _livyEndpoint

  // Livy jdbc url endpoint
  override def jdbcEndpoint: Option[String] = _livyThriftJdbcUrl

  // Temp directory in hdfs
  override def hdfsScratchDir(): Path = _hdfsScrathDir

  // Working directory that store core-site.xml, yarn-site.xml
  override def configDir(): File = _configDir

  // Security details
  override def authScheme: String = _authScheme
  override def user: String = _user
  override def password: String = _password
  override def sslCertPath: String = _sslCertPath

  override def principal: String = _principal
  override def keytabPath: String = _keytabPath

  override def doAsClusterUser[T](task: => T): T = task

  override def deploy(): Unit = {
    _configDir = new File(config.getOrElse("configDir", "hadoop-conf"))
    _livyEndpoint = config.getOrElse("livyEndpoint", "")

    _authScheme = config.getOrElse("authScheme", "")
    _user = config.getOrElse("user", "")
    _password = config.getOrElse("password", "")
    _sslCertPath = config.getOrElse("sslCertPath", "")
    _principal = config.getOrElse("principal", "")
    _keytabPath = config.getOrElse("keytabPath", "")

    // Needs to be set after all the other fields are filled in properly
    _hdfsScrathDir = fs.makeQualified(new Path(config.getOrElse("hdfsScratchDir", "/")))
  }

  override def cleanUp(): Unit = {
  }

  def runLivy(): Unit = {
  }

  def stopLivy(): Unit = {
  }
}

Source File: BaseInteractiveServletSpec.scala From incubator-livy with Apache License 2.0

5 votes

package org.apache.livy.server.interactive

import java.io.File
import java.nio.file.Files

import org.apache.commons.io.FileUtils
import org.apache.spark.launcher.SparkLauncher

import org.apache.livy.LivyConf
import org.apache.livy.rsc.RSCConf
import org.apache.livy.server.BaseSessionServletSpec
import org.apache.livy.sessions.{Kind, SessionKindModule, Spark}

abstract class BaseInteractiveServletSpec
  extends BaseSessionServletSpec[InteractiveSession, InteractiveRecoveryMetadata] {

  mapper.registerModule(new SessionKindModule())

  protected var tempDir: File = _

  override def afterAll(): Unit = {
    super.afterAll()
    if (tempDir != null) {
      scala.util.Try(FileUtils.deleteDirectory(tempDir))
      tempDir = null
    }
  }

  override protected def createConf(): LivyConf = synchronized {
    if (tempDir == null) {
      tempDir = Files.createTempDirectory("client-test").toFile()
    }
    super.createConf()
      .set(LivyConf.SESSION_STAGING_DIR, tempDir.toURI().toString())
      .set(LivyConf.REPL_JARS, "dummy.jar")
      .set(LivyConf.LIVY_SPARK_VERSION, sys.env("LIVY_SPARK_VERSION"))
      .set(LivyConf.LIVY_SPARK_SCALA_VERSION, sys.env("LIVY_SCALA_VERSION"))
  }

  protected def createRequest(
      inProcess: Boolean = true,
      extraConf: Map[String, String] = Map(),
      kind: Kind = Spark): CreateInteractiveRequest = {
    val classpath = sys.props("java.class.path")
    val request = new CreateInteractiveRequest()
    request.kind = kind
    request.name = None
    request.conf = extraConf ++ Map(
      RSCConf.Entry.LIVY_JARS.key() -> "",
      RSCConf.Entry.CLIENT_IN_PROCESS.key() -> inProcess.toString,
      SparkLauncher.SPARK_MASTER -> "local",
      SparkLauncher.DRIVER_EXTRA_CLASSPATH -> classpath,
      SparkLauncher.EXECUTOR_EXTRA_CLASSPATH -> classpath
    )
    request
  }

}

Source File: YarnShuffleIntegrationSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files
import org.apache.commons.io.FileUtils
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.scalatest.Matchers

import org.apache.spark._
import org.apache.spark.internal.Logging
import org.apache.spark.network.shuffle.ShuffleTestAccessor
import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor}
import org.apache.spark.tags.ExtendedYarnTest


@ExtendedYarnTest
class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {

  override def newYarnConfig(): YarnConfiguration = {
    val yarnConfig = new YarnConfiguration()
    yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle")
    yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"),
      classOf[YarnShuffleService].getCanonicalName)
    yarnConfig.set("spark.shuffle.service.port", "0")
    yarnConfig
  }

  test("external shuffle service") {
    val shuffleServicePort = YarnTestAccessor.getShuffleServicePort
    val shuffleService = YarnTestAccessor.getShuffleServiceInstance

    val registeredExecFile = YarnTestAccessor.getRegisteredExecutorFile(shuffleService)

    logInfo("Shuffle service port = " + shuffleServicePort)
    val result = File.createTempFile("result", null, tempDir)
    val finalState = runSpark(
      false,
      mainClassName(YarnExternalShuffleDriver.getClass),
      appArgs = Seq(result.getAbsolutePath(), registeredExecFile.getAbsolutePath),
      extraConf = Map(
        "spark.shuffle.service.enabled" -> "true",
        "spark.shuffle.service.port" -> shuffleServicePort.toString
      )
    )
    checkResult(finalState, result)
    assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists())
  }
}

private object YarnExternalShuffleDriver extends Logging with Matchers {

  val WAIT_TIMEOUT_MILLIS = 10000

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      // scalastyle:off println
      System.err.println(
        s"""
        |Invalid command line: ${args.mkString(" ")}
        |
        |Usage: ExternalShuffleDriver [result file] [registered exec file]
        """.stripMargin)
      // scalastyle:on println
      System.exit(1)
    }

    val sc = new SparkContext(new SparkConf()
      .setAppName("External Shuffle Test"))
    val conf = sc.getConf
    val status = new File(args(0))
    val registeredExecFile = new File(args(1))
    logInfo("shuffle service executor file = " + registeredExecFile)
    var result = "failure"
    val execStateCopy = new File(registeredExecFile.getAbsolutePath + "_dup")
    try {
      val data = sc.parallelize(0 until 100, 10).map { x => (x % 10) -> x }.reduceByKey{ _ + _ }.
        collect().toSet
      sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
      data should be ((0 until 10).map{x => x -> (x * 10 + 450)}.toSet)
      result = "success"
      // only one process can open a leveldb file at a time, so we copy the files
      FileUtils.copyDirectory(registeredExecFile, execStateCopy)
      assert(!ShuffleTestAccessor.reloadRegisteredExecutors(execStateCopy).isEmpty)
    } finally {
      sc.stop()
      FileUtils.deleteDirectory(execStateCopy)
      Files.write(result, status, StandardCharsets.UTF_8)
    }
  }

}

Source File: SortShuffleSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark

import java.io.File

import scala.collection.JavaConverters._

import org.apache.commons.io.FileUtils
import org.apache.commons.io.filefilter.TrueFileFilter
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.rdd.ShuffledRDD
import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
import org.apache.spark.shuffle.sort.SortShuffleManager
import org.apache.spark.util.Utils

class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {

  // This test suite should run all tests in ShuffleSuite with sort-based shuffle.

  private var tempDir: File = _

  override def beforeAll() {
    super.beforeAll()
    conf.set("spark.shuffle.manager", "sort")
  }

  override def beforeEach(): Unit = {
    super.beforeEach()
    tempDir = Utils.createTempDir()
    conf.set("spark.local.dir", tempDir.getAbsolutePath)
  }

  override def afterEach(): Unit = {
    try {
      Utils.deleteRecursively(tempDir)
    } finally {
      super.afterEach()
    }
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the new serialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new KryoSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the old deserialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new JavaSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = {
    def getAllFiles: Set[File] =
      FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
    val filesBeforeShuffle = getAllFiles
    // Force the shuffle to be performed
    shuffledRdd.count()
    // Ensure that the shuffle actually created files that will need to be cleaned up
    val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle
    filesCreatedByShuffle.map(_.getName) should be
    Set("shuffle_0_0_0.data", "shuffle_0_0_0.index")
    // Check that the cleanup actually removes the files
    sc.env.blockManager.master.removeShuffle(0, blocking = true)
    for (file <- filesCreatedByShuffle) {
      assert (!file.exists(), s"Shuffle file $file was not cleaned up")
    }
  }
}

Source File: EmbeddedIO.scala From embedded-kafka with Apache License 2.0

5 votes

package com.tuplejump.embedded.kafka

import java.io.{ File => JFile }

import scala.util.Try
import org.apache.commons.io.FileUtils

object EmbeddedIO extends Logging {

  private val shutdownDeletePaths = new scala.collection.mutable.HashSet[String]()

  val logsDir = new JFile(".", "logs")
  dirSetup(new JFile(logsDir.getAbsolutePath))

  
  def createTempDir(tmpName: String): JFile =
    dirSetup(new JFile(logsDir, tmpName))

  private def dirSetup(dir: JFile): JFile = {
    if (logsDir.exists()) deleteRecursively(logsDir)
    dir.mkdir

    logger.info(s"Created dir ${dir.getAbsolutePath.replace("./", "")}")

    registerShutdownDeleteDir(dir)

    sys.runtime.addShutdownHook(new Thread("delete temp dir " + dir) {
      override def run(): Unit = {
        if (!hasRootAsShutdownDeleteDir(dir)) deleteRecursively(dir)
      }
    })
    dir
  }

  protected def registerShutdownDeleteDir(file: JFile) {
    shutdownDeletePaths.synchronized {
      shutdownDeletePaths += file.getAbsolutePath
    }
  }

  private def hasRootAsShutdownDeleteDir(file: JFile): Boolean = {
    val absolutePath = file.getAbsolutePath
    shutdownDeletePaths.synchronized {
      shutdownDeletePaths.exists { path =>
        !absolutePath.equals(path) && absolutePath.startsWith(path)
      }
    }
  }

  protected def deleteRecursively(delete: JFile): Unit =
    for {
      file <- Option(delete)
    } Try(FileUtils.deleteDirectory(file))
}

Source File: SharedSparkSessionSuite.scala From spark-tensorflow-connector with Apache License 2.0

5 votes

package org.trustedanalytics.spark.datasources.tensorflow

import java.io.File

import org.apache.commons.io.FileUtils
import org.apache.spark.SharedSparkSession
import org.junit.{After, Before}
import org.scalatest.{BeforeAndAfterAll, Matchers, WordSpecLike}


trait BaseSuite extends WordSpecLike with Matchers with BeforeAndAfterAll

class SharedSparkSessionSuite extends SharedSparkSession with BaseSuite {
  val TF_SANDBOX_DIR = "tf-sandbox"
  val file = new File(TF_SANDBOX_DIR)

  @Before
  override def beforeAll() = {
    super.setUp()
    FileUtils.deleteQuietly(file)
    file.mkdirs()
  }

  @After
  override def afterAll() = {
    FileUtils.deleteQuietly(file)
    super.tearDown()
  }
}

Source File: TestCreateTableWithBlockletSize.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.spark.testsuite.createTable

import scala.util.Random

import org.apache.commons.io.FileUtils
import org.apache.spark.sql.CarbonEnv
import org.apache.spark.sql.test.util.QueryTest
import org.scalatest.BeforeAndAfterAll

import org.apache.carbondata.common.exceptions.sql.MalformedCarbonCommandException
import org.apache.carbondata.core.datastore.impl.FileFactory
import org.apache.carbondata.core.reader.CarbonFooterReaderV3
import org.apache.carbondata.core.util.path.CarbonTablePath


class TestCreateTableWithBlockletSize extends QueryTest with BeforeAndAfterAll {

  override def beforeAll {
    sql("use default")
    sql("drop table if exists source")
  }

  test("test create table with blocklet size") {
    val rdd = sqlContext.sparkContext.parallelize(1 to 1000000)
        .map(x => (Random.nextInt(), Random.nextInt().toString))
    sqlContext.createDataFrame(rdd)
        .write
        .format("carbondata")
        .option("table_blocksize", "8")
        .option("table_blocklet_size", "3")
        .option("tableName", "source")
        .save()

    // read footer and verify number of blocklets
    val table = CarbonEnv.getCarbonTable(None, "source")(sqlContext.sparkSession)
    val folder = FileFactory.getCarbonFile(table.getTablePath)
    val files = folder.listFiles(true)
    import scala.collection.JavaConverters._
    val dataFiles = files.asScala.filter(_.getName.endsWith(CarbonTablePath.CARBON_DATA_EXT))
    dataFiles.foreach { dataFile =>
      val fileReader = FileFactory
        .getFileHolder(FileFactory.getFileType(dataFile.getPath))
      val buffer = fileReader
        .readByteBuffer(FileFactory.getUpdatedFilePath(dataFile.getPath), dataFile.getSize - 8, 8)
      val footerReader = new CarbonFooterReaderV3(dataFile.getAbsolutePath, buffer.getLong)
      val footer = footerReader.readFooterVersion3
      assertResult(2)(footer.blocklet_index_list.size)
      assertResult(2)(footer.blocklet_info_list3.size)
    }
    sql("drop table source")
  }

  test("test create table with invalid blocklet size") {
    val ex = intercept[MalformedCarbonCommandException] {
      sql("CREATE TABLE T1(name String) STORED AS CARBONDATA TBLPROPERTIES('TABLE_BLOCKLET_SIZE'='3X')")
    }
    assert(ex.getMessage.toLowerCase.contains("invalid table_blocklet_size"))
  }

  override def afterAll {
    sql("use default")
    sql("drop table if exists source")
  }

}

Source File: DirectSQLExample.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.examples

import java.io.File

import org.apache.commons.io.FileUtils
import org.apache.spark.sql.SparkSession

import org.apache.carbondata.core.metadata.datatype.{DataTypes, Field}
import org.apache.carbondata.examples.util.ExampleUtils
import org.apache.carbondata.sdk.file.{CarbonWriter, Schema}


// scalastyle:off println
object DirectSQLExample {

  def main(args: Array[String]) {
    val carbonSession = ExampleUtils.createSparkSession("DirectSQLExample")
    exampleBody(carbonSession)
    carbonSession.close()
  }

  def exampleBody(carbonSession : SparkSession): Unit = {

    val rootPath = new File(this.getClass.getResource("/").getPath
      + "../../../..").getCanonicalPath
    val path = s"$rootPath/examples/spark/target/carbonFile/"

    import carbonSession._
    // 1. generate data file
    cleanTestData(path)

    val rows = 20
    buildTestData(path, rows)
    val readPath = path

    println("Running SQL on carbon files directly")
    try {
      // 2. run queries directly, no need to create table first
      sql(s"""select * FROM carbon.`$readPath` limit 10""".stripMargin).show()

      // 3. check rows count
      val counts = sql(s"""select * FROM carbon.`$readPath`""".stripMargin).count()
      assert(rows == counts)

    } catch {
      case e: Exception => throw e
    } finally {
      // 3.delete data files
      cleanTestData(path)
    }

  }

  // prepare SDK writer output
  def buildTestData(
      path: String,
      num: Int = 3): Unit = {

    // getCanonicalPath gives path with \, but the code expects /.
    val writerPath = path.replace("\\", "/")

    val fields = new Array[Field](3)
    fields(0) = new Field("name", DataTypes.STRING)
    fields(1) = new Field("age", DataTypes.INT)
    fields(2) = new Field("height", DataTypes.DOUBLE)

    try {
      val builder = CarbonWriter
        .builder()
        .outputPath(writerPath)
        .uniqueIdentifier(System.currentTimeMillis)
        .withBlockSize(2)
        .withCsvInput(new Schema(fields))
        .writtenBy("DirectSQLExample")
      val writer = builder.build()
      var i = 0
      while (i < num) {
        writer.write(Array[String]("robot" + i, String.valueOf(i), String.valueOf(i.toDouble / 2)))
        i += 1
      }
      writer.close()
    } catch {
      case e: Exception => throw e
    }
  }

  def cleanTestData(path: String): Unit = {
    FileUtils.deleteDirectory(new File(path))
  }

}
// scalastyle:on println

Source File: TestRegisterIndexCarbonTable.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.spark.testsuite.secondaryindex

import java.io.{File, IOException}

import org.apache.commons.io.FileUtils
import org.apache.spark.sql.Row
import org.apache.spark.sql.test.TestQueryExecutor
import org.apache.spark.sql.test.util.QueryTest
import org.scalatest.BeforeAndAfterAll

import org.apache.carbondata.core.constants.CarbonCommonConstants


class TestRegisterIndexCarbonTable extends QueryTest with BeforeAndAfterAll {

  override def beforeAll {
    sql("drop database if exists carbon cascade")
  }

  def restoreData(dblocation: String, tableName: String) = {
    val destination = dblocation + CarbonCommonConstants.FILE_SEPARATOR + tableName
    val source = dblocation+ "_back" + CarbonCommonConstants.FILE_SEPARATOR + tableName
    try {
      FileUtils.copyDirectory(new File(source), new File(destination))
      FileUtils.deleteDirectory(new File(source))
    } catch {
      case e : Exception =>
        throw new IOException("carbon table data restore failed.")
    } finally {

    }
  }
  def backUpData(dblocation: String, tableName: String) = {
    val source = dblocation + CarbonCommonConstants.FILE_SEPARATOR + tableName
    val destination = dblocation+ "_back" + CarbonCommonConstants.FILE_SEPARATOR + tableName
    try {
      FileUtils.copyDirectory(new File(source), new File(destination))
    } catch {
      case e : Exception =>
        throw new IOException("carbon table data backup failed.")
    }
  }
  test("register tables test") {
    val location = TestQueryExecutor.warehouse +
                           CarbonCommonConstants.FILE_SEPARATOR + "dbName"
    sql("drop database if exists carbon cascade")
    sql(s"create database carbon location '${location}'")
    sql("use carbon")
    sql("""create table carbon.carbontable (c1 string,c2 int,c3 string,c5 string) STORED AS carbondata""")
    sql("insert into carbontable select 'a',1,'aa','aaa'")
    sql("create index index_on_c3 on table carbontable (c3, c5) AS 'carbondata'")
    backUpData(location, "carbontable")
    backUpData(location, "index_on_c3")
    sql("drop table carbontable")
    restoreData(location, "carbontable")
    restoreData(location, "index_on_c3")
    sql("refresh table carbontable")
    sql("refresh table index_on_c3")
    checkAnswer(sql("select count(*) from carbontable"), Row(1))
    checkAnswer(sql("select c1 from carbontable"), Seq(Row("a")))
    sql("REGISTER INDEX TABLE index_on_c3 ON carbontable")
    assert(sql("show indexes on carbontable").collect().nonEmpty)
  }
  override def afterAll {
    sql("drop database if exists carbon cascade")
    sql("use default")
  }
}

Source File: services.scala From InteractiveGraph-neo4j with BSD 2-Clause "Simplified" License

5 votes

package org.grapheco.server.pidb

import java.io.{File, FileInputStream}

import org.apache.commons.io.{FileUtils, IOUtils}
import org.grapheco.server.util.{JsonUtils, Logging, ServletContextUtils}
import org.neo4j.driver.v1._
import org.neo4j.graphdb.factory.{GraphDatabaseFactory, GraphDatabaseSettings}
import org.neo4j.graphdb.{GraphDatabaseService, Label, RelationshipType}
import org.springframework.beans.factory.annotation.Autowired
import org.springframework.beans.factory.{DisposableBean, InitializingBean}
import cn.pidb.engine.{BoltService, CypherService, PidbConnector}

import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.reflect.ClassTag



class PidbService(boltUrl:String, boltUser:String, boltPassword:String) extends BoltService(boltUrl, boltUser, boltPassword){


  def getRelativeOrAbsoluteFile(path: String) = {
    Some(new File(path)).map { file =>
      if (file.isAbsolute) {
        file
      }
      else {
        new File(ServletContextUtils.getServletContext.getRealPath(s"/${path}"))
      }
    }.get
  }
}

Source File: IntegrationTests.scala From scala-typed-holes with Apache License 2.0

5 votes

package holes

import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Path, Paths}

import org.apache.commons.io.FileUtils
import org.scalatest.{BeforeAndAfterAll, FunSpec}

import scala.sys.process._

class IntegrationTests extends FunSpec with BeforeAndAfterAll {

  private val pluginJar = sys.props("plugin.jar")
  private val scalacClasspath = sys.props("scalac.classpath")
  private val targetDir = Paths.get("target/integration-tests")

  private def runScalac(args: String*): String = {
    val buf = new StringBuffer
    val logger = new ProcessLogger {
      override def out(s: => String): Unit = { buf.append(s); buf.append('\n') }
      override def err(s: => String): Unit = { buf.append(s); buf.append('\n') }
      override def buffer[T](f: => T): T = f
    }

    Process(
      "java"
        :: "-Dscala.usejavacp=true"
        :: "-cp" :: scalacClasspath
        :: "scala.tools.nsc.Main"
        :: args.toList
    ).!(logger)

    buf.toString
  }

  private def compileFile(path: Path): String =
    runScalac(
      s"-Xplugin:$pluginJar",
      "-P:typed-holes:log-level:info",
      "-d", targetDir.toString,
      path.toString
    )

  override def beforeAll(): Unit = {
    println(runScalac("-version"))

    FileUtils.deleteQuietly(targetDir.toFile)
    Files.createDirectories(targetDir)
  }

  describe("produces the expected output") {
    for (scenario <- Paths.get("src/test/resources").toFile.listFiles().toList.map(_.toPath)) {
      it(scenario.getFileName.toString) {
        val expected =
          new String(Files.readAllBytes(scenario.resolve("expected.txt")), StandardCharsets.UTF_8).trim
        val actual =
          compileFile(scenario.resolve("input.scala")).trim

        if (actual != expected) {
          println("Compiler output:")
          println("=====")
          println(actual)
          println("=====")
        }
        assert(actual === expected)
      }
    }
  }

}

Source File: Template.scala From AppCrawler with Apache License 2.0

5 votes

package com.testerhome.appcrawler

import java.io.File

import org.apache.commons.io.FileUtils
import org.fusesource.scalate.TemplateEngine

import scala.collection.mutable
import scala.collection.mutable.ListBuffer
import scala.io.Source


class Template {

  val elements = mutable.HashMap[String, ListBuffer[Map[String, Any]]]()


  def getPageSource(url:String): Unit ={
    val page=Source.fromURL(s"${url}/source/xml").mkString
    val xml=DataObject.fromJson[Map[String, String]](page).getOrElse("value", "")
      .asInstanceOf[Map[String, String]].getOrElse("tree", "")
    val doc=XPathUtil.toDocument(xml)
    elements("Demo")=ListBuffer[Map[String, Any]]()
    elements("Demo")++=XPathUtil.getListFromXPath("//*[]", doc)

  }
  def read(path:String): Unit = {

    //val path = "/Users/seveniruby/projects/AppCrawlerSuite/AppCrawler/android_20170109145102/elements.yml"
    val store = (DataObject.fromYaml[URIElementStore](Source.fromFile(path).mkString)).elementStore

    store.foreach(s => {
      val reqDom = s._2.reqDom
      val url = s._2.element.url
      if (reqDom.size != 0) {
        val doc = XPathUtil.toDocument(reqDom)

        if (elements.contains(url) == false) {
          elements.put(url, ListBuffer[Map[String, Any]]())
        }
        elements(url) ++= XPathUtil.getListFromXPath("//*", doc)
        val tagsLimit=List("Image", "Button", "Text")
        elements(url) = elements(url)
          .filter(_.getOrElse("visible", "true")=="true")
          .filter(_.getOrElse("tag", "").toString.contains("StatusBar")==false)
          .filter(e=>tagsLimit.exists(t=>e.getOrElse("tag", "").toString.contains(t)))
          .distinct
      }

    })
  }

  def write(template:String, dir:String) {
    val engine = new TemplateEngine
    elements.foreach(e => {
      val file:String = e._1
      println(s"file=${file}")
      e._2.foreach(m => {
        val name = m("name")
        val value = m("value")
        val label = m("label")
        val xpath = m("xpath")
        println(s"name=${name} label=${label} value=${value} xpath=${xpath}")
      })

      val output = engine.layout(template, Map(
        "file" -> s"Template_${file.split('-').takeRight(1).head.toString}",
        "elements" -> elements(file))
      )
      println(output)

      val directory=new File(dir)
      if(directory.exists()==false){
        FileUtils.forceMkdir(directory)
      }
      println(s"template source directory = ${dir}")
      val appdex=template.split('.').takeRight(2).head
      scala.reflect.io.File(s"${dir}/${file}.${appdex}").writeAll(output)

    })

  }

}

Source File: Report.scala From AppCrawler with Apache License 2.0

5 votes

package com.testerhome.appcrawler

import org.apache.commons.io.FileUtils
import org.scalatest.tools.Runner

import scala.collection.mutable
import scala.collection.mutable.ListBuffer
import scala.io.{Source, Codec}
import scala.reflect.io.File
import collection.JavaConversions._


    log.info(s"run ${cmdArgs.mkString(" ")}")
    Runner.run(cmdArgs)
    changeTitle()
  }

  def changeTitle(title:String=Report.title): Unit ={
    val originTitle="ScalaTest Results"
    val indexFile=reportPath+"/index.html"
    val newContent=Source.fromFile(indexFile).mkString.replace(originTitle, title)
    scala.reflect.io.File(indexFile).writeAll(newContent)
  }

}

object Report extends Report{
  var showCancel=false
  var title="AppCrawler"
  var master=""
  var candidate=""
  var reportDir=""
  var store=new URIElementStore


  def loadResult(elementsFile: String): URIElementStore ={
    DataObject.fromYaml[URIElementStore](Source.fromFile(elementsFile).mkString)
  }
}

Source File: TestGetClassFile.scala From AppCrawler with Apache License 2.0

5 votes

package com.testerhome.appcrawler.ut

import com.testerhome.appcrawler.plugin.FlowDiff
import com.testerhome.appcrawler.{DiffSuite, Report}
import org.apache.commons.io.FileUtils
import org.scalatest.Checkpoints.Checkpoint
import org.scalatest.{FunSuite, Matchers}


class TestGetClassFile extends FunSuite with Matchers{



  test("test checkpoints"){
    markup {
      """
        |dddddddd
      """.stripMargin
    }
    markup("xxxx")
    val cp = new Checkpoint()
    val (x, y) = (1, 2)
    cp { x should be < 0 }
    cp { y should be > 9 }
    cp.reportAll()
  }

  test("test markup"){
    markup {
      """
        |dddddddd
      """.stripMargin
    }
    markup("xxxx")

  }

  test("get class file"){
    val location=classOf[DiffSuite].getProtectionDomain.getCodeSource.getLocation
    println(location)
    val f=getClass.getResource("/com/xueqiu/qa/appcrawler/ut/TestDiffReport.class").getFile
    println(f)
    FileUtils.copyFile(new java.io.File(f), new java.io.File("/tmp/1.class"))



    println(getClass.getClassLoader.getResources("com/xueqiu/qa/appcrawler/ut/TestDiffReport.class"))
  }
}

Source File: Generator.scala From play-soap with Apache License 2.0

5 votes

package play.soap.docs

import java.io.File
import java.util.Collections

import org.apache.commons.io.FileUtils
import org.pegdown.ast.WikiLinkNode
import org.pegdown.VerbatimSerializer
import org.pegdown.LinkRenderer
import org.pegdown.Extensions
import org.pegdown.PegDownProcessor
import play.doc.PrettifyVerbatimSerializer
import play.twirl.api.Html

object Generator extends App {
  val outDir  = new File(args(0))
  val inDir   = new File(args(1))
  val inPages = args.drop(2)

  val parser = new PegDownProcessor(Extensions.ALL)
  val linkRenderer = new LinkRenderer {
    import LinkRenderer.Rendering
    override def render(node: WikiLinkNode) = {
      node.getText.split("\\|", 2) match {
        case Array(name)        => new Rendering(name + ".html", name)
        case Array(title, name) => new Rendering(name + ".html", title)
        case _                  => new Rendering(node.getText + ".html", node.getText)
      }
    }
  }
  val verbatimSerializer =
    Collections.singletonMap[String, VerbatimSerializer](VerbatimSerializer.DEFAULT, PrettifyVerbatimSerializer)

  val nav = Seq(
    "Home"                       -> "Home",
    "Using sbt WSDL"             -> "SbtWsdl",
    "Using the Play SOAP client" -> "PlaySoapClient",
    "Using JAX WS Handlers"      -> "Handlers",
    "Security"                   -> "Security"
  )
  val titleMap = nav.map(t => t._2 -> t._1).toMap

  // Ensure target directory exists
  outDir.mkdirs()

  inPages.foreach { name =>
    val inFile      = new File(inDir, name + ".md")
    val markdown    = FileUtils.readFileToString(inFile)
    val htmlSnippet = parser.markdownToHtml(markdown, linkRenderer, verbatimSerializer)
    val title       = titleMap.get(name)
    val htmlPage    = html.template(title, nav)(Html(htmlSnippet))
    FileUtils.writeStringToFile(new File(outDir, name + ".html"), htmlPage.body)
  }
}

Source File: UsesMasterSlaveServers.scala From scala-commons with MIT License

5 votes

package com.avsystem.commons
package redis

import java.io.File

import org.apache.commons.io.FileUtils
import org.scalatest.{BeforeAndAfterAll, Suite}

import scala.concurrent.Await
import scala.concurrent.duration._


trait UsesMasterSlaveServers extends BeforeAndAfterAll with RedisProcessUtils { this: Suite =>

  val masterSlavePath: String = "masterSlave/" + System.currentTimeMillis()
  val masterSlaveDir: File = new File(masterSlavePath.replaceAllLiterally("/", File.separator))

  def masterName: String
  def ports: Seq[Int]
  def sentinelPorts: Seq[Int]

  lazy val addresses: Seq[NodeAddress] = ports.map(port => NodeAddress(port = port))
  lazy val sentinelAddresses: Seq[NodeAddress] = sentinelPorts.map(port => NodeAddress(port = port))

  var redisProcesses: Seq[RedisProcess] = _
  var sentinelProcesses: Seq[RedisProcess] = _

  protected def prepareDirectory(): Unit

  override protected def beforeAll(): Unit = {
    super.beforeAll()
    prepareDirectory()
    val processesFut = Future.traverse(ports)(port => launchRedis(
      "--port", port.toString,
      "--daemonize", "no",
      "--pidfile", "redis.pid",
      "--dbfilename", "dump.rdb",
      "--dir", s"$masterSlavePath/$port"
    ))
    val sentinelsFut = Future.traverse(sentinelPorts)(port => launchSentinel(
      s"$masterSlavePath/$port/sentinel.conf",
      "--port", port.toString,
      "--daemonize", "no",
      "--pidfile", "redis.pid",
      "--dir", s"$masterSlavePath/$port"
    ))
    redisProcesses = Await.result(processesFut, 10.seconds)
    sentinelProcesses = Await.result(sentinelsFut, 10.seconds)
  }

  override protected def afterAll(): Unit = {
    Await.result(Future.traverse(redisProcesses ++ sentinelProcesses)(shutdownRedis), 10.seconds)
    FileUtils.deleteDirectory(masterSlaveDir)
    super.afterAll()
  }
}

Source File: UsesClusterServers.scala From scala-commons with MIT License

5 votes

package com.avsystem.commons
package redis

import java.io.File

import org.apache.commons.io.FileUtils
import org.scalatest.{BeforeAndAfterAll, Suite}

import scala.concurrent.Await
import scala.concurrent.duration._


trait UsesClusterServers extends BeforeAndAfterAll with RedisProcessUtils { this: Suite =>

  val clusterPath: String = "cluster/" + System.currentTimeMillis()
  val clusterDir: File = new File(clusterPath.replaceAllLiterally("/", File.separator))

  def ports: Seq[Int]

  lazy val addresses: Seq[NodeAddress] = ports.map(port => NodeAddress(port = port))
  var redisProcesses: Seq[RedisProcess] = _

  protected def prepareDirectory(): Unit

  protected def slotKey(slot: Int): String = ClusterUtils.SlotKeys(slot)

  override protected def beforeAll(): Unit = {
    super.beforeAll()
    prepareDirectory()
    redisProcesses = Await.result(Future.traverse(ports)(port => launchRedis(
      "--port", port.toString,
      "--daemonize", "no",
      "--pidfile", "redis.pid",
      "--dbfilename", "dump.rdb",
      "--dir", s"$clusterPath/$port",
      "--appendonly", "yes",
      "--appendfilename", "appendonly.aof",
      "--cluster-enabled", "yes",
      "--cluster-config-file", "nodes.conf"
    )), 10.seconds)
  }

  override protected def afterAll(): Unit = {
    Await.result(Future.traverse(redisProcesses)(shutdownRedis), 10.seconds)
    FileUtils.deleteDirectory(clusterDir)
    super.afterAll()
  }
}

Source File: MQTTSinkWordCount.scala From bahir with Apache License 2.0

5 votes

package org.apache.bahir.examples.sql.streaming.mqtt

import java.io.File

import org.apache.commons.io.FileUtils

import org.apache.spark.sql.SparkSession


object MQTTSinkWordCount  {
  def main(args: Array[String]) {
    if (args.length < 2) {
      // scalastyle:off
      System.err.println("Usage: MQTTSinkWordCount <port> <brokerUrl> <topic>")
      // scalastyle:on
      System.exit(1)
    }

    val checkpointDir = System.getProperty("java.io.tmpdir") + "/mqtt-example/"
    // Remove checkpoint directory.
    FileUtils.deleteDirectory(new File(checkpointDir))

    val port = args(0)
    val brokerUrl = args(1)
    val topic = args(2)

    val spark = SparkSession.builder
      .appName("MQTTSinkWordCount").master("local[4]")
      .getOrCreate()

    import spark.implicits._

    // Create DataFrame representing the stream of input lines from local network socket.
    val lines = spark.readStream
      .format("socket")
      .option("host", "localhost").option("port", port)
      .load().select("value").as[String]

    // Split the lines into words.
    val words = lines.flatMap(_.split(" "))

    // Generate running word count.
    val wordCounts = words.groupBy("value").count()

    // Start publishing the counts to MQTT server.
    val query = wordCounts.writeStream
      .format("org.apache.bahir.sql.streaming.mqtt.MQTTStreamSinkProvider")
      .option("checkpointLocation", checkpointDir)
      .outputMode("complete")
      .option("topic", topic)
      .option("localStorage", checkpointDir)
      .start(brokerUrl)

    query.awaitTermination()
  }
}

Source File: ElasticServer.scala From nexus with Apache License 2.0

5 votes

package ch.epfl.bluebrain.nexus.commons.es.server.embed

import java.nio.file.Files
import java.util.Arrays._

import akka.http.scaladsl.model.Uri
import ch.epfl.bluebrain.nexus.commons.es.server.embed.ElasticServer.MyNode
import ch.epfl.bluebrain.nexus.util.{ActorSystemFixture, Randomness}
import org.apache.commons.io.FileUtils
import org.elasticsearch.common.settings.Settings
import org.elasticsearch.index.reindex.ReindexPlugin
import org.elasticsearch.node.Node
import org.elasticsearch.painless.PainlessPlugin
import org.elasticsearch.plugins.Plugin
import org.elasticsearch.transport.Netty4Plugin
import org.scalatest.wordspec.AnyWordSpecLike
import org.scalatest.BeforeAndAfterAll

import scala.jdk.CollectionConverters._
import scala.util.Try

// $COVERAGE-OFF$
abstract class ElasticServer
    extends ActorSystemFixture("ElasticServer")
    with AnyWordSpecLike
    with BeforeAndAfterAll
    with Randomness {

  override protected def beforeAll(): Unit = {
    super.beforeAll()
    startElastic()
  }

  override protected def afterAll(): Unit = {
    stopElastic()
    super.afterAll()
  }

  val startPort = freePort()
  val endPort   = startPort + 100

  val esUri       = Uri(s"http://localhost:$startPort")
  implicit val ec = system.dispatcher

  private val clusterName = "elasticsearch"

  private val dataDir  = Files.createTempDirectory("elasticsearch_data_").toFile
  private val settings = Settings
    .builder()
    .put("path.home", dataDir.toString)
    .put("http.port", s"$startPort-$endPort")
    .put("http.cors.enabled", true)
    .put("cluster.name", clusterName)
    .put("http.type", "netty4")
    .build

  private lazy val node =
    new MyNode(settings, asList(classOf[Netty4Plugin], classOf[PainlessPlugin], classOf[ReindexPlugin]))

  def startElastic(): Unit = {
    node.start()
    ()
  }

  def stopElastic(): Unit = {
    node.close()
    Try(FileUtils.forceDelete(dataDir))
    ()
  }
}

object ElasticServer extends Randomness {

  import java.util

  import org.elasticsearch.node.InternalSettingsPreparer

  private class MyNode(preparedSettings: Settings, classpathPlugins: util.Collection[Class[_ <: Plugin]])
      extends Node(
        InternalSettingsPreparer
          .prepareEnvironment(preparedSettings, Map.empty[String, String].asJava, null, () => "elasticsearch"),
        classpathPlugins,
        true
      ) {}
}
// $COVERAGE-ON$

Source File: TarFlowSpec.scala From nexus with Apache License 2.0

5 votes

package ch.epfl.bluebrain.nexus.storage

import java.io.ByteArrayInputStream
import java.nio.file.{Files, Path, Paths}

import akka.actor.ActorSystem
import akka.stream.alpakka.file.scaladsl.Directory
import akka.stream.scaladsl.{FileIO, Source}
import akka.testkit.TestKit
import akka.util.ByteString
import ch.epfl.bluebrain.nexus.storage.utils.{EitherValues, IOEitherValues, Randomness}
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream
import org.apache.commons.io.FileUtils
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpecLike
import org.scalatest.{BeforeAndAfterAll, Inspectors, OptionValues}

import scala.annotation.tailrec

class TarFlowSpec
    extends TestKit(ActorSystem("TarFlowSpec"))
    with AnyWordSpecLike
    with Matchers
    with IOEitherValues
    with Randomness
    with EitherValues
    with OptionValues
    with Inspectors
    with BeforeAndAfterAll {

  val basePath = Files.createTempDirectory("tarflow")
  val dir1     = basePath.resolve("one")
  val dir2     = basePath.resolve("two")

  override def afterAll(): Unit = {
    super.afterAll()
    FileUtils.cleanDirectory(basePath.toFile)
    ()
  }

  type PathAndContent = (Path, String)

  "A TarFlow" should {

    Files.createDirectories(dir1)
    Files.createDirectories(dir2)

    def relativize(path: Path): String = basePath.getParent().relativize(path).toString

    "generate the byteString for a tar file correctly" in {
      val file1        = dir1.resolve("file1.txt")
      val file1Content = genString()
      val file2        = dir1.resolve("file3.txt")
      val file2Content = genString()
      val file3        = dir2.resolve("file3.txt")
      val file3Content = genString()
      val files        = List(file1 -> file1Content, file2 -> file2Content, file3 -> file3Content)
      forAll(files) {
        case (file, content) => Source.single(ByteString(content)).runWith(FileIO.toPath(file)).futureValue
      }
      val byteString   = Directory.walk(basePath).via(TarFlow.writer(basePath)).runReduce(_ ++ _).futureValue
      val bytes        = new ByteArrayInputStream(byteString.toArray)
      val tar          = new TarArchiveInputStream(bytes)

      @tailrec def readEntries(
          tar: TarArchiveInputStream,
          entries: List[PathAndContent] = Nil
      ): List[PathAndContent] = {
        val entry = tar.getNextTarEntry
        if (entry == null) entries
        else {
          val data = Array.ofDim[Byte](entry.getSize.toInt)
          tar.read(data)
          readEntries(tar, (Paths.get(entry.getName) -> ByteString(data).utf8String) :: entries)
        }
      }
      val directories = List(relativize(basePath) -> "", relativize(dir1) -> "", relativize(dir2) -> "")
      val untarred    = readEntries(tar).map { case (path, content) => path.toString -> content }
      val expected    = files.map { case (path, content) => relativize(path) -> content } ++ directories
      untarred should contain theSameElementsAs expected
    }
  }

}

Source File: format_flow.scala From scalabpe with Apache License 2.0

5 votes

package scalabpe

import java.io._
import scala.collection.mutable.HashMap
import scala.collection.mutable.ArrayBuffer
import scala.io.Source
import org.apache.commons.io.FileUtils
import scala.xml._
import scala.collection.mutable._
import scalabpe.core._
import org.apache.commons.lang.StringUtils
import Tools._



object FormatFlowTool {

    def help() {
        println(
"""
usage: scalabpe.FormatFlowTool [options] dirname
options:
    -h|--help               帮助信息
""")
    }

    def parseArgs(args:Array[String]):HashMapStringAny = {
        val map = HashMapStringAny()
        var i = 0
        val files = ArrayBufferString()
        while(i < args.size) {
            args(i) match {
                case "-h" | "--help" => 
                    return null
                case s if s.startsWith("-") => 
                    println("invalid option "+s)
                    return null
                case _ => 
                    files += args(i)
                    i += 1
            }
        }
        map.put("files",files)
        map
    }

    def main(args:Array[String]) {

        var params = parseArgs(args)
        if( params == null ) {
            help()
            return
        }
        var files = params.nls("files")
        if( files.size == 0 ) {
            help()
            return
        }

        var dir = files(0)
        if( !new File(dir).exists() ) {
            val p1 = "compose_conf"+File.separator+dir
            if( new File(p1).exists ) {
                dir = p1
            } else {
                println("not a valid dir, dir="+dir)
                return
            }
        }

        processDir(dir,params)
    }

    def processDir(dir:String,params:HashMapStringAny) {
        val files = new File(dir).listFiles.filter(_.getName.endsWith(".flow"))
        for(f <- files ) {
            processFile(dir,f.getName,params)
        }
    }

    def processFile(dir:String,f:String,params:HashMapStringAny) {
        val lines = readAllLines(dir+File.separator+f)
        // TODO
    }

}

Source File: NodeActor.scala From ForestFlow with Apache License 2.0

5 votes

package ai.forestflow.serving.cluster

import java.io.File

import akka.actor.{Actor, ActorLogging, ActorRef, Props, Timers}
import akka.cluster.Cluster
import akka.cluster.pubsub.DistributedPubSub
import akka.cluster.pubsub.DistributedPubSubMediator.Subscribe
import ai.forestflow.domain.CleanupLocalStorage
import org.apache.commons.io.FileUtils
import com.typesafe.scalalogging.LazyLogging
import ai.forestflow.utils.ThrowableImplicits._

import scala.util.{Failure, Success, Try}

/***
 * This actor is responsible for node-level (host-level) stuff that should be done on a per-node basis.
 * A good example of this is file system cleanup tasks.
 */
object NodeActor extends LazyLogging {
  
  def props(): Props =
    Props(new NodeActor)
      .withDispatcher("blocking-io-dispatcher")

  def cleanupLocalStorage(path: String): Unit = {
    val localDir = new File(path)
    val localDirExists = localDir.exists()
    logger.info(s"Cleaning up local storage: Local Directory: $localDir , exists? $localDirExists")
    if (localDirExists)
      Try(FileUtils.deleteDirectory(localDir)) match {
        case Success(_) => logger.info(s"Local Directory $localDir cleaned up successfully")
        case Failure(ex) => logger.error(s"Local Directory $localDir cleanup failed! Reason: ${ex.printableStackTrace}")
      }
  }
}

class NodeActor extends Actor
  with ActorLogging
  with Timers {

  
  implicit val cluster: Cluster = Cluster(context.system)
  val mediator: ActorRef = DistributedPubSub(context.system).mediator

  mediator ! Subscribe(classOf[CleanupLocalStorage].getSimpleName, self)

  override def receive: Receive = {
    case CleanupLocalStorage(path) =>
      NodeActor.cleanupLocalStorage(path)
  }
}

Source File: SentencePieceWrapper.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.ml.tensorflow.sentencepiece

import java.io.File
import java.nio.file.{Files, Paths}
import java.util.UUID

import org.apache.commons.io.FileUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.sql.SparkSession

class SentencePieceWrapper(
                            var sppModel: Array[Byte]
                          ) extends Serializable {

  @transient private var mspp: SentencePieceProcessor = _

  def getSppModel: SentencePieceProcessor = {

    if (mspp == null){
      val spp = new SentencePieceProcessor()
      spp.loadFromSerializedProto(sppModel)
      mspp = spp
    }
    mspp
  }

}

object SentencePieceWrapper {

  def read(
            path: String
          ): SentencePieceWrapper = {
    val byteArray = Files.readAllBytes(Paths.get(path))
    val sppWrapper = new SentencePieceWrapper(byteArray)
    val spp = new SentencePieceProcessor()
    spp.loadFromSerializedProto(byteArray)

    sppWrapper.mspp = spp
    sppWrapper
  }
}


trait WriteSentencePieceModel {
  def writeSentencePieceModel(
                               path: String,
                               spark: SparkSession,
                               spp: SentencePieceWrapper,
                               suffix: String, filename:String
                             ): Unit = {

    val uri = new java.net.URI(path.replaceAllLiterally("\\", "/"))
    val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration)

    // 1. Create tmp folder
    val tmpFolder = Files.createTempDirectory(UUID.randomUUID().toString.takeRight(12) + suffix)
      .toAbsolutePath.toString

    val sppFile = Paths.get(tmpFolder, filename).toString

    // 2. Save Tensorflow state
    FileUtils.writeByteArrayToFile(new File(sppFile), spp.sppModel)
    // 3. Copy to dest folder
    fs.copyFromLocalFile(new Path(sppFile), new Path(path))

    // 4. Remove tmp folder
    FileUtils.deleteDirectory(new File(tmpFolder))
  }
}

trait ReadSentencePieceModel {
  val sppFile: String

  def readSentencePieceModel(
                              path: String,
                              spark: SparkSession,
                              suffix: String
                            ): SentencePieceWrapper = {

    val uri = new java.net.URI(path.replaceAllLiterally("\\", "/"))
    val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration)

    // 1. Create tmp directory
    val tmpFolder = Files.createTempDirectory(UUID.randomUUID().toString.takeRight(12)+ suffix)
      .toAbsolutePath.toString

    // 2. Copy to local dir
    fs.copyToLocalFile(new Path(path, sppFile), new Path(tmpFolder))

    val sppModelFilePath = new Path(tmpFolder, sppFile)

    val byteArray = Files.readAllBytes(Paths.get(sppModelFilePath.toString))
    val sppWrapper = new SentencePieceWrapper(byteArray)
    sppWrapper
  }
}

Source File: TrainingHelper.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.util

import java.io.File
import java.nio.file.{Files, Paths, StandardCopyOption}
import java.sql.Timestamp
import java.util.Date

import com.johnsnowlabs.nlp.pretrained.ResourceType.ResourceType
import com.johnsnowlabs.nlp.pretrained.{ResourceMetadata, ResourceType}
import org.apache.commons.io.FileUtils
import org.apache.spark.ml.util.MLWriter


object TrainingHelper {

  def saveModel(name: String,
                language: Option[String],
                libVersion: Option[Version],
                sparkVersion: Option[Version],
                modelWriter: MLWriter,
                folder: String,
                category: Option[ResourceType] = Some(ResourceType.NOT_DEFINED)
               ): Unit = {

    // 1. Get current timestamp
    val timestamp = new Timestamp(new Date().getTime)


    // 2. Save model to file
    val file = Paths.get(folder, timestamp.toString).toString.replaceAllLiterally("\\", "/")
    modelWriter.save(file)

    // 3. Zip file
    val tempzipFile = Paths.get(folder, timestamp + ".zip")
    ZipArchiveUtil.zip(file, tempzipFile.toString)

    // 4. Set checksum
    val checksum = FileHelper.generateChecksum(tempzipFile.toString)

    // 5. Create resource metadata
    val meta = new ResourceMetadata(name, language, libVersion, sparkVersion, true, timestamp, true, category = category, checksum)

    val zipfile = Paths.get(meta.fileName)

    // 6. Move the zip
    Files.move(tempzipFile, zipfile, StandardCopyOption.REPLACE_EXISTING)

    // 7. Remove original file
    try {
      FileUtils.deleteDirectory(new File(file))
    } catch {
      case _: java.io.IOException => //file lock may prevent deletion, ignore and continue
    }

      // 6. Add to metadata.json info about resource
      val metadataFile = Paths.get(folder, "metadata.json").toString
      ResourceMetadata.addMetadataToFile(metadataFile, meta)
    }
}

Source File: FileHelper.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.util

import java.io.{File, IOException}
import java.nio.charset.Charset
import java.nio.file.{Files, Paths}
import java.security.MessageDigest
import java.text.DecimalFormat

import org.apache.commons.io.FileUtils
object FileHelper {
  def writeLines(file: String, lines: Seq[String], encoding: String = "UTF-8"): Unit = {
    val writer = Files.newBufferedWriter(Paths.get(file), Charset.forName(encoding))
    try {
      var cnt = 0
      for (line <- lines) {
        writer.write(line)
        if (cnt > 0)
          writer.write(System.lineSeparator())
        cnt += 1
      }
    }
    catch {
      case ex: IOException =>
        ex.printStackTrace()
    }
    finally if (writer != null) writer.close()
  }

  def delete(file: String, throwOnError: Boolean = false): Unit = {
    val f = new File(file)
    if (f.exists()) {
      try {
        if (f.isDirectory)
          FileUtils.deleteDirectory(f)
        else
          FileUtils.deleteQuietly(f)
      }
      catch {
        case e: Exception =>
          if (throwOnError)
            throw e
          else
            FileUtils.forceDeleteOnExit(f)
      }
    }

  }

  def generateChecksum(path: String): String = {
    val arr = Files readAllBytes (Paths get path)
    val checksum = MessageDigest.getInstance("MD5") digest arr
    checksum.map("%02X" format _).mkString
  }

  def getHumanReadableFileSize(size: Long): String = {
    if (size <= 0) return "0"
    val units = Array[String]("B", "KB", "MB", "GB", "TB", "PB", "EB")
    val digitGroups = (Math.log10(size) / Math.log10(1024)).toInt
    new DecimalFormat("#,##0.#").format(size / Math.pow(1024, digitGroups)) + " " + units(digitGroups)
  }
}

Source File: Quickstart.scala From delta with Apache License 2.0

5 votes

package example

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SparkSession, SQLContext}
import io.delta.tables._

import org.apache.spark.sql.functions._
import org.apache.commons.io.FileUtils
import java.io.File

object Quickstart {
  def main(args: Array[String]): Unit = {

    val spark = SparkSession
      .builder()
      .appName("Quickstart")
      .master("local[*]")
      .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
      .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
      .getOrCreate()

    val file = new File("/tmp/delta-table")
    if (file.exists()) FileUtils.deleteDirectory(file)
    
    // Create a table
    println("Creating a table")
    val path = file.getCanonicalPath
    var data = spark.range(0, 5)
    data.write.format("delta").save(path)

    // Read table
    println("Reading the table")
    val df = spark.read.format("delta").load(path)
    df.show()

    // Upsert (merge) new data
    println("Upsert new data")
    val newData = spark.range(0, 20).toDF
    val deltaTable = DeltaTable.forPath(path)

    deltaTable.as("oldData")
      .merge(
        newData.as("newData"),
        "oldData.id = newData.id")
      .whenMatched
      .update(Map("id" -> col("newData.id")))
      .whenNotMatched
      .insert(Map("id" -> col("newData.id")))
      .execute()

    deltaTable.toDF.show()

    // Update table data
    println("Overwrite the table")
    data = spark.range(5, 10)
    data.write.format("delta").mode("overwrite").save(path)
    deltaTable.toDF.show()

    // Update every even value by adding 100 to it
    println("Update to the table (add 100 to every even value)")
    deltaTable.update(
      condition = expr("id % 2 == 0"),
      set = Map("id" -> expr("id + 100")))
    deltaTable.toDF.show()

    // Delete every even value
    deltaTable.delete(condition = expr("id % 2 == 0"))
    deltaTable.toDF.show()

    // Read old version of the data using time travel
    print("Read old data using time travel")
    val df2 = spark.read.format("delta").option("versionAsOf", 0).load(path)
    df2.show()

    // Cleanup
    FileUtils.deleteDirectory(file)
    spark.stop()
  }
}

Source File: QuickstartSQL.scala From delta with Apache License 2.0

5 votes

package example

import org.apache.spark.sql.SparkSession
import io.delta.tables._

import org.apache.spark.sql.functions._
import org.apache.commons.io.FileUtils
import java.io.File

object QuickstartSQL {
  def main(args: Array[String]): Unit = {
    // Create Spark Conf
    val spark = SparkSession
      .builder()
      .appName("QuickstartSQL")
      .master("local[*]")
      .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
      .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
      .getOrCreate()
    
    val tableName = "tblname"

    // Clear up old session
    spark.sql(s"DROP TABLE IF EXISTS $tableName")
    spark.sql(s"DROP TABLE IF EXISTS newData")

    try {
      // Create a table
      println("Creating a table")
      spark.sql(s"CREATE TABLE $tableName(id LONG) USING delta")
      spark.sql(s"INSERT INTO $tableName VALUES 0, 1, 2, 3, 4")

      // Read table
      println("Reading the table")
      spark.sql(s"SELECT * FROM $tableName").show()

      // Upsert (merge) new data
      println("Upsert new data")
      spark.sql("CREATE TABLE newData(id LONG) USING parquet")
      spark.sql("INSERT INTO newData VALUES 3, 4, 5, 6")
      
      spark.sql(s"""MERGE INTO $tableName USING newData
          ON ${tableName}.id = newData.id
          WHEN MATCHED THEN
            UPDATE SET ${tableName}.id = newData.id
          WHEN NOT MATCHED THEN INSERT *
      """)

      spark.sql(s"SELECT * FROM $tableName").show()

      // Update table data
      println("Overwrite the table")
      spark.sql(s"INSERT OVERWRITE $tableName VALUES 5, 6, 7, 8, 9")
      spark.sql(s"SELECT * FROM $tableName").show()

      // Update every even value by adding 100 to it
      println("Update to the table (add 100 to every even value)")
      spark.sql(s"UPDATE $tableName SET id = (id + 100) WHERE (id % 2 == 0)")
      spark.sql(s"SELECT * FROM $tableName").show()

      // Delete every even value
      spark.sql(s"DELETE FROM $tableName WHERE (id % 2 == 0)")
      spark.sql(s"SELECT * FROM $tableName").show()

      // Read old version of the data using time travel
      print("Read old data using time travel")
      val df2 = spark.read.format("delta").option("versionAsOf", 0).table(tableName)
      df2.show()
    } finally {
      // Cleanup
      spark.sql(s"DROP TABLE IF EXISTS $tableName")
      spark.sql(s"DROP TABLE IF EXISTS newData")
      spark.stop()
    }
  }
}

Source File: QuickstartSQLOnPaths.scala From delta with Apache License 2.0

5 votes

package example

import org.apache.spark.sql.SparkSession
import io.delta.tables._

import org.apache.spark.sql.functions._
import org.apache.commons.io.FileUtils
import java.io.File

object QuickstartSQLOnPaths {
  def main(args: Array[String]): Unit = {
    // Create Spark Conf
    val spark = SparkSession
      .builder()
      .appName("QuickstartSQLOnPaths")
      .master("local[*]")
      .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
      .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
      .getOrCreate()
    
    val tablePath = new File("/tmp/delta-table")
    if (tablePath.exists()) FileUtils.deleteDirectory(tablePath)

    // Clear up old session
    spark.sql(s"DROP TABLE IF EXISTS newData")

    try {
      // Create a table
      println("Creating a table")
      spark.sql(s"CREATE TABLE delta.`$tablePath`(id LONG) USING delta")
      spark.sql(s"INSERT INTO delta.`$tablePath` VALUES 0, 1, 2, 3, 4")

      // Read table
      println("Reading the table")
      spark.sql(s"SELECT * FROM delta.`$tablePath`").show()

      // Upsert (merge) new data
      println("Upsert new data")
      spark.sql("CREATE TABLE newData(id LONG) USING parquet")
      spark.sql("INSERT INTO newData VALUES 3, 4, 5, 6")
      
      spark.sql(s"""MERGE INTO delta.`$tablePath` data USING newData
          ON data.id = newData.id
          WHEN MATCHED THEN
            UPDATE SET data.id = newData.id
          WHEN NOT MATCHED THEN INSERT *
      """)

      spark.sql(s"SELECT * FROM delta.`$tablePath`").show()

      // Update table data
      println("Overwrite the table")
      spark.sql(s"INSERT OVERWRITE delta.`$tablePath` VALUES 5, 6, 7, 8, 9")
      spark.sql(s"SELECT * FROM delta.`$tablePath`").show()

      // Update every even value by adding 100 to it
      println("Update to the table (add 100 to every even value)")
      spark.sql(s"UPDATE delta.`$tablePath` SET id = (id + 100) WHERE (id % 2 == 0)")
      spark.sql(s"SELECT * FROM delta.`$tablePath`").show()

      // Delete every even value
      spark.sql(s"DELETE FROM delta.`$tablePath` WHERE (id % 2 == 0)")
      spark.sql(s"SELECT * FROM delta.`$tablePath`").show()
    } finally {
      // Cleanup
      spark.sql(s"DROP TABLE IF EXISTS newData")
      spark.stop()
    }
  }
}

Source File: Utilities.scala From delta with Apache License 2.0

5 votes

package example

import java.io.File

import io.delta.tables.DeltaTable
import org.apache.commons.io.FileUtils

import org.apache.spark.sql.SparkSession

object Utilities {
  def main(args: Array[String]): Unit = {
    // Create a Spark Session with SQL enabled
    val spark = SparkSession
      .builder()
      .appName("Utilities")
      .master("local[*]")
      .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
      .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
      // control the parallelism for vacuum
      .config("spark.sql.sources.parallelPartitionDiscovery.parallelism", "4")
      .getOrCreate()

    // Create a table
    println("Create a parquet table")
    val data = spark.range(0, 5)
    val file = new File("/tmp/parquet-table")
    val path = file.getAbsolutePath
    data.write.format("parquet").save(path)

    // Convert to delta
    println("Convert to Delta")
    DeltaTable.convertToDelta(spark, s"parquet.`$path`")

    // Read table as delta
    var df = spark.read.format("delta").load(path)

    // Read old version of data using time travel
    df = spark.read.format("delta").option("versionAsOf", 0).load(path)
    df.show()

    val deltaTable = DeltaTable.forPath(path)

    // Utility commands
    println("Vacuum the table")
    deltaTable.vacuum()

    println("Describe History for the table")
    deltaTable.history().show()

    // Generate manifest
    println("Generate Manifest files")
    deltaTable.generate("SYMLINK_FORMAT_MANIFEST")

    // SQL utility commands
    println("SQL Vacuum")
    spark.sql(s"VACUUM '$path' RETAIN 169 HOURS")

    println("SQL Describe History")
    println(spark.sql(s"DESCRIBE HISTORY '$path'").collect())

    // Cleanup
    FileUtils.deleteDirectory(new File(path))
    spark.stop()
  }
}

Source File: CodeGen.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.codegen

import java.io.File

import com.microsoft.ml.spark.codegen.Config._
import com.microsoft.ml.spark.core.env.FileUtilities._
import org.apache.commons.io.FileUtils
import org.apache.commons.io.FilenameUtils._

object CodeGen {

  def generateArtifacts(): Unit = {
    println(
      s"""|Running code generation with config:
          |  topDir:     $TopDir
          |  packageDir: $PackageDir
          |  pySrcDir:   $PySrcDir
          |  pyTestDir:  $PyTestDir
          |  rsrcDir:    $RSrcDir""".stripMargin)

    println("Creating temp folders")
    if (GeneratedDir.exists()) FileUtils.forceDelete(GeneratedDir)

    println("Generating python APIs")
    PySparkWrapperGenerator()
    println("Generating R APIs")
    SparklyRWrapperGenerator(Version)

    def toDir(f: File): File = new File(f, File.separator)

    //writeFile(new File(pySrcDir, "__init__.py"), packageHelp(""))
    FileUtils.copyDirectoryToDirectory(toDir(PySrcOverrideDir), toDir(PySrcDir))
    FileUtils.copyDirectoryToDirectory(toDir(PyTestOverrideDir), toDir(PyTestDir))
    makeInitFiles()

    // build init file
    // package python+r zip files
    // zipFolder(pyDir, pyZipFile)
    RPackageDir.mkdirs()
    zipFolder(RSrcDir, new File(RPackageDir, s"mmlspark-$Version.zip"))

    //FileUtils.forceDelete(rDir)
    // leave the python source files, so they will be included in the super-jar
    // FileUtils.forceDelete(pyDir)
  }

  private def makeInitFiles(packageFolder: String = ""): Unit = {
    val dir = new File(new File(PySrcDir,"mmlspark"), packageFolder)
    val packageString = if (packageFolder != "") packageFolder.replace("/",".") else ""
    val importStrings =
      dir.listFiles.filter(_.isFile).sorted
        .map(_.getName)
        .filter(name => name.endsWith(".py") && !name.startsWith("_") && !name.startsWith("test"))
        .map(name => s"from mmlspark$packageString.${getBaseName(name)} import *\n").mkString("")
    writeFile(new File(dir, "__init__.py"), packageHelp(importStrings))
    dir.listFiles().filter(_.isDirectory).foreach(f =>
      makeInitFiles(packageFolder +"/" + f.getName)
    )
  }

  def main(args: Array[String]): Unit = {
    generateArtifacts()
  }

}

Source File: DownloaderSuite.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.downloader

import java.io.File
import java.nio.file.Files

import com.microsoft.ml.spark.core.test.base.TestBase
import org.apache.commons.io.FileUtils

import scala.collection.JavaConversions._
import scala.concurrent.duration.Duration
import scala.util.Random

class DownloaderSuite extends TestBase {

  lazy val saveDir = Files.createTempDirectory("Models-").toFile
  lazy val d = new ModelDownloader(session, saveDir.toURI)

  test("retry utility should catch flakiness"){
    (1 to 20).foreach { i =>
      val result = FaultToleranceUtils.retryWithTimeout(20, Duration.apply(2, "seconds")) {
        val r = Random.nextDouble()
        if (r > .5) {
          println(s"$r failed")
          throw new IllegalArgumentException("Flakiness")
        } else if (r < .1){
          //Getting stuck
          val m = 3* 1e3.toLong
          println(s"$r Stuck for $m")
          Thread.sleep(m)
        }
        println(s"$r Success")
        5
      }
      assert(result === 5)
    }
  }

  test("A downloader should be able to download a model", TestBase.Extended) {
    val m = d.remoteModels.filter(_.name == "CNN").next()
    val schema = d.downloadModel(m)
    println(schema)
    assert(m.size == new File(schema.uri).length())
    assert(d.localModels.toList.length == 1)
  }

  ignore("A downloader should be able to get all Models " +
    "and maybeDownload should be fast if models are downloaded", TestBase.Extended) {
    val (modTimes, modTimes2) = FaultToleranceUtils.retryWithTimeout(10, Duration.apply(500, "seconds")) {
      d.downloadModels()
      val modTimes = d.localModels.map(s =>
        new File(s.uri).lastModified())

      d.downloadModels()
      val modTimes2 = d.localModels.map(s =>
        new File(s.uri).lastModified())
      (modTimes, modTimes2)
    }
    // No modification on second call because models are cached
    assert(modTimes.toList === modTimes2.toList)

    // the downloader's local models will reflect the change
    assert(d.localModels.toList.length == d.remoteModels.toList.length)

    // there will be a metadata file for every model
    assert(saveDir.list().count(_.endsWith(".meta")) == d.localModels.toList.length)
  }

  override def afterAll(): Unit = {
    if (saveDir.exists()) {
      FileUtils.forceDelete(saveDir)
    }
    super.afterAll()
  }

}

Source File: S3ObjectUploader.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.tools.neptune.export

import java.io._
import java.util
import java.util.concurrent.{Executors, TimeoutException}
import java.util.stream.Collectors
import java.util.{Collections, Vector}

import com.amazonaws.auth.profile.ProfileCredentialsProvider
import com.amazonaws.services.s3.AmazonS3ClientBuilder
import com.amazonaws.services.s3.model.{ObjectMetadata, PutObjectRequest}
import com.amazonaws.{AmazonServiceException, ClientConfiguration, Protocol, SdkClientException}
import org.apache.commons.io.{FileUtils, IOUtils}
import org.slf4j.LoggerFactory

import scala.concurrent.{Await, ExecutionContext, Future}
import scala.concurrent.duration.{FiniteDuration, _}

object S3ObjectUploader{

  val executor = Executors.newFixedThreadPool(1)
  implicit val ec: ExecutionContext = scala.concurrent.ExecutionContext.fromExecutor(executor)
  protected lazy val logger = LoggerFactory.getLogger("s3_uploader")


  def init(proxyHost:Option[String], proxyPort:Option[Int]) = {
    val clientRegion = "us-east-1"
    val config = new ClientConfiguration
    config.setProtocol(Protocol.HTTPS)
    proxyHost.foreach(host => config.setProxyHost(host))
    proxyPort.foreach(port =>  config.setProxyPort(port))
    val s3Client = AmazonS3ClientBuilder.standard()
      .withRegion(clientRegion)
      .withClientConfiguration(config)
      .withCredentials(new ProfileCredentialsProvider())
      .build()
    s3Client
  }


  def persistChunkToS3Bucket(chunkData:String, fileName:String, proxyHost:Option[String], proxyPort:Option[Int], s3Directory:String) = {
        try{
          init(proxyHost, proxyPort).putObject(s3Directory, fileName, chunkData)
      }
      catch {
        case e: AmazonServiceException =>
          e.printStackTrace()
          throw e
        case e: SdkClientException =>
          e.printStackTrace()
          throw e
      }
  }

  def persistChunkToS3Bucket(tmpFile:File, proxyHost:Option[String], proxyPort:Option[Int], s3Directory:String, retryCount:Int = 3):Unit = {
    try{
      val s3UploadTask = Future{init(proxyHost, proxyPort).putObject(s3Directory, tmpFile.getName, tmpFile)}(ec)
      Await.result(s3UploadTask,  5.minutes)
      tmpFile.delete()
    }
    catch {
      case e:TimeoutException =>
        if(retryCount > 0) {
          logger.error("S3 upload task run more than 5 minutes..Going to retry")
          persistChunkToS3Bucket(tmpFile, proxyHost, proxyPort, s3Directory, retryCount-1)
        }
        else{
          throw new Exception( "S3 upload task duration was more than 5 minutes")
        }
      case e: AmazonServiceException =>
        e.printStackTrace()
        throw e
      case e: SdkClientException =>
        e.printStackTrace()
        throw e
    }
  }

}

Source File: AnalyzeInconsistenciesResult.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import java.io.File
import java.nio.charset.StandardCharsets.UTF_8

import cmwell.analytics.data.InfotonAndIndexWithSystemFields
import cmwell.analytics.util.Connector
import org.apache.commons.io.FileUtils
import org.apache.log4j.LogManager
import org.apache.spark.sql.{Column, DataFrame, Row}
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.collection.breakOut

object AnalyzeInconsistenciesResult {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(AnalyzeInconsistenciesResult.getClass)

    try {

      object Opts extends ScallopConf(args) {

        val in: ScallopOption[String] = opt[String]("in", short = 'i', descr = "The path to read the (parquet) inconsistencies dataset from", required = true)
        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the (csv) output to", required = true)
        val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false))

        verify()
      }

      Connector(
        appName = "Analyze InfotonAndIndexWithSystemFields Output",
        sparkShell = Opts.shell()
      ).withSparkSessionDo { spark =>

        val ds: DataFrame = spark.read.parquet(Opts.in())

        import org.apache.spark.sql.functions._

        // A column expression that counts the number of failures for each constraint.
        // This will also include null counts, needed to interpret the results.
        val constraints: Seq[(String, Column)] = InfotonAndIndexWithSystemFields.constraints(ds).map { case (name, predicate) =>
          name -> sum(when(predicate, 0L).otherwise(1L)).as(name)
        }(breakOut)

        // Compute the failure counts
        val failureCounts: Row = ds.agg(constraints.head._2, constraints.tail.map(_._2): _*).head

        val results = for {
          i <- constraints.indices
          constraintName = constraints(i)._1
          failureCount = if (failureCounts.isNullAt(i)) 0 else failureCounts.getAs[Long](i)
        } yield s"$constraintName,$failureCount"

        FileUtils.write(new File(Opts.out()), "constraint,failures\n" + results.mkString("\n"), UTF_8)
      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
}

Source File: ExtractFromParquet.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.data

import java.io.File
import java.nio.charset.StandardCharsets.UTF_8

import cmwell.analytics.util.Connector
import cmwell.analytics.util.StringUtil._
import org.apache.commons.io.FileUtils
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.rogach.scallop.{ScallopConf, ScallopOption}

object ExtractFromParquet {

  def main(args: Array[String]): Unit = {

    object Opts extends ScallopConf(args) {

      val pathsToFind: ScallopOption[String] = opt[String]("paths-to-find", short = 'f', descr = "A file containing the list of paths to look for", required = true)
      val parquetData: ScallopOption[String] = opt[String]("parquet-file", short = 'p', descr = "A Parquet file containing the data; single string column rdfStatement", required = true)
      val extractedData: ScallopOption[String] = opt[String]("extracted-data", short = 'd', descr = "The file that extracted data will be written to (in nquads format)", required = true)
      val pathsNotFound: ScallopOption[String] = opt[String]("paths-not-found", short = 'n', descr = "The output file that any paths that were not found are written to", required = true)
      val pathsFound: ScallopOption[String] = opt[String]("paths-found", short = 'a', descr = "The output file containing the paths that we found are written to", required = true)

      verify()
    }

    Connector(sparkShell = true, appName = "Extract from parquet").withSparkSessionDo {
      spark: SparkSession =>

        val pathsToFind = Set(splitLines(FileUtils.readFileToString(new File(Opts.pathsToFind()), UTF_8)): _*)

        val ds: DataFrame = spark.read.parquet(Opts.parquetData())

        // Cheesy parsing of path from an RDF nquad, but sufficient for this purpose
        def extractPath(rdfStatement: String): String = rdfStatement.substring(7, rdfStatement.indexOf(">"))

        val statementsFound = ds.rdd.filter { row: Row =>

          val statement = row.getAs[String]("rdfStatement")
          val path = extractPath(statement)

          pathsToFind.contains(path)
        }.collect() // expect the result to be small, so collect is OK

        // Save all the paths that were not found to file - look for them in other files.
        val pathsFound: Set[String] = Set(statementsFound.map(row => extractPath(row.getString(0))): _*)
        println(s"There were ${pathsFound.size} paths found (out of ${pathsToFind.size}).")
        FileUtils.writeStringToFile(new File(Opts.pathsFound()), pathsFound.mkString("\n"), UTF_8, false)

        val pathsNotFound = pathsToFind.diff(pathsFound)
        println(s"There were ${pathsNotFound.size} paths not found.")
        FileUtils.writeStringToFile(new File(Opts.pathsNotFound()), pathsNotFound.mkString("\n"), UTF_8, false)

        // Save the RDF statements for the paths that were found
        val x = statementsFound.map(row => row.getString(0)).mkString("\n")
        FileUtils.writeStringToFile(new File(Opts.extractedData()), x, UTF_8, false)
    }
  }
}

Source File: DumpCompleteDocumentFromEs.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import java.nio.file.Paths

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.data.{DataWriterFactory, IndexWithCompleteDocument}
import cmwell.analytics.downloader.PartitionedDownloader
import cmwell.analytics.util.TimestampConversion.timestampConverter
import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints}
import org.apache.commons.io.FileUtils
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.concurrent.ExecutionContextExecutor

object DumpCompleteDocumentFromEs {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpCompleteDocumentFromEs.getClass)

    // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors
    // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might
    // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    implicit val system: ActorSystem = ActorSystem("dump-complete-document-from-es")
    implicit val executionContext: ExecutionContextExecutor = system.dispatcher
    implicit val actorMaterializer: ActorMaterializer = ActorMaterializer()

    try {

      object Opts extends ScallopConf(args) {

        val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from (default: cm_well_all)", required = false)
        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-filter", short = 'c', descr = "Filter on current status", default = None)
        val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter)
        val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None)

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The data format: either 'parquet' or 'csv'", default = Some("parquet"))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        verify()
      }

      val esContactPoint = FindContactPoints.es(Opts.url())
      val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_))
      val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead)

      // Calling script should clear output directory as necessary.

      val objectExtractor = IndexWithCompleteDocument
      val dataWriterFactory = DataWriterFactory.file(format = Opts.format(), objectExtractor, outDirectory = Opts.out())

      PartitionedDownloader.runDownload(
        esTopology = esTopology,
        parallelism = Opts.parallelism(),

        currentOnly = Opts.currentOnly(),
        lastModifiedGteFilter = Opts.lastModifiedGteFilter.toOption,
        pathPrefixFilter = Opts.pathPrefixFilter.toOption,

        objectExtractor = objectExtractor,
        dataWriterFactory = dataWriterFactory,
        sourceFilter = false)

      // The Hadoop convention is to touch the (empty) _SUCCESS file to signal successful completion.
      FileUtils.touch(Paths.get(Opts.out(), "_SUCCESS").toFile)
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
    finally {
      system.terminate()
    }
  }
}

Source File: DumpKeyFieldsFromEs.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import java.nio.file.Paths

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.data.{DataWriterFactory, IndexWithKeyFields}
import cmwell.analytics.downloader.PartitionedDownloader
import cmwell.analytics.util.TimestampConversion.timestampConverter
import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints}
import org.apache.commons.io.FileUtils
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.concurrent.ExecutionContextExecutor

object DumpKeyFieldsFromEs {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpKeyFieldsFromEs.getClass)

    implicit val system: ActorSystem = ActorSystem("dump-key-fields-from-es")
    implicit val executionContext: ExecutionContextExecutor = system.dispatcher
    implicit val actorMaterializer: ActorMaterializer = ActorMaterializer()

    try {
      // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors
      // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might
      // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side.
      val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

      object Opts extends ScallopConf(args) {

        val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from (default: cm_well_all)", required = false)
        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-only", short = 'c', descr = "Only download current uuids")
        val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter)
        val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None)

        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The data format: either 'parquet' or 'csv'", default = Some("parquet"))
        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)

        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val sourceFilter: ScallopOption[Boolean] = toggle("source-filter", noshort = true, default = Some(true), prefix = "no-",
          descrNo = "Do not filter _source fields (workaround for bad index)", descrYes = "Use source filtering to reduce network traffic")

        verify()
      }

      val esContactPoint = FindContactPoints.es(Opts.url())
      val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_))
      val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead)

      // Calling script should clear output directory as necessary.

      val objectExtractor = IndexWithKeyFields
      val dataWriterFactory = DataWriterFactory.file(format = Opts.format(), objectExtractor, outDirectory = Opts.out())

      PartitionedDownloader.runDownload(
        esTopology = esTopology,
        parallelism = Opts.parallelism(),

        currentOnly = Opts.currentOnly(),
        lastModifiedGteFilter = Opts.lastModifiedGteFilter.toOption,
        pathPrefixFilter = Opts.pathPrefixFilter.toOption,

        objectExtractor = objectExtractor,
        dataWriterFactory = dataWriterFactory,
        sourceFilter = Opts.sourceFilter())

      // The Hadoop convention is to touch the (empty) _SUCCESS file to signal successful completion.
      FileUtils.touch(Paths.get(Opts.out(), "_SUCCESS").toFile)
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
    finally {
      system.terminate()
    }
  }
}

Source File: DumpUuidOnlyFromEs.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import java.nio.file.Paths

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.data.{DataWriterFactory, IndexWithUuidOnly}
import cmwell.analytics.downloader.PartitionedDownloader
import cmwell.analytics.util.TimestampConversion.timestampConverter
import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints}
import org.apache.commons.io.FileUtils
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.concurrent.ExecutionContextExecutor

object DumpUuidOnlyFromEs {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpUuidOnlyFromEs.getClass)

    // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors
    // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might
    // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    implicit val system: ActorSystem = ActorSystem("dump-uuid-only-from-es")
    implicit val executionContext: ExecutionContextExecutor = system.dispatcher
    implicit val actorMaterializer: ActorMaterializer = ActorMaterializer()

    try {

      object Opts extends ScallopConf(args) {

        val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from (default: cm_well_all)", required = false)
        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-only", short = 'c', descr = "Only download current uuids")
        val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter)
        val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None)

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The data format: either 'parquet' or 'csv'", default = Some("parquet"))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val sourceFilter: ScallopOption[Boolean] = toggle("source-filter", noshort = true, default=Some(true), prefix = "no-",
          descrNo = "Do not filter _source fields (workaround for bad index)", descrYes = "Use source filtering to reduce network traffic")

        verify()
      }

      val esContactPoint = FindContactPoints.es(Opts.url())
      val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_))
      val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead)

      // Calling script should clear output directory as necessary.

      val objectExtractor = IndexWithUuidOnly
      val dataWriterFactory = DataWriterFactory.file(format = Opts.format(), objectExtractor, outDirectory = Opts.out())

      PartitionedDownloader.runDownload(
        esTopology = esTopology,
        parallelism = Opts.parallelism(),

        currentOnly = Opts.currentOnly(),
        lastModifiedGteFilter = Opts.lastModifiedGteFilter.toOption,
        pathPrefixFilter = Opts.pathPrefixFilter.toOption,

        objectExtractor = objectExtractor,
        dataWriterFactory = dataWriterFactory,
        sourceFilter = Opts.sourceFilter())

      // The Hadoop convention is to touch the (empty) _SUCCESS file to signal successful completion.
      FileUtils.touch(Paths.get(Opts.out(), "_SUCCESS").toFile)
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
    finally {
      system.terminate()
    }
  }
}

Source File: DumpSystemFieldsFromEs.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import java.nio.file.Paths

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.data.{DataWriterFactory, IndexWithSystemFields}
import cmwell.analytics.downloader.PartitionedDownloader
import cmwell.analytics.util.TimestampConversion.timestampConverter
import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints}
import org.apache.commons.io.FileUtils
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.concurrent.ExecutionContextExecutor

object DumpSystemFieldsFromEs {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpSystemFieldsFromEs.getClass)

    implicit val system: ActorSystem = ActorSystem("dump-system-fields-from-es")
    implicit val executionContext: ExecutionContextExecutor = system.dispatcher
    implicit val actorMaterializer: ActorMaterializer = ActorMaterializer()

    try {
      // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors
      // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might
      // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side.
      val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

      object Opts extends ScallopConf(args) {

        val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from (default: cm_well_all)", required = false)
        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-only", short = 'c', descr = "Only download current uuids")
        val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter)
        val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None)

        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The data format: either 'parquet' or 'csv'", default = Some("parquet"))
        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val sourceFilter: ScallopOption[Boolean] = toggle("source-filter", noshort = true, default=Some(true), prefix = "no-",
          descrNo = "Do not filter _source fields (workaround for bad index)", descrYes = "Use source filtering to reduce network traffic")

        verify()
      }

      val esContactPoint = FindContactPoints.es(Opts.url())
      val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_))
      val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead)

      // Calling script should clear output directory as necessary.

      val objectExtractor = IndexWithSystemFields
      val dataWriterFactory = DataWriterFactory.file(format = Opts.format(), objectExtractor, outDirectory = Opts.out())

      PartitionedDownloader.runDownload(
        esTopology = esTopology,
        parallelism = Opts.parallelism(),

        currentOnly = Opts.currentOnly(),
        lastModifiedGteFilter = Opts.lastModifiedGteFilter.toOption,
        pathPrefixFilter = Opts.pathPrefixFilter.toOption,

        objectExtractor = objectExtractor,
        dataWriterFactory = dataWriterFactory,
        sourceFilter = Opts.sourceFilter())

      // The Hadoop convention is to touch the (empty) _SUCCESS file to signal successful completion.
      FileUtils.touch(Paths.get(Opts.out(), "_SUCCESS").toFile)
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
    finally {
      system.terminate()
    }
  }
}

Source File: DataWriterFactory.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.data

import java.io.File
import java.nio.file.Paths

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.util.Shard
import org.apache.avro.generic.GenericRecord
import org.apache.commons.io.FileUtils
import org.apache.parquet.hadoop.metadata.CompressionCodecName

import scala.concurrent.ExecutionContextExecutor

trait DataWriterFactory[T <: GenericRecord] {
  def apply(shard: Shard): DataWriter[T]
}

object DataWriterFactory {

  private val compressionCodec = CompressionCodecName.SNAPPY


  def file[T <: GenericRecord with CsvGenerator](format: String,
                                                 objectExtractor: ObjectExtractor[T],
                                                 outDirectory: String): Shard => DataWriter[T] = {

    val extension = s".$format" + (if (format == "parquet") s"${compressionCodec.getExtension}" else "")

    // Generate a meaningful file name for the target file name based on the source shard index name and shard number.
    (sourceShard: Shard) => {
      val outFile: File = Paths.get(outDirectory, s"part-r-${sourceShard.indexName}.${sourceShard.shard}$extension").toFile

      if (outFile.exists)
        FileUtils.forceDelete(outFile)

      new File(outFile.getParent).mkdirs()

      FileDataWriter[T](format, objectExtractor.schema, outFile.toString, compressionCodec)
    }
  }

  
  def index[T <: GenericRecord](indexMap: Map[String, String], // source-index -> target-index
                                esEndpoint: String)
                               (implicit system: ActorSystem,
                                executionContext: ExecutionContextExecutor,
                                actorMaterializer: ActorMaterializer
                               ): Shard => DataWriter[T] = {

    (sourceShard: Shard) => {
      val targetIndex = indexMap(sourceShard.indexName)
      new IndexDataWriter[T](indexName = targetIndex, esEndpoint = esEndpoint)
    }
  }
}

Source File: MetadataTest.scala From spark-pagerank with MIT License

5 votes

package com.soundcloud.spark.pagerank

import java.io.File

import org.apache.commons.io.FileUtils
import org.scalatest.{ BeforeAndAfter, FunSuite, Matchers }

class MetadataTest
  extends FunSuite
  with BeforeAndAfter
  with Matchers
  with SparkTesting {

  val path = "target/test/MetadataTest"
  val metadata = Metadata(numVertices=1)

  before {
    FileUtils.deleteDirectory(new File(path))
  }

  test("save and load") {
    Metadata.save(spark, metadata, path)
    Metadata.load(spark, path) shouldBe (metadata)
  }
}

Source File: GraphBuilderAppTest.scala From spark-pagerank with MIT License

5 votes

package com.soundcloud.spark.pagerank

import java.io.File

import org.apache.commons.io.FileUtils
import org.scalatest.{ BeforeAndAfter, FunSuite, Matchers }

class GraphBuilderAppTest
  extends FunSuite
  with BeforeAndAfter
  with Matchers
  with GraphTesting
  with SparkTesting {

  val path = "target/test/GraphBuilderAppTest"

  before {
    FileUtils.deleteDirectory(new File(path))
  }

  // TODO(jd): design a better integration test as this just runs the app without assertions
  test("integration test") {
    val options = new GraphBuilderApp.Options()
    options.output = path
    options.numPartitions = 1

    val input = spark.sparkContext.parallelize(Seq(
      (1, 5, 1.0),
      (2, 1, 1.0),
      (3, 1, 1.0),
      (4, 2, 1.0),
      (4, 3, 1.0),
      (5, 3, 1.0),
      (5, 4, 1.0)
    ).map(_.productIterator.toSeq.mkString("\t")))

    GraphBuilderApp.runFromInputs(options, spark, input)
  }
}

Source File: PageRankAppTest.scala From spark-pagerank with MIT License

5 votes

package com.soundcloud.spark.pagerank

import java.io.File

import org.apache.commons.io.FileUtils
import org.apache.spark.storage.StorageLevel
import org.scalatest.{ BeforeAndAfter, Matchers, FunSuite }

class PageRankAppTest
  extends FunSuite
  with BeforeAndAfter
  with Matchers
  with GraphTesting
  with SparkTesting {

  val path = "target/test/PageRankAppTest"

  before {
    FileUtils.deleteDirectory(new File(path))
  }

  // TODO(jd): design a better integration test as this just runs the app without assertions
  test("integration test") {
    val options = new PageRankApp.Options()
    options.output = path

    val numVertices = 5
    val prior = 1.0 / numVertices
    val stats = Seq(s"numVertices,$numVertices")

    val edges = spark.sparkContext.parallelize(Seq[OutEdgePair](
      // node 1 is dangling
      (2, OutEdge(1, 1.0)),
      (3, OutEdge(1, 1.0)),
      (4, OutEdge(2, 0.5)),
      (4, OutEdge(3, 0.5)),
      (5, OutEdge(3, 0.5)),
      (5, OutEdge(4, 0.5))
    ))
    val vertices = spark.sparkContext.parallelize(Seq[RichVertexPair](
      (1, VertexMetadata(prior, true)),
      (2, VertexMetadata(prior, false)),
      (3, VertexMetadata(prior, false)),
      (4, VertexMetadata(prior, false)),
      (5, VertexMetadata(prior, false))
    ))
    val graph = PageRankGraph(
      numVertices,
      edges.persist(StorageLevel.MEMORY_ONLY),
      vertices.persist(StorageLevel.MEMORY_ONLY)
    )

    PageRankApp.runFromInputs(
      spark,
      options,
      graph,
      priorsOpt = None
    )
  }
}

Source File: YarnShuffleIntegrationSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files
import org.apache.commons.io.FileUtils
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.scalatest.Matchers

import org.apache.spark._
import org.apache.spark.internal.Logging
import org.apache.spark.network.shuffle.ShuffleTestAccessor
import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor}
import org.apache.spark.tags.ExtendedYarnTest


@ExtendedYarnTest
class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {

  override def newYarnConfig(): YarnConfiguration = {
    val yarnConfig = new YarnConfiguration()
    yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle")
    yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"),
      classOf[YarnShuffleService].getCanonicalName)
    yarnConfig.set("spark.shuffle.service.port", "0")
    yarnConfig
  }

  test("external shuffle service") {
    val shuffleServicePort = YarnTestAccessor.getShuffleServicePort
    val shuffleService = YarnTestAccessor.getShuffleServiceInstance

    val registeredExecFile = YarnTestAccessor.getRegisteredExecutorFile(shuffleService)

    logInfo("Shuffle service port = " + shuffleServicePort)
    val result = File.createTempFile("result", null, tempDir)
    val finalState = runSpark(
      false,
      mainClassName(YarnExternalShuffleDriver.getClass),
      appArgs = Seq(result.getAbsolutePath(), registeredExecFile.getAbsolutePath),
      extraConf = Map(
        "spark.shuffle.service.enabled" -> "true",
        "spark.shuffle.service.port" -> shuffleServicePort.toString
      )
    )
    checkResult(finalState, result)
    assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists())
  }
}

private object YarnExternalShuffleDriver extends Logging with Matchers {

  val WAIT_TIMEOUT_MILLIS = 10000

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      // scalastyle:off println
      System.err.println(
        s"""
        |Invalid command line: ${args.mkString(" ")}
        |
        |Usage: ExternalShuffleDriver [result file] [registered exec file]
        """.stripMargin)
      // scalastyle:on println
      System.exit(1)
    }

    val sc = new SparkContext(new SparkConf()
      .setAppName("External Shuffle Test"))
    val conf = sc.getConf
    val status = new File(args(0))
    val registeredExecFile = new File(args(1))
    logInfo("shuffle service executor file = " + registeredExecFile)
    var result = "failure"
    val execStateCopy = new File(registeredExecFile.getAbsolutePath + "_dup")
    try {
      val data = sc.parallelize(0 until 100, 10).map { x => (x % 10) -> x }.reduceByKey{ _ + _ }.
        collect().toSet
      sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
      data should be ((0 until 10).map{x => x -> (x * 10 + 450)}.toSet)
      result = "success"
      // only one process can open a leveldb file at a time, so we copy the files
      FileUtils.copyDirectory(registeredExecFile, execStateCopy)
      assert(!ShuffleTestAccessor.reloadRegisteredExecutors(execStateCopy).isEmpty)
    } finally {
      sc.stop()
      FileUtils.deleteDirectory(execStateCopy)
      Files.write(result, status, StandardCharsets.UTF_8)
    }
  }

}

Source File: SortShuffleSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark

import java.io.File

import scala.collection.JavaConverters._

import org.apache.commons.io.FileUtils
import org.apache.commons.io.filefilter.TrueFileFilter
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.rdd.ShuffledRDD
import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
import org.apache.spark.shuffle.sort.SortShuffleManager
import org.apache.spark.util.Utils

class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {

  // This test suite should run all tests in ShuffleSuite with sort-based shuffle.

  private var tempDir: File = _

  override def beforeAll() {
    super.beforeAll()
    conf.set("spark.shuffle.manager", "sort")
  }

  override def beforeEach(): Unit = {
    super.beforeEach()
    tempDir = Utils.createTempDir()
    conf.set("spark.local.dir", tempDir.getAbsolutePath)
  }

  override def afterEach(): Unit = {
    try {
      Utils.deleteRecursively(tempDir)
    } finally {
      super.afterEach()
    }
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the new serialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new KryoSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the old deserialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new JavaSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = {
    def getAllFiles: Set[File] =
      FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
    val filesBeforeShuffle = getAllFiles
    // Force the shuffle to be performed
    shuffledRdd.count()
    // Ensure that the shuffle actually created files that will need to be cleaned up
    val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle
    filesCreatedByShuffle.map(_.getName) should be
    Set("shuffle_0_0_0.data", "shuffle_0_0_0.index")
    // Check that the cleanup actually removes the files
    sc.env.blockManager.master.removeShuffle(0, blocking = true)
    for (file <- filesCreatedByShuffle) {
      assert (!file.exists(), s"Shuffle file $file was not cleaned up")
    }
  }
}

Source File: TransformerSerialization.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.deeplang.doperables.spark.wrappers.transformers

import java.nio.file.{Files, Path}

import org.apache.commons.io.FileUtils
import org.scalatest.{BeforeAndAfter, Suite}

import ai.deepsense.deeplang.doperables.Transformer
import ai.deepsense.deeplang.doperables.dataframe.DataFrame
import ai.deepsense.deeplang.{DeeplangIntegTestSupport, ExecutionContext}

trait TransformerSerialization extends Suite with BeforeAndAfter {

  var tempDir: Path = _

  before {
    tempDir = Files.createTempDirectory("writeReadTransformer")
  }

  after {
    FileUtils.deleteDirectory(tempDir.toFile)
  }
}

object TransformerSerialization {

  implicit class TransformerSerializationOps(private val transformer: Transformer) {

    def applyTransformationAndSerialization(
        path: Path,
        df: DataFrame)(implicit executionContext: ExecutionContext): DataFrame = {
      val result = transformer._transform(executionContext, df)
      val deserialized = loadSerializedTransformer(path)
      val resultFromSerializedTransformer = deserialized._transform(executionContext, df)
      DeeplangIntegTestSupport.assertDataFramesEqual(result, resultFromSerializedTransformer)
      result
    }

    def loadSerializedTransformer(
        path: Path)(
        implicit executionContext: ExecutionContext): Transformer = {
      val outputPath: Path = path.resolve(this.getClass.getName)
      transformer.save(executionContext, outputPath.toString)
      Transformer.load(executionContext, outputPath.toString)
    }
  }
}

Source File: MavenAddManagedDependenciesRuleTest.scala From RTran with Apache License 2.0

5 votes

package com.ebay.rtran.maven

import java.io.File

import org.apache.commons.io.FileUtils
import com.ebay.rtran.maven.util.MavenModelUtil
import com.ebay.rtran.maven.util.MavenModelUtil.SimpleDependency
import org.scalatest.{BeforeAndAfterEach, FlatSpecLike, Matchers}


class MavenAddManagedDependenciesRuleTest extends FlatSpecLike with Matchers with BeforeAndAfterEach {

  val projectRoot = new File(getClass.getClassLoader.getResource("mvnproject").getFile)
  val destProjectRoot = new File(projectRoot.getParentFile, projectRoot.getName + "-bak")

  override def beforeEach = {
    FileUtils.deleteQuietly(destProjectRoot)
    FileUtils.copyDirectory(projectRoot, destProjectRoot)
  }

  "MavenAddManagedDependenciesRule" should "be able to add dependencies to dependency management" in {
    val ruleConfig = MavenAddManagedDependenciesRuleConfig(
      Set(
        SimpleDependency("org.slf4j", "slf4j-api", Some("1.7.12")),
        SimpleDependency("com.typesafe.akka", "akka-actor_2.11", Some("2.3.9"))
      )
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenAddManagedDependenciesRule(ruleConfig)
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    val parent = transformed.parents.head
    val dm1 = parent.managedDependencies.values.find(_.getArtifactId == "slf4j-api")
    dm1 should not be None
    dm1.get.getVersion should be ("1.7.12")
    val dm2 = parent.managedDependencies.values.find(_.getArtifactId == "akka-actor_2.11")
    dm2 should not be None
    dm2.get.getVersion should be ("2.4.17")
  }
}

Source File: MultiModuleMavenModelProviderTest.scala From RTran with Apache License 2.0

5 votes

package com.ebay.rtran.maven

import java.io.{File, FileReader}

import org.apache.commons.io.FileUtils
import org.apache.maven.model.io.xpp3.MavenXpp3Reader
import org.codehaus.plexus.util.xml.Xpp3Dom
import org.scalatest.{FlatSpecLike, Matchers}

import collection.JavaConversions._


class MultiModuleMavenModelProviderTest extends FlatSpecLike with Matchers {

  val projectRoot = new File(getClass.getClassLoader.getResource("mvnproject").getFile)

  "MavenModelProvider" should "resolve all the pom files in the project" in {
    val projectCtx = new MavenProjectCtx(projectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider.create(projectCtx)

    model.modules foreach { m =>
      m.resolvedDependencies foreach {dep =>
        Option(dep.getVersion) should not be None
      }
    }
  }

  "MavenModelProvider" should "resolve all the pom files recursively in the project" in {
    val dir = new File(getClass.getClassLoader.getResource("recursive").getFile)
    val projectCtx = new MavenProjectCtx(dir)
    val provider = new MultiModuleMavenModelProvider
    val model = provider.create(projectCtx)
    model.modules.size should be (5)
  }

  "MavenModelProvider" should "not remove empty property nodes" in {
    val dir = new File(projectRoot.getParent, projectRoot.getName + "-bak")
    FileUtils.deleteQuietly(dir)
    FileUtils.copyDirectory(projectRoot, dir)
    val projectCtx = new MavenProjectCtx(dir)
    val provider = new MultiModuleMavenModelProvider
    val model = provider.create(projectCtx)

    provider save model

    val pom = new MavenXpp3Reader().read(new FileReader(new File(dir, "pom.xml")))
    pom.getProperties.getProperty("empty.property1") should be ("")
    pom.getProperties.getProperty("empty.property2") should be ("")
    pom.getProperties.getProperty("empty.property3") should be ("")
  }

  "MavenModelProvider" should "not break on xlint element" in {
    val dir = new File(projectRoot.getParent, projectRoot.getName + "-bak")
    FileUtils.deleteQuietly(dir)
    FileUtils.copyDirectory(projectRoot, dir)
    val projectCtx = new MavenProjectCtx(dir)
    val provider = new MultiModuleMavenModelProvider
    val model = provider.create(projectCtx)

    for {
      root <- model.parents.headOption
      build <- Option(root.pomModel.getBuild)
      sourcePlugin <- build.getPlugins.find(_.getArtifactId == "some-maven-plugin")
    } {
      build.removePlugin(sourcePlugin)
    }
    provider save model

    val pom = new MavenXpp3Reader().read(new FileReader(new File(dir, "pom.xml")))
    pom.getBuild.getPlugins.size() should be(1)
    val plugin = pom.getBuild.getPlugins.find(_.getArtifactId == "maven-source-plugin")
    plugin shouldNot be(None)
    plugin.map(_.getConfiguration.asInstanceOf[Xpp3Dom].getChild("compilerArguments").getChildCount) should be(Some(3))

  }

}

Source File: MavenDependenciesMappingRuleTest.scala From RTran with Apache License 2.0

5 votes

package com.ebay.rtran.maven

import java.io.File

import org.apache.commons.io.FileUtils
import com.ebay.rtran.maven.util.MavenModelUtil
import com.ebay.rtran.maven.util.MavenModelUtil.SimpleDependency
import org.scalatest.{BeforeAndAfterEach, FlatSpecLike, Matchers}

import scala.collection.JavaConversions._


class MavenDependenciesMappingRuleTest extends FlatSpecLike with Matchers with BeforeAndAfterEach {
  val projectRoot = new File(getClass.getClassLoader.getResource("mvnproject").getFile)
  val destProjectRoot = new File(projectRoot.getParentFile, projectRoot.getName + "-bak")

  override def beforeEach = {
    FileUtils.deleteQuietly(destProjectRoot)
    FileUtils.copyDirectory(projectRoot, destProjectRoot)
  }

  "MavenDependenciesMappingRule" should "be able to alter dependencies according to mapping" in {
    val ruleConfig = MavenDependenciesMappingRuleConfig(
      Set(SimpleDependency("junit", "junit")),
      Set(SimpleDependency("org.slf4j", "slf4j-api"), SimpleDependency("org.slf4j", "slf4j-log4j12"))
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenDependenciesMappingRule(ruleConfig)
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.modules foreach { module =>
      module.pomModel.getDependencies.exists(_.getArtifactId == "junit") should be (false)
      module.pomModel.getDependencies.exists(_.getArtifactId == "slf4j-api") should be (true)
      module.pomModel.getDependencies.exists(_.getArtifactId == "slf4j-log4j12") should be (true)
    }
  }

  "MavenDependenciesMappingRule" should "not alter dependencies that don't exist" in {
    val ruleConfig = MavenDependenciesMappingRuleConfig(
      Set(SimpleDependency("org.slf4j", "slf4j-api")),
      Set(SimpleDependency("org.slf4j", "slf4j-log4j12"))
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenDependenciesMappingRule(ruleConfig)
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.modules foreach { module =>
      module.pomModel.getDependencies.exists(_.getArtifactId == "slf4j-api") should be (false)
      module.pomModel.getDependencies.exists(_.getArtifactId == "slf4j-log4j12") should be (false)
    }
  }

  "MavenDependenciesMappingRule" should "alter dependencies matches that match other condition" in {
    val ruleConfig = MavenDependenciesMappingRuleConfig(
      Set(SimpleDependency("junit", "junit", Some("4.9"))),
      Set(SimpleDependency("org.slf4j", "slf4j-api"), SimpleDependency("org.slf4j", "slf4j-log4j12"))
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenDependenciesMappingRule(ruleConfig)
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.modules foreach { module =>
      if (module.pomModel.getPackaging == "pom") {
        module.pomModel.getDependencies.exists(_.getArtifactId == "junit") should be (true)
      } else {
        module.pomModel.getDependencies.exists(_.getArtifactId == "junit") should be (false)
        module.pomModel.getDependencies.exists(_.getArtifactId == "slf4j-api") should be (true)
        module.pomModel.getDependencies.exists(_.getArtifactId == "slf4j-log4j12") should be (true)
      }
    }
  }

  "MavenDependenciesMappingRule" should "not alter dependencies if other condition doesn't match" in {
    val ruleConfig = MavenDependenciesMappingRuleConfig(
      Set(SimpleDependency("junit", "junit", scope = Some("compile"))),
      Set(SimpleDependency("org.slf4j", "slf4j-api"), SimpleDependency("org.slf4j", "slf4j-log4j12"))
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenDependenciesMappingRule(ruleConfig)
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.modules foreach { module =>
      module.pomModel.getDependencies.exists(_.getArtifactId == "junit") should be (true)
      module.pomModel.getDependencies.exists(_.getArtifactId == "slf4j-api") should be (false)
      module.pomModel.getDependencies.exists(_.getArtifactId == "slf4j-log4j12") should be (false)
    }
  }
}

Source File: MavenRemoveDependenciesRuleTest.scala From RTran with Apache License 2.0

5 votes

package com.ebay.rtran.maven

import java.io.File

import org.apache.commons.io.FileUtils
import com.ebay.rtran.maven.util.MavenModelUtil
import com.ebay.rtran.maven.util.MavenModelUtil.SimpleDependency
import org.scalatest.{BeforeAndAfterEach, FlatSpecLike, Matchers}

import scala.collection.JavaConversions._


class MavenRemoveDependenciesRuleTest extends FlatSpecLike with Matchers with BeforeAndAfterEach {
  val projectRoot = new File(getClass.getClassLoader.getResource("mvnproject").getFile)
  val destProjectRoot = new File(projectRoot.getParentFile, projectRoot.getName + "-bak")

  override def beforeEach = {
    FileUtils.deleteQuietly(destProjectRoot)
    FileUtils.copyDirectory(projectRoot, destProjectRoot)
  }

  "MavenRemoveDependencies" should "be able to remove dependencies" in {
    val ruleConfig = MavenRemoveDependenciesRuleConfig(
      Set(SimpleDependency("junit", "junit"))
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenRemoveDependenciesRule(ruleConfig)
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.modules foreach { module =>
      module.pomModel.getDependencies.exists(_.getArtifactId == "junit") should be (false)
    }
  }

  "MavenAddDependenciesRule" should "not remove dependencies that don't exist" in {
    val ruleConfig = MavenRemoveDependenciesRuleConfig(
      Set(SimpleDependency("org.slf4j", "slf4j-api"))
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenRemoveDependenciesRule(ruleConfig)
    val originalSizes = model.modules map (_.pomModel.getDependencies.size)
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.modules map (_.pomModel.getDependencies.size) should be (originalSizes)
  }

  "MavenRemoveDependencies" should "remove dependencies matches that match other condition" in {
    val ruleConfig = MavenRemoveDependenciesRuleConfig(
      Set(SimpleDependency("junit", "junit", version = Some("4.9")))
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenRemoveDependenciesRule(ruleConfig)
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.modules foreach { module =>
      if (module.pomModel.getPackaging == "pom") {
        module.pomModel.getDependencies.exists(_.getArtifactId == "junit") should be (true)
      } else {
        module.pomModel.getDependencies.exists(_.getArtifactId == "junit") should be (false)
      }
    }
  }

  "MavenRemoveDependencies" should "not remove dependencies if other condition doesn't match" in {
    val ruleConfig = MavenRemoveDependenciesRuleConfig(
      Set(SimpleDependency("junit", "junit", scope = Some("compile")))
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenRemoveDependenciesRule(ruleConfig)
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.modules foreach { module =>
      module.pomModel.getDependencies.exists(_.getArtifactId == "junit") should be (true)
    }
  }
}

Source File: MavenPluginsMappingRuleTest.scala From RTran with Apache License 2.0

5 votes

package com.ebay.rtran.maven

import java.io.File

import org.apache.commons.io.FileUtils
import org.scalatest.{BeforeAndAfterEach, FlatSpecLike, Matchers}

import scala.collection.JavaConversions._


class MavenPluginsMappingRuleTest extends FlatSpecLike with Matchers with BeforeAndAfterEach {
  val projectRoot = new File(getClass.getClassLoader.getResource("mvnproject").getFile)
  val destProjectRoot = new File(projectRoot.getParentFile, projectRoot.getName + "-bak")

  override def beforeEach = {
    FileUtils.deleteQuietly(destProjectRoot)
    FileUtils.copyDirectory(projectRoot, destProjectRoot)
  }

  "MavenPluginsMappingRule" should "be able to alter both plugins and managed plugins" in {
    val ruleConfig = MavenPluginsMappingRuleConfig(
      List(
        PluginMapping(
          SimplePlugin(Some("com.ebay.rtran.old"), "some-maven-plugin"),
          SimplePlugin(Some("com.ebay.rtran.new"), "some-maven-plugin")
        )
      )
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenPluginsMappingRule(ruleConfig)
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.parents.head
      .pomModel.getBuild.getPluginManagement.getPlugins
      .exists(_.getGroupId == "com.ebay.rtran.old") should be (false)

    transformed.parents.head
      .pomModel.getBuild.getPluginManagement.getPlugins
      .exists(_.getGroupId == "com.ebay.rtran.new") should be (true)

    transformed.parents.head
      .pomModel.getBuild.getPlugins
      .exists(_.getGroupId == "com.ebay.rtran.old") should be (false)

    transformed.parents.head
      .pomModel.getBuild.getPlugins
      .exists(_.getGroupId == "com.ebay.rtran.new") should be (true)
  }

  "MavenPluginsMappingRule" should "not alter plugins or managed plugins that don't exist" in {
    val ruleConfig = MavenPluginsMappingRuleConfig(
      List(
        PluginMapping(
          SimplePlugin(Some("com.ebay.rtran.old"), "non-exist"),
          SimplePlugin(Some("com.ebay.rtran.new"), "non-exist")
        )
      )
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenPluginsMappingRule(ruleConfig)
    val mpSize = model.parents.head
      .pomModel.getBuild.getPluginManagement.getPlugins.size
    val pluginSize = model.parents.head
      .pomModel.getBuild.getPlugins.size
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.parents.head
      .pomModel.getBuild.getPluginManagement.getPlugins.size should be (mpSize)

    transformed.parents.head
      .pomModel.getBuild.getPluginManagement.getPlugins
      .exists(_.getGroupId == "com.ebay.rtran.old") should be (true)

    transformed.parents.head
      .pomModel.getBuild.getPlugins.size should be (pluginSize)

    transformed.parents.head
      .pomModel.getBuild.getPlugins
      .exists(_.getGroupId == "com.ebay.rtran.old") should be (true)
  }
}

Source File: MavenRemoveManagedDependenciesRuleTest.scala From RTran with Apache License 2.0

5 votes

package com.ebay.rtran.maven

import java.io.File

import com.ebay.rtran.maven.util.MavenModelUtil
import MavenModelUtil.SimpleDependency
import org.apache.commons.io.FileUtils
import org.scalatest.{BeforeAndAfterEach, Matchers, FlatSpecLike}

import scala.collection.JavaConversions._


class MavenRemoveManagedDependenciesRuleTest extends FlatSpecLike with Matchers with BeforeAndAfterEach {
  val projectRoot = new File(getClass.getClassLoader.getResource("mvnproject").getFile)
  val destProjectRoot = new File(projectRoot.getParentFile, projectRoot.getName + "-bak")

  override def beforeEach = {
    FileUtils.deleteQuietly(destProjectRoot)
    FileUtils.copyDirectory(projectRoot, destProjectRoot)
  }

  "MavenRemoveManagedDependenciesRule" should "be able to remove managed dependencies" in {
    val ruleConfig = MavenRemoveManagedDependenciesRuleConfig(
      Set(SimpleDependency("org.eclipse.aether", "aether-spi"))
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenRemoveManagedDependenciesRule(ruleConfig)
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.parents.head
      .pomModel.getDependencyManagement.getDependencies.exists(_.getArtifactId == "aether-spi") should be (false)
  }

  "MavenRemoveManagedDependenciesRule" should "not remove managed dependencies that don't exist" in {
    val ruleConfig = MavenRemoveManagedDependenciesRuleConfig(
      Set(SimpleDependency("org.slf4j", "slf4j-api"))
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenRemoveManagedDependenciesRule(ruleConfig)
    val originalSize = model.parents.head
      .pomModel.getDependencyManagement.getDependencies.size
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.parents.head
      .pomModel.getDependencyManagement.getDependencies.size should be (originalSize)
  }

  "MavenRemoveManagedDependenciesRule" should "remove managed dependencies matches that match other condition" in {
    val ruleConfig = MavenRemoveManagedDependenciesRuleConfig(
      Set(SimpleDependency("org.eclipse.aether", "aether-spi", version = Some("1.0.2.v20150114")))
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenRemoveManagedDependenciesRule(ruleConfig)
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.parents.head
      .pomModel.getDependencyManagement.getDependencies.exists(_.getArtifactId == "aether-spi") should be (false)
  }

  "MavenRemoveManagedDependenciesRule" should "not remove managed dependencies if other condition doesn't match" in {
    val ruleConfig = MavenRemoveManagedDependenciesRuleConfig(
      Set(SimpleDependency("org.eclipse.aether", "aether-spi", version = Some("1.0.3.v20150114")))
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenRemoveManagedDependenciesRule(ruleConfig)
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.parents.head
      .pomModel.getDependencyManagement.getDependencies.exists(_.getArtifactId == "aether-spi") should be (true)
  }
}

Source File: MavenRemoveRepositoriesRuleTest.scala From RTran with Apache License 2.0

5 votes

package com.ebay.rtran.maven

import java.io.File

import org.apache.commons.io.FileUtils
import org.scalatest.{BeforeAndAfterEach, FlatSpecLike, Matchers}


class MavenRemoveRepositoriesRuleTest extends FlatSpecLike with Matchers with BeforeAndAfterEach {
  val projectRoot = new File(getClass.getClassLoader.getResource("mvnproject").getFile)
  val destProjectRoot = new File(projectRoot.getParentFile, projectRoot.getName + "-bak")

  override def beforeEach = {
    FileUtils.deleteQuietly(destProjectRoot)
    FileUtils.copyDirectory(projectRoot, destProjectRoot)
  }

  "MavenRemoveRepositoriesRule" should "remove repository that matches given patterns" in {
    val ruleConfig = MavenRemoveRepositoriesRuleConfig(
      Set(
        ".*/content/repositories/releases[/]?",
        ".*/content/repositories/snapshots[/]?"
      )
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val rule = new MavenRemoveRepositoriesRule(ruleConfig)
    val model = provider create projectCtx
    provider save (rule transform model)

    val transformed = provider create projectCtx
    transformed.modules foreach { module =>
      module.pomModel.getRepositories.size should be (0)
    }
  }
}

Source File: MavenAddDependenciesRuleTest.scala From RTran with Apache License 2.0

5 votes

package com.ebay.rtran.maven

import java.io.File

import org.apache.commons.io.FileUtils
import com.ebay.rtran.maven.util.MavenModelUtil
import com.ebay.rtran.maven.util.MavenModelUtil.SimpleDependency
import org.scalatest.{BeforeAndAfterEach, FlatSpecLike, Matchers}

import scala.collection.JavaConversions._


class MavenAddDependenciesRuleTest extends FlatSpecLike with Matchers with BeforeAndAfterEach {

  val projectRoot = new File(getClass.getClassLoader.getResource("mvnproject").getFile)
  val destProjectRoot = new File(projectRoot.getParentFile, projectRoot.getName + "-bak")

  override def beforeEach = {
    FileUtils.deleteQuietly(destProjectRoot)
    FileUtils.copyDirectory(projectRoot, destProjectRoot)
  }

  "MavenAddDependenciesRule" should "be able to add dependencies" in {
    val ruleConfig = MavenAddDependenciesRuleConfig(
      Set(
        SimpleDependency("org.slf4j", "slf4j-api"),
        SimpleDependency("org.slf4j", "slf4j-log4j12")
      )
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenAddDependenciesRule(ruleConfig)
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.modules foreach { module =>
      module.pomModel.getDependencies.exists(_.getArtifactId == "slf4j-api") should be (true)
      module.pomModel.getDependencies.exists(_.getArtifactId == "slf4j-log4j12") should be (true)
    }
  }

  "MavenAddDependenciesRule" should "not add dependencies that already exist" in {
    val ruleConfig = MavenAddDependenciesRuleConfig(
      Set(
        SimpleDependency("junit", "junit")
      )
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenAddDependenciesRule(ruleConfig)
    val originalSize = model.modules
      .find(_.pomModel.getPackaging == "pom")
      .map(_.pomModel.getDependencies.size)
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.modules
      .find(_.pomModel.getPackaging == "pom")
      .map(_.pomModel.getDependencies.size) should be (originalSize)
    transformed.modules foreach { module =>
      module.pomModel.getDependencies.exists(_.getArtifactId == "junit") should be (true)
    }
  }
}

Source File: MavenRemovePluginsRuleTest.scala From RTran with Apache License 2.0

5 votes

package com.ebay.rtran.maven

import java.io.File

import org.apache.commons.io.FileUtils
import org.scalatest.{BeforeAndAfterEach, FlatSpecLike, Matchers}

import scala.collection.JavaConversions._


class MavenRemovePluginsRuleTest extends FlatSpecLike with Matchers with BeforeAndAfterEach {
  val projectRoot = new File(getClass.getClassLoader.getResource("mvnproject").getFile)
  val destProjectRoot = new File(projectRoot.getParentFile, projectRoot.getName + "-bak")

  override def beforeEach = {
    FileUtils.deleteQuietly(destProjectRoot)
    FileUtils.copyDirectory(projectRoot, destProjectRoot)
  }

  "MavenRemovePluginsRule" should "be able to remove both plugins and managed plugins" in {
    val ruleConfig = MavenRemoveManagedPluginsRuleConfig(
      Set(SimplePlugin(artifactId = "maven-source-plugin"))
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenRemovePluginsRule(ruleConfig)
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.parents.head
      .pomModel.getBuild.getPluginManagement.getPlugins
      .exists(_.getArtifactId == "maven-source-plugin") should be (false)

    transformed.parents.head
      .pomModel.getBuild.getPlugins
      .exists(_.getArtifactId == "maven-source-plugin") should be (false)
  }

  "MavenRemovePluginsRule" should "not remove plugins or managed plugins that don't exist" in {
    val ruleConfig = MavenRemoveManagedPluginsRuleConfig(
      Set(SimplePlugin(artifactId = "maven-surefire-plugin"))
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenRemovePluginsRule(ruleConfig)
    val mpSize = model.parents.head.pomModel.getBuild.getPluginManagement.getPlugins.size
    val pluginSize = model.parents.head.pomModel.getBuild.getPlugins.size
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.parents.head
      .pomModel.getBuild.getPluginManagement.getPlugins.size should be (mpSize)

    transformed.parents.head
      .pomModel.getBuild.getPlugins.size should be (pluginSize)
  }

  "MavenRemovePluginsRule" should "remove both plugins and managed plugins matches that match other condition" in {
    val ruleConfig = MavenRemoveManagedPluginsRuleConfig(
      Set(SimplePlugin(artifactId = "maven-source-plugin", version = Some("2.2.1")))
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenRemovePluginsRule(ruleConfig)
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.parents.head
      .pomModel.getBuild.getPluginManagement.getPlugins
      .exists(_.getArtifactId == "maven-source-plugin") should be (false)

    transformed.parents.head
      .pomModel.getBuild.getPlugins
      .exists(_.getArtifactId == "maven-source-plugin") should be (false)
  }

  "MavenRemoveManagedPluginsRule" should "not remove plugins or managed plugins if other condition doesn't match" in {
    val ruleConfig = MavenRemoveManagedPluginsRuleConfig(
      Set(SimplePlugin(artifactId = "maven-source-plugin", version = Some("2.2.0")))
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx
    val rule = new MavenRemovePluginsRule(ruleConfig)
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.parents.head
      .pomModel.getBuild.getPluginManagement.getPlugins
      .exists(_.getArtifactId == "maven-source-plugin") should be (true)

    transformed.parents.head
      .pomModel.getBuild.getPlugins
      .exists(_.getArtifactId == "maven-source-plugin") should be (true)
  }
}

Source File: MavenExcludeDependenciesRuleTest.scala From RTran with Apache License 2.0

5 votes

package com.ebay.rtran.maven

import java.io.File

import org.apache.commons.io.FileUtils
import org.scalatest.{BeforeAndAfterEach, FlatSpecLike, Matchers}

import scala.collection.JavaConversions._


class MavenExcludeDependenciesRuleTest extends FlatSpecLike with Matchers with BeforeAndAfterEach {

  val projectRoot = new File(getClass.getClassLoader.getResource("mvnproject").getFile)
  val destProjectRoot = new File(projectRoot.getParentFile, projectRoot.getName + "-bak")

  override def beforeEach = {
    FileUtils.deleteQuietly(destProjectRoot)
    FileUtils.copyDirectory(projectRoot, destProjectRoot)
  }

  "MavenExcludeDependenciesRule" should "exclude the dependencies if they are used transitively" in {
    val ruleConfig = MavenExcludeDependenciesRuleConfig(
      Set(SimpleExclusion("org.springframework", "spring-aop"))
    )
    val projectCtx = new MavenProjectCtx(destProjectRoot)
    val provider = new MultiModuleMavenModelProvider
    val model = provider create projectCtx

    val rule = new MavenExcludeDependenciesRule(ruleConfig)
    provider save rule.transform(model)

    val transformed = provider create projectCtx
    transformed.modules foreach { module =>
      if (module.pomModel.getPackaging != "war") {
        module.pomModel.getDependencies.forall(_.getExclusions.size == 0) should be (true)
      }else {
        module.pomModel.getDependencies.exists(_.getExclusions.size > 0) should be (true)
      }
    }
  }

}

Source File: ModifyFilesRule.scala From RTran with Apache License 2.0

5 votes

package com.ebay.rtran.generic

import org.apache.commons.io.FileUtils
import com.ebay.rtran._
import com.ebay.rtran.api.{IRule, IRuleConfig}
import com.ebay.rtran.generic.util.{EncodingDetector, FilePathMatcher}

class ModifyFilesRule(ruleConfig: ModifyFilesRuleConfig) extends IRule[AllFilesModel] {

  override def transform(model: AllFilesModel): AllFilesModel = {
    val modified = model.files filter {file =>
      FilePathMatcher(model.projectRoot, ruleConfig.pathPattern).map(_ matches file) getOrElse false
    } map {file =>
      val content = ruleConfig.encoding map (encoding => FileUtils.readFileToString(file, encoding)) getOrElse {
        val (encoding, bytes) = EncodingDetector.guessEncoding(file)
        new String(bytes, encoding)
      }
      val newContent = ruleConfig.contentMappings.foldLeft(content) {(c, contentMapping) =>
        contentMapping match {
          case ContentMapping(regex, replacement, false) => c.replaceAll(regex, replacement)
          case ContentMapping(regex, replacement, true) => c.replaceFirst(regex, replacement)
        }
      }
      if (content != newContent) {
        FileUtils.write(file, newContent, false)
        Some(file)
      } else None
    } collect {
      case Some(f) => f
    }
    model.copy(modified = modified)
  }

}

case class ModifyFilesRuleConfig(pathPattern: String,
                                 encoding: Option[String],
                                 contentMappings: List[ContentMapping]) extends IRuleConfig

case class ContentMapping(regex: String, replacement: String, firstOnly: Boolean = false)

Source File: AllFilesModel.scala From RTran with Apache License 2.0

5 votes

package com.ebay.rtran.generic

import java.io.File

import org.apache.commons.io.FileUtils
import org.apache.commons.io.filefilter.TrueFileFilter
import com.ebay.rtran.api.{IModel, IModelProvider}

import scala.collection.JavaConversions._


case class AllFilesModel(projectRoot: File, files: List[File], modified: List[File] = List.empty) extends IModel

class AllFilesModelProvider extends IModelProvider[AllFilesModel, GenericProjectCtx] {
  override def id(): String = getClass.getName

  override def save(model: AllFilesModel): Unit = {
    // all files operations are taken in place
    // simply validate the model
    if (!model.files.forall(_.exists)) {
      throw new IllegalStateException(s"${model.files.filterNot(_.exists)} does not exist")
    }
  }

  override def create(project: GenericProjectCtx): AllFilesModel = AllFilesModel(
    project.rootDir,
    FileUtils.listFiles(project.rootDir, TrueFileFilter.TRUE, TrueFileFilter.TRUE).toList
  )
}

Source File: FilePathMatcher.scala From RTran with Apache License 2.0

5 votes

package com.ebay.rtran.generic.util

import java.io.File
import java.nio.file.{FileSystems, PathMatcher}

import com.typesafe.scalalogging.LazyLogging
import org.apache.commons.io.FileUtils
import org.mozilla.universalchardet.CharsetListener

import scala.util.Try


object FilePathMatcher {

  def apply(rootDir: File, pathPattern: String): Try[PathMatcher] = Try {
    val trimmedPattern = new String(pathPattern.trim.toCharArray.dropWhile(_ == '/')).trim
	val path=rootDir.getAbsolutePath.replaceAll("\\\\","/")
    FileSystems.getDefault.getPathMatcher(s"glob:${path}/$trimmedPattern")
    //FileSystems.getDefault.getPathMatcher(s"glob:${rootDir.getAbsolutePath}/$trimmedPattern")
  }
}

object EncodingDetector extends LazyLogging {

  val DEFAULT_ENCODING = "UTF-8"

  def guessEncoding(file: File) = {
    val bytes = FileUtils.readFileToByteArray(file)
    val dummyListener = new CharsetListener {
      override def report(charset: String): Unit = {}
    }
    val detector = new org.mozilla.universalchardet.UniversalDetector(dummyListener)
    detector.handleData(bytes, 0, bytes.length)
    detector.dataEnd()
    val encoding = Option(detector.getDetectedCharset) getOrElse DEFAULT_ENCODING
    logger.debug("Detected encoding {} for {}", detector.getDetectedCharset, file)
    detector.reset()
    (encoding, bytes)
  }
}

Source File: MoveFilesRule.scala From RTran with Apache License 2.0

5 votes

package com.ebay.rtran.generic

import java.io.File

import com.ebay.rtran._
import org.apache.commons.io.FileUtils
import com.ebay.rtran.api.{IRule, IRuleConfig}
import com.ebay.rtran.generic.util.FilePathMatcher


class MoveFilesRule(ruleConfig: MoveFilesRuleConfig) extends IRule[AllFilesModel] {

  override def transform(model: AllFilesModel): AllFilesModel = {
    val result = ruleConfig.moves.foldLeft(model.files) {(files, move) =>
      val removes = files filter { file =>
        FilePathMatcher(model.projectRoot, move.pathPattern).map(_ matches file).getOrElse(false)
      }
      val dest = new File(model.projectRoot, move.destDir)
      val creates = removes map {f =>
        FileUtils.moveFileToDirectory(f, dest, true)
        new File(dest, f.getName)
      }
      files diff removes ++ creates
    }
    model.copy(files = result)
  }
}

case class MoveFilesRuleConfig(moves: List[Move]) extends IRuleConfig

case class Move(pathPattern: String, destDir: String)

Source File: XMLFilesModel.scala From RTran with Apache License 2.0

5 votes

package com.ebay.rtran.xml

import java.io.{File, FileInputStream}

import org.apache.axiom.om.{OMElement, OMXMLBuilderFactory}
import org.apache.commons.io.FileUtils
import com.ebay.rtran.api.{IModel, IModelProvider}
import com.ebay.rtran.generic.GenericProjectCtx
import com.ebay.rtran.xml.util.XmlUtil

import scala.collection.JavaConversions._
import scala.language.postfixOps
import scala.util.{Success, Try}


case class XMLFilesModel(projectRoot: File,
                         xmlRoots: Map[File, OMElement],
                         modified: Map[File, Option[OMElement]] = Map.empty) extends IModel

class XMLFilesModelProvider extends IModelProvider[XMLFilesModel, GenericProjectCtx] {
  override def id(): String = getClass.getName

  override def save(model: XMLFilesModel): Unit = {
    model.modified foreach {
      case (file, root) => root.map(r => XmlUtil.writeOMElement2File(file, r))
    }
  }

  override def create(projectCtx: GenericProjectCtx): XMLFilesModel = XMLFilesModel(
    projectCtx.rootDir,
    FileUtils.listFiles(projectCtx.rootDir, Array("xml"), true) map {file =>
      file -> Try(OMXMLBuilderFactory.createOMBuilder(new FileInputStream(file)).getDocumentElement)
    } collect {
      case (f, Success(r)) => f -> r
    } toMap
  )
}

Source File: BigQueryClientSpecs.scala From spark-bigquery with Apache License 2.0

4 votes

package com.samelamin.spark.bigquery

import java.io.File

import com.google.api.services.bigquery.Bigquery
import com.google.api.services.bigquery.model._
import com.google.cloud.hadoop.io.bigquery._
import com.holdenkarau.spark.testing.DataFrameSuiteBase
import com.samelamin.spark.bigquery.converters.{BigQueryAdapter, SchemaConverters}
import org.apache.commons.io.FileUtils
import org.apache.spark.sql._
import org.mockito.Matchers.{any, eq => mockitoEq}
import org.mockito.Mockito._
import org.scalatest.FeatureSpec
import org.scalatest.mock.MockitoSugar


class BigQueryClientSpecs extends FeatureSpec with DataFrameSuiteBase with MockitoSugar {
  val BQProjectId = "google.com:foo-project"

  def setupBigQueryClient(sqlCtx: SQLContext, bigQueryMock: Bigquery): BigQueryClient = {
    val fakeJobReference = new JobReference()
    fakeJobReference.setProjectId(BQProjectId)
    fakeJobReference.setJobId("bigquery-job-1234")
    val dataProjectId = "publicdata"
    // Create the job result.
    val jobStatus = new JobStatus()
    jobStatus.setState("DONE")
    jobStatus.setErrorResult(null)

    val jobHandle = new Job()
    jobHandle.setStatus(jobStatus)
    jobHandle.setJobReference(fakeJobReference)

    // Create table reference.
    val tableRef = new TableReference()
    tableRef.setProjectId(dataProjectId)
    tableRef.setDatasetId("test_dataset")
    tableRef.setTableId("test_table")

    // Mock getting Bigquery jobs
    when(bigQueryMock.jobs().get(any[String], any[String]).execute())
      .thenReturn(jobHandle)
    when(bigQueryMock.jobs().insert(any[String], any[Job]).execute())
      .thenReturn(jobHandle)

    val bigQueryClient = new BigQueryClient(sqlCtx, bigQueryMock)
    bigQueryClient
  }

  scenario("When writing to BQ") {
    val sqlCtx = sqlContext
    import sqlCtx.implicits._
    val gcsPath = "/tmp/testfile2.json"
    FileUtils.deleteQuietly(new File(gcsPath))
    val adaptedDf = BigQueryAdapter(sc.parallelize(List(1, 2, 3)).toDF)
    val bigQueryMock =  mock[Bigquery](RETURNS_DEEP_STUBS)
    val fullyQualifiedOutputTableId = "testProjectID:test_dataset.test"
    val targetTable = BigQueryStrings.parseTableReference(fullyQualifiedOutputTableId)
    val bigQueryClient = setupBigQueryClient(sqlCtx, bigQueryMock)
    val bigQuerySchema = SchemaConverters.SqlToBQSchema(adaptedDf)

    bigQueryClient.load(targetTable,bigQuerySchema,gcsPath)
    verify(bigQueryMock.jobs().insert(mockitoEq(BQProjectId),any[Job]), times(1)).execute()
  }

  scenario("When reading from BQ") {
    val sqlCtx = sqlContext
    val fullyQualifiedOutputTableId = "testProjectID:test_dataset.test"
    val sqlQuery = s"select * from $fullyQualifiedOutputTableId"

    val bqQueryContext = new BigQuerySQLContext(sqlCtx)
    bqQueryContext.setBigQueryProjectId(BQProjectId)
    val bigQueryMock =  mock[Bigquery](RETURNS_DEEP_STUBS)
    val bigQueryClient = setupBigQueryClient(sqlCtx, bigQueryMock)
    bigQueryClient.selectQuery(sqlQuery)
    verify(bigQueryMock.jobs().insert(mockitoEq(BQProjectId),any[Job]), times(1)).execute()
  }

  scenario("When running a DML Queries") {
    val sqlCtx = sqlContext
    val fullyQualifiedOutputTableId = "testProjectID:test_dataset.test"
    val dmlQuery = s"UPDATE $fullyQualifiedOutputTableId SET test_col = new_value WHERE test_col = old_value"
    val bqQueryContext = new BigQuerySQLContext(sqlCtx)
    bqQueryContext.setBigQueryProjectId(BQProjectId)
    val bigQueryMock =  mock[Bigquery](RETURNS_DEEP_STUBS)
    val bigQueryClient = setupBigQueryClient(sqlCtx, bigQueryMock)
    bigQueryClient.runDMLQuery(dmlQuery)
    verify(bigQueryMock.jobs().insert(mockitoEq(BQProjectId),any[Job]), times(1)).execute()
  }
}

org.apache.commons.io.FileUtils Scala Examples