com.google.common.io.Files Scala Example

Source File: QueryPartitionSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import com.google.common.io.Files

import org.apache.spark.util.Utils
import org.apache.spark.sql.{QueryTest, _}
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.test.SQLTestUtils

class QueryPartitionSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
  import hiveContext.implicits._

  test("SPARK-5068: query data when path doesn't exist") {
    withSQLConf((SQLConf.HIVE_VERIFY_PARTITION_PATH.key, "true")) {
      val testData = sparkContext.parallelize(
        (1 to 10).map(i => TestData(i, i.toString))).toDF()
      testData.registerTempTable("testData")

      val tmpDir = Files.createTempDir()
      // create the table for test
      sql(s"CREATE TABLE table_with_partition(key int,value string) " +
        s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ")
      sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='1') " +
        "SELECT key,value FROM testData")
      sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='2') " +
        "SELECT key,value FROM testData")
      sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='3') " +
        "SELECT key,value FROM testData")
      sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='4') " +
        "SELECT key,value FROM testData")

      // test for the exist path
      checkAnswer(sql("select key,value from table_with_partition"),
        testData.toDF.collect ++ testData.toDF.collect
          ++ testData.toDF.collect ++ testData.toDF.collect)

      // delete the path of one partition
      tmpDir.listFiles
        .find { f => f.isDirectory && f.getName().startsWith("ds=") }
        .foreach { f => Utils.deleteRecursively(f) }

      // test for after delete the path
      checkAnswer(sql("select key,value from table_with_partition"),
        testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect)

      sql("DROP TABLE table_with_partition")
      sql("DROP TABLE createAndInsertTest")
    }
  }
}

Source File: DatasetExample.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.examples.mllib

import java.io.File

import com.google.common.io.Files
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext, DataFrame}


object DatasetExample {

  case class Params(
      input: String = "data/mllib/sample_libsvm_data.txt",
      dataFormat: String = "libsvm") extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DatasetExample") {
      head("Dataset: an example app using DataFrame as a Dataset for ML.")
      opt[String]("input")
        .text(s"input path to dataset")
        .action((x, c) => c.copy(input = x))
      opt[String]("dataFormat")
        .text("data format: libsvm (default), dense (deprecated in Spark v1.1)")
        .action((x, c) => c.copy(input = x))
      checkConfig { params =>
        success
      }
    }

    parser.parse(args, defaultParams).map { params =>
      run(params)
    }.getOrElse {
      sys.exit(1)
    }
  }

  def run(params: Params) {

    val conf = new SparkConf().setAppName(s"DatasetExample with $params")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._  // for implicit conversions

    // Load input data
    val origData: RDD[LabeledPoint] = params.dataFormat match {
      case "dense" => MLUtils.loadLabeledPoints(sc, params.input)
      case "libsvm" => MLUtils.loadLibSVMFile(sc, params.input)
    }
    println(s"Loaded ${origData.count()} instances from file: ${params.input}")

    // Convert input data to DataFrame explicitly.
    val df: DataFrame = origData.toDF()
    println(s"Inferred schema:\n${df.schema.prettyJson}")
    println(s"Converted to DataFrame with ${df.count()} records")

    // Select columns
    val labelsDf: DataFrame = df.select("label")
    val labels: RDD[Double] = labelsDf.map { case Row(v: Double) => v }
    val numLabels = labels.count()
    val meanLabel = labels.fold(0.0)(_ + _) / numLabels
    println(s"Selected label column with average value $meanLabel")

    val featuresDf: DataFrame = df.select("features")
    val features: RDD[Vector] = featuresDf.map { case Row(v: Vector) => v }
    val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())(
      (summary, feat) => summary.add(feat),
      (sum1, sum2) => sum1.merge(sum2))
    println(s"Selected features column with average values:\n ${featureSummary.mean.toString}")

    val tmpDir = Files.createTempDir()
    tmpDir.deleteOnExit()
    val outputDir = new File(tmpDir, "dataset").toString
    println(s"Saving to $outputDir as Parquet file.")
    df.write.parquet(outputDir)

    println(s"Loading Parquet file with UDT from $outputDir.")
    val newDataset = sqlContext.read.parquet(outputDir)

    println(s"Schema from Parquet: ${newDataset.schema.prettyJson}")
    val newFeatures = newDataset.select("features").map { case Row(v: Vector) => v }
    val newFeaturesSummary = newFeatures.aggregate(new MultivariateOnlineSummarizer())(
      (summary, feat) => summary.add(feat),
      (sum1, sum2) => sum1.merge(sum2))
    println(s"Selected features column with average values:\n ${newFeaturesSummary.mean.toString}")

    sc.stop()
  }

}

Source File: QueryPartitionSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import com.google.common.io.Files

import org.apache.spark.sql.{QueryTest, _}
import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.hive.test.TestHive._
import org.apache.spark.util.Utils


class QueryPartitionSuite extends QueryTest {
  import org.apache.spark.sql.hive.test.TestHive.implicits._

  test("SPARK-5068: query data when path doesn't exist"){
    val testData = TestHive.sparkContext.parallelize(
      (1 to 10).map(i => TestData(i, i.toString))).toDF()
    testData.registerTempTable("testData")

    val tmpDir = Files.createTempDir()
    // create the table for test
    sql(s"CREATE TABLE table_with_partition(key int,value string) " +
      s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ")
    sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='1') " +
      "SELECT key,value FROM testData")
    sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='2') " +
      "SELECT key,value FROM testData")
    sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='3') " +
      "SELECT key,value FROM testData")
    sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='4') " +
      "SELECT key,value FROM testData")

    // test for the exist path
    checkAnswer(sql("select key,value from table_with_partition"),
      testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect
        ++ testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect)

    // delete the path of one partition
    tmpDir.listFiles
      .find { f => f.isDirectory && f.getName().startsWith("ds=") }
      .foreach { f => Utils.deleteRecursively(f) }

    // test for after delete the path
    checkAnswer(sql("select key,value from table_with_partition"),
      testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect
        ++ testData.toSchemaRDD.collect)

    sql("DROP TABLE table_with_partition")
    sql("DROP TABLE createAndInsertTest")
  }
}

Source File: HttpFileServer.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark

import java.io.File

import com.google.common.io.Files

import org.apache.spark.util.Utils

private[spark] class HttpFileServer(
    conf: SparkConf,
    securityManager: SecurityManager,
    requestedPort: Int = 0)
  extends Logging {

  var baseDir : File = null
  var fileDir : File = null
  var jarDir : File = null
  var httpServer : HttpServer = null
  var serverUri : String = null

  def initialize() {
    baseDir = Utils.createTempDir(Utils.getLocalDir(conf), "httpd")
    fileDir = new File(baseDir, "files")
    jarDir = new File(baseDir, "jars")
    fileDir.mkdir()
    jarDir.mkdir()
    logInfo("HTTP File server directory is " + baseDir)
    httpServer = new HttpServer(conf, baseDir, securityManager, requestedPort, "HTTP file server")
    httpServer.start()
    serverUri = httpServer.uri
    logDebug("HTTP file server started at: " + serverUri)
  }

  def stop() {
    httpServer.stop()

    // If we only stop sc, but the driver process still run as a services then we need to delete
    // the tmp dir, if not, it will create too many tmp dirs
    try {
      Utils.deleteRecursively(baseDir)
    } catch {
      case e: Exception =>
        logWarning(s"Exception while deleting Spark temp dir: ${baseDir.getAbsolutePath}", e)
    }
  }

  def addFile(file: File) : String = {
    addFileToDir(file, fileDir)
    serverUri + "/files/" + file.getName
  }

  def addJar(file: File) : String = {
    addFileToDir(file, jarDir)
    serverUri + "/jars/" + file.getName
  }

  def addFileToDir(file: File, dir: File) : String = {
    // Check whether the file is a directory. If it is, throw a more meaningful exception.
    // If we don't catch this, Guava throws a very confusing error message:
    //   java.io.FileNotFoundException: [file] (No such file or directory)
    // even though the directory ([file]) exists.
    if (file.isDirectory) {
      throw new IllegalArgumentException(s"$file cannot be a directory.")
    }
    Files.copy(file, new File(dir, file.getName))
    dir + "/" + file.getName
  }

}

Source File: Unzip.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.workflowexecutor

import java.io._
import java.util.zip.ZipInputStream

import scala.reflect.io.Path

import com.google.common.io.Files

import ai.deepsense.commons.utils.Logging

object Unzip extends Logging {

  
  def unzipAll(inputFile: String): String =
    unzipToTmp(inputFile, _ => true)

  private def transferImpl(in: InputStream, out: OutputStream, close: Boolean): Unit = {
    try {
      val buffer = new Array[Byte](4096)
      def read(): Unit = {
        val byteCount = in.read(buffer)
        if (byteCount >= 0) {
          out.write(buffer, 0, byteCount)
          read()
        }
      }
      read()
      out.close()
    }
    finally {
      if (close) {
        in.close()
      }
    }
  }
}

Source File: ZookeeperFunSuite.scala From kyuubi with Apache License 2.0

5 votes

package yaooqinn.kyuubi.ha

import com.google.common.io.Files
import org.apache.curator.framework.{CuratorFramework, CuratorFrameworkFactory}
import org.apache.curator.retry.ExponentialBackoffRetry
import org.apache.curator.test.TestingServer
import org.apache.spark.{KyuubiConf, KyuubiSparkUtil, SparkConf, SparkFunSuite}
import org.apache.spark.KyuubiConf._

trait ZookeeperFunSuite extends SparkFunSuite{

  var zkServer: TestingServer = _
  var connectString: String = _
  val conf = new SparkConf(loadDefaults = true)
  KyuubiSparkUtil.setupCommonConfig(conf)
  conf.set(KyuubiConf.FRONTEND_BIND_PORT.key, "0")

  var zooKeeperClient: CuratorFramework = _

  override def beforeAll(): Unit = {
    zkServer = new TestingServer(2181, Files.createTempDir(), true)
    connectString = zkServer.getConnectString
    conf.set(HA_ZOOKEEPER_QUORUM.key, connectString)
    conf.set(HA_ZOOKEEPER_CONNECTION_BASESLEEPTIME.key, "100ms")
    conf.set(HA_ZOOKEEPER_SESSION_TIMEOUT.key, "15s")
    conf.set(HA_ZOOKEEPER_CONNECTION_MAX_RETRIES.key, "0")
    zooKeeperClient = CuratorFrameworkFactory.builder().connectString(connectString)
        .retryPolicy(new ExponentialBackoffRetry(1000, 3))
        .build()
    zooKeeperClient.start()
    super.beforeAll()
  }

  override def afterAll(): Unit = {
    Option(zooKeeperClient).foreach(_.close())
    Option(zkServer).foreach(_.stop())
    System.clearProperty(HA_ZOOKEEPER_QUORUM.key)
    System.clearProperty(HA_ENABLED.key)
    super.afterAll()
  }
}

Source File: FileWrite.scala From spark1.52 with Apache License 2.0

5 votes

package scalaDemo

import java.io.{File, FileWriter}
import java.util.Random

import com.google.common.base.Charsets.UTF_8
import com.google.common.io.Files
import org.apache.spark.util.Utils

object FileWrite {
  def main(args: Array[String]) {


    val outFile = File.createTempFile("test-load-spark-properties", "test")
    Files.write("spark.test.fileNameLoadA true\n" +
      "spark.test.fileNameLoadB 1\n", outFile, UTF_8)


    val writer = new FileWriter(new File("D:\\eclipse44_64\\workspace\\spark1.5\\examples\\sample_age_data.txt"), false)
    val rand = new Random()
    for (i <- 1 to 10000) {
      writer.write(i + " " + rand.nextInt(100))
      writer.write(System.getProperty("line.separator"))
    }
    writer.flush()
    writer.close()
  }
}

Source File: QueryPartitionSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import com.google.common.io.Files
import org.apache.spark.sql.test.SQLTestUtils

import org.apache.spark.sql.{QueryTest, _}
import org.apache.spark.util.Utils

//查询分区套件
class QueryPartitionSuite extends QueryTest with SQLTestUtils {

  private lazy val ctx = org.apache.spark.sql.hive.test.TestHive
  import ctx.implicits._

  protected def _sqlContext = ctx
  //查询数据当路径不存在时
  test("SPARK-5068: query data when path doesn't exist"){
    withSQLConf((SQLConf.HIVE_VERIFY_PARTITION_PATH.key, "true")) {
      val testData = ctx.sparkContext.parallelize(
        (1 to 10).map(i => TestData(i, i.toString))).toDF()
      testData.registerTempTable("testData")

      val tmpDir = Files.createTempDir()
      // create the table for test 创建表进行测试
      sql(s"CREATE TABLE table_with_partition(key int,value string) " +
        s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ")
      sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='1') " +
        "SELECT key,value FROM testData")
      sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='2') " +
        "SELECT key,value FROM testData")
      sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='3') " +
        "SELECT key,value FROM testData")
      sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='4') " +
        "SELECT key,value FROM testData")

      // test for the exist path 测试存在的路径
      checkAnswer(sql("select key,value from table_with_partition"),
        testData.toDF.collect ++ testData.toDF.collect
          ++ testData.toDF.collect ++ testData.toDF.collect)

      // delete the path of one partition 删除一个分区的路径
      tmpDir.listFiles
        .find { f => f.isDirectory && f.getName().startsWith("ds=") }
        .foreach { f => Utils.deleteRecursively(f) }

      // test for after delete the path 测试后删除路径
      checkAnswer(sql("select key,value from table_with_partition"),
        testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect)

      sql("DROP TABLE table_with_partition")
      sql("DROP TABLE createAndInsertTest")
    }
  }
}

Source File: SparkKubernetesClientFactory.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.deploy.k8s

import java.io.File

import com.google.common.base.Charsets
import com.google.common.io.Files
import io.fabric8.kubernetes.client.{ConfigBuilder, DefaultKubernetesClient, KubernetesClient}
import io.fabric8.kubernetes.client.utils.HttpClientUtils
import okhttp3.Dispatcher

import org.apache.spark.SparkConf
import org.apache.spark.deploy.k8s.Config._
import org.apache.spark.util.ThreadUtils


private[spark] object SparkKubernetesClientFactory {

  def createKubernetesClient(
      master: String,
      namespace: Option[String],
      kubernetesAuthConfPrefix: String,
      sparkConf: SparkConf,
      defaultServiceAccountToken: Option[File],
      defaultServiceAccountCaCert: Option[File]): KubernetesClient = {
    val oauthTokenFileConf = s"$kubernetesAuthConfPrefix.$OAUTH_TOKEN_FILE_CONF_SUFFIX"
    val oauthTokenConf = s"$kubernetesAuthConfPrefix.$OAUTH_TOKEN_CONF_SUFFIX"
    val oauthTokenFile = sparkConf.getOption(oauthTokenFileConf)
      .map(new File(_))
      .orElse(defaultServiceAccountToken)
    val oauthTokenValue = sparkConf.getOption(oauthTokenConf)
    KubernetesUtils.requireNandDefined(
      oauthTokenFile,
      oauthTokenValue,
      s"Cannot specify OAuth token through both a file $oauthTokenFileConf and a " +
        s"value $oauthTokenConf.")

    val caCertFile = sparkConf
      .getOption(s"$kubernetesAuthConfPrefix.$CA_CERT_FILE_CONF_SUFFIX")
      .orElse(defaultServiceAccountCaCert.map(_.getAbsolutePath))
    val clientKeyFile = sparkConf
      .getOption(s"$kubernetesAuthConfPrefix.$CLIENT_KEY_FILE_CONF_SUFFIX")
    val clientCertFile = sparkConf
      .getOption(s"$kubernetesAuthConfPrefix.$CLIENT_CERT_FILE_CONF_SUFFIX")
    val dispatcher = new Dispatcher(
      ThreadUtils.newDaemonCachedThreadPool("kubernetes-dispatcher"))
    val config = new ConfigBuilder()
      .withApiVersion("v1")
      .withMasterUrl(master)
      .withWebsocketPingInterval(0)
      .withOption(oauthTokenValue) {
        (token, configBuilder) => configBuilder.withOauthToken(token)
      }.withOption(oauthTokenFile) {
        (file, configBuilder) =>
            configBuilder.withOauthToken(Files.toString(file, Charsets.UTF_8))
      }.withOption(caCertFile) {
        (file, configBuilder) => configBuilder.withCaCertFile(file)
      }.withOption(clientKeyFile) {
        (file, configBuilder) => configBuilder.withClientKeyFile(file)
      }.withOption(clientCertFile) {
        (file, configBuilder) => configBuilder.withClientCertFile(file)
      }.withOption(namespace) {
        (ns, configBuilder) => configBuilder.withNamespace(ns)
      }.build()
    val baseHttpClient = HttpClientUtils.createHttpClient(config)
    val httpClientWithCustomDispatcher = baseHttpClient.newBuilder()
      .dispatcher(dispatcher)
      .build()
    new DefaultKubernetesClient(httpClientWithCustomDispatcher, config)
  }

  private implicit class OptionConfigurableConfigBuilder(val configBuilder: ConfigBuilder)
    extends AnyVal {

    def withOption[T]
        (option: Option[T])
        (configurator: ((T, ConfigBuilder) => ConfigBuilder)): ConfigBuilder = {
      option.map { opt =>
        configurator(opt, configBuilder)
      }.getOrElse(configBuilder)
    }
  }
}

Source File: SparkPodInitContainerSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.deploy.k8s

import java.io.File
import java.util.UUID

import com.google.common.base.Charsets
import com.google.common.io.Files
import org.mockito.Mockito
import org.scalatest.BeforeAndAfter
import org.scalatest.mockito.MockitoSugar._

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.deploy.k8s.Config._
import org.apache.spark.util.Utils

class SparkPodInitContainerSuite extends SparkFunSuite with BeforeAndAfter {

  private val DOWNLOAD_JARS_SECRET_LOCATION = createTempFile("txt")
  private val DOWNLOAD_FILES_SECRET_LOCATION = createTempFile("txt")

  private var downloadJarsDir: File = _
  private var downloadFilesDir: File = _
  private var downloadJarsSecretValue: String = _
  private var downloadFilesSecretValue: String = _
  private var fileFetcher: FileFetcher = _

  override def beforeAll(): Unit = {
    downloadJarsSecretValue = Files.toString(
      new File(DOWNLOAD_JARS_SECRET_LOCATION), Charsets.UTF_8)
    downloadFilesSecretValue = Files.toString(
      new File(DOWNLOAD_FILES_SECRET_LOCATION), Charsets.UTF_8)
  }

  before {
    downloadJarsDir = Utils.createTempDir()
    downloadFilesDir = Utils.createTempDir()
    fileFetcher = mock[FileFetcher]
  }

  after {
    downloadJarsDir.delete()
    downloadFilesDir.delete()
  }

  test("Downloads from remote server should invoke the file fetcher") {
    val sparkConf = getSparkConfForRemoteFileDownloads
    val initContainerUnderTest = new SparkPodInitContainer(sparkConf, fileFetcher)
    initContainerUnderTest.run()
    Mockito.verify(fileFetcher).fetchFile("http://localhost:9000/jar1.jar", downloadJarsDir)
    Mockito.verify(fileFetcher).fetchFile("hdfs://localhost:9000/jar2.jar", downloadJarsDir)
    Mockito.verify(fileFetcher).fetchFile("http://localhost:9000/file.txt", downloadFilesDir)
  }

  private def getSparkConfForRemoteFileDownloads: SparkConf = {
    new SparkConf(true)
      .set(INIT_CONTAINER_REMOTE_JARS,
        "http://localhost:9000/jar1.jar,hdfs://localhost:9000/jar2.jar")
      .set(INIT_CONTAINER_REMOTE_FILES,
        "http://localhost:9000/file.txt")
      .set(JARS_DOWNLOAD_LOCATION, downloadJarsDir.getAbsolutePath)
      .set(FILES_DOWNLOAD_LOCATION, downloadFilesDir.getAbsolutePath)
  }

  private def createTempFile(extension: String): String = {
    val dir = Utils.createTempDir()
    val file = new File(dir, s"${UUID.randomUUID().toString}.$extension")
    Files.write(UUID.randomUUID().toString, file, Charsets.UTF_8)
    file.getAbsolutePath
  }
}

Source File: QueryPartitionSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import java.io.File
import java.sql.Timestamp

import com.google.common.io.Files
import org.apache.hadoop.fs.FileSystem

import org.apache.spark.sql._
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.test.SQLTestUtils
import org.apache.spark.util.Utils

class QueryPartitionSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
  import spark.implicits._

  test("SPARK-5068: query data when path doesn't exist") {
    withSQLConf((SQLConf.HIVE_VERIFY_PARTITION_PATH.key, "true")) {
      val testData = sparkContext.parallelize(
        (1 to 10).map(i => TestData(i, i.toString))).toDF()
      testData.createOrReplaceTempView("testData")

      val tmpDir = Files.createTempDir()
      // create the table for test
      sql(s"CREATE TABLE table_with_partition(key int,value string) " +
        s"PARTITIONED by (ds string) location '${tmpDir.toURI}' ")
      sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='1') " +
        "SELECT key,value FROM testData")
      sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='2') " +
        "SELECT key,value FROM testData")
      sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='3') " +
        "SELECT key,value FROM testData")
      sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='4') " +
        "SELECT key,value FROM testData")

      // test for the exist path
      checkAnswer(sql("select key,value from table_with_partition"),
        testData.toDF.collect ++ testData.toDF.collect
          ++ testData.toDF.collect ++ testData.toDF.collect)

      // delete the path of one partition
      tmpDir.listFiles
        .find { f => f.isDirectory && f.getName().startsWith("ds=") }
        .foreach { f => Utils.deleteRecursively(f) }

      // test for after delete the path
      checkAnswer(sql("select key,value from table_with_partition"),
        testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect)

      sql("DROP TABLE IF EXISTS table_with_partition")
      sql("DROP TABLE IF EXISTS createAndInsertTest")
    }
  }

  test("SPARK-21739: Cast expression should initialize timezoneId") {
    withTable("table_with_timestamp_partition") {
      sql("CREATE TABLE table_with_timestamp_partition(value int) PARTITIONED BY (ts TIMESTAMP)")
      sql("INSERT OVERWRITE TABLE table_with_timestamp_partition " +
        "PARTITION (ts = '2010-01-01 00:00:00.000') VALUES (1)")

      // test for Cast expression in TableReader
      checkAnswer(sql("SELECT * FROM table_with_timestamp_partition"),
        Seq(Row(1, Timestamp.valueOf("2010-01-01 00:00:00.000"))))

      // test for Cast expression in HiveTableScanExec
      checkAnswer(sql("SELECT value FROM table_with_timestamp_partition " +
        "WHERE ts = '2010-01-01 00:00:00.000'"), Row(1))
    }
  }
}

Source File: HistoryServerArgumentsSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.deploy.history

import java.io.File
import java.nio.charset.StandardCharsets._

import com.google.common.io.Files

import org.apache.spark._
import org.apache.spark.util.Utils

class HistoryServerArgumentsSuite extends SparkFunSuite {

  private val logDir = new File("src/test/resources/spark-events")
  private val conf = new SparkConf()
    .set("spark.history.fs.logDirectory", logDir.getAbsolutePath)
    .set("spark.history.fs.updateInterval", "1")
    .set("spark.testing", "true")

  test("No Arguments Parsing") {
    val argStrings = Array.empty[String]
    val hsa = new HistoryServerArguments(conf, argStrings)
    assert(conf.get("spark.history.fs.logDirectory") === logDir.getAbsolutePath)
    assert(conf.get("spark.history.fs.updateInterval") === "1")
    assert(conf.get("spark.testing") === "true")
  }

  test("Directory Arguments Parsing --dir or -d") {
    val argStrings = Array("--dir", "src/test/resources/spark-events1")
    val hsa = new HistoryServerArguments(conf, argStrings)
    assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events1")
  }

  test("Directory Param can also be set directly") {
    val argStrings = Array("src/test/resources/spark-events2")
    val hsa = new HistoryServerArguments(conf, argStrings)
    assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events2")
  }

  test("Properties File Arguments Parsing --properties-file") {
    val tmpDir = Utils.createTempDir()
    val outFile = File.createTempFile("test-load-spark-properties", "test", tmpDir)
    try {
      Files.write("spark.test.CustomPropertyA blah\n" +
        "spark.test.CustomPropertyB notblah\n", outFile, UTF_8)
      val argStrings = Array("--properties-file", outFile.getAbsolutePath)
      val hsa = new HistoryServerArguments(conf, argStrings)
      assert(conf.get("spark.test.CustomPropertyA") === "blah")
      assert(conf.get("spark.test.CustomPropertyB") === "notblah")
    } finally {
      Utils.deleteRecursively(tmpDir)
    }
  }

}

Source File: DataFrameExample.scala From BigDatalog with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

import java.io.File

import com.google.common.io.Files
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.examples.mllib.AbstractParams
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
import org.apache.spark.sql.{DataFrame, Row, SQLContext}


object DataFrameExample {

  case class Params(input: String = "data/mllib/sample_libsvm_data.txt")
    extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DataFrameExample") {
      head("DataFrameExample: an example app using DataFrame for ML.")
      opt[String]("input")
        .text(s"input path to dataframe")
        .action((x, c) => c.copy(input = x))
      checkConfig { params =>
        success
      }
    }

    parser.parse(args, defaultParams).map { params =>
      run(params)
    }.getOrElse {
      sys.exit(1)
    }
  }

  def run(params: Params) {

    val conf = new SparkConf().setAppName(s"DataFrameExample with $params")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    // Load input data
    println(s"Loading LIBSVM file with UDT from ${params.input}.")
    val df: DataFrame = sqlContext.read.format("libsvm").load(params.input).cache()
    println("Schema from LIBSVM:")
    df.printSchema()
    println(s"Loaded training data as a DataFrame with ${df.count()} records.")

    // Show statistical summary of labels.
    val labelSummary = df.describe("label")
    labelSummary.show()

    // Convert features column to an RDD of vectors.
    val features = df.select("features").map { case Row(v: Vector) => v }
    val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())(
      (summary, feat) => summary.add(feat),
      (sum1, sum2) => sum1.merge(sum2))
    println(s"Selected features column with average values:\n ${featureSummary.mean.toString}")

    // Save the records in a parquet file.
    val tmpDir = Files.createTempDir()
    tmpDir.deleteOnExit()
    val outputDir = new File(tmpDir, "dataframe").toString
    println(s"Saving to $outputDir as Parquet file.")
    df.write.parquet(outputDir)

    // Load the records back.
    println(s"Loading Parquet file with UDT from $outputDir.")
    val newDF = sqlContext.read.parquet(outputDir)
    println(s"Schema from Parquet:")
    newDF.printSchema()

    sc.stop()
  }
}
// scalastyle:on println

Source File: LibSVMRelationSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.source.libsvm

import java.io.File

import com.google.common.base.Charsets
import com.google.common.io.Files

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.util.Utils

class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  var tempDir: File = _
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    tempDir = Utils.createTempDir()
    val file = new File(tempDir, "part-00000")
    Files.write(lines, file, Charsets.US_ASCII)
    path = tempDir.toURI.toString
  }

  override def afterAll(): Unit = {
    Utils.deleteRecursively(tempDir)
    super.afterAll()
  }

  test("select as sparse vector") {
    val df = sqlContext.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = sqlContext.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = sqlContext.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }
}

Source File: HistoryServerArgumentsSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.history

import java.io.File
import java.nio.charset.StandardCharsets._

import com.google.common.io.Files

import org.apache.spark._
import org.apache.spark.util.Utils

class HistoryServerArgumentsSuite extends SparkFunSuite {

  private val logDir = new File("src/test/resources/spark-events")
  private val conf = new SparkConf()
    .set("spark.history.fs.logDirectory", logDir.getAbsolutePath)
    .set("spark.history.fs.updateInterval", "1")
    .set("spark.testing", "true")

  test("No Arguments Parsing") {
    val argStrings = Array.empty[String]
    val hsa = new HistoryServerArguments(conf, argStrings)
    assert(conf.get("spark.history.fs.logDirectory") === logDir.getAbsolutePath)
    assert(conf.get("spark.history.fs.updateInterval") === "1")
    assert(conf.get("spark.testing") === "true")
  }

  test("Directory Arguments Parsing --dir or -d") {
    val argStrings = Array("--dir", "src/test/resources/spark-events1")
    val hsa = new HistoryServerArguments(conf, argStrings)
    assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events1")
  }

  test("Directory Param can also be set directly") {
    val argStrings = Array("src/test/resources/spark-events2")
    val hsa = new HistoryServerArguments(conf, argStrings)
    assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events2")
  }

  test("Properties File Arguments Parsing --properties-file") {
    val tmpDir = Utils.createTempDir()
    val outFile = File.createTempFile("test-load-spark-properties", "test", tmpDir)
    try {
      Files.write("spark.test.CustomPropertyA blah\n" +
        "spark.test.CustomPropertyB notblah\n", outFile, UTF_8)
      val argStrings = Array("--properties-file", outFile.getAbsolutePath)
      val hsa = new HistoryServerArguments(conf, argStrings)
      assert(conf.get("spark.test.CustomPropertyA") === "blah")
      assert(conf.get("spark.test.CustomPropertyB") === "notblah")
    } finally {
      Utils.deleteRecursively(tmpDir)
    }
  }

}

Source File: YarnShuffleIntegrationSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.io.File

import com.google.common.base.Charsets.UTF_8
import com.google.common.io.Files
import org.apache.commons.io.FileUtils
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.scalatest.Matchers

import org.apache.spark._
import org.apache.spark.network.shuffle.ShuffleTestAccessor
import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor}
import org.apache.spark.tags.ExtendedYarnTest


@ExtendedYarnTest
class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {

  override def newYarnConfig(): YarnConfiguration = {
    val yarnConfig = new YarnConfiguration()
    yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle")
    yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"),
      classOf[YarnShuffleService].getCanonicalName)
    yarnConfig.set("spark.shuffle.service.port", "0")
    yarnConfig
  }

  test("external shuffle service") {
    val shuffleServicePort = YarnTestAccessor.getShuffleServicePort
    val shuffleService = YarnTestAccessor.getShuffleServiceInstance

    val registeredExecFile = YarnTestAccessor.getRegisteredExecutorFile(shuffleService)

    logInfo("Shuffle service port = " + shuffleServicePort)
    val result = File.createTempFile("result", null, tempDir)
    val finalState = runSpark(
      false,
      mainClassName(YarnExternalShuffleDriver.getClass),
      appArgs = Seq(result.getAbsolutePath(), registeredExecFile.getAbsolutePath),
      extraConf = Map(
        "spark.shuffle.service.enabled" -> "true",
        "spark.shuffle.service.port" -> shuffleServicePort.toString
      )
    )
    checkResult(finalState, result)
    assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists())
  }
}

private object YarnExternalShuffleDriver extends Logging with Matchers {

  val WAIT_TIMEOUT_MILLIS = 10000

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      // scalastyle:off println
      System.err.println(
        s"""
        |Invalid command line: ${args.mkString(" ")}
        |
        |Usage: ExternalShuffleDriver [result file] [registed exec file]
        """.stripMargin)
      // scalastyle:on println
      System.exit(1)
    }

    val sc = new SparkContext(new SparkConf()
      .setAppName("External Shuffle Test"))
    val conf = sc.getConf
    val status = new File(args(0))
    val registeredExecFile = new File(args(1))
    logInfo("shuffle service executor file = " + registeredExecFile)
    var result = "failure"
    val execStateCopy = new File(registeredExecFile.getAbsolutePath + "_dup")
    try {
      val data = sc.parallelize(0 until 100, 10).map { x => (x % 10) -> x }.reduceByKey{ _ + _ }.
        collect().toSet
      sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
      data should be ((0 until 10).map{x => x -> (x * 10 + 450)}.toSet)
      result = "success"
      // only one process can open a leveldb file at a time, so we copy the files
      FileUtils.copyDirectory(registeredExecFile, execStateCopy)
      assert(!ShuffleTestAccessor.reloadRegisteredExecutors(execStateCopy).isEmpty)
    } finally {
      sc.stop()
      FileUtils.deleteDirectory(execStateCopy)
      Files.write(result, status, UTF_8)
    }
  }

}

Source File: HttpFileServer.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark

import java.io.File

import com.google.common.io.Files

import org.apache.spark.util.Utils

private[spark] class HttpFileServer(
    conf: SparkConf,
    securityManager: SecurityManager,
    requestedPort: Int = 0)
  extends Logging {

  var baseDir : File = null
  var fileDir : File = null
  var jarDir : File = null
  var httpServer : HttpServer = null
  var serverUri : String = null

  def initialize() {
    baseDir = Utils.createTempDir(Utils.getLocalDir(conf), "httpd")
    fileDir = new File(baseDir, "files")
    jarDir = new File(baseDir, "jars")
    fileDir.mkdir()
    jarDir.mkdir()
    logInfo("HTTP File server directory is " + baseDir)
    httpServer = new HttpServer(conf, baseDir, securityManager, requestedPort, "HTTP file server")
    httpServer.start()
    serverUri = httpServer.uri
    logDebug("HTTP file server started at: " + serverUri)
  }

  def stop() {
    httpServer.stop()

    // If we only stop sc, but the driver process still run as a services then we need to delete
    // the tmp dir, if not, it will create too many tmp dirs
    try {
      Utils.deleteRecursively(baseDir)
    } catch {
      case e: Exception =>
        logWarning(s"Exception while deleting Spark temp dir: ${baseDir.getAbsolutePath}", e)
    }
  }

  def addFile(file: File) : String = {
    addFileToDir(file, fileDir)
    serverUri + "/files/" + Utils.encodeFileNameToURIRawPath(file.getName)
  }

  def addJar(file: File) : String = {
    addFileToDir(file, jarDir)
    serverUri + "/jars/" + Utils.encodeFileNameToURIRawPath(file.getName)
  }

  def addFileToDir(file: File, dir: File) : String = {
    // Check whether the file is a directory. If it is, throw a more meaningful exception.
    // If we don't catch this, Guava throws a very confusing error message:
    //   java.io.FileNotFoundException: [file] (No such file or directory)
    // even though the directory ([file]) exists.
    if (file.isDirectory) {
      throw new IllegalArgumentException(s"$file cannot be a directory.")
    }
    Files.copy(file, new File(dir, file.getName))
    dir + "/" + Utils.encodeFileNameToURIRawPath(file.getName)
  }

}

Source File: HistoryServerArgumentsSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.deploy.history

import java.io.File
import java.nio.charset.StandardCharsets._

import com.google.common.io.Files

import org.apache.spark._
import org.apache.spark.util.Utils

class HistoryServerArgumentsSuite extends SparkFunSuite {

  private val logDir = new File("src/test/resources/spark-events")
  private val conf = new SparkConf()
    .set("spark.history.fs.logDirectory", logDir.getAbsolutePath)
    .set("spark.history.fs.updateInterval", "1")
    .set("spark.testing", "true")

  test("No Arguments Parsing") {
    val argStrings = Array[String]()
    val hsa = new HistoryServerArguments(conf, argStrings)
    assert(conf.get("spark.history.fs.logDirectory") === logDir.getAbsolutePath)
    assert(conf.get("spark.history.fs.updateInterval") === "1")
    assert(conf.get("spark.testing") === "true")
  }

  test("Directory Arguments Parsing --dir or -d") {
    val argStrings = Array("--dir", "src/test/resources/spark-events1")
    val hsa = new HistoryServerArguments(conf, argStrings)
    assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events1")
  }

  test("Directory Param can also be set directly") {
    val argStrings = Array("src/test/resources/spark-events2")
    val hsa = new HistoryServerArguments(conf, argStrings)
    assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events2")
  }

  test("Properties File Arguments Parsing --properties-file") {
    val tmpDir = Utils.createTempDir()
    val outFile = File.createTempFile("test-load-spark-properties", "test", tmpDir)
    try {
      Files.write("spark.test.CustomPropertyA blah\n" +
        "spark.test.CustomPropertyB notblah\n", outFile, UTF_8)
      val argStrings = Array("--properties-file", outFile.getAbsolutePath)
      val hsa = new HistoryServerArguments(conf, argStrings)
      assert(conf.get("spark.test.CustomPropertyA") === "blah")
      assert(conf.get("spark.test.CustomPropertyB") === "notblah")
    } finally {
      Utils.deleteRecursively(tmpDir)
    }
  }

}

Source File: HBaseLocalClient.scala From gimel with Apache License 2.0

5 votes

package com.paypal.gimel.hbase.utilities

import java.io.File

import scala.collection.mutable.ArrayBuffer

import com.google.common.io.Files
import org.apache.hadoop.hbase.{HBaseTestingUtility, TableName}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.execution.datasources.hbase.SparkHBaseConf
import org.apache.spark.sql.util._
import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers}

import com.paypal.gimel.common.catalog.Field
import com.paypal.gimel.hbase.DataSet

class HBaseLocalClient extends FunSuite with Matchers with BeforeAndAfterAll {

  var sparkSession : SparkSession = _
  var dataSet: DataSet = _
  val hbaseTestingUtility = new HBaseTestingUtility()
  val tableName = "test_table"
  val cfs = Array("personal", "professional")
  val columns = Array("id", "name", "age", "address", "company", "designation", "salary")
  val fields = columns.map(col => new Field(col))

  val metrics = ArrayBuffer.empty[(String, QueryExecution, Long)]

  protected override def beforeAll(): Unit = {
    val tempDir: File = Files.createTempDir
    tempDir.deleteOnExit
    hbaseTestingUtility.startMiniCluster()
    SparkHBaseConf.conf = hbaseTestingUtility.getConfiguration
    createTable(tableName, cfs)
    val conf = new SparkConf
    conf.set(SparkHBaseConf.testConf, "true")
    sparkSession = SparkSession.builder()
      .master("local")
      .appName("HBase Test")
      .config(conf)
      .getOrCreate()

    val listener = new QueryExecutionListener {
      // Only test successful case here, so no need to implement `onFailure`
      override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {}
      override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = {
        metrics += ((funcName, qe, duration))
      }
    }
    sparkSession.listenerManager.register(listener)
    sparkSession.sparkContext.setLogLevel("ERROR")
    dataSet = new DataSet(sparkSession)
  }

  protected override def afterAll(): Unit = {
    hbaseTestingUtility.shutdownMiniCluster()
    sparkSession.close()
  }

  def createTable(name: String, cfs: Array[String]) {
    val tName = Bytes.toBytes(name)
    val bcfs = cfs.map(Bytes.toBytes(_))
    try {
      hbaseTestingUtility.deleteTable(TableName.valueOf(tName))
    } catch {
      case _ : Throwable =>
        println("No table = " + name + " found")
    }
    hbaseTestingUtility.createMultiRegionTable(TableName.valueOf(tName), bcfs)
  }

  // Mocks data for testing
  def mockDataInDataFrame(numberOfRows: Int): DataFrame = {
    def stringed(n: Int) = s"""{"id": "$n","name": "MAC-$n", "address": "MAC-${n + 1}", "age": "${n + 1}", "company": "MAC-$n", "designation": "MAC-$n", "salary": "${n * 10000}" }"""
    val texts: Seq[String] = (1 to numberOfRows).map { x => stringed(x) }
    val rdd: RDD[String] = sparkSession.sparkContext.parallelize(texts)
    val dataFrame: DataFrame = sparkSession.read.json(rdd)
    dataFrame
  }
}

Source File: ApplicationWithProcess.scala From aloha with Apache License 2.0

5 votes

package me.jrwang.aloha.app

import java.io.File
import java.nio.charset.StandardCharsets

import scala.collection.JavaConverters._
import scala.concurrent.Promise

import com.google.common.io.Files
import me.jrwang.aloha.common.Logging
import me.jrwang.aloha.common.util.{FileAppender, Utils}

abstract class ApplicationWithProcess extends AbstractApplication with Logging {
  private var process: Process = _
  private var stdoutAppender: FileAppender = _
  private var stderrAppender: FileAppender = _

  // Timeout to wait for when trying to terminate an app.
  private val APP_TERMINATE_TIMEOUT_MS = 10 * 1000

  def getProcessBuilder(): ProcessBuilder

  private var stateMonitorThread: Thread = _

  override def start(): Promise[ExitState] = {
    val processBuilder = getProcessBuilder()
    val command = processBuilder.command()
    val formattedCommand = command.asScala.mkString("\"", "\" \"", "\"")
    logInfo(s"Launch command: $formattedCommand")
    processBuilder.directory(appDir)

    process = processBuilder.start()
    // Redirect its stdout and stderr to files
    val stdout = new File(appDir, "stdout")
    stdoutAppender = FileAppender(process.getInputStream, stdout, alohaConf)

    val header = "Aloha Application Command: %s\n%s\n\n".format(
      formattedCommand, "=" * 40)
    val stderr = new File(appDir, "stderr")
    Files.write(header, stderr, StandardCharsets.UTF_8)
    stderrAppender = FileAppender(process.getErrorStream, stderr, alohaConf)

    stateMonitorThread = new Thread("app-state-monitor-thread") {
      override def run(): Unit = {
        val exitCode = process.waitFor()
        if(exitCode == 0) {
          result.success(ExitState(ExitCode.SUCCESS, Some("success")))
        } else {
          result.success(ExitState(ExitCode.FAILED, Some("failed")))
        }
      }
    }
    stateMonitorThread.start()
    result
  }

  override def shutdown(reason: Option[String]): Unit = {
    if (process != null) {
      logInfo("Killing process!")
      if (stdoutAppender != null) {
        stdoutAppender.stop()
      }
      if (stderrAppender != null) {
        stderrAppender.stop()
      }
      val exitCode = Utils.terminateProcess(process, APP_TERMINATE_TIMEOUT_MS)
      if (exitCode.isEmpty) {
        logWarning("Failed to terminate process: " + process +
          ". This process will likely be orphaned.")
      }
    }
  }
}

Source File: SHC.scala From shc with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.sql.execution.datasources.hbase.Logging

import java.io.File

import com.google.common.io.Files
import org.apache.hadoop.hbase.client.Table
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseTestingUtility, TableName}
import org.apache.spark.sql.execution.datasources.hbase.SparkHBaseConf
import org.apache.spark.{SparkContext, SparkConf}
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}

class SHC  extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll  with Logging {
  implicit class StringToColumn(val sc: StringContext) {
    def $(args: Any*): ColumnName = {
      new ColumnName(sc.s(args: _*))
    }
  }

  var spark: SparkSession = null
  var sc: SparkContext = null
  var sqlContext: SQLContext = null
  var df: DataFrame = null

  private[spark] var htu = new HBaseTestingUtility
  private[spark] def tableName = "table1"

  private[spark] def columnFamilies: Array[String] = Array.tabulate(9){ x=> s"cf$x"}
  var table: Table = null
  val conf = new SparkConf
  conf.set(SparkHBaseConf.testConf, "true")
  // private[spark] var columnFamilyStr = Bytes.toString(columnFamily)

  def defineCatalog(tName: String) = s"""{
                                         |"table":{"namespace":"default", "name":"$tName"},
                                         |"rowkey":"key",
                                         |"columns":{
                                              |"col0":{"cf":"rowkey", "col":"key", "type":"string"},
                                              |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"},
                                              |"col2":{"cf":"cf2", "col":"col2", "type":"double"},
                                              |"col3":{"cf":"cf3", "col":"col3", "type":"float"},
                                              |"col4":{"cf":"cf4", "col":"col4", "type":"int"},
                                              |"col5":{"cf":"cf5", "col":"col5", "type":"bigint"},
                                              |"col6":{"cf":"cf6", "col":"col6", "type":"smallint"},
                                              |"col7":{"cf":"cf7", "col":"col7", "type":"string"},
                                              |"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"}
                                            |}
                                         |}""".stripMargin

  @deprecated(since = "04.12.2017(dd/mm/year)", message = "use `defineCatalog` instead")
  def catalog = defineCatalog(tableName)

  override def beforeAll() {
    val tempDir: File = Files.createTempDir
    tempDir.deleteOnExit
    htu.startMiniCluster
    SparkHBaseConf.conf = htu.getConfiguration
    logInfo(" - minicluster started")
    println(" - minicluster started")

    spark = SparkSession.builder()
      .master("local")
      .appName("HBaseTest")
      .config(conf)
      .getOrCreate()

    sqlContext = spark.sqlContext
    sc = spark.sparkContext
  }

  override def afterAll() {
    htu.shutdownMiniCluster()
    spark.stop()
  }

  def createTable(name: String, cfs: Array[String]) {
    val tName = Bytes.toBytes(name)
    val bcfs = cfs.map(Bytes.toBytes(_))
    try {
      htu.deleteTable(TableName.valueOf(tName))
    } catch {
      case _ : Throwable =>
        logInfo(" - no table " + name + " found")
    }
    htu.createMultiRegionTable(TableName.valueOf(tName), bcfs)
  }


  def createTable(name: Array[Byte], cfs: Array[Array[Byte]]) {
    try {
      htu.deleteTable(TableName.valueOf(name))
    } catch {
      case _ : Throwable =>
        logInfo(" - no table " + Bytes.toString(name) + " found")
    }
    htu.createMultiRegionTable(TableName.valueOf(name), cfs)
  }
}

Source File: HBaseTestSuite.scala From shc with Apache License 2.0

5 votes

package org.apache.spark.sql

import java.io.File

import scala.collection.JavaConverters._

import com.google.common.io.Files
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{TableName, HBaseTestingUtility}
import org.apache.spark.sql.execution.datasources.hbase.Logging
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}

class HBaseTestSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll  with Logging {
  private[spark] var htu = HBaseTestingUtility.createLocalHTU()
  private[spark] var tableName: Array[Byte] = Bytes.toBytes("t1")
  private[spark] var columnFamily: Array[Byte] = Bytes.toBytes("cf0")
  private[spark] var columnFamilies: Array[Array[Byte]] =
    Array(Bytes.toBytes("cf0"), Bytes.toBytes("cf1"), Bytes.toBytes("cf2"), Bytes.toBytes("cf3"), Bytes.toBytes("cf4"))
  var table: Table = null
  // private[spark] var columnFamilyStr = Bytes.toString(columnFamily)

  override def beforeAll() {
    val tempDir: File = Files.createTempDir
    tempDir.deleteOnExit
    htu.cleanupTestDir
    htu.startMiniZKCluster
    htu.startMiniHBaseCluster(1, 4)
    logInfo(" - minicluster started")
    println(" - minicluster started")
    try {
      htu.deleteTable(TableName.valueOf(tableName))

      //htu.createTable(TableName.valueOf(tableName), columnFamily, 2, Bytes.toBytes("abc"), Bytes.toBytes("xyz"), 2)
    } catch {
      case _ : Throwable =>
        logInfo(" - no table " + Bytes.toString(tableName) + " found")
    }
    setupTable()
  }



  override def afterAll() {
    try {
      table.close()
      println("shutdown")
      htu.deleteTable(TableName.valueOf(tableName))
      logInfo("shuting down minicluster")
      htu.shutdownMiniHBaseCluster
      htu.shutdownMiniZKCluster
      logInfo(" - minicluster shut down")
      htu.cleanupTestDir
    } catch {
      case _ : Throwable => logError("teardown error")
    }
  }

  def setupTable() {
    val config = htu.getConfiguration
    htu.createMultiRegionTable(TableName.valueOf(tableName), columnFamilies)
    println("create htable t1")
    val connection = ConnectionFactory.createConnection(config)
    val r = connection.getRegionLocator(TableName.valueOf("t1"))
    table = connection.getTable(TableName.valueOf("t1"))

    val regionLocations = r.getAllRegionLocations.asScala.toSeq
    println(s"$regionLocations size: ${regionLocations.size}")
    (0 until 100).foreach { x =>
      var put = new Put(Bytes.toBytes(s"row$x"))
      (0 until 5).foreach { y =>
        put.addColumn(columnFamilies(y), Bytes.toBytes(s"c$y"), Bytes.toBytes(s"value $x $y"))
      }
      table.put(put)
    }
  }
}

Source File: LibFFMRelationSuite.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.ml.source.libffm

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files
import org.apache.spark.SparkFunSuite
import com.tencent.angel.sona.ml.util.MLlibTestSparkContext
import org.apache.spark.util.SparkUtil

class LibFFMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Path for dataset
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines0 =
      """
        |1 0:1:1.0 1:3:2.0 2:5:3.0
        |0
      """.stripMargin
    val lines1 =
      """
        |0 0:2:4.0 1:4:5.0 2:6:6.0
      """.stripMargin
    val dir = SparkUtil.createTempDir()
    val succ = new File(dir, "_SUCCESS")
    val file0 = new File(dir, "part-00000")
    val file1 = new File(dir, "part-00001")
    Files.write("", succ, StandardCharsets.UTF_8)
    Files.write(lines0, file0, StandardCharsets.UTF_8)
    Files.write(lines1, file1, StandardCharsets.UTF_8)
    path = dir.getPath
  }

  override def afterAll(): Unit = {
    try {
      val prefix = "C:\\Users\\fitzwang\\AppData\\Local\\Temp\\"
      if (path.startsWith(prefix)) {
        SparkUtil.deleteRecursively(new File(path))
      }
    } finally {
      super.afterAll()
    }
  }

  test("ffmIO"){
    val df = spark.read.format("libffm").load(path)
    val metadata = df.schema(1).metadata

    val fieldSet = MetaSummary.getFieldSet(metadata)
    println(fieldSet.mkString("[", ",", "]"))

    val keyFieldMap = MetaSummary.getKeyFieldMap(metadata)
    println(keyFieldMap.mkString("[", ",", "]"))

    df.write.format("libffm").save("temp.libffm")
  }

  test("read_ffm"){
    val df = spark.read.format("libffm").load(path)
    val metadata = df.schema(1).metadata

    val fieldSet = MetaSummary.getFieldSet(metadata)
    println(fieldSet.mkString("[", ",", "]"))

    val keyFieldMap = MetaSummary.getKeyFieldMap(metadata)
    println(keyFieldMap.mkString("[", ",", "]"))
  }

}

Source File: SHC.scala From shc with Apache License 2.0

5 votes

package org.apache.spark.sql

import java.io.File

import com.google.common.io.Files
import org.apache.hadoop.hbase.{HColumnDescriptor, HTableDescriptor, TableName, HBaseTestingUtility}
import org.apache.hadoop.hbase.client.{Scan, Put, ConnectionFactory, Table}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.execution.datasources.hbase.SparkHBaseConf
import org.apache.spark.sql.types.UTF8String
import org.apache.spark.{SparkContext, SparkConf, Logging}
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}
import scala.collection.JavaConverters._

class SHC  extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll  with Logging {
  implicit class StringToColumn(val sc: StringContext) {
    def $(args: Any*): ColumnName = {
      new ColumnName(sc.s(args: _*))
    }
  }


  private[spark] var htu = HBaseTestingUtility.createLocalHTU()
  private[spark] def tableName = "table1"

  private[spark] def columnFamilies: Array[String] = Array.tabulate(9){ x=> s"cf$x"}
  var table: Table = null
  val conf = new SparkConf
  conf.set(SparkHBaseConf.testConf, "true")
  SparkHBaseConf.conf = htu.getConfiguration
  // private[spark] var columnFamilyStr = Bytes.toString(columnFamily)

  def catalog = s"""{
            |"table":{"namespace":"default", "name":"table1"},
            |"rowkey":"key",
            |"columns":{
              |"col0":{"cf":"rowkey", "col":"key", "type":"string"},
              |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"},
              |"col2":{"cf":"cf2", "col":"col2", "type":"double"},
              |"col3":{"cf":"cf3", "col":"col3", "type":"float"},
              |"col4":{"cf":"cf4", "col":"col4", "type":"int"},
              |"col5":{"cf":"cf5", "col":"col5", "type":"bigint"},
              |"col6":{"cf":"cf6", "col":"col6", "type":"smallint"},
              |"col7":{"cf":"cf7", "col":"col7", "type":"string"},
              |"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"}
            |}
          |}""".stripMargin

  override def beforeAll() {
    val tempDir: File = Files.createTempDir
    tempDir.deleteOnExit
    htu.cleanupTestDir
    htu.startMiniZKCluster
    htu.startMiniHBaseCluster(1, 4)
    logInfo(" - minicluster started")
    println(" - minicluster started")

  }

  override def afterAll() {
    try {
      table.close()
      println("shutdown")
      htu.deleteTable(TableName.valueOf(tableName))
      logInfo("shuting down minicluster")
      htu.shutdownMiniHBaseCluster
      htu.shutdownMiniZKCluster
      logInfo(" - minicluster shut down")
      htu.cleanupTestDir
    } catch {
      case _ => logError("teardown error")
    }
  }

  def createTable(name: String, cfs: Array[String]) {
    val tName = Bytes.toBytes(name)
    val bcfs = cfs.map(Bytes.toBytes(_))
    try {
      htu.deleteTable(TableName.valueOf(tName))
    } catch {
      case _ =>
        logInfo(" - no table " + name + " found")
    }
    htu.createMultiRegionTable(TableName.valueOf(tName), bcfs)
  }


  def createTable(name: Array[Byte], cfs: Array[Array[Byte]]) {
    try {
      htu.deleteTable(TableName.valueOf(name))
    } catch {
      case _ =>
        logInfo(" - no table " + Bytes.toString(name) + " found")
    }
    htu.createMultiRegionTable(TableName.valueOf(name), cfs)
  }
}

Source File: TestUtils.scala From shc with Apache License 2.0

5 votes

package org.apache.spark.sql

import java.nio.ByteBuffer
import java.io.{IOException, File}
import java.nio.ByteBuffer
import java.util

import org.apache.avro.generic.GenericData

import scala.collection.immutable.HashSet
import scala.collection.mutable.ArrayBuffer
import scala.util.Random

import com.google.common.io.Files
import org.apache.spark.sql.SQLContext

import scala.util.Random

object TestUtils {

  def generateRandomByteBuffer(rand: Random, size: Int): ByteBuffer = {
    val bb = ByteBuffer.allocate(size)
    val arrayOfBytes = new Array[Byte](size)
    rand.nextBytes(arrayOfBytes)
    bb.put(arrayOfBytes)
  }

  def generateRandomMap(rand: Random, size: Int): java.util.Map[String, Int] = {
    val jMap = new util.HashMap[String, Int]()
    for (i <- 0 until size) {
      jMap.put(rand.nextString(5), i)
    }
    jMap
  }

  def generateRandomArray(rand: Random, size: Int): util.ArrayList[Boolean] = {
    val vec = new util.ArrayList[Boolean]()
    for (i <- 0 until size) {
      vec.add(rand.nextBoolean())
    }
    vec
  }
}

Source File: MVMSuite.scala From zen with Apache License 2.0

5 votes

package com.github.cloudml.zen.ml.recommendation

import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV, sum => brzSum}
import com.github.cloudml.zen.ml.util._
import com.google.common.io.Files
import org.apache.spark.mllib.linalg.{DenseVector => SDV, SparseVector => SSV, Vector => SV}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.scalatest.{FunSuite, Matchers}

class MVMSuite extends FunSuite with SharedSparkContext with Matchers {
  test("binary classification") {
    val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
    val dataSetFile = classOf[MVMSuite].getClassLoader().getResource("binary_classification_data.txt").toString()
    val checkpoint = s"$sparkHome/target/tmp"
    sc.setCheckpointDir(checkpoint)
    val dataSet = MLUtils.loadLibSVMFile(sc, dataSetFile).zipWithIndex().map {
      case (LabeledPoint(label, features), id) =>
        val newLabel = if (label > 0.0) 1.0 else 0.0
        (id, LabeledPoint(newLabel, features))
    }
    val stepSize = 0.1
    val regParam = 1e-2
    val l2 = (regParam, regParam, regParam)
    val rank = 20
    val useAdaGrad = true
    val trainSet = dataSet.cache()
    val fm = new FMClassification(trainSet, stepSize, l2, rank, useAdaGrad)

    val maxIter = 10
    val pps = new Array[Double](maxIter)
    var i = 0
    val startedAt = System.currentTimeMillis()
    while (i < maxIter) {
      fm.run(1)
      pps(i) = fm.saveModel().loss(trainSet)
      i += 1
    }
    println((System.currentTimeMillis() - startedAt) / 1e3)
    pps.foreach(println)

    val ppsDiff = pps.init.zip(pps.tail).map { case (lhs, rhs) => lhs - rhs }
    assert(ppsDiff.count(_ < 0).toDouble / ppsDiff.size > 0.05)

    val fmModel = fm.saveModel()
    val tempDir = Files.createTempDir()
    tempDir.deleteOnExit()
    val path = tempDir.toURI.toString
    fmModel.save(sc, path)
    val sameModel = FMModel.load(sc, path)
    assert(sameModel.k === fmModel.k)
    assert(sameModel.classification === fmModel.classification)
    assert(sameModel.factors.sortByKey().map(_._2).collect() ===
      fmModel.factors.sortByKey().map(_._2).collect())
  }

  ignore("url_combined classification") {
    val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
    val dataSetFile = classOf[MVMSuite].getClassLoader().getResource("binary_classification_data.txt").toString()
    val checkpointDir = s"$sparkHome/target/tmp"
    sc.setCheckpointDir(checkpointDir)
    val dataSet = MLUtils.loadLibSVMFile(sc, dataSetFile).zipWithIndex().map {
      case (LabeledPoint(label, features), id) =>
        val newLabel = if (label > 0.0) 1.0 else 0.0
        (id, LabeledPoint(newLabel, features))
    }.cache()
    val numFeatures = dataSet.first()._2.features.size
    val stepSize = 0.1
    val numIterations = 500
    val regParam = 1e-3
    val rank = 20
    val views = Array(20, numFeatures / 2, numFeatures).map(_.toLong)
    val useAdaGrad = true
    val useWeightedLambda = true
    val miniBatchFraction = 1
    val Array(trainSet, testSet) = dataSet.randomSplit(Array(0.8, 0.2))
    trainSet.cache()
    testSet.cache()

    val fm = new MVMClassification(trainSet, stepSize, views, regParam, 0.0, rank,
      useAdaGrad, useWeightedLambda, miniBatchFraction)
    fm.run(numIterations)
    val model = fm.saveModel()
    println(f"Test loss: ${model.loss(testSet.cache())}%1.4f")

  }

}

Source File: PgV3ProtocolTest.scala From spark-sql-server with Apache License 2.0

5 votes

package org.apache.spark.sql.server.service.postgresql.protocol.v3

import java.io.File
import java.nio.charset.StandardCharsets
import java.util.UUID

import scala.sys.process._

import com.google.common.io.Files
import org.xerial.snappy.OSInfo

import org.apache.spark.sql.server.PgJdbcTest
import org.apache.spark.util.Utils

class PgV3ProtocolTest extends PgJdbcTest {

  // TODO: Replace `snappy-java` with `commons.lang3.SystemUtils`
  private val isOsSupported = Seq("Linux", "Mac").contains(OSInfo.getOSName)
  private val isArchSupported = Seq("x86_64").contains(OSInfo.getArchName)

  private lazy val tempDirPath = Utils.createTempDir().getCanonicalPath
  private lazy val cmdPath = {
    val resourcePath = s"pgproto/${OSInfo.getOSName}/${OSInfo.getArchName}/pgproto"
    val classLoader = Thread.currentThread().getContextClassLoader
    val _cmdPath = classLoader.getResource(resourcePath).getPath
    // Set an executable flag explicitly here
    new File(_cmdPath).setExecutable(true)
    _cmdPath
  }

  
  def testIfSupported(testName: String)(testBody: => Unit) {
    if (isOsSupported && isArchSupported) {
      test(testName)(testBody)
    } else {
      ignore(s"$testName [not supported in env: " +
        s"os=${OSInfo.getOSName} arch=${OSInfo.getArchName}]")(testBody)
    }
  }

  def checkV3Protocol(messages: String, expected: String): Unit = {
    val msgDescriptionPath = s"$tempDirPath/${UUID.randomUUID().toString}.pgproto"
    val serverPort = serverInstance.listeningPort
    val command = s"$cmdPath -h localhost -d default -p $serverPort -f $msgDescriptionPath 2>&1"

    def normalize(s: String): String = s.trim.stripLineEnd.replaceAll("^\n", "")

    // Write a file containing messages in the temporary dir
    Files.write(normalize(messages), new File(msgDescriptionPath), StandardCharsets.UTF_8)

    val output = ("bash" :: "-c" :: command :: Nil).lineStream
    val actual = output.mkString("\n")
    assert(actual === normalize(expected))
  }
}

Source File: Unzip.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.workflowexecutor

import java.io._
import java.util.zip.ZipInputStream

import scala.reflect.io.Path

import com.google.common.io.Files

import io.deepsense.commons.utils.Logging

object Unzip extends Logging {

  
  def unzipAll(inputFile: String): String =
    unzipToTmp(inputFile, _ => true)

  private def transferImpl(in: InputStream, out: OutputStream, close: Boolean): Unit = {
    try {
      val buffer = new Array[Byte](4096)
      def read(): Unit = {
        val byteCount = in.read(buffer)
        if (byteCount >= 0) {
          out.write(buffer, 0, byteCount)
          read()
        }
      }
      read()
      out.close()
    }
    finally {
      if (close) {
        in.close()
      }
    }
  }
}

Source File: QueryPartitionSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import java.io.File
import java.sql.Timestamp

import com.google.common.io.Files
import org.apache.hadoop.fs.FileSystem

import org.apache.spark.internal.config._
import org.apache.spark.sql._
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.test.SQLTestUtils
import org.apache.spark.util.Utils

class QueryPartitionSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
  import spark.implicits._

  private def queryWhenPathNotExist(): Unit = {
    withTempView("testData") {
      withTable("table_with_partition", "createAndInsertTest") {
        withTempDir { tmpDir =>
          val testData = sparkContext.parallelize(
            (1 to 10).map(i => TestData(i, i.toString))).toDF()
          testData.createOrReplaceTempView("testData")

          // create the table for test
          sql(s"CREATE TABLE table_with_partition(key int,value string) " +
              s"PARTITIONED by (ds string) location '${tmpDir.toURI}' ")
          sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='1') " +
              "SELECT key,value FROM testData")
          sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='2') " +
              "SELECT key,value FROM testData")
          sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='3') " +
              "SELECT key,value FROM testData")
          sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='4') " +
              "SELECT key,value FROM testData")

          // test for the exist path
          checkAnswer(sql("select key,value from table_with_partition"),
            testData.union(testData).union(testData).union(testData))

          // delete the path of one partition
          tmpDir.listFiles
              .find { f => f.isDirectory && f.getName().startsWith("ds=") }
              .foreach { f => Utils.deleteRecursively(f) }

          // test for after delete the path
          checkAnswer(sql("select key,value from table_with_partition"),
            testData.union(testData).union(testData))
        }
      }
    }
  }

  test("SPARK-5068: query data when path doesn't exist") {
    withSQLConf(SQLConf.HIVE_VERIFY_PARTITION_PATH.key -> "true") {
      queryWhenPathNotExist()
    }
  }

  test("Replace spark.sql.hive.verifyPartitionPath by spark.files.ignoreMissingFiles") {
    withSQLConf(SQLConf.HIVE_VERIFY_PARTITION_PATH.key -> "false") {
      sparkContext.conf.set(IGNORE_MISSING_FILES.key, "true")
      queryWhenPathNotExist()
    }
  }

  test("SPARK-21739: Cast expression should initialize timezoneId") {
    withTable("table_with_timestamp_partition") {
      sql("CREATE TABLE table_with_timestamp_partition(value int) PARTITIONED BY (ts TIMESTAMP)")
      sql("INSERT OVERWRITE TABLE table_with_timestamp_partition " +
        "PARTITION (ts = '2010-01-01 00:00:00.000') VALUES (1)")

      // test for Cast expression in TableReader
      checkAnswer(sql("SELECT * FROM table_with_timestamp_partition"),
        Seq(Row(1, Timestamp.valueOf("2010-01-01 00:00:00.000"))))

      // test for Cast expression in HiveTableScanExec
      checkAnswer(sql("SELECT value FROM table_with_timestamp_partition " +
        "WHERE ts = '2010-01-01 00:00:00.000'"), Row(1))
    }
  }
}

Source File: YarnShuffleIntegrationSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files
import org.apache.commons.io.FileUtils
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.scalatest.Matchers

import org.apache.spark._
import org.apache.spark.internal.Logging
import org.apache.spark.network.shuffle.ShuffleTestAccessor
import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor}
import org.apache.spark.tags.ExtendedYarnTest


@ExtendedYarnTest
class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {

  override def newYarnConfig(): YarnConfiguration = {
    val yarnConfig = new YarnConfiguration()
    yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle")
    yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"),
      classOf[YarnShuffleService].getCanonicalName)
    yarnConfig.set("spark.shuffle.service.port", "0")
    yarnConfig
  }

  test("external shuffle service") {
    val shuffleServicePort = YarnTestAccessor.getShuffleServicePort
    val shuffleService = YarnTestAccessor.getShuffleServiceInstance

    val registeredExecFile = YarnTestAccessor.getRegisteredExecutorFile(shuffleService)

    logInfo("Shuffle service port = " + shuffleServicePort)
    val result = File.createTempFile("result", null, tempDir)
    val finalState = runSpark(
      false,
      mainClassName(YarnExternalShuffleDriver.getClass),
      appArgs = Seq(result.getAbsolutePath(), registeredExecFile.getAbsolutePath),
      extraConf = Map(
        "spark.shuffle.service.enabled" -> "true",
        "spark.shuffle.service.port" -> shuffleServicePort.toString
      )
    )
    checkResult(finalState, result)
    assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists())
  }
}

private object YarnExternalShuffleDriver extends Logging with Matchers {

  val WAIT_TIMEOUT_MILLIS = 10000

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      // scalastyle:off println
      System.err.println(
        s"""
        |Invalid command line: ${args.mkString(" ")}
        |
        |Usage: ExternalShuffleDriver [result file] [registered exec file]
        """.stripMargin)
      // scalastyle:on println
      System.exit(1)
    }

    val sc = new SparkContext(new SparkConf()
      .setAppName("External Shuffle Test"))
    val conf = sc.getConf
    val status = new File(args(0))
    val registeredExecFile = new File(args(1))
    logInfo("shuffle service executor file = " + registeredExecFile)
    var result = "failure"
    val execStateCopy = new File(registeredExecFile.getAbsolutePath + "_dup")
    try {
      val data = sc.parallelize(0 until 100, 10).map { x => (x % 10) -> x }.reduceByKey{ _ + _ }.
        collect().toSet
      sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
      data should be ((0 until 10).map{x => x -> (x * 10 + 450)}.toSet)
      result = "success"
      // only one process can open a leveldb file at a time, so we copy the files
      FileUtils.copyDirectory(registeredExecFile, execStateCopy)
      assert(!ShuffleTestAccessor.reloadRegisteredExecutors(execStateCopy).isEmpty)
    } finally {
      sc.stop()
      FileUtils.deleteDirectory(execStateCopy)
      Files.write(result, status, StandardCharsets.UTF_8)
    }
  }

}

Source File: HistoryServerArgumentsSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.history

import java.io.File
import java.nio.charset.StandardCharsets._

import com.google.common.io.Files

import org.apache.spark._
import org.apache.spark.util.Utils

class HistoryServerArgumentsSuite extends SparkFunSuite {

  private val logDir = new File("src/test/resources/spark-events")
  private val conf = new SparkConf()
    .set("spark.history.fs.logDirectory", logDir.getAbsolutePath)
    .set("spark.history.fs.updateInterval", "1")
    .set("spark.testing", "true")

  test("No Arguments Parsing") {
    val argStrings = Array.empty[String]
    val hsa = new HistoryServerArguments(conf, argStrings)
    assert(conf.get("spark.history.fs.logDirectory") === logDir.getAbsolutePath)
    assert(conf.get("spark.history.fs.updateInterval") === "1")
    assert(conf.get("spark.testing") === "true")
  }

  test("Directory Arguments Parsing --dir or -d") {
    val argStrings = Array("--dir", "src/test/resources/spark-events1")
    val hsa = new HistoryServerArguments(conf, argStrings)
    assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events1")
  }

  test("Directory Param can also be set directly") {
    val argStrings = Array("src/test/resources/spark-events2")
    val hsa = new HistoryServerArguments(conf, argStrings)
    assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events2")
  }

  test("Properties File Arguments Parsing --properties-file") {
    val tmpDir = Utils.createTempDir()
    val outFile = File.createTempFile("test-load-spark-properties", "test", tmpDir)
    try {
      Files.write("spark.test.CustomPropertyA blah\n" +
        "spark.test.CustomPropertyB notblah\n", outFile, UTF_8)
      val argStrings = Array("--properties-file", outFile.getAbsolutePath)
      val hsa = new HistoryServerArguments(conf, argStrings)
      assert(conf.get("spark.test.CustomPropertyA") === "blah")
      assert(conf.get("spark.test.CustomPropertyB") === "notblah")
    } finally {
      Utils.deleteRecursively(tmpDir)
    }
  }

}

Source File: PailDataSourceSpec.scala From utils with Apache License 2.0

5 votes

package com.indix.utils.spark.pail

import java.util

import com.backtype.hadoop.pail.{PailFormatFactory, PailSpec, PailStructure}
import com.backtype.support.{Utils => PailUtils}
import com.google.common.io.Files
import org.apache.commons.io.FileUtils
import org.apache.spark.sql.SparkSession
import org.scalatest.{BeforeAndAfterAll, FlatSpec}
import org.scalatest.Matchers._

import scala.collection.JavaConverters._
import scala.util.Random

case class User(name: String, age: Int)

class UserPailStructure extends PailStructure[User] {
  override def isValidTarget(dirs: String*): Boolean = true

  override def getType: Class[_] = classOf[User]

  override def serialize(user: User): Array[Byte] = PailUtils.serialize(user)

  override def getTarget(user: User): util.List[String] = List(user.age % 10).map(_.toString).asJava

  override def deserialize(serialized: Array[Byte]): User = PailUtils.deserialize(serialized).asInstanceOf[User]
}

class PailDataSourceSpec extends FlatSpec with BeforeAndAfterAll with PailDataSource {
  private var spark: SparkSession = _

  override protected def beforeAll(): Unit = {
    super.beforeAll()
    spark = SparkSession.builder().master("local[2]").appName("PailDataSource").getOrCreate()
  }

  val userPailSpec = new PailSpec(PailFormatFactory.SEQUENCE_FILE, new UserPailStructure)

  "PailBasedReaderWriter" should "read/write user records from/into pail" in {
    val output = Files.createTempDir()
    val users = (1 to 100).map { index => User(s"foo$index", Random.nextInt(40))}
    spark.sparkContext.parallelize(users)
      .saveAsPail(output.getAbsolutePath, userPailSpec)

    val input = output.getAbsolutePath
    val total = spark.sparkContext.pailFile[User](input)
      .map(u => u.name)
      .count()

    total should be(100)
    FileUtils.deleteDirectory(output)
  }
}

Source File: ParquetAvroDataSourceSpec.scala From utils with Apache License 2.0

5 votes

package com.indix.utils.spark.parquet

import java.io.File

import com.google.common.io.Files
import com.indix.utils.spark.parquet.avro.ParquetAvroDataSource
import org.apache.commons.io.FileUtils
import org.apache.parquet.hadoop.metadata.CompressionCodecName
import org.apache.spark.sql.SparkSession
import org.scalactic.Equality
import org.scalatest.Matchers.{be, convertToAnyShouldWrapper, equal}
import org.scalatest.{BeforeAndAfterAll, FlatSpec}
import java.util.{Arrays => JArrays}

case class SampleAvroRecord(a: Int, b: String, c: Seq[String], d: Boolean, e: Double, f: collection.Map[String, String], g: Array[Byte])

class ParquetAvroDataSourceSpec extends FlatSpec with BeforeAndAfterAll with ParquetAvroDataSource {
  private var spark: SparkSession = _
  implicit val sampleAvroRecordEq = new Equality[SampleAvroRecord] {
    override def areEqual(left: SampleAvroRecord, b: Any): Boolean = b match {
      case right: SampleAvroRecord =>
        left.a == right.a &&
          left.b == right.b &&
          Equality.default[Seq[String]].areEqual(left.c, right.c) &&
          left.d == right.d &&
          left.e == right.e &&
          Equality.default[collection.Map[String, String]].areEqual(left.f, right.f) &&
          JArrays.equals(left.g, right.g)
      case _ => false
    }
  }

  override protected def beforeAll(): Unit = {
    super.beforeAll()
    spark = SparkSession.builder().master("local[2]").appName("ParquetAvroDataSource").getOrCreate()
  }

  override protected def afterAll(): Unit = {
    try {
      spark.sparkContext.stop()
    } finally {
      super.afterAll()
    }
  }

  "AvroBasedParquetDataSource" should "read/write avro records as ParquetData" in {

    val outputLocation = Files.createTempDir().getAbsolutePath + "/output"

    val sampleRecords: Seq[SampleAvroRecord] = Seq(
      SampleAvroRecord(1, "1", List("a1"), true, 1.0d, Map("a1" -> "b1"), "1".getBytes),
      SampleAvroRecord(2, "2", List("a2"), false, 2.0d, Map("a2" -> "b2"), "2".getBytes),
      SampleAvroRecord(3, "3", List("a3"), true, 3.0d, Map("a3" -> "b3"), "3".getBytes),
      SampleAvroRecord(4, "4", List("a4"), true, 4.0d, Map("a4" -> "b4"), "4".getBytes),
      SampleAvroRecord(5, "5", List("a5"), false, 5.0d, Map("a5" -> "b5"), "5".getBytes)
    )

    val sampleDf = spark.createDataFrame(sampleRecords)

    sampleDf.rdd.saveAvroInParquet(outputLocation, sampleDf.schema, CompressionCodecName.GZIP)

    val sparkVal = spark

    import sparkVal.implicits._

    val records: Array[SampleAvroRecord] = spark.read.parquet(outputLocation).as[SampleAvroRecord].collect()

    records.length should be(5)
    // We use === to use the custom Equality defined above for comparing Array[Byte]
    // Ref - https://github.com/scalatest/scalatest/issues/491
    records.sortBy(_.a) === sampleRecords.sortBy(_.a)

    FileUtils.deleteDirectory(new File(outputLocation))
  }

}

Source File: MessageSink.scala From parquet4s with MIT License

5 votes

package com.github.mjakubowski84.parquet4s.indefinite

import java.sql.Timestamp
import java.util.UUID

import akka.Done
import akka.kafka.CommitterSettings
import akka.kafka.ConsumerMessage.CommittableOffsetBatch
import akka.kafka.scaladsl.Committer
import akka.stream.scaladsl.{Flow, Keep, Sink}
import com.github.mjakubowski84.parquet4s.{ChunkPathBuilder, ParquetStreams, ParquetWriter}
import com.google.common.io.Files
import org.apache.hadoop.fs.Path
import org.apache.parquet.hadoop.metadata.CompressionCodecName

import scala.concurrent.Future
import scala.concurrent.duration._

object MessageSink {

  case class Data(timestamp: Timestamp, word: String)

  val MaxChunkSize: Int = 128
  val ChunkWriteTimeWindow: FiniteDuration = 10.seconds
  val WriteDirectoryName: String = "messages"

}

trait MessageSink {

  this: Akka =>

  import MessageSink._
  import MessageSource._

  protected val baseWritePath: String = new Path(Files.createTempDir().getAbsolutePath, WriteDirectoryName).toString

  private val writerOptions = ParquetWriter.Options(compressionCodecName = CompressionCodecName.SNAPPY)

  private lazy val committerSink = Flow.apply[Seq[Message]].map { messages =>
    CommittableOffsetBatch(messages.map(_.committableOffset))
  }.toMat(Committer.sink(CommitterSettings(system)))(Keep.right)

  def chunkPath: ChunkPathBuilder[Message] = {
    case (basePath, chunk) =>
      val lastElementDateTime = new Timestamp(chunk.last.record.timestamp()).toLocalDateTime
      val year = lastElementDateTime.getYear
      val month = lastElementDateTime.getMonthValue
      val day = lastElementDateTime.getDayOfMonth
      val uuid = UUID.randomUUID()

      basePath.suffix(s"/$year/$month/$day/part-$uuid.parquet")
  }

  lazy val messageSink: Sink[Message, Future[Done]] = ParquetStreams.toParquetIndefinite(
    path = baseWritePath,
    maxChunkSize = MaxChunkSize,
    chunkWriteTimeWindow = ChunkWriteTimeWindow,
    buildChunkPath = chunkPath,
    preWriteTransformation = { message: Message =>
      Data(
        timestamp = new Timestamp(message.record.timestamp()),
        word = message.record.value()
      )
    },
    postWriteSink = committerSink,
    options = writerOptions
  )

}

Source File: WriteAndReadFilteredAkkaApp.scala From parquet4s with MIT License

5 votes

package com.github.mjakubowski84.parquet4s.akka

import akka.actor.ActorSystem
import akka.stream.scaladsl.{Sink, Source}
import akka.stream.{ActorMaterializer, Materializer}
import com.github.mjakubowski84.parquet4s.{Col, ParquetReader, ParquetStreams}
import com.google.common.io.Files

import scala.concurrent.Future
import scala.util.Random

object WriteAndReadFilteredAkkaApp extends App {

  object Dict {
    val A = "A"
    val B = "B"
    val C = "C"
    val D = "D"

    val values: List[String] = List(A, B, C, D)
    def random: String = values(Random.nextInt(values.length))
  }

  case class Data(id: Int, dict: String)

  val count = 100
  val data = (1 to count).map { i => Data(id = i, dict = Dict.random) }
  val path = Files.createTempDir().getAbsolutePath

  implicit val system: ActorSystem = ActorSystem()
  implicit val materializer: Materializer = ActorMaterializer()
  import system.dispatcher

  val options = ParquetReader.Options()
  val printingSink = Sink.foreach(println)

  for {
    // write
    _ <- Source(data).runWith(ParquetStreams.toParquetSingleFile(s"$path/data.parquet"))
    // read filtered
    _ <- Future(println("""dict == "A""""))
    _ <- ParquetStreams.fromParquet[Data](path, options = options, filter = Col("dict") === Dict.A).runWith(printingSink)
    _ <- Future(println("""id >= 20 && id < 40"""))
    _ <- ParquetStreams.fromParquet[Data](path, options = options, filter = Col("id") >= 20 && Col("id") < 40).runWith(printingSink)
    // finish
    _ <- system.terminate()
  } yield ()

}

Source File: WriteAndReadCustomTypeAkkaApp.scala From parquet4s with MIT License

5 votes

package com.github.mjakubowski84.parquet4s.akka

import akka.actor.ActorSystem
import akka.stream.scaladsl.{Sink, Source}
import akka.stream.{ActorMaterializer, Materializer}
import com.github.mjakubowski84.parquet4s.CustomType._
import com.github.mjakubowski84.parquet4s.ParquetStreams
import com.google.common.io.Files

object WriteAndReadCustomTypeAkkaApp extends App {

  object Data {
    def generate(count: Int): Iterator[Data] = Iterator.range(1, count).map { i => Data(id = i, dict = Dict.random) }
  }
  case class Data(id: Long, dict: Dict.Type)

  val data = () => Data.generate(count = 100)
  val path = Files.createTempDir().getAbsolutePath

  implicit val system: ActorSystem = ActorSystem()
  implicit val materializer: Materializer = ActorMaterializer()
  import system.dispatcher

  for {
    // write
    _ <- Source.fromIterator(data).runWith(ParquetStreams.toParquetSingleFile(s"$path/data.parquet"))
    // read
    // hint: you can filter by dict using string value, for example: filter = Col("dict") === "A"
    _ <- ParquetStreams.fromParquet[Data](path).runWith(Sink.foreach(println))
    // finish
    _ <- system.terminate()
  } yield ()

}

Source File: WriteAndReadAkkaApp.scala From parquet4s with MIT License

5 votes

package com.github.mjakubowski84.parquet4s.akka

import akka.actor.ActorSystem
import akka.stream.scaladsl.{Sink, Source}
import akka.stream.{ActorMaterializer, Materializer}
import com.github.mjakubowski84.parquet4s.ParquetStreams
import com.google.common.io.Files

import scala.util.Random

object WriteAndReadAkkaApp extends App {

  case class Data(id: Int, text: String)

  val count = 100
  val data = (1 to count).map { i => Data(id = i, text = Random.nextString(4)) }
  val path = Files.createTempDir().getAbsolutePath

  implicit val system: ActorSystem = ActorSystem()
  implicit val materializer: Materializer = ActorMaterializer()
  import system.dispatcher

  for {
    // write
    _ <- Source(data).runWith(ParquetStreams.toParquetSingleFile(s"$path/data.parquet"))
    // read
    _ <- ParquetStreams.fromParquet[Data](path).runWith(Sink.foreach(println))
    // finish
    _ <- system.terminate()
  } yield ()

}

Source File: WriteAndReadGenericApp.scala From parquet4s with MIT License

5 votes

package com.github.mjakubowski84.parquet4s.core

import java.time.{LocalDate, ZoneOffset}
import java.util.TimeZone

import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter, RowParquetRecord, ValueCodecConfiguration}
import com.google.common.io.Files
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.{BINARY, INT32, INT64}
import org.apache.parquet.schema.Type.Repetition.{OPTIONAL, REQUIRED}
import org.apache.parquet.schema.{MessageType, OriginalType, Types}

object WriteAndReadGenericApp extends App {

  val ID = "id"
  val Name = "name"
  val Birthday = "birthday"
  val Schema = "user_schema"

  val path = Files.createTempDir().getAbsolutePath
  val vcc = ValueCodecConfiguration(TimeZone.getTimeZone(ZoneOffset.UTC))

  val users = List(
    (1L, "Alice", LocalDate.of(2000, 1, 1)),
    (2L, "Bob", LocalDate.of(1980, 2, 28)),
    (3L, "Cecilia", LocalDate.of(1977, 3, 15))
  ).map { case (id, name, birthday) =>
    RowParquetRecord.empty
      .add(ID, id, vcc)
      .add(Name, name, vcc)
      .add(Birthday, birthday, vcc)
  }

  // write
  implicit val schema: MessageType = Types.buildMessage()
    .addField(Types.primitive(INT64, REQUIRED).as(OriginalType.INT_64).named(ID))
    .addField(Types.primitive(BINARY, OPTIONAL).as(OriginalType.UTF8).named(Name))
    .addField(Types.primitive(INT32, OPTIONAL).as(OriginalType.DATE).named(Birthday))
    .named(Schema)

  ParquetWriter.writeAndClose(s"$path/users.parquet", users)

  //read
  val readData = ParquetReader.read[RowParquetRecord](path)
  try {
    readData.foreach { record =>
      val id = record.get[Long](ID, vcc)
      val name = record.get[String](Name, vcc)
      val birthday = record.get[LocalDate](Birthday, vcc)
      println(s"User[$ID=$id,$Name=$name,$Birthday=$birthday]")
    }
  } finally readData.close()

}

Source File: WriteAndReadFilteredApp.scala From parquet4s with MIT License

5 votes

package com.github.mjakubowski84.parquet4s.core

import com.github.mjakubowski84.parquet4s.{Col, ParquetReader, ParquetWriter}
import com.google.common.io.Files

import scala.util.Random

object WriteAndReadFilteredApp extends App {

  object Dict {
    val A = "A"
    val B = "B"
    val C = "C"
    val D = "D"

    val values: List[String] = List(A, B, C, D)
    def random: String = values(Random.nextInt(values.length))
  }

  case class Data(id: Int, dict: String)

  val count = 100
  val data = (1 to count).map { i => Data(id = i, dict = Dict.random) }
  val path = Files.createTempDir().getAbsolutePath

  // write
  ParquetWriter.writeAndClose(s"$path/data.parquet", data)

  //read filtered
  println("""dict == "A"""")
  val dictIsOnlyA = ParquetReader.read[Data](path, filter = Col("dict") === Dict.A)
  try {
    dictIsOnlyA.foreach(println)
  } finally dictIsOnlyA.close()

  println("""id >= 20 && id < 40""")
  val idIsBetween10And90 = ParquetReader.read[Data](path, filter = Col("id") >= 20 && Col("id") < 40)
  try {
    idIsBetween10And90.foreach(println)
  } finally idIsBetween10And90.close()

}

Source File: WriteAndReadCustomTypeApp.scala From parquet4s with MIT License

5 votes

package com.github.mjakubowski84.parquet4s.core

import com.github.mjakubowski84.parquet4s.CustomType._
import com.github.mjakubowski84.parquet4s.ParquetSchemaResolver._
import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter}
import com.google.common.io.Files

object WriteAndReadCustomTypeApp extends App {

  object Data {
    def generate(count: Int): Iterable[Data] = (1 to count).map { i => Data(id = i, dict = Dict.random) }
  }
  case class Data(id: Long, dict: Dict.Type)

  val data = Data.generate(count = 100)
  val path = Files.createTempDir().getAbsolutePath

  // write
  ParquetWriter.writeAndClose(s"$path/data.parquet", data)

  //read
  val readData = ParquetReader.read[Data](path)
  // hint: you can filter by dict using string value, for example: filter = Col("dict") === "A"
  try {
    readData.foreach(println)
  } finally readData.close()

}

Source File: WriteIncrementallyAndReadApp.scala From parquet4s with MIT License

5 votes

package com.github.mjakubowski84.parquet4s.core

import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter}
import com.google.common.io.Files

import scala.util.Random

object WriteIncrementallyAndReadApp extends App {

  case class Data(id: Int, text: String)

  val count = 100
  val data = (1 to count).map { i => Data(id = i, text = Random.nextString(4)) }
  val path = Files.createTempDir().getAbsolutePath

  // write
  val writer = ParquetWriter.writer[Data](s"$path/data.parquet")
  try {
    data.foreach(entity => writer.write(entity))
  } finally writer.close()

  //read
  val readData = ParquetReader.read[Data](path)
  try {
    readData.foreach(println)
  } finally readData.close()

}

Source File: config.scala From spark-integration with Apache License 2.0

5 votes

package org.apache.spark.deploy.k8s.integrationtest

import java.io.File

import com.google.common.base.Charsets
import com.google.common.io.Files

package object config {
  def getTestImageTag: String = {
    val imageTagFileProp = System.getProperty("spark.kubernetes.test.imageTagFile")
    require(imageTagFileProp != null, "Image tag file must be provided in system properties.")
    val imageTagFile = new File(imageTagFileProp)
    require(imageTagFile.isFile, s"No file found for image tag at ${imageTagFile.getAbsolutePath}.")
    Files.toString(imageTagFile, Charsets.UTF_8).trim
  }

  def getTestImageRepo: String = {
    val imageRepo = System.getProperty("spark.kubernetes.test.imageRepo")
    require(imageRepo != null, "Image repo must be provided in system properties.")
    imageRepo
  }
}

Source File: LibSVMRelationSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.source.libsvm

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.util.Utils


class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Path for dataset
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
    val file = new File(dir, "part-00000")
    Files.write(lines, file, StandardCharsets.UTF_8)
    path = dir.toURI.toString
  }

  override def afterAll(): Unit = {
    try {
      Utils.deleteRecursively(new File(path))
    } finally {
      super.afterAll()
    }
  }

  test("select as sparse vector") {
    val df = spark.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = spark.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data and read it again") {
    val df = spark.read.format("libsvm").load(path)
    val tempDir2 = new File(tempDir, "read_write_test")
    val writepath = tempDir2.toURI.toString
    // TODO: Remove requirement to coalesce by supporting multiple reads.
    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)

    val df2 = spark.read.format("libsvm").load(writepath)
    val row1 = df2.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data failed due to invalid schema") {
    val df = spark.read.format("text").load(path)
    intercept[SparkException] {
      df.write.format("libsvm").save(path + "_2")
    }
  }

  test("select features from libsvm relation") {
    val df = spark.read.format("libsvm").load(path)
    df.select("features").rdd.map { case Row(d: Vector) => d }.first
    df.select("features").collect
  }
}

Source File: L9-11CollabFilteringPreprocessing.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapred.FileSplit
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.HadoopRDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions

import com.google.common.io.Files

object CollabFilteringPreprocessingApp {

  def main(args: Array[String]) {
    if (args.length != 3) {
      System.err.println(
        "Usage: CollabFilteringPreprocessingApp <appname> <inputpath> <outputpath>")
      System.exit(1)
    }
    val Seq(appName, iPath, oPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val delim = " "

    val sc = new SparkContext(conf)
    sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions)
      .asInstanceOf[HadoopRDD[LongWritable, Text]]
      .mapPartitionsWithInputSplit((iSplit, iter) =>
        iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1))))
      .filter(r => r._2 != "0")
      .map(r => ((r._1, r._2), 1))
      .reduceByKey(_ + _)
      .map(r => r._1._1.replace("subject", "") + delim + r._1._2 + delim + r._2)
      .sample(false, 0.7)
      .coalesce(1)
      .saveAsTextFile(oPath)
  }
}

Source File: L9-13FPMiningPreprocessing.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapred.FileSplit
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.HadoopRDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions

import com.google.common.io.Files

object FPMiningPreprocessingApp {

  def main(args: Array[String]) {
    if (args.length != 3) {
      System.err.println(
        "Usage: FPMiningPreprocessingApp <appname> <inputpath> <outputpath>")
      System.exit(1)
    }
    val Seq(appName, iPath, oPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val delim = " "

    val sc = new SparkContext(conf)
    sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions)
      .asInstanceOf[HadoopRDD[LongWritable, Text]]
      .mapPartitionsWithInputSplit((iSplit, iter) =>
        iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1))))
      .filter(r => r._2 != "0")
      .map(r => (r._1, r._2))
      .distinct()
      .groupByKey()
      .map(r => r._2.mkString(" "))
      .sample(false, 0.7)
      .coalesce(1)
      .saveAsTextFile(oPath)
  }
}

Source File: KafkaTestBroker.scala From CMAK with Apache License 2.0

5 votes

package kafka.test

import java.io.File
import java.util.Properties

import com.google.common.io.Files
import kafka.server.{KafkaConfig, KafkaServerStartable}
import org.apache.curator.framework.CuratorFramework
import org.apache.curator.test.InstanceSpec

import scala.util.Try


class KafkaTestBroker(zookeeper: CuratorFramework, zookeeperConnectionString: String) {
  val AdminPath = "/admin"
  val BrokersPath = "/brokers"
  val ClusterPath = "/cluster"
  val ConfigPath = "/config"
  val ControllerPath = "/controller"
  val ControllerEpochPath = "/controller_epoch"
  val IsrChangeNotificationPath = "/isr_change_notification"
  val LogDirEventNotificationPath = "/log_dir_event_notification"
  val KafkaAclPath = "/kafka-acl"
  val KafkaAclChangesPath = "/kafka-acl-changes"

  val ConsumersPath = "/consumers"
  val ClusterIdPath = s"$ClusterPath/id"
  val BrokerIdsPath = s"$BrokersPath/ids"
  val BrokerTopicsPath = s"$BrokersPath/topics"
  val ReassignPartitionsPath = s"$AdminPath/reassign_partitions"
  val DeleteTopicsPath = s"$AdminPath/delete_topics"
  val PreferredReplicaLeaderElectionPath = s"$AdminPath/preferred_replica_election"
  val BrokerSequenceIdPath = s"$BrokersPath/seqid"
  val ConfigChangesPath = s"$ConfigPath/changes"
  val ConfigUsersPath = s"$ConfigPath/users"
  val ConfigBrokersPath = s"$ConfigPath/brokers"
  val ProducerIdBlockPath = "/latest_producer_id_block"

  private[this] val port: Int = InstanceSpec.getRandomPort
  private[this] val config: KafkaConfig = buildKafkaConfig(zookeeperConnectionString)
  private[this] val kafkaServerStartable: KafkaServerStartable = new KafkaServerStartable(config)
  kafkaServerStartable.startup()

  //wait until broker shows up in zookeeper
  var count = 0
  while(count < 10 && zookeeper.checkExists().forPath(BrokerIdsPath + "/0") == null) {
    count += 1
    println("Waiting for broker ...")
    println(Option(zookeeper.getData.forPath(BrokerIdsPath + "/0")).map(kafka.manager.asString))
    Thread.sleep(1000)
  }

  private def buildKafkaConfig(zookeeperConnectionString: String): KafkaConfig = {
    val p: Properties = new Properties
    p.setProperty("zookeeper.connect", zookeeperConnectionString)
    p.setProperty("broker.id", "0")
    p.setProperty("port", "" + port)
    p.setProperty("log.dirs", getLogDir)
    p.setProperty("log.retention.hours", "1")
    p.setProperty("offsets.topic.replication.factor", "1")
    p.setProperty("delete.topic.enable", "true")
    new KafkaConfig(p)
  }

  private def getLogDir: String = {
    val logDir: File = Files.createTempDir
    logDir.deleteOnExit()
    logDir.getAbsolutePath
  }

  def getBrokerConnectionString: String = s"localhost:$port"

  def getPort: Int = port

  def shutdown() {
    Try(kafkaServerStartable.shutdown())
  }
}

Source File: FileUtils.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.util

import java.io.{File, IOException}
import java.nio.charset.Charset

import com.google.common.io.Files

object FileUtils {
  private val UTF8 = Charset.forName("UTF-8")

  def write(file: File, str: String): Unit = {
    Files.write(str, file, UTF8)
  }

  def read(file: File): String = {
    Files.asCharSource(file, UTF8).read()
  }

  def writeByteArrayToFile(file: File, bytes: Array[Byte]): Unit = {
    Files.write(bytes, file)
  }

  def readFileToByteArray(file: File): Array[Byte] = {
    Files.toByteArray(file)
  }

  
  def forceMkdir(directory: File): Unit = {
    if (directory.exists() && directory.isFile) {
      throw new IOException(s"Failed to create directory ${directory.toString}, it already exist")
    }
    Files.createParentDirs(directory)
    directory.mkdir()
  }
}

Source File: FileUtilsSpec.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.util

import com.google.common.io.Files

import java.io.File
import java.util

import org.scalatest.FlatSpec

class FileUtilsSpec extends FlatSpec {
  val TXT =
    """
      |This is a multiple line
      |text
      |
    """.stripMargin

  it should "read/write string correctly" in {
    val file = File.createTempFile("fileutilspec", ".test")
    FileUtils.write(file, TXT)
    assert(FileUtils.read(file) == TXT)
    file.delete()
  }

  it should "read/write bytes array correctly" in {
    val file = File.createTempFile("fileutilspec", ".test")
    val bytes = TXT.toCharArray.map(_.toByte)
    FileUtils.writeByteArrayToFile(file, bytes)
    util.Arrays.equals(bytes, FileUtils.readFileToByteArray(file))
    file.delete()
  }

  it should "create directory and all parents" in {
    val temp = Files.createTempDir()
    val parent = new File(temp, "sub1")
    val child = new File(parent, "sub2" + File.separator)
    FileUtils.forceMkdir(child)
    assert(child.exists())
    assert(child.isDirectory)
    child.delete()
    parent.delete()
    temp.delete()
  }
}

Source File: LibSVMRelationSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.source.libsvm

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.util.Utils


class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Path for dataset
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
    val file = new File(dir, "part-00000")
    Files.write(lines, file, StandardCharsets.UTF_8)
    path = dir.toURI.toString
  }

  override def afterAll(): Unit = {
    try {
      Utils.deleteRecursively(new File(path))
    } finally {
      super.afterAll()
    }
  }

  test("select as sparse vector") {
    val df = spark.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = spark.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data and read it again") {
    val df = spark.read.format("libsvm").load(path)
    val tempDir2 = new File(tempDir, "read_write_test")
    val writepath = tempDir2.toURI.toString
    // TODO: Remove requirement to coalesce by supporting multiple reads.
    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)

    val df2 = spark.read.format("libsvm").load(writepath)
    val row1 = df2.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data failed due to invalid schema") {
    val df = spark.read.format("text").load(path)
    intercept[SparkException] {
      df.write.format("libsvm").save(path + "_2")
    }
  }

  test("select features from libsvm relation") {
    val df = spark.read.format("libsvm").load(path)
    df.select("features").rdd.map { case Row(d: Vector) => d }.first
    df.select("features").collect
  }
}

Source File: YarnShuffleIntegrationSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files
import org.apache.commons.io.FileUtils
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.scalatest.Matchers

import org.apache.spark._
import org.apache.spark.internal.Logging
import org.apache.spark.network.shuffle.ShuffleTestAccessor
import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor}
import org.apache.spark.tags.ExtendedYarnTest


@ExtendedYarnTest
class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {

  override def newYarnConfig(): YarnConfiguration = {
    val yarnConfig = new YarnConfiguration()
    yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle")
    yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"),
      classOf[YarnShuffleService].getCanonicalName)
    yarnConfig.set("spark.shuffle.service.port", "0")
    yarnConfig
  }

  test("external shuffle service") {
    val shuffleServicePort = YarnTestAccessor.getShuffleServicePort
    val shuffleService = YarnTestAccessor.getShuffleServiceInstance

    val registeredExecFile = YarnTestAccessor.getRegisteredExecutorFile(shuffleService)

    logInfo("Shuffle service port = " + shuffleServicePort)
    val result = File.createTempFile("result", null, tempDir)
    val finalState = runSpark(
      false,
      mainClassName(YarnExternalShuffleDriver.getClass),
      appArgs = Seq(result.getAbsolutePath(), registeredExecFile.getAbsolutePath),
      extraConf = Map(
        "spark.shuffle.service.enabled" -> "true",
        "spark.shuffle.service.port" -> shuffleServicePort.toString
      )
    )
    checkResult(finalState, result)
    assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists())
  }
}

private object YarnExternalShuffleDriver extends Logging with Matchers {

  val WAIT_TIMEOUT_MILLIS = 10000

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      // scalastyle:off println
      System.err.println(
        s"""
        |Invalid command line: ${args.mkString(" ")}
        |
        |Usage: ExternalShuffleDriver [result file] [registered exec file]
        """.stripMargin)
      // scalastyle:on println
      System.exit(1)
    }

    val sc = new SparkContext(new SparkConf()
      .setAppName("External Shuffle Test"))
    val conf = sc.getConf
    val status = new File(args(0))
    val registeredExecFile = new File(args(1))
    logInfo("shuffle service executor file = " + registeredExecFile)
    var result = "failure"
    val execStateCopy = new File(registeredExecFile.getAbsolutePath + "_dup")
    try {
      val data = sc.parallelize(0 until 100, 10).map { x => (x % 10) -> x }.reduceByKey{ _ + _ }.
        collect().toSet
      sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
      data should be ((0 until 10).map{x => x -> (x * 10 + 450)}.toSet)
      result = "success"
      // only one process can open a leveldb file at a time, so we copy the files
      FileUtils.copyDirectory(registeredExecFile, execStateCopy)
      assert(!ShuffleTestAccessor.reloadRegisteredExecutors(execStateCopy).isEmpty)
    } finally {
      sc.stop()
      FileUtils.deleteDirectory(execStateCopy)
      Files.write(result, status, StandardCharsets.UTF_8)
    }
  }

}

Source File: HistoryServerArgumentsSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.deploy.history

import java.io.File
import java.nio.charset.StandardCharsets._

import com.google.common.io.Files

import org.apache.spark._
import org.apache.spark.util.Utils

class HistoryServerArgumentsSuite extends SparkFunSuite {

  private val logDir = new File("src/test/resources/spark-events")
  private val conf = new SparkConf()
    .set("spark.history.fs.logDirectory", logDir.getAbsolutePath)
    .set("spark.history.fs.updateInterval", "1")
    .set("spark.testing", "true")

  test("No Arguments Parsing") {
    val argStrings = Array.empty[String]
    val hsa = new HistoryServerArguments(conf, argStrings)
    assert(conf.get("spark.history.fs.logDirectory") === logDir.getAbsolutePath)
    assert(conf.get("spark.history.fs.updateInterval") === "1")
    assert(conf.get("spark.testing") === "true")
  }

  test("Directory Arguments Parsing --dir or -d") {
    val argStrings = Array("--dir", "src/test/resources/spark-events1")
    val hsa = new HistoryServerArguments(conf, argStrings)
    assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events1")
  }

  test("Directory Param can also be set directly") {
    val argStrings = Array("src/test/resources/spark-events2")
    val hsa = new HistoryServerArguments(conf, argStrings)
    assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events2")
  }

  test("Properties File Arguments Parsing --properties-file") {
    val tmpDir = Utils.createTempDir()
    val outFile = File.createTempFile("test-load-spark-properties", "test", tmpDir)
    try {
      Files.write("spark.test.CustomPropertyA blah\n" +
        "spark.test.CustomPropertyB notblah\n", outFile, UTF_8)
      val argStrings = Array("--properties-file", outFile.getAbsolutePath)
      val hsa = new HistoryServerArguments(conf, argStrings)
      assert(conf.get("spark.test.CustomPropertyA") === "blah")
      assert(conf.get("spark.test.CustomPropertyB") === "notblah")
    } finally {
      Utils.deleteRecursively(tmpDir)
    }
  }

}

Source File: HttpFileServer.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark

import java.io.File

import com.google.common.io.Files

import org.apache.spark.util.Utils

private[spark] class HttpFileServer(
    conf: SparkConf,
    securityManager: SecurityManager,
    requestedPort: Int = 0)
  extends Logging {

  var baseDir : File = null
  var fileDir : File = null
  var jarDir : File = null
  var httpServer : HttpServer = null
  var serverUri : String = null

  def initialize() {
    baseDir = Utils.createTempDir(Utils.getLocalDir(conf), "httpd")
    fileDir = new File(baseDir, "files")
    jarDir = new File(baseDir, "jars")
    fileDir.mkdir()
    jarDir.mkdir()
    logInfo("HTTP File server directory is " + baseDir)
    httpServer = new HttpServer(conf, baseDir, securityManager, requestedPort, "HTTP file server")
    httpServer.start()
    serverUri = httpServer.uri
    logDebug("HTTP file server started at: " + serverUri)
  }

  def stop() {
    httpServer.stop()
  }

  def addFile(file: File) : String = {
    addFileToDir(file, fileDir)
    serverUri + "/files/" + file.getName
  }

  def addJar(file: File) : String = {
    addFileToDir(file, jarDir)
    serverUri + "/jars/" + file.getName
  }

  def addFileToDir(file: File, dir: File) : String = {
    // Check whether the file is a directory. If it is, throw a more meaningful exception.
    // If we don't catch this, Guava throws a very confusing error message:
    //   java.io.FileNotFoundException: [file] (No such file or directory)
    // even though the directory ([file]) exists.
    if (file.isDirectory) {
      throw new IllegalArgumentException(s"$file cannot be a directory.")
    }
    Files.copy(file, new File(dir, file.getName))
    dir + "/" + file.getName
  }

}

Source File: ExampleData.scala From cuesheet with Apache License 2.0

5 votes

package com.kakao.cuesheet.examples.util

import java.io.FileOutputStream

import com.google.common.io.{ByteStreams, Files}

import scala.util.control.NonFatal

object ExampleData {
  lazy val path: String = {
    try {
      val resource = "data.tsv"
      val tmpfile = Files.createTempDir().getAbsolutePath + resource
      val input = getClass.getResourceAsStream(resource)
      val output = new FileOutputStream(tmpfile)
      ByteStreams.copy(input, output)
      input.close()
      output.close()
      tmpfile
    } catch {
      case NonFatal(e) =>
        throw new RuntimeException("Could not copy example data file to temp directory", e)
    }
  }
}

Source File: MultipartFileTest.scala From fintrospect with Apache License 2.0

5 votes

package io.fintrospect.parameters

import java.io.File
import java.nio.charset.StandardCharsets.UTF_8

import com.google.common.io.Files
import com.twitter.io.{Buf, Bufs}
import org.scalatest.{FunSpec, Matchers}

class MultipartFileTest extends FunSpec with Matchers {

  describe("OnDiskMultiPartFile") {
    it("converts toFileElement") {
      val tempFile = File.createTempFile("temp", "file")
      Files.write("hello bob", tempFile, UTF_8)
      tempFile.deleteOnExit()
      Bufs.asUtf8String(OnDiskMultiPartFile("file", tempFile, None).toFileElement("hello").content) shouldBe "hello bob"
    }
  }

  describe("InMemoryMultiPartFile") {
    it("converts toFileElement") {
      Bufs.asUtf8String(InMemoryMultiPartFile("file", Buf.Utf8("hello bob"), None).toFileElement("hello").content) shouldBe "hello bob"
    }
  }

}

Source File: LibSVMRelationSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.source.libsvm

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.util.Utils


class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Path for dataset
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
    val file = new File(dir, "part-00000")
    Files.write(lines, file, StandardCharsets.UTF_8)
    path = dir.toURI.toString
  }

  override def afterAll(): Unit = {
    try {
      Utils.deleteRecursively(new File(path))
    } finally {
      super.afterAll()
    }
  }

  test("select as sparse vector") {
    val df = spark.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = spark.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data and read it again") {
    val df = spark.read.format("libsvm").load(path)
    val tempDir2 = new File(tempDir, "read_write_test")
    val writepath = tempDir2.toURI.toString
    // TODO: Remove requirement to coalesce by supporting multiple reads.
    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)

    val df2 = spark.read.format("libsvm").load(writepath)
    val row1 = df2.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data failed due to invalid schema") {
    val df = spark.read.format("text").load(path)
    intercept[SparkException] {
      df.write.format("libsvm").save(path + "_2")
    }
  }

  test("select features from libsvm relation") {
    val df = spark.read.format("libsvm").load(path)
    df.select("features").rdd.map { case Row(d: Vector) => d }.first
    df.select("features").collect
  }
}

Source File: YarnShuffleIntegrationSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files
import org.apache.commons.io.FileUtils
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.scalatest.Matchers

import org.apache.spark._
import org.apache.spark.internal.Logging
import org.apache.spark.network.shuffle.ShuffleTestAccessor
import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor}
import org.apache.spark.tags.ExtendedYarnTest


@ExtendedYarnTest
class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {

  override def newYarnConfig(): YarnConfiguration = {
    val yarnConfig = new YarnConfiguration()
    yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle")
    yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"),
      classOf[YarnShuffleService].getCanonicalName)
    yarnConfig.set("spark.shuffle.service.port", "0")
    yarnConfig
  }

  test("external shuffle service") {
    val shuffleServicePort = YarnTestAccessor.getShuffleServicePort
    val shuffleService = YarnTestAccessor.getShuffleServiceInstance

    val registeredExecFile = YarnTestAccessor.getRegisteredExecutorFile(shuffleService)

    logInfo("Shuffle service port = " + shuffleServicePort)
    val result = File.createTempFile("result", null, tempDir)
    val finalState = runSpark(
      false,
      mainClassName(YarnExternalShuffleDriver.getClass),
      appArgs = Seq(result.getAbsolutePath(), registeredExecFile.getAbsolutePath),
      extraConf = Map(
        "spark.shuffle.service.enabled" -> "true",
        "spark.shuffle.service.port" -> shuffleServicePort.toString
      )
    )
    checkResult(finalState, result)
    assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists())
  }
}

private object YarnExternalShuffleDriver extends Logging with Matchers {

  val WAIT_TIMEOUT_MILLIS = 10000

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      // scalastyle:off println
      System.err.println(
        s"""
        |Invalid command line: ${args.mkString(" ")}
        |
        |Usage: ExternalShuffleDriver [result file] [registered exec file]
        """.stripMargin)
      // scalastyle:on println
      System.exit(1)
    }

    val sc = new SparkContext(new SparkConf()
      .setAppName("External Shuffle Test"))
    val conf = sc.getConf
    val status = new File(args(0))
    val registeredExecFile = new File(args(1))
    logInfo("shuffle service executor file = " + registeredExecFile)
    var result = "failure"
    val execStateCopy = new File(registeredExecFile.getAbsolutePath + "_dup")
    try {
      val data = sc.parallelize(0 until 100, 10).map { x => (x % 10) -> x }.reduceByKey{ _ + _ }.
        collect().toSet
      sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
      data should be ((0 until 10).map{x => x -> (x * 10 + 450)}.toSet)
      result = "success"
      // only one process can open a leveldb file at a time, so we copy the files
      FileUtils.copyDirectory(registeredExecFile, execStateCopy)
      assert(!ShuffleTestAccessor.reloadRegisteredExecutors(execStateCopy).isEmpty)
    } finally {
      sc.stop()
      FileUtils.deleteDirectory(execStateCopy)
      Files.write(result, status, StandardCharsets.UTF_8)
    }
  }

}

com.google.common.io.Files Scala Examples