com.google.common.io.Files Scala Examples
The following examples show how to use com.google.common.io.Files.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: QueryPartitionSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import com.google.common.io.Files import org.apache.spark.util.Utils import org.apache.spark.sql.{QueryTest, _} import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils class QueryPartitionSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { import hiveContext.implicits._ test("SPARK-5068: query data when path doesn't exist") { withSQLConf((SQLConf.HIVE_VERIFY_PARTITION_PATH.key, "true")) { val testData = sparkContext.parallelize( (1 to 10).map(i => TestData(i, i.toString))).toDF() testData.registerTempTable("testData") val tmpDir = Files.createTempDir() // create the table for test sql(s"CREATE TABLE table_with_partition(key int,value string) " + s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='1') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='2') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='3') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='4') " + "SELECT key,value FROM testData") // test for the exist path checkAnswer(sql("select key,value from table_with_partition"), testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect) // delete the path of one partition tmpDir.listFiles .find { f => f.isDirectory && f.getName().startsWith("ds=") } .foreach { f => Utils.deleteRecursively(f) } // test for after delete the path checkAnswer(sql("select key,value from table_with_partition"), testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect) sql("DROP TABLE table_with_partition") sql("DROP TABLE createAndInsertTest") } } }
Example 2
Source File: DatasetExample.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import java.io.File import com.google.common.io.Files import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SQLContext, DataFrame} object DatasetExample { case class Params( input: String = "data/mllib/sample_libsvm_data.txt", dataFormat: String = "libsvm") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DatasetExample") { head("Dataset: an example app using DataFrame as a Dataset for ML.") opt[String]("input") .text(s"input path to dataset") .action((x, c) => c.copy(input = x)) opt[String]("dataFormat") .text("data format: libsvm (default), dense (deprecated in Spark v1.1)") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"DatasetExample with $params") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ // for implicit conversions // Load input data val origData: RDD[LabeledPoint] = params.dataFormat match { case "dense" => MLUtils.loadLabeledPoints(sc, params.input) case "libsvm" => MLUtils.loadLibSVMFile(sc, params.input) } println(s"Loaded ${origData.count()} instances from file: ${params.input}") // Convert input data to DataFrame explicitly. val df: DataFrame = origData.toDF() println(s"Inferred schema:\n${df.schema.prettyJson}") println(s"Converted to DataFrame with ${df.count()} records") // Select columns val labelsDf: DataFrame = df.select("label") val labels: RDD[Double] = labelsDf.map { case Row(v: Double) => v } val numLabels = labels.count() val meanLabel = labels.fold(0.0)(_ + _) / numLabels println(s"Selected label column with average value $meanLabel") val featuresDf: DataFrame = df.select("features") val features: RDD[Vector] = featuresDf.map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(feat), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") val tmpDir = Files.createTempDir() tmpDir.deleteOnExit() val outputDir = new File(tmpDir, "dataset").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) println(s"Loading Parquet file with UDT from $outputDir.") val newDataset = sqlContext.read.parquet(outputDir) println(s"Schema from Parquet: ${newDataset.schema.prettyJson}") val newFeatures = newDataset.select("features").map { case Row(v: Vector) => v } val newFeaturesSummary = newFeatures.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(feat), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${newFeaturesSummary.mean.toString}") sc.stop() } }
Example 3
Source File: QueryPartitionSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import com.google.common.io.Files import org.apache.spark.sql.{QueryTest, _} import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.util.Utils class QueryPartitionSuite extends QueryTest { import org.apache.spark.sql.hive.test.TestHive.implicits._ test("SPARK-5068: query data when path doesn't exist"){ val testData = TestHive.sparkContext.parallelize( (1 to 10).map(i => TestData(i, i.toString))).toDF() testData.registerTempTable("testData") val tmpDir = Files.createTempDir() // create the table for test sql(s"CREATE TABLE table_with_partition(key int,value string) " + s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='1') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='2') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='3') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='4') " + "SELECT key,value FROM testData") // test for the exist path checkAnswer(sql("select key,value from table_with_partition"), testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect) // delete the path of one partition tmpDir.listFiles .find { f => f.isDirectory && f.getName().startsWith("ds=") } .foreach { f => Utils.deleteRecursively(f) } // test for after delete the path checkAnswer(sql("select key,value from table_with_partition"), testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect) sql("DROP TABLE table_with_partition") sql("DROP TABLE createAndInsertTest") } }
Example 4
Source File: HttpFileServer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io.File import com.google.common.io.Files import org.apache.spark.util.Utils private[spark] class HttpFileServer( conf: SparkConf, securityManager: SecurityManager, requestedPort: Int = 0) extends Logging { var baseDir : File = null var fileDir : File = null var jarDir : File = null var httpServer : HttpServer = null var serverUri : String = null def initialize() { baseDir = Utils.createTempDir(Utils.getLocalDir(conf), "httpd") fileDir = new File(baseDir, "files") jarDir = new File(baseDir, "jars") fileDir.mkdir() jarDir.mkdir() logInfo("HTTP File server directory is " + baseDir) httpServer = new HttpServer(conf, baseDir, securityManager, requestedPort, "HTTP file server") httpServer.start() serverUri = httpServer.uri logDebug("HTTP file server started at: " + serverUri) } def stop() { httpServer.stop() // If we only stop sc, but the driver process still run as a services then we need to delete // the tmp dir, if not, it will create too many tmp dirs try { Utils.deleteRecursively(baseDir) } catch { case e: Exception => logWarning(s"Exception while deleting Spark temp dir: ${baseDir.getAbsolutePath}", e) } } def addFile(file: File) : String = { addFileToDir(file, fileDir) serverUri + "/files/" + file.getName } def addJar(file: File) : String = { addFileToDir(file, jarDir) serverUri + "/jars/" + file.getName } def addFileToDir(file: File, dir: File) : String = { // Check whether the file is a directory. If it is, throw a more meaningful exception. // If we don't catch this, Guava throws a very confusing error message: // java.io.FileNotFoundException: [file] (No such file or directory) // even though the directory ([file]) exists. if (file.isDirectory) { throw new IllegalArgumentException(s"$file cannot be a directory.") } Files.copy(file, new File(dir, file.getName)) dir + "/" + file.getName } }
Example 5
Source File: Unzip.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.workflowexecutor import java.io._ import java.util.zip.ZipInputStream import scala.reflect.io.Path import com.google.common.io.Files import ai.deepsense.commons.utils.Logging object Unzip extends Logging { def unzipAll(inputFile: String): String = unzipToTmp(inputFile, _ => true) private def transferImpl(in: InputStream, out: OutputStream, close: Boolean): Unit = { try { val buffer = new Array[Byte](4096) def read(): Unit = { val byteCount = in.read(buffer) if (byteCount >= 0) { out.write(buffer, 0, byteCount) read() } } read() out.close() } finally { if (close) { in.close() } } } }
Example 6
Source File: ZookeeperFunSuite.scala From kyuubi with Apache License 2.0 | 5 votes |
package yaooqinn.kyuubi.ha import com.google.common.io.Files import org.apache.curator.framework.{CuratorFramework, CuratorFrameworkFactory} import org.apache.curator.retry.ExponentialBackoffRetry import org.apache.curator.test.TestingServer import org.apache.spark.{KyuubiConf, KyuubiSparkUtil, SparkConf, SparkFunSuite} import org.apache.spark.KyuubiConf._ trait ZookeeperFunSuite extends SparkFunSuite{ var zkServer: TestingServer = _ var connectString: String = _ val conf = new SparkConf(loadDefaults = true) KyuubiSparkUtil.setupCommonConfig(conf) conf.set(KyuubiConf.FRONTEND_BIND_PORT.key, "0") var zooKeeperClient: CuratorFramework = _ override def beforeAll(): Unit = { zkServer = new TestingServer(2181, Files.createTempDir(), true) connectString = zkServer.getConnectString conf.set(HA_ZOOKEEPER_QUORUM.key, connectString) conf.set(HA_ZOOKEEPER_CONNECTION_BASESLEEPTIME.key, "100ms") conf.set(HA_ZOOKEEPER_SESSION_TIMEOUT.key, "15s") conf.set(HA_ZOOKEEPER_CONNECTION_MAX_RETRIES.key, "0") zooKeeperClient = CuratorFrameworkFactory.builder().connectString(connectString) .retryPolicy(new ExponentialBackoffRetry(1000, 3)) .build() zooKeeperClient.start() super.beforeAll() } override def afterAll(): Unit = { Option(zooKeeperClient).foreach(_.close()) Option(zkServer).foreach(_.stop()) System.clearProperty(HA_ZOOKEEPER_QUORUM.key) System.clearProperty(HA_ENABLED.key) super.afterAll() } }
Example 7
Source File: FileWrite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package scalaDemo import java.io.{File, FileWriter} import java.util.Random import com.google.common.base.Charsets.UTF_8 import com.google.common.io.Files import org.apache.spark.util.Utils object FileWrite { def main(args: Array[String]) { val outFile = File.createTempFile("test-load-spark-properties", "test") Files.write("spark.test.fileNameLoadA true\n" + "spark.test.fileNameLoadB 1\n", outFile, UTF_8) val writer = new FileWriter(new File("D:\\eclipse44_64\\workspace\\spark1.5\\examples\\sample_age_data.txt"), false) val rand = new Random() for (i <- 1 to 10000) { writer.write(i + " " + rand.nextInt(100)) writer.write(System.getProperty("line.separator")) } writer.flush() writer.close() } }
Example 8
Source File: QueryPartitionSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import com.google.common.io.Files import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.{QueryTest, _} import org.apache.spark.util.Utils //查询分区套件 class QueryPartitionSuite extends QueryTest with SQLTestUtils { private lazy val ctx = org.apache.spark.sql.hive.test.TestHive import ctx.implicits._ protected def _sqlContext = ctx //查询数据当路径不存在时 test("SPARK-5068: query data when path doesn't exist"){ withSQLConf((SQLConf.HIVE_VERIFY_PARTITION_PATH.key, "true")) { val testData = ctx.sparkContext.parallelize( (1 to 10).map(i => TestData(i, i.toString))).toDF() testData.registerTempTable("testData") val tmpDir = Files.createTempDir() // create the table for test 创建表进行测试 sql(s"CREATE TABLE table_with_partition(key int,value string) " + s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='1') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='2') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='3') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='4') " + "SELECT key,value FROM testData") // test for the exist path 测试存在的路径 checkAnswer(sql("select key,value from table_with_partition"), testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect) // delete the path of one partition 删除一个分区的路径 tmpDir.listFiles .find { f => f.isDirectory && f.getName().startsWith("ds=") } .foreach { f => Utils.deleteRecursively(f) } // test for after delete the path 测试后删除路径 checkAnswer(sql("select key,value from table_with_partition"), testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect) sql("DROP TABLE table_with_partition") sql("DROP TABLE createAndInsertTest") } } }
Example 9
Source File: SparkKubernetesClientFactory.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.k8s import java.io.File import com.google.common.base.Charsets import com.google.common.io.Files import io.fabric8.kubernetes.client.{ConfigBuilder, DefaultKubernetesClient, KubernetesClient} import io.fabric8.kubernetes.client.utils.HttpClientUtils import okhttp3.Dispatcher import org.apache.spark.SparkConf import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.util.ThreadUtils private[spark] object SparkKubernetesClientFactory { def createKubernetesClient( master: String, namespace: Option[String], kubernetesAuthConfPrefix: String, sparkConf: SparkConf, defaultServiceAccountToken: Option[File], defaultServiceAccountCaCert: Option[File]): KubernetesClient = { val oauthTokenFileConf = s"$kubernetesAuthConfPrefix.$OAUTH_TOKEN_FILE_CONF_SUFFIX" val oauthTokenConf = s"$kubernetesAuthConfPrefix.$OAUTH_TOKEN_CONF_SUFFIX" val oauthTokenFile = sparkConf.getOption(oauthTokenFileConf) .map(new File(_)) .orElse(defaultServiceAccountToken) val oauthTokenValue = sparkConf.getOption(oauthTokenConf) KubernetesUtils.requireNandDefined( oauthTokenFile, oauthTokenValue, s"Cannot specify OAuth token through both a file $oauthTokenFileConf and a " + s"value $oauthTokenConf.") val caCertFile = sparkConf .getOption(s"$kubernetesAuthConfPrefix.$CA_CERT_FILE_CONF_SUFFIX") .orElse(defaultServiceAccountCaCert.map(_.getAbsolutePath)) val clientKeyFile = sparkConf .getOption(s"$kubernetesAuthConfPrefix.$CLIENT_KEY_FILE_CONF_SUFFIX") val clientCertFile = sparkConf .getOption(s"$kubernetesAuthConfPrefix.$CLIENT_CERT_FILE_CONF_SUFFIX") val dispatcher = new Dispatcher( ThreadUtils.newDaemonCachedThreadPool("kubernetes-dispatcher")) val config = new ConfigBuilder() .withApiVersion("v1") .withMasterUrl(master) .withWebsocketPingInterval(0) .withOption(oauthTokenValue) { (token, configBuilder) => configBuilder.withOauthToken(token) }.withOption(oauthTokenFile) { (file, configBuilder) => configBuilder.withOauthToken(Files.toString(file, Charsets.UTF_8)) }.withOption(caCertFile) { (file, configBuilder) => configBuilder.withCaCertFile(file) }.withOption(clientKeyFile) { (file, configBuilder) => configBuilder.withClientKeyFile(file) }.withOption(clientCertFile) { (file, configBuilder) => configBuilder.withClientCertFile(file) }.withOption(namespace) { (ns, configBuilder) => configBuilder.withNamespace(ns) }.build() val baseHttpClient = HttpClientUtils.createHttpClient(config) val httpClientWithCustomDispatcher = baseHttpClient.newBuilder() .dispatcher(dispatcher) .build() new DefaultKubernetesClient(httpClientWithCustomDispatcher, config) } private implicit class OptionConfigurableConfigBuilder(val configBuilder: ConfigBuilder) extends AnyVal { def withOption[T] (option: Option[T]) (configurator: ((T, ConfigBuilder) => ConfigBuilder)): ConfigBuilder = { option.map { opt => configurator(opt, configBuilder) }.getOrElse(configBuilder) } } }
Example 10
Source File: SparkPodInitContainerSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.k8s import java.io.File import java.util.UUID import com.google.common.base.Charsets import com.google.common.io.Files import org.mockito.Mockito import org.scalatest.BeforeAndAfter import org.scalatest.mockito.MockitoSugar._ import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.util.Utils class SparkPodInitContainerSuite extends SparkFunSuite with BeforeAndAfter { private val DOWNLOAD_JARS_SECRET_LOCATION = createTempFile("txt") private val DOWNLOAD_FILES_SECRET_LOCATION = createTempFile("txt") private var downloadJarsDir: File = _ private var downloadFilesDir: File = _ private var downloadJarsSecretValue: String = _ private var downloadFilesSecretValue: String = _ private var fileFetcher: FileFetcher = _ override def beforeAll(): Unit = { downloadJarsSecretValue = Files.toString( new File(DOWNLOAD_JARS_SECRET_LOCATION), Charsets.UTF_8) downloadFilesSecretValue = Files.toString( new File(DOWNLOAD_FILES_SECRET_LOCATION), Charsets.UTF_8) } before { downloadJarsDir = Utils.createTempDir() downloadFilesDir = Utils.createTempDir() fileFetcher = mock[FileFetcher] } after { downloadJarsDir.delete() downloadFilesDir.delete() } test("Downloads from remote server should invoke the file fetcher") { val sparkConf = getSparkConfForRemoteFileDownloads val initContainerUnderTest = new SparkPodInitContainer(sparkConf, fileFetcher) initContainerUnderTest.run() Mockito.verify(fileFetcher).fetchFile("http://localhost:9000/jar1.jar", downloadJarsDir) Mockito.verify(fileFetcher).fetchFile("hdfs://localhost:9000/jar2.jar", downloadJarsDir) Mockito.verify(fileFetcher).fetchFile("http://localhost:9000/file.txt", downloadFilesDir) } private def getSparkConfForRemoteFileDownloads: SparkConf = { new SparkConf(true) .set(INIT_CONTAINER_REMOTE_JARS, "http://localhost:9000/jar1.jar,hdfs://localhost:9000/jar2.jar") .set(INIT_CONTAINER_REMOTE_FILES, "http://localhost:9000/file.txt") .set(JARS_DOWNLOAD_LOCATION, downloadJarsDir.getAbsolutePath) .set(FILES_DOWNLOAD_LOCATION, downloadFilesDir.getAbsolutePath) } private def createTempFile(extension: String): String = { val dir = Utils.createTempDir() val file = new File(dir, s"${UUID.randomUUID().toString}.$extension") Files.write(UUID.randomUUID().toString, file, Charsets.UTF_8) file.getAbsolutePath } }
Example 11
Source File: QueryPartitionSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import java.io.File import java.sql.Timestamp import com.google.common.io.Files import org.apache.hadoop.fs.FileSystem import org.apache.spark.sql._ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.util.Utils class QueryPartitionSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { import spark.implicits._ test("SPARK-5068: query data when path doesn't exist") { withSQLConf((SQLConf.HIVE_VERIFY_PARTITION_PATH.key, "true")) { val testData = sparkContext.parallelize( (1 to 10).map(i => TestData(i, i.toString))).toDF() testData.createOrReplaceTempView("testData") val tmpDir = Files.createTempDir() // create the table for test sql(s"CREATE TABLE table_with_partition(key int,value string) " + s"PARTITIONED by (ds string) location '${tmpDir.toURI}' ") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='1') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='2') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='3') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='4') " + "SELECT key,value FROM testData") // test for the exist path checkAnswer(sql("select key,value from table_with_partition"), testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect) // delete the path of one partition tmpDir.listFiles .find { f => f.isDirectory && f.getName().startsWith("ds=") } .foreach { f => Utils.deleteRecursively(f) } // test for after delete the path checkAnswer(sql("select key,value from table_with_partition"), testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect) sql("DROP TABLE IF EXISTS table_with_partition") sql("DROP TABLE IF EXISTS createAndInsertTest") } } test("SPARK-21739: Cast expression should initialize timezoneId") { withTable("table_with_timestamp_partition") { sql("CREATE TABLE table_with_timestamp_partition(value int) PARTITIONED BY (ts TIMESTAMP)") sql("INSERT OVERWRITE TABLE table_with_timestamp_partition " + "PARTITION (ts = '2010-01-01 00:00:00.000') VALUES (1)") // test for Cast expression in TableReader checkAnswer(sql("SELECT * FROM table_with_timestamp_partition"), Seq(Row(1, Timestamp.valueOf("2010-01-01 00:00:00.000")))) // test for Cast expression in HiveTableScanExec checkAnswer(sql("SELECT value FROM table_with_timestamp_partition " + "WHERE ts = '2010-01-01 00:00:00.000'"), Row(1)) } } }
Example 12
Source File: HistoryServerArgumentsSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.history import java.io.File import java.nio.charset.StandardCharsets._ import com.google.common.io.Files import org.apache.spark._ import org.apache.spark.util.Utils class HistoryServerArgumentsSuite extends SparkFunSuite { private val logDir = new File("src/test/resources/spark-events") private val conf = new SparkConf() .set("spark.history.fs.logDirectory", logDir.getAbsolutePath) .set("spark.history.fs.updateInterval", "1") .set("spark.testing", "true") test("No Arguments Parsing") { val argStrings = Array.empty[String] val hsa = new HistoryServerArguments(conf, argStrings) assert(conf.get("spark.history.fs.logDirectory") === logDir.getAbsolutePath) assert(conf.get("spark.history.fs.updateInterval") === "1") assert(conf.get("spark.testing") === "true") } test("Directory Arguments Parsing --dir or -d") { val argStrings = Array("--dir", "src/test/resources/spark-events1") val hsa = new HistoryServerArguments(conf, argStrings) assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events1") } test("Directory Param can also be set directly") { val argStrings = Array("src/test/resources/spark-events2") val hsa = new HistoryServerArguments(conf, argStrings) assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events2") } test("Properties File Arguments Parsing --properties-file") { val tmpDir = Utils.createTempDir() val outFile = File.createTempFile("test-load-spark-properties", "test", tmpDir) try { Files.write("spark.test.CustomPropertyA blah\n" + "spark.test.CustomPropertyB notblah\n", outFile, UTF_8) val argStrings = Array("--properties-file", outFile.getAbsolutePath) val hsa = new HistoryServerArguments(conf, argStrings) assert(conf.get("spark.test.CustomPropertyA") === "blah") assert(conf.get("spark.test.CustomPropertyB") === "notblah") } finally { Utils.deleteRecursively(tmpDir) } } }
Example 13
Source File: DataFrameExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml import java.io.File import com.google.common.io.Files import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.sql.{DataFrame, Row, SQLContext} object DataFrameExample { case class Params(input: String = "data/mllib/sample_libsvm_data.txt") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DataFrameExample") { head("DataFrameExample: an example app using DataFrame for ML.") opt[String]("input") .text(s"input path to dataframe") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"DataFrameExample with $params") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // Load input data println(s"Loading LIBSVM file with UDT from ${params.input}.") val df: DataFrame = sqlContext.read.format("libsvm").load(params.input).cache() println("Schema from LIBSVM:") df.printSchema() println(s"Loaded training data as a DataFrame with ${df.count()} records.") // Show statistical summary of labels. val labelSummary = df.describe("label") labelSummary.show() // Convert features column to an RDD of vectors. val features = df.select("features").map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(feat), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") // Save the records in a parquet file. val tmpDir = Files.createTempDir() tmpDir.deleteOnExit() val outputDir = new File(tmpDir, "dataframe").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) // Load the records back. println(s"Loading Parquet file with UDT from $outputDir.") val newDF = sqlContext.read.parquet(outputDir) println(s"Schema from Parquet:") newDF.printSchema() sc.stop() } } // scalastyle:on println
Example 14
Source File: LibSVMRelationSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import com.google.common.base.Charsets import com.google.common.io.Files import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { var tempDir: File = _ var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin tempDir = Utils.createTempDir() val file = new File(tempDir, "part-00000") Files.write(lines, file, Charsets.US_ASCII) path = tempDir.toURI.toString } override def afterAll(): Unit = { Utils.deleteRecursively(tempDir) super.afterAll() } test("select as sparse vector") { val df = sqlContext.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = sqlContext.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = sqlContext.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } }
Example 15
Source File: HistoryServerArgumentsSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.history import java.io.File import java.nio.charset.StandardCharsets._ import com.google.common.io.Files import org.apache.spark._ import org.apache.spark.util.Utils class HistoryServerArgumentsSuite extends SparkFunSuite { private val logDir = new File("src/test/resources/spark-events") private val conf = new SparkConf() .set("spark.history.fs.logDirectory", logDir.getAbsolutePath) .set("spark.history.fs.updateInterval", "1") .set("spark.testing", "true") test("No Arguments Parsing") { val argStrings = Array.empty[String] val hsa = new HistoryServerArguments(conf, argStrings) assert(conf.get("spark.history.fs.logDirectory") === logDir.getAbsolutePath) assert(conf.get("spark.history.fs.updateInterval") === "1") assert(conf.get("spark.testing") === "true") } test("Directory Arguments Parsing --dir or -d") { val argStrings = Array("--dir", "src/test/resources/spark-events1") val hsa = new HistoryServerArguments(conf, argStrings) assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events1") } test("Directory Param can also be set directly") { val argStrings = Array("src/test/resources/spark-events2") val hsa = new HistoryServerArguments(conf, argStrings) assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events2") } test("Properties File Arguments Parsing --properties-file") { val tmpDir = Utils.createTempDir() val outFile = File.createTempFile("test-load-spark-properties", "test", tmpDir) try { Files.write("spark.test.CustomPropertyA blah\n" + "spark.test.CustomPropertyB notblah\n", outFile, UTF_8) val argStrings = Array("--properties-file", outFile.getAbsolutePath) val hsa = new HistoryServerArguments(conf, argStrings) assert(conf.get("spark.test.CustomPropertyA") === "blah") assert(conf.get("spark.test.CustomPropertyB") === "notblah") } finally { Utils.deleteRecursively(tmpDir) } } }
Example 16
Source File: YarnShuffleIntegrationSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.io.File import com.google.common.base.Charsets.UTF_8 import com.google.common.io.Files import org.apache.commons.io.FileUtils import org.apache.hadoop.yarn.conf.YarnConfiguration import org.scalatest.Matchers import org.apache.spark._ import org.apache.spark.network.shuffle.ShuffleTestAccessor import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor} import org.apache.spark.tags.ExtendedYarnTest @ExtendedYarnTest class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite { override def newYarnConfig(): YarnConfiguration = { val yarnConfig = new YarnConfiguration() yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle") yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"), classOf[YarnShuffleService].getCanonicalName) yarnConfig.set("spark.shuffle.service.port", "0") yarnConfig } test("external shuffle service") { val shuffleServicePort = YarnTestAccessor.getShuffleServicePort val shuffleService = YarnTestAccessor.getShuffleServiceInstance val registeredExecFile = YarnTestAccessor.getRegisteredExecutorFile(shuffleService) logInfo("Shuffle service port = " + shuffleServicePort) val result = File.createTempFile("result", null, tempDir) val finalState = runSpark( false, mainClassName(YarnExternalShuffleDriver.getClass), appArgs = Seq(result.getAbsolutePath(), registeredExecFile.getAbsolutePath), extraConf = Map( "spark.shuffle.service.enabled" -> "true", "spark.shuffle.service.port" -> shuffleServicePort.toString ) ) checkResult(finalState, result) assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists()) } } private object YarnExternalShuffleDriver extends Logging with Matchers { val WAIT_TIMEOUT_MILLIS = 10000 def main(args: Array[String]): Unit = { if (args.length != 2) { // scalastyle:off println System.err.println( s""" |Invalid command line: ${args.mkString(" ")} | |Usage: ExternalShuffleDriver [result file] [registed exec file] """.stripMargin) // scalastyle:on println System.exit(1) } val sc = new SparkContext(new SparkConf() .setAppName("External Shuffle Test")) val conf = sc.getConf val status = new File(args(0)) val registeredExecFile = new File(args(1)) logInfo("shuffle service executor file = " + registeredExecFile) var result = "failure" val execStateCopy = new File(registeredExecFile.getAbsolutePath + "_dup") try { val data = sc.parallelize(0 until 100, 10).map { x => (x % 10) -> x }.reduceByKey{ _ + _ }. collect().toSet sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) data should be ((0 until 10).map{x => x -> (x * 10 + 450)}.toSet) result = "success" // only one process can open a leveldb file at a time, so we copy the files FileUtils.copyDirectory(registeredExecFile, execStateCopy) assert(!ShuffleTestAccessor.reloadRegisteredExecutors(execStateCopy).isEmpty) } finally { sc.stop() FileUtils.deleteDirectory(execStateCopy) Files.write(result, status, UTF_8) } } }
Example 17
Source File: HttpFileServer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io.File import com.google.common.io.Files import org.apache.spark.util.Utils private[spark] class HttpFileServer( conf: SparkConf, securityManager: SecurityManager, requestedPort: Int = 0) extends Logging { var baseDir : File = null var fileDir : File = null var jarDir : File = null var httpServer : HttpServer = null var serverUri : String = null def initialize() { baseDir = Utils.createTempDir(Utils.getLocalDir(conf), "httpd") fileDir = new File(baseDir, "files") jarDir = new File(baseDir, "jars") fileDir.mkdir() jarDir.mkdir() logInfo("HTTP File server directory is " + baseDir) httpServer = new HttpServer(conf, baseDir, securityManager, requestedPort, "HTTP file server") httpServer.start() serverUri = httpServer.uri logDebug("HTTP file server started at: " + serverUri) } def stop() { httpServer.stop() // If we only stop sc, but the driver process still run as a services then we need to delete // the tmp dir, if not, it will create too many tmp dirs try { Utils.deleteRecursively(baseDir) } catch { case e: Exception => logWarning(s"Exception while deleting Spark temp dir: ${baseDir.getAbsolutePath}", e) } } def addFile(file: File) : String = { addFileToDir(file, fileDir) serverUri + "/files/" + Utils.encodeFileNameToURIRawPath(file.getName) } def addJar(file: File) : String = { addFileToDir(file, jarDir) serverUri + "/jars/" + Utils.encodeFileNameToURIRawPath(file.getName) } def addFileToDir(file: File, dir: File) : String = { // Check whether the file is a directory. If it is, throw a more meaningful exception. // If we don't catch this, Guava throws a very confusing error message: // java.io.FileNotFoundException: [file] (No such file or directory) // even though the directory ([file]) exists. if (file.isDirectory) { throw new IllegalArgumentException(s"$file cannot be a directory.") } Files.copy(file, new File(dir, file.getName)) dir + "/" + Utils.encodeFileNameToURIRawPath(file.getName) } }
Example 18
Source File: HistoryServerArgumentsSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.history import java.io.File import java.nio.charset.StandardCharsets._ import com.google.common.io.Files import org.apache.spark._ import org.apache.spark.util.Utils class HistoryServerArgumentsSuite extends SparkFunSuite { private val logDir = new File("src/test/resources/spark-events") private val conf = new SparkConf() .set("spark.history.fs.logDirectory", logDir.getAbsolutePath) .set("spark.history.fs.updateInterval", "1") .set("spark.testing", "true") test("No Arguments Parsing") { val argStrings = Array[String]() val hsa = new HistoryServerArguments(conf, argStrings) assert(conf.get("spark.history.fs.logDirectory") === logDir.getAbsolutePath) assert(conf.get("spark.history.fs.updateInterval") === "1") assert(conf.get("spark.testing") === "true") } test("Directory Arguments Parsing --dir or -d") { val argStrings = Array("--dir", "src/test/resources/spark-events1") val hsa = new HistoryServerArguments(conf, argStrings) assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events1") } test("Directory Param can also be set directly") { val argStrings = Array("src/test/resources/spark-events2") val hsa = new HistoryServerArguments(conf, argStrings) assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events2") } test("Properties File Arguments Parsing --properties-file") { val tmpDir = Utils.createTempDir() val outFile = File.createTempFile("test-load-spark-properties", "test", tmpDir) try { Files.write("spark.test.CustomPropertyA blah\n" + "spark.test.CustomPropertyB notblah\n", outFile, UTF_8) val argStrings = Array("--properties-file", outFile.getAbsolutePath) val hsa = new HistoryServerArguments(conf, argStrings) assert(conf.get("spark.test.CustomPropertyA") === "blah") assert(conf.get("spark.test.CustomPropertyB") === "notblah") } finally { Utils.deleteRecursively(tmpDir) } } }
Example 19
Source File: HBaseLocalClient.scala From gimel with Apache License 2.0 | 5 votes |
package com.paypal.gimel.hbase.utilities import java.io.File import scala.collection.mutable.ArrayBuffer import com.google.common.io.Files import org.apache.hadoop.hbase.{HBaseTestingUtility, TableName} import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.execution.datasources.hbase.SparkHBaseConf import org.apache.spark.sql.util._ import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers} import com.paypal.gimel.common.catalog.Field import com.paypal.gimel.hbase.DataSet class HBaseLocalClient extends FunSuite with Matchers with BeforeAndAfterAll { var sparkSession : SparkSession = _ var dataSet: DataSet = _ val hbaseTestingUtility = new HBaseTestingUtility() val tableName = "test_table" val cfs = Array("personal", "professional") val columns = Array("id", "name", "age", "address", "company", "designation", "salary") val fields = columns.map(col => new Field(col)) val metrics = ArrayBuffer.empty[(String, QueryExecution, Long)] protected override def beforeAll(): Unit = { val tempDir: File = Files.createTempDir tempDir.deleteOnExit hbaseTestingUtility.startMiniCluster() SparkHBaseConf.conf = hbaseTestingUtility.getConfiguration createTable(tableName, cfs) val conf = new SparkConf conf.set(SparkHBaseConf.testConf, "true") sparkSession = SparkSession.builder() .master("local") .appName("HBase Test") .config(conf) .getOrCreate() val listener = new QueryExecutionListener { // Only test successful case here, so no need to implement `onFailure` override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {} override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = { metrics += ((funcName, qe, duration)) } } sparkSession.listenerManager.register(listener) sparkSession.sparkContext.setLogLevel("ERROR") dataSet = new DataSet(sparkSession) } protected override def afterAll(): Unit = { hbaseTestingUtility.shutdownMiniCluster() sparkSession.close() } def createTable(name: String, cfs: Array[String]) { val tName = Bytes.toBytes(name) val bcfs = cfs.map(Bytes.toBytes(_)) try { hbaseTestingUtility.deleteTable(TableName.valueOf(tName)) } catch { case _ : Throwable => println("No table = " + name + " found") } hbaseTestingUtility.createMultiRegionTable(TableName.valueOf(tName), bcfs) } // Mocks data for testing def mockDataInDataFrame(numberOfRows: Int): DataFrame = { def stringed(n: Int) = s"""{"id": "$n","name": "MAC-$n", "address": "MAC-${n + 1}", "age": "${n + 1}", "company": "MAC-$n", "designation": "MAC-$n", "salary": "${n * 10000}" }""" val texts: Seq[String] = (1 to numberOfRows).map { x => stringed(x) } val rdd: RDD[String] = sparkSession.sparkContext.parallelize(texts) val dataFrame: DataFrame = sparkSession.read.json(rdd) dataFrame } }
Example 20
Source File: ApplicationWithProcess.scala From aloha with Apache License 2.0 | 5 votes |
package me.jrwang.aloha.app import java.io.File import java.nio.charset.StandardCharsets import scala.collection.JavaConverters._ import scala.concurrent.Promise import com.google.common.io.Files import me.jrwang.aloha.common.Logging import me.jrwang.aloha.common.util.{FileAppender, Utils} abstract class ApplicationWithProcess extends AbstractApplication with Logging { private var process: Process = _ private var stdoutAppender: FileAppender = _ private var stderrAppender: FileAppender = _ // Timeout to wait for when trying to terminate an app. private val APP_TERMINATE_TIMEOUT_MS = 10 * 1000 def getProcessBuilder(): ProcessBuilder private var stateMonitorThread: Thread = _ override def start(): Promise[ExitState] = { val processBuilder = getProcessBuilder() val command = processBuilder.command() val formattedCommand = command.asScala.mkString("\"", "\" \"", "\"") logInfo(s"Launch command: $formattedCommand") processBuilder.directory(appDir) process = processBuilder.start() // Redirect its stdout and stderr to files val stdout = new File(appDir, "stdout") stdoutAppender = FileAppender(process.getInputStream, stdout, alohaConf) val header = "Aloha Application Command: %s\n%s\n\n".format( formattedCommand, "=" * 40) val stderr = new File(appDir, "stderr") Files.write(header, stderr, StandardCharsets.UTF_8) stderrAppender = FileAppender(process.getErrorStream, stderr, alohaConf) stateMonitorThread = new Thread("app-state-monitor-thread") { override def run(): Unit = { val exitCode = process.waitFor() if(exitCode == 0) { result.success(ExitState(ExitCode.SUCCESS, Some("success"))) } else { result.success(ExitState(ExitCode.FAILED, Some("failed"))) } } } stateMonitorThread.start() result } override def shutdown(reason: Option[String]): Unit = { if (process != null) { logInfo("Killing process!") if (stdoutAppender != null) { stdoutAppender.stop() } if (stderrAppender != null) { stderrAppender.stop() } val exitCode = Utils.terminateProcess(process, APP_TERMINATE_TIMEOUT_MS) if (exitCode.isEmpty) { logWarning("Failed to terminate process: " + process + ". This process will likely be orphaned.") } } } }
Example 21
package org.apache.spark.sql import org.apache.spark.sql.execution.datasources.hbase.Logging import java.io.File import com.google.common.io.Files import org.apache.hadoop.hbase.client.Table import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.{HBaseTestingUtility, TableName} import org.apache.spark.sql.execution.datasources.hbase.SparkHBaseConf import org.apache.spark.{SparkContext, SparkConf} import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} class SHC extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll with Logging { implicit class StringToColumn(val sc: StringContext) { def $(args: Any*): ColumnName = { new ColumnName(sc.s(args: _*)) } } var spark: SparkSession = null var sc: SparkContext = null var sqlContext: SQLContext = null var df: DataFrame = null private[spark] var htu = new HBaseTestingUtility private[spark] def tableName = "table1" private[spark] def columnFamilies: Array[String] = Array.tabulate(9){ x=> s"cf$x"} var table: Table = null val conf = new SparkConf conf.set(SparkHBaseConf.testConf, "true") // private[spark] var columnFamilyStr = Bytes.toString(columnFamily) def defineCatalog(tName: String) = s"""{ |"table":{"namespace":"default", "name":"$tName"}, |"rowkey":"key", |"columns":{ |"col0":{"cf":"rowkey", "col":"key", "type":"string"}, |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"}, |"col2":{"cf":"cf2", "col":"col2", "type":"double"}, |"col3":{"cf":"cf3", "col":"col3", "type":"float"}, |"col4":{"cf":"cf4", "col":"col4", "type":"int"}, |"col5":{"cf":"cf5", "col":"col5", "type":"bigint"}, |"col6":{"cf":"cf6", "col":"col6", "type":"smallint"}, |"col7":{"cf":"cf7", "col":"col7", "type":"string"}, |"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"} |} |}""".stripMargin @deprecated(since = "04.12.2017(dd/mm/year)", message = "use `defineCatalog` instead") def catalog = defineCatalog(tableName) override def beforeAll() { val tempDir: File = Files.createTempDir tempDir.deleteOnExit htu.startMiniCluster SparkHBaseConf.conf = htu.getConfiguration logInfo(" - minicluster started") println(" - minicluster started") spark = SparkSession.builder() .master("local") .appName("HBaseTest") .config(conf) .getOrCreate() sqlContext = spark.sqlContext sc = spark.sparkContext } override def afterAll() { htu.shutdownMiniCluster() spark.stop() } def createTable(name: String, cfs: Array[String]) { val tName = Bytes.toBytes(name) val bcfs = cfs.map(Bytes.toBytes(_)) try { htu.deleteTable(TableName.valueOf(tName)) } catch { case _ : Throwable => logInfo(" - no table " + name + " found") } htu.createMultiRegionTable(TableName.valueOf(tName), bcfs) } def createTable(name: Array[Byte], cfs: Array[Array[Byte]]) { try { htu.deleteTable(TableName.valueOf(name)) } catch { case _ : Throwable => logInfo(" - no table " + Bytes.toString(name) + " found") } htu.createMultiRegionTable(TableName.valueOf(name), cfs) } }
Example 22
Source File: HBaseTestSuite.scala From shc with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import java.io.File import scala.collection.JavaConverters._ import com.google.common.io.Files import org.apache.hadoop.hbase.client._ import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.{TableName, HBaseTestingUtility} import org.apache.spark.sql.execution.datasources.hbase.Logging import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} class HBaseTestSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll with Logging { private[spark] var htu = HBaseTestingUtility.createLocalHTU() private[spark] var tableName: Array[Byte] = Bytes.toBytes("t1") private[spark] var columnFamily: Array[Byte] = Bytes.toBytes("cf0") private[spark] var columnFamilies: Array[Array[Byte]] = Array(Bytes.toBytes("cf0"), Bytes.toBytes("cf1"), Bytes.toBytes("cf2"), Bytes.toBytes("cf3"), Bytes.toBytes("cf4")) var table: Table = null // private[spark] var columnFamilyStr = Bytes.toString(columnFamily) override def beforeAll() { val tempDir: File = Files.createTempDir tempDir.deleteOnExit htu.cleanupTestDir htu.startMiniZKCluster htu.startMiniHBaseCluster(1, 4) logInfo(" - minicluster started") println(" - minicluster started") try { htu.deleteTable(TableName.valueOf(tableName)) //htu.createTable(TableName.valueOf(tableName), columnFamily, 2, Bytes.toBytes("abc"), Bytes.toBytes("xyz"), 2) } catch { case _ : Throwable => logInfo(" - no table " + Bytes.toString(tableName) + " found") } setupTable() } override def afterAll() { try { table.close() println("shutdown") htu.deleteTable(TableName.valueOf(tableName)) logInfo("shuting down minicluster") htu.shutdownMiniHBaseCluster htu.shutdownMiniZKCluster logInfo(" - minicluster shut down") htu.cleanupTestDir } catch { case _ : Throwable => logError("teardown error") } } def setupTable() { val config = htu.getConfiguration htu.createMultiRegionTable(TableName.valueOf(tableName), columnFamilies) println("create htable t1") val connection = ConnectionFactory.createConnection(config) val r = connection.getRegionLocator(TableName.valueOf("t1")) table = connection.getTable(TableName.valueOf("t1")) val regionLocations = r.getAllRegionLocations.asScala.toSeq println(s"$regionLocations size: ${regionLocations.size}") (0 until 100).foreach { x => var put = new Put(Bytes.toBytes(s"row$x")) (0 until 5).foreach { y => put.addColumn(columnFamilies(y), Bytes.toBytes(s"c$y"), Bytes.toBytes(s"value $x $y")) } table.put(put) } } }
Example 23
Source File: LibFFMRelationSuite.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.source.libffm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.SparkFunSuite import com.tencent.angel.sona.ml.util.MLlibTestSparkContext import org.apache.spark.util.SparkUtil class LibFFMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines0 = """ |1 0:1:1.0 1:3:2.0 2:5:3.0 |0 """.stripMargin val lines1 = """ |0 0:2:4.0 1:4:5.0 2:6:6.0 """.stripMargin val dir = SparkUtil.createTempDir() val succ = new File(dir, "_SUCCESS") val file0 = new File(dir, "part-00000") val file1 = new File(dir, "part-00001") Files.write("", succ, StandardCharsets.UTF_8) Files.write(lines0, file0, StandardCharsets.UTF_8) Files.write(lines1, file1, StandardCharsets.UTF_8) path = dir.getPath } override def afterAll(): Unit = { try { val prefix = "C:\\Users\\fitzwang\\AppData\\Local\\Temp\\" if (path.startsWith(prefix)) { SparkUtil.deleteRecursively(new File(path)) } } finally { super.afterAll() } } test("ffmIO"){ val df = spark.read.format("libffm").load(path) val metadata = df.schema(1).metadata val fieldSet = MetaSummary.getFieldSet(metadata) println(fieldSet.mkString("[", ",", "]")) val keyFieldMap = MetaSummary.getKeyFieldMap(metadata) println(keyFieldMap.mkString("[", ",", "]")) df.write.format("libffm").save("temp.libffm") } test("read_ffm"){ val df = spark.read.format("libffm").load(path) val metadata = df.schema(1).metadata val fieldSet = MetaSummary.getFieldSet(metadata) println(fieldSet.mkString("[", ",", "]")) val keyFieldMap = MetaSummary.getKeyFieldMap(metadata) println(keyFieldMap.mkString("[", ",", "]")) } }
Example 24
package org.apache.spark.sql import java.io.File import com.google.common.io.Files import org.apache.hadoop.hbase.{HColumnDescriptor, HTableDescriptor, TableName, HBaseTestingUtility} import org.apache.hadoop.hbase.client.{Scan, Put, ConnectionFactory, Table} import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.sql.execution.datasources.hbase.SparkHBaseConf import org.apache.spark.sql.types.UTF8String import org.apache.spark.{SparkContext, SparkConf, Logging} import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} import scala.collection.JavaConverters._ class SHC extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll with Logging { implicit class StringToColumn(val sc: StringContext) { def $(args: Any*): ColumnName = { new ColumnName(sc.s(args: _*)) } } private[spark] var htu = HBaseTestingUtility.createLocalHTU() private[spark] def tableName = "table1" private[spark] def columnFamilies: Array[String] = Array.tabulate(9){ x=> s"cf$x"} var table: Table = null val conf = new SparkConf conf.set(SparkHBaseConf.testConf, "true") SparkHBaseConf.conf = htu.getConfiguration // private[spark] var columnFamilyStr = Bytes.toString(columnFamily) def catalog = s"""{ |"table":{"namespace":"default", "name":"table1"}, |"rowkey":"key", |"columns":{ |"col0":{"cf":"rowkey", "col":"key", "type":"string"}, |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"}, |"col2":{"cf":"cf2", "col":"col2", "type":"double"}, |"col3":{"cf":"cf3", "col":"col3", "type":"float"}, |"col4":{"cf":"cf4", "col":"col4", "type":"int"}, |"col5":{"cf":"cf5", "col":"col5", "type":"bigint"}, |"col6":{"cf":"cf6", "col":"col6", "type":"smallint"}, |"col7":{"cf":"cf7", "col":"col7", "type":"string"}, |"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"} |} |}""".stripMargin override def beforeAll() { val tempDir: File = Files.createTempDir tempDir.deleteOnExit htu.cleanupTestDir htu.startMiniZKCluster htu.startMiniHBaseCluster(1, 4) logInfo(" - minicluster started") println(" - minicluster started") } override def afterAll() { try { table.close() println("shutdown") htu.deleteTable(TableName.valueOf(tableName)) logInfo("shuting down minicluster") htu.shutdownMiniHBaseCluster htu.shutdownMiniZKCluster logInfo(" - minicluster shut down") htu.cleanupTestDir } catch { case _ => logError("teardown error") } } def createTable(name: String, cfs: Array[String]) { val tName = Bytes.toBytes(name) val bcfs = cfs.map(Bytes.toBytes(_)) try { htu.deleteTable(TableName.valueOf(tName)) } catch { case _ => logInfo(" - no table " + name + " found") } htu.createMultiRegionTable(TableName.valueOf(tName), bcfs) } def createTable(name: Array[Byte], cfs: Array[Array[Byte]]) { try { htu.deleteTable(TableName.valueOf(name)) } catch { case _ => logInfo(" - no table " + Bytes.toString(name) + " found") } htu.createMultiRegionTable(TableName.valueOf(name), cfs) } }
Example 25
Source File: TestUtils.scala From shc with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import java.nio.ByteBuffer import java.io.{IOException, File} import java.nio.ByteBuffer import java.util import org.apache.avro.generic.GenericData import scala.collection.immutable.HashSet import scala.collection.mutable.ArrayBuffer import scala.util.Random import com.google.common.io.Files import org.apache.spark.sql.SQLContext import scala.util.Random object TestUtils { def generateRandomByteBuffer(rand: Random, size: Int): ByteBuffer = { val bb = ByteBuffer.allocate(size) val arrayOfBytes = new Array[Byte](size) rand.nextBytes(arrayOfBytes) bb.put(arrayOfBytes) } def generateRandomMap(rand: Random, size: Int): java.util.Map[String, Int] = { val jMap = new util.HashMap[String, Int]() for (i <- 0 until size) { jMap.put(rand.nextString(5), i) } jMap } def generateRandomArray(rand: Random, size: Int): util.ArrayList[Boolean] = { val vec = new util.ArrayList[Boolean]() for (i <- 0 until size) { vec.add(rand.nextBoolean()) } vec } }
Example 26
Source File: MVMSuite.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.recommendation import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV, sum => brzSum} import com.github.cloudml.zen.ml.util._ import com.google.common.io.Files import org.apache.spark.mllib.linalg.{DenseVector => SDV, SparseVector => SSV, Vector => SV} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.scalatest.{FunSuite, Matchers} class MVMSuite extends FunSuite with SharedSparkContext with Matchers { test("binary classification") { val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) val dataSetFile = classOf[MVMSuite].getClassLoader().getResource("binary_classification_data.txt").toString() val checkpoint = s"$sparkHome/target/tmp" sc.setCheckpointDir(checkpoint) val dataSet = MLUtils.loadLibSVMFile(sc, dataSetFile).zipWithIndex().map { case (LabeledPoint(label, features), id) => val newLabel = if (label > 0.0) 1.0 else 0.0 (id, LabeledPoint(newLabel, features)) } val stepSize = 0.1 val regParam = 1e-2 val l2 = (regParam, regParam, regParam) val rank = 20 val useAdaGrad = true val trainSet = dataSet.cache() val fm = new FMClassification(trainSet, stepSize, l2, rank, useAdaGrad) val maxIter = 10 val pps = new Array[Double](maxIter) var i = 0 val startedAt = System.currentTimeMillis() while (i < maxIter) { fm.run(1) pps(i) = fm.saveModel().loss(trainSet) i += 1 } println((System.currentTimeMillis() - startedAt) / 1e3) pps.foreach(println) val ppsDiff = pps.init.zip(pps.tail).map { case (lhs, rhs) => lhs - rhs } assert(ppsDiff.count(_ < 0).toDouble / ppsDiff.size > 0.05) val fmModel = fm.saveModel() val tempDir = Files.createTempDir() tempDir.deleteOnExit() val path = tempDir.toURI.toString fmModel.save(sc, path) val sameModel = FMModel.load(sc, path) assert(sameModel.k === fmModel.k) assert(sameModel.classification === fmModel.classification) assert(sameModel.factors.sortByKey().map(_._2).collect() === fmModel.factors.sortByKey().map(_._2).collect()) } ignore("url_combined classification") { val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) val dataSetFile = classOf[MVMSuite].getClassLoader().getResource("binary_classification_data.txt").toString() val checkpointDir = s"$sparkHome/target/tmp" sc.setCheckpointDir(checkpointDir) val dataSet = MLUtils.loadLibSVMFile(sc, dataSetFile).zipWithIndex().map { case (LabeledPoint(label, features), id) => val newLabel = if (label > 0.0) 1.0 else 0.0 (id, LabeledPoint(newLabel, features)) }.cache() val numFeatures = dataSet.first()._2.features.size val stepSize = 0.1 val numIterations = 500 val regParam = 1e-3 val rank = 20 val views = Array(20, numFeatures / 2, numFeatures).map(_.toLong) val useAdaGrad = true val useWeightedLambda = true val miniBatchFraction = 1 val Array(trainSet, testSet) = dataSet.randomSplit(Array(0.8, 0.2)) trainSet.cache() testSet.cache() val fm = new MVMClassification(trainSet, stepSize, views, regParam, 0.0, rank, useAdaGrad, useWeightedLambda, miniBatchFraction) fm.run(numIterations) val model = fm.saveModel() println(f"Test loss: ${model.loss(testSet.cache())}%1.4f") } }
Example 27
Source File: PgV3ProtocolTest.scala From spark-sql-server with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.server.service.postgresql.protocol.v3 import java.io.File import java.nio.charset.StandardCharsets import java.util.UUID import scala.sys.process._ import com.google.common.io.Files import org.xerial.snappy.OSInfo import org.apache.spark.sql.server.PgJdbcTest import org.apache.spark.util.Utils class PgV3ProtocolTest extends PgJdbcTest { // TODO: Replace `snappy-java` with `commons.lang3.SystemUtils` private val isOsSupported = Seq("Linux", "Mac").contains(OSInfo.getOSName) private val isArchSupported = Seq("x86_64").contains(OSInfo.getArchName) private lazy val tempDirPath = Utils.createTempDir().getCanonicalPath private lazy val cmdPath = { val resourcePath = s"pgproto/${OSInfo.getOSName}/${OSInfo.getArchName}/pgproto" val classLoader = Thread.currentThread().getContextClassLoader val _cmdPath = classLoader.getResource(resourcePath).getPath // Set an executable flag explicitly here new File(_cmdPath).setExecutable(true) _cmdPath } def testIfSupported(testName: String)(testBody: => Unit) { if (isOsSupported && isArchSupported) { test(testName)(testBody) } else { ignore(s"$testName [not supported in env: " + s"os=${OSInfo.getOSName} arch=${OSInfo.getArchName}]")(testBody) } } def checkV3Protocol(messages: String, expected: String): Unit = { val msgDescriptionPath = s"$tempDirPath/${UUID.randomUUID().toString}.pgproto" val serverPort = serverInstance.listeningPort val command = s"$cmdPath -h localhost -d default -p $serverPort -f $msgDescriptionPath 2>&1" def normalize(s: String): String = s.trim.stripLineEnd.replaceAll("^\n", "") // Write a file containing messages in the temporary dir Files.write(normalize(messages), new File(msgDescriptionPath), StandardCharsets.UTF_8) val output = ("bash" :: "-c" :: command :: Nil).lineStream val actual = output.mkString("\n") assert(actual === normalize(expected)) } }
Example 28
Source File: Unzip.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.workflowexecutor import java.io._ import java.util.zip.ZipInputStream import scala.reflect.io.Path import com.google.common.io.Files import io.deepsense.commons.utils.Logging object Unzip extends Logging { def unzipAll(inputFile: String): String = unzipToTmp(inputFile, _ => true) private def transferImpl(in: InputStream, out: OutputStream, close: Boolean): Unit = { try { val buffer = new Array[Byte](4096) def read(): Unit = { val byteCount = in.read(buffer) if (byteCount >= 0) { out.write(buffer, 0, byteCount) read() } } read() out.close() } finally { if (close) { in.close() } } } }
Example 29
Source File: QueryPartitionSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import java.io.File import java.sql.Timestamp import com.google.common.io.Files import org.apache.hadoop.fs.FileSystem import org.apache.spark.internal.config._ import org.apache.spark.sql._ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.util.Utils class QueryPartitionSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { import spark.implicits._ private def queryWhenPathNotExist(): Unit = { withTempView("testData") { withTable("table_with_partition", "createAndInsertTest") { withTempDir { tmpDir => val testData = sparkContext.parallelize( (1 to 10).map(i => TestData(i, i.toString))).toDF() testData.createOrReplaceTempView("testData") // create the table for test sql(s"CREATE TABLE table_with_partition(key int,value string) " + s"PARTITIONED by (ds string) location '${tmpDir.toURI}' ") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='1') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='2') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='3') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='4') " + "SELECT key,value FROM testData") // test for the exist path checkAnswer(sql("select key,value from table_with_partition"), testData.union(testData).union(testData).union(testData)) // delete the path of one partition tmpDir.listFiles .find { f => f.isDirectory && f.getName().startsWith("ds=") } .foreach { f => Utils.deleteRecursively(f) } // test for after delete the path checkAnswer(sql("select key,value from table_with_partition"), testData.union(testData).union(testData)) } } } } test("SPARK-5068: query data when path doesn't exist") { withSQLConf(SQLConf.HIVE_VERIFY_PARTITION_PATH.key -> "true") { queryWhenPathNotExist() } } test("Replace spark.sql.hive.verifyPartitionPath by spark.files.ignoreMissingFiles") { withSQLConf(SQLConf.HIVE_VERIFY_PARTITION_PATH.key -> "false") { sparkContext.conf.set(IGNORE_MISSING_FILES.key, "true") queryWhenPathNotExist() } } test("SPARK-21739: Cast expression should initialize timezoneId") { withTable("table_with_timestamp_partition") { sql("CREATE TABLE table_with_timestamp_partition(value int) PARTITIONED BY (ts TIMESTAMP)") sql("INSERT OVERWRITE TABLE table_with_timestamp_partition " + "PARTITION (ts = '2010-01-01 00:00:00.000') VALUES (1)") // test for Cast expression in TableReader checkAnswer(sql("SELECT * FROM table_with_timestamp_partition"), Seq(Row(1, Timestamp.valueOf("2010-01-01 00:00:00.000")))) // test for Cast expression in HiveTableScanExec checkAnswer(sql("SELECT value FROM table_with_timestamp_partition " + "WHERE ts = '2010-01-01 00:00:00.000'"), Row(1)) } } }
Example 30
Source File: YarnShuffleIntegrationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.commons.io.FileUtils import org.apache.hadoop.yarn.conf.YarnConfiguration import org.scalatest.Matchers import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.network.shuffle.ShuffleTestAccessor import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor} import org.apache.spark.tags.ExtendedYarnTest @ExtendedYarnTest class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite { override def newYarnConfig(): YarnConfiguration = { val yarnConfig = new YarnConfiguration() yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle") yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"), classOf[YarnShuffleService].getCanonicalName) yarnConfig.set("spark.shuffle.service.port", "0") yarnConfig } test("external shuffle service") { val shuffleServicePort = YarnTestAccessor.getShuffleServicePort val shuffleService = YarnTestAccessor.getShuffleServiceInstance val registeredExecFile = YarnTestAccessor.getRegisteredExecutorFile(shuffleService) logInfo("Shuffle service port = " + shuffleServicePort) val result = File.createTempFile("result", null, tempDir) val finalState = runSpark( false, mainClassName(YarnExternalShuffleDriver.getClass), appArgs = Seq(result.getAbsolutePath(), registeredExecFile.getAbsolutePath), extraConf = Map( "spark.shuffle.service.enabled" -> "true", "spark.shuffle.service.port" -> shuffleServicePort.toString ) ) checkResult(finalState, result) assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists()) } } private object YarnExternalShuffleDriver extends Logging with Matchers { val WAIT_TIMEOUT_MILLIS = 10000 def main(args: Array[String]): Unit = { if (args.length != 2) { // scalastyle:off println System.err.println( s""" |Invalid command line: ${args.mkString(" ")} | |Usage: ExternalShuffleDriver [result file] [registered exec file] """.stripMargin) // scalastyle:on println System.exit(1) } val sc = new SparkContext(new SparkConf() .setAppName("External Shuffle Test")) val conf = sc.getConf val status = new File(args(0)) val registeredExecFile = new File(args(1)) logInfo("shuffle service executor file = " + registeredExecFile) var result = "failure" val execStateCopy = new File(registeredExecFile.getAbsolutePath + "_dup") try { val data = sc.parallelize(0 until 100, 10).map { x => (x % 10) -> x }.reduceByKey{ _ + _ }. collect().toSet sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) data should be ((0 until 10).map{x => x -> (x * 10 + 450)}.toSet) result = "success" // only one process can open a leveldb file at a time, so we copy the files FileUtils.copyDirectory(registeredExecFile, execStateCopy) assert(!ShuffleTestAccessor.reloadRegisteredExecutors(execStateCopy).isEmpty) } finally { sc.stop() FileUtils.deleteDirectory(execStateCopy) Files.write(result, status, StandardCharsets.UTF_8) } } }
Example 31
Source File: HistoryServerArgumentsSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.history import java.io.File import java.nio.charset.StandardCharsets._ import com.google.common.io.Files import org.apache.spark._ import org.apache.spark.util.Utils class HistoryServerArgumentsSuite extends SparkFunSuite { private val logDir = new File("src/test/resources/spark-events") private val conf = new SparkConf() .set("spark.history.fs.logDirectory", logDir.getAbsolutePath) .set("spark.history.fs.updateInterval", "1") .set("spark.testing", "true") test("No Arguments Parsing") { val argStrings = Array.empty[String] val hsa = new HistoryServerArguments(conf, argStrings) assert(conf.get("spark.history.fs.logDirectory") === logDir.getAbsolutePath) assert(conf.get("spark.history.fs.updateInterval") === "1") assert(conf.get("spark.testing") === "true") } test("Directory Arguments Parsing --dir or -d") { val argStrings = Array("--dir", "src/test/resources/spark-events1") val hsa = new HistoryServerArguments(conf, argStrings) assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events1") } test("Directory Param can also be set directly") { val argStrings = Array("src/test/resources/spark-events2") val hsa = new HistoryServerArguments(conf, argStrings) assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events2") } test("Properties File Arguments Parsing --properties-file") { val tmpDir = Utils.createTempDir() val outFile = File.createTempFile("test-load-spark-properties", "test", tmpDir) try { Files.write("spark.test.CustomPropertyA blah\n" + "spark.test.CustomPropertyB notblah\n", outFile, UTF_8) val argStrings = Array("--properties-file", outFile.getAbsolutePath) val hsa = new HistoryServerArguments(conf, argStrings) assert(conf.get("spark.test.CustomPropertyA") === "blah") assert(conf.get("spark.test.CustomPropertyB") === "notblah") } finally { Utils.deleteRecursively(tmpDir) } } }
Example 32
Source File: PailDataSourceSpec.scala From utils with Apache License 2.0 | 5 votes |
package com.indix.utils.spark.pail import java.util import com.backtype.hadoop.pail.{PailFormatFactory, PailSpec, PailStructure} import com.backtype.support.{Utils => PailUtils} import com.google.common.io.Files import org.apache.commons.io.FileUtils import org.apache.spark.sql.SparkSession import org.scalatest.{BeforeAndAfterAll, FlatSpec} import org.scalatest.Matchers._ import scala.collection.JavaConverters._ import scala.util.Random case class User(name: String, age: Int) class UserPailStructure extends PailStructure[User] { override def isValidTarget(dirs: String*): Boolean = true override def getType: Class[_] = classOf[User] override def serialize(user: User): Array[Byte] = PailUtils.serialize(user) override def getTarget(user: User): util.List[String] = List(user.age % 10).map(_.toString).asJava override def deserialize(serialized: Array[Byte]): User = PailUtils.deserialize(serialized).asInstanceOf[User] } class PailDataSourceSpec extends FlatSpec with BeforeAndAfterAll with PailDataSource { private var spark: SparkSession = _ override protected def beforeAll(): Unit = { super.beforeAll() spark = SparkSession.builder().master("local[2]").appName("PailDataSource").getOrCreate() } val userPailSpec = new PailSpec(PailFormatFactory.SEQUENCE_FILE, new UserPailStructure) "PailBasedReaderWriter" should "read/write user records from/into pail" in { val output = Files.createTempDir() val users = (1 to 100).map { index => User(s"foo$index", Random.nextInt(40))} spark.sparkContext.parallelize(users) .saveAsPail(output.getAbsolutePath, userPailSpec) val input = output.getAbsolutePath val total = spark.sparkContext.pailFile[User](input) .map(u => u.name) .count() total should be(100) FileUtils.deleteDirectory(output) } }
Example 33
Source File: ParquetAvroDataSourceSpec.scala From utils with Apache License 2.0 | 5 votes |
package com.indix.utils.spark.parquet import java.io.File import com.google.common.io.Files import com.indix.utils.spark.parquet.avro.ParquetAvroDataSource import org.apache.commons.io.FileUtils import org.apache.parquet.hadoop.metadata.CompressionCodecName import org.apache.spark.sql.SparkSession import org.scalactic.Equality import org.scalatest.Matchers.{be, convertToAnyShouldWrapper, equal} import org.scalatest.{BeforeAndAfterAll, FlatSpec} import java.util.{Arrays => JArrays} case class SampleAvroRecord(a: Int, b: String, c: Seq[String], d: Boolean, e: Double, f: collection.Map[String, String], g: Array[Byte]) class ParquetAvroDataSourceSpec extends FlatSpec with BeforeAndAfterAll with ParquetAvroDataSource { private var spark: SparkSession = _ implicit val sampleAvroRecordEq = new Equality[SampleAvroRecord] { override def areEqual(left: SampleAvroRecord, b: Any): Boolean = b match { case right: SampleAvroRecord => left.a == right.a && left.b == right.b && Equality.default[Seq[String]].areEqual(left.c, right.c) && left.d == right.d && left.e == right.e && Equality.default[collection.Map[String, String]].areEqual(left.f, right.f) && JArrays.equals(left.g, right.g) case _ => false } } override protected def beforeAll(): Unit = { super.beforeAll() spark = SparkSession.builder().master("local[2]").appName("ParquetAvroDataSource").getOrCreate() } override protected def afterAll(): Unit = { try { spark.sparkContext.stop() } finally { super.afterAll() } } "AvroBasedParquetDataSource" should "read/write avro records as ParquetData" in { val outputLocation = Files.createTempDir().getAbsolutePath + "/output" val sampleRecords: Seq[SampleAvroRecord] = Seq( SampleAvroRecord(1, "1", List("a1"), true, 1.0d, Map("a1" -> "b1"), "1".getBytes), SampleAvroRecord(2, "2", List("a2"), false, 2.0d, Map("a2" -> "b2"), "2".getBytes), SampleAvroRecord(3, "3", List("a3"), true, 3.0d, Map("a3" -> "b3"), "3".getBytes), SampleAvroRecord(4, "4", List("a4"), true, 4.0d, Map("a4" -> "b4"), "4".getBytes), SampleAvroRecord(5, "5", List("a5"), false, 5.0d, Map("a5" -> "b5"), "5".getBytes) ) val sampleDf = spark.createDataFrame(sampleRecords) sampleDf.rdd.saveAvroInParquet(outputLocation, sampleDf.schema, CompressionCodecName.GZIP) val sparkVal = spark import sparkVal.implicits._ val records: Array[SampleAvroRecord] = spark.read.parquet(outputLocation).as[SampleAvroRecord].collect() records.length should be(5) // We use === to use the custom Equality defined above for comparing Array[Byte] // Ref - https://github.com/scalatest/scalatest/issues/491 records.sortBy(_.a) === sampleRecords.sortBy(_.a) FileUtils.deleteDirectory(new File(outputLocation)) } }
Example 34
Source File: MessageSink.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s.indefinite import java.sql.Timestamp import java.util.UUID import akka.Done import akka.kafka.CommitterSettings import akka.kafka.ConsumerMessage.CommittableOffsetBatch import akka.kafka.scaladsl.Committer import akka.stream.scaladsl.{Flow, Keep, Sink} import com.github.mjakubowski84.parquet4s.{ChunkPathBuilder, ParquetStreams, ParquetWriter} import com.google.common.io.Files import org.apache.hadoop.fs.Path import org.apache.parquet.hadoop.metadata.CompressionCodecName import scala.concurrent.Future import scala.concurrent.duration._ object MessageSink { case class Data(timestamp: Timestamp, word: String) val MaxChunkSize: Int = 128 val ChunkWriteTimeWindow: FiniteDuration = 10.seconds val WriteDirectoryName: String = "messages" } trait MessageSink { this: Akka => import MessageSink._ import MessageSource._ protected val baseWritePath: String = new Path(Files.createTempDir().getAbsolutePath, WriteDirectoryName).toString private val writerOptions = ParquetWriter.Options(compressionCodecName = CompressionCodecName.SNAPPY) private lazy val committerSink = Flow.apply[Seq[Message]].map { messages => CommittableOffsetBatch(messages.map(_.committableOffset)) }.toMat(Committer.sink(CommitterSettings(system)))(Keep.right) def chunkPath: ChunkPathBuilder[Message] = { case (basePath, chunk) => val lastElementDateTime = new Timestamp(chunk.last.record.timestamp()).toLocalDateTime val year = lastElementDateTime.getYear val month = lastElementDateTime.getMonthValue val day = lastElementDateTime.getDayOfMonth val uuid = UUID.randomUUID() basePath.suffix(s"/$year/$month/$day/part-$uuid.parquet") } lazy val messageSink: Sink[Message, Future[Done]] = ParquetStreams.toParquetIndefinite( path = baseWritePath, maxChunkSize = MaxChunkSize, chunkWriteTimeWindow = ChunkWriteTimeWindow, buildChunkPath = chunkPath, preWriteTransformation = { message: Message => Data( timestamp = new Timestamp(message.record.timestamp()), word = message.record.value() ) }, postWriteSink = committerSink, options = writerOptions ) }
Example 35
Source File: WriteAndReadFilteredAkkaApp.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s.akka import akka.actor.ActorSystem import akka.stream.scaladsl.{Sink, Source} import akka.stream.{ActorMaterializer, Materializer} import com.github.mjakubowski84.parquet4s.{Col, ParquetReader, ParquetStreams} import com.google.common.io.Files import scala.concurrent.Future import scala.util.Random object WriteAndReadFilteredAkkaApp extends App { object Dict { val A = "A" val B = "B" val C = "C" val D = "D" val values: List[String] = List(A, B, C, D) def random: String = values(Random.nextInt(values.length)) } case class Data(id: Int, dict: String) val count = 100 val data = (1 to count).map { i => Data(id = i, dict = Dict.random) } val path = Files.createTempDir().getAbsolutePath implicit val system: ActorSystem = ActorSystem() implicit val materializer: Materializer = ActorMaterializer() import system.dispatcher val options = ParquetReader.Options() val printingSink = Sink.foreach(println) for { // write _ <- Source(data).runWith(ParquetStreams.toParquetSingleFile(s"$path/data.parquet")) // read filtered _ <- Future(println("""dict == "A"""")) _ <- ParquetStreams.fromParquet[Data](path, options = options, filter = Col("dict") === Dict.A).runWith(printingSink) _ <- Future(println("""id >= 20 && id < 40""")) _ <- ParquetStreams.fromParquet[Data](path, options = options, filter = Col("id") >= 20 && Col("id") < 40).runWith(printingSink) // finish _ <- system.terminate() } yield () }
Example 36
Source File: WriteAndReadCustomTypeAkkaApp.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s.akka import akka.actor.ActorSystem import akka.stream.scaladsl.{Sink, Source} import akka.stream.{ActorMaterializer, Materializer} import com.github.mjakubowski84.parquet4s.CustomType._ import com.github.mjakubowski84.parquet4s.ParquetStreams import com.google.common.io.Files object WriteAndReadCustomTypeAkkaApp extends App { object Data { def generate(count: Int): Iterator[Data] = Iterator.range(1, count).map { i => Data(id = i, dict = Dict.random) } } case class Data(id: Long, dict: Dict.Type) val data = () => Data.generate(count = 100) val path = Files.createTempDir().getAbsolutePath implicit val system: ActorSystem = ActorSystem() implicit val materializer: Materializer = ActorMaterializer() import system.dispatcher for { // write _ <- Source.fromIterator(data).runWith(ParquetStreams.toParquetSingleFile(s"$path/data.parquet")) // read // hint: you can filter by dict using string value, for example: filter = Col("dict") === "A" _ <- ParquetStreams.fromParquet[Data](path).runWith(Sink.foreach(println)) // finish _ <- system.terminate() } yield () }
Example 37
Source File: WriteAndReadAkkaApp.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s.akka import akka.actor.ActorSystem import akka.stream.scaladsl.{Sink, Source} import akka.stream.{ActorMaterializer, Materializer} import com.github.mjakubowski84.parquet4s.ParquetStreams import com.google.common.io.Files import scala.util.Random object WriteAndReadAkkaApp extends App { case class Data(id: Int, text: String) val count = 100 val data = (1 to count).map { i => Data(id = i, text = Random.nextString(4)) } val path = Files.createTempDir().getAbsolutePath implicit val system: ActorSystem = ActorSystem() implicit val materializer: Materializer = ActorMaterializer() import system.dispatcher for { // write _ <- Source(data).runWith(ParquetStreams.toParquetSingleFile(s"$path/data.parquet")) // read _ <- ParquetStreams.fromParquet[Data](path).runWith(Sink.foreach(println)) // finish _ <- system.terminate() } yield () }
Example 38
Source File: WriteAndReadGenericApp.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s.core import java.time.{LocalDate, ZoneOffset} import java.util.TimeZone import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter, RowParquetRecord, ValueCodecConfiguration} import com.google.common.io.Files import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.{BINARY, INT32, INT64} import org.apache.parquet.schema.Type.Repetition.{OPTIONAL, REQUIRED} import org.apache.parquet.schema.{MessageType, OriginalType, Types} object WriteAndReadGenericApp extends App { val ID = "id" val Name = "name" val Birthday = "birthday" val Schema = "user_schema" val path = Files.createTempDir().getAbsolutePath val vcc = ValueCodecConfiguration(TimeZone.getTimeZone(ZoneOffset.UTC)) val users = List( (1L, "Alice", LocalDate.of(2000, 1, 1)), (2L, "Bob", LocalDate.of(1980, 2, 28)), (3L, "Cecilia", LocalDate.of(1977, 3, 15)) ).map { case (id, name, birthday) => RowParquetRecord.empty .add(ID, id, vcc) .add(Name, name, vcc) .add(Birthday, birthday, vcc) } // write implicit val schema: MessageType = Types.buildMessage() .addField(Types.primitive(INT64, REQUIRED).as(OriginalType.INT_64).named(ID)) .addField(Types.primitive(BINARY, OPTIONAL).as(OriginalType.UTF8).named(Name)) .addField(Types.primitive(INT32, OPTIONAL).as(OriginalType.DATE).named(Birthday)) .named(Schema) ParquetWriter.writeAndClose(s"$path/users.parquet", users) //read val readData = ParquetReader.read[RowParquetRecord](path) try { readData.foreach { record => val id = record.get[Long](ID, vcc) val name = record.get[String](Name, vcc) val birthday = record.get[LocalDate](Birthday, vcc) println(s"User[$ID=$id,$Name=$name,$Birthday=$birthday]") } } finally readData.close() }
Example 39
Source File: WriteAndReadFilteredApp.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s.core import com.github.mjakubowski84.parquet4s.{Col, ParquetReader, ParquetWriter} import com.google.common.io.Files import scala.util.Random object WriteAndReadFilteredApp extends App { object Dict { val A = "A" val B = "B" val C = "C" val D = "D" val values: List[String] = List(A, B, C, D) def random: String = values(Random.nextInt(values.length)) } case class Data(id: Int, dict: String) val count = 100 val data = (1 to count).map { i => Data(id = i, dict = Dict.random) } val path = Files.createTempDir().getAbsolutePath // write ParquetWriter.writeAndClose(s"$path/data.parquet", data) //read filtered println("""dict == "A"""") val dictIsOnlyA = ParquetReader.read[Data](path, filter = Col("dict") === Dict.A) try { dictIsOnlyA.foreach(println) } finally dictIsOnlyA.close() println("""id >= 20 && id < 40""") val idIsBetween10And90 = ParquetReader.read[Data](path, filter = Col("id") >= 20 && Col("id") < 40) try { idIsBetween10And90.foreach(println) } finally idIsBetween10And90.close() }
Example 40
Source File: WriteAndReadCustomTypeApp.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s.core import com.github.mjakubowski84.parquet4s.CustomType._ import com.github.mjakubowski84.parquet4s.ParquetSchemaResolver._ import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter} import com.google.common.io.Files object WriteAndReadCustomTypeApp extends App { object Data { def generate(count: Int): Iterable[Data] = (1 to count).map { i => Data(id = i, dict = Dict.random) } } case class Data(id: Long, dict: Dict.Type) val data = Data.generate(count = 100) val path = Files.createTempDir().getAbsolutePath // write ParquetWriter.writeAndClose(s"$path/data.parquet", data) //read val readData = ParquetReader.read[Data](path) // hint: you can filter by dict using string value, for example: filter = Col("dict") === "A" try { readData.foreach(println) } finally readData.close() }
Example 41
Source File: WriteIncrementallyAndReadApp.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s.core import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter} import com.google.common.io.Files import scala.util.Random object WriteIncrementallyAndReadApp extends App { case class Data(id: Int, text: String) val count = 100 val data = (1 to count).map { i => Data(id = i, text = Random.nextString(4)) } val path = Files.createTempDir().getAbsolutePath // write val writer = ParquetWriter.writer[Data](s"$path/data.parquet") try { data.foreach(entity => writer.write(entity)) } finally writer.close() //read val readData = ParquetReader.read[Data](path) try { readData.foreach(println) } finally readData.close() }
Example 42
Source File: config.scala From spark-integration with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.k8s.integrationtest import java.io.File import com.google.common.base.Charsets import com.google.common.io.Files package object config { def getTestImageTag: String = { val imageTagFileProp = System.getProperty("spark.kubernetes.test.imageTagFile") require(imageTagFileProp != null, "Image tag file must be provided in system properties.") val imageTagFile = new File(imageTagFileProp) require(imageTagFile.isFile, s"No file found for image tag at ${imageTagFile.getAbsolutePath}.") Files.toString(imageTagFile, Charsets.UTF_8).trim } def getTestImageRepo: String = { val imageRepo = System.getProperty("spark.kubernetes.test.imageRepo") require(imageRepo != null, "Image repo must be provided in system properties.") imageRepo } }
Example 43
Source File: LibSVMRelationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 44
Source File: L9-11CollabFilteringPreprocessing.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.FileSplit import org.apache.hadoop.mapred.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.HadoopRDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import com.google.common.io.Files object CollabFilteringPreprocessingApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: CollabFilteringPreprocessingApp <appname> <inputpath> <outputpath>") System.exit(1) } val Seq(appName, iPath, oPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val delim = " " val sc = new SparkContext(conf) sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions) .asInstanceOf[HadoopRDD[LongWritable, Text]] .mapPartitionsWithInputSplit((iSplit, iter) => iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1)))) .filter(r => r._2 != "0") .map(r => ((r._1, r._2), 1)) .reduceByKey(_ + _) .map(r => r._1._1.replace("subject", "") + delim + r._1._2 + delim + r._2) .sample(false, 0.7) .coalesce(1) .saveAsTextFile(oPath) } }
Example 45
Source File: L9-13FPMiningPreprocessing.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.FileSplit import org.apache.hadoop.mapred.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.HadoopRDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import com.google.common.io.Files object FPMiningPreprocessingApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: FPMiningPreprocessingApp <appname> <inputpath> <outputpath>") System.exit(1) } val Seq(appName, iPath, oPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val delim = " " val sc = new SparkContext(conf) sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions) .asInstanceOf[HadoopRDD[LongWritable, Text]] .mapPartitionsWithInputSplit((iSplit, iter) => iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1)))) .filter(r => r._2 != "0") .map(r => (r._1, r._2)) .distinct() .groupByKey() .map(r => r._2.mkString(" ")) .sample(false, 0.7) .coalesce(1) .saveAsTextFile(oPath) } }
Example 46
Source File: KafkaTestBroker.scala From CMAK with Apache License 2.0 | 5 votes |
package kafka.test import java.io.File import java.util.Properties import com.google.common.io.Files import kafka.server.{KafkaConfig, KafkaServerStartable} import org.apache.curator.framework.CuratorFramework import org.apache.curator.test.InstanceSpec import scala.util.Try class KafkaTestBroker(zookeeper: CuratorFramework, zookeeperConnectionString: String) { val AdminPath = "/admin" val BrokersPath = "/brokers" val ClusterPath = "/cluster" val ConfigPath = "/config" val ControllerPath = "/controller" val ControllerEpochPath = "/controller_epoch" val IsrChangeNotificationPath = "/isr_change_notification" val LogDirEventNotificationPath = "/log_dir_event_notification" val KafkaAclPath = "/kafka-acl" val KafkaAclChangesPath = "/kafka-acl-changes" val ConsumersPath = "/consumers" val ClusterIdPath = s"$ClusterPath/id" val BrokerIdsPath = s"$BrokersPath/ids" val BrokerTopicsPath = s"$BrokersPath/topics" val ReassignPartitionsPath = s"$AdminPath/reassign_partitions" val DeleteTopicsPath = s"$AdminPath/delete_topics" val PreferredReplicaLeaderElectionPath = s"$AdminPath/preferred_replica_election" val BrokerSequenceIdPath = s"$BrokersPath/seqid" val ConfigChangesPath = s"$ConfigPath/changes" val ConfigUsersPath = s"$ConfigPath/users" val ConfigBrokersPath = s"$ConfigPath/brokers" val ProducerIdBlockPath = "/latest_producer_id_block" private[this] val port: Int = InstanceSpec.getRandomPort private[this] val config: KafkaConfig = buildKafkaConfig(zookeeperConnectionString) private[this] val kafkaServerStartable: KafkaServerStartable = new KafkaServerStartable(config) kafkaServerStartable.startup() //wait until broker shows up in zookeeper var count = 0 while(count < 10 && zookeeper.checkExists().forPath(BrokerIdsPath + "/0") == null) { count += 1 println("Waiting for broker ...") println(Option(zookeeper.getData.forPath(BrokerIdsPath + "/0")).map(kafka.manager.asString)) Thread.sleep(1000) } private def buildKafkaConfig(zookeeperConnectionString: String): KafkaConfig = { val p: Properties = new Properties p.setProperty("zookeeper.connect", zookeeperConnectionString) p.setProperty("broker.id", "0") p.setProperty("port", "" + port) p.setProperty("log.dirs", getLogDir) p.setProperty("log.retention.hours", "1") p.setProperty("offsets.topic.replication.factor", "1") p.setProperty("delete.topic.enable", "true") new KafkaConfig(p) } private def getLogDir: String = { val logDir: File = Files.createTempDir logDir.deleteOnExit() logDir.getAbsolutePath } def getBrokerConnectionString: String = s"localhost:$port" def getPort: Int = port def shutdown() { Try(kafkaServerStartable.shutdown()) } }
Example 47
Source File: FileUtils.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.util import java.io.{File, IOException} import java.nio.charset.Charset import com.google.common.io.Files object FileUtils { private val UTF8 = Charset.forName("UTF-8") def write(file: File, str: String): Unit = { Files.write(str, file, UTF8) } def read(file: File): String = { Files.asCharSource(file, UTF8).read() } def writeByteArrayToFile(file: File, bytes: Array[Byte]): Unit = { Files.write(bytes, file) } def readFileToByteArray(file: File): Array[Byte] = { Files.toByteArray(file) } def forceMkdir(directory: File): Unit = { if (directory.exists() && directory.isFile) { throw new IOException(s"Failed to create directory ${directory.toString}, it already exist") } Files.createParentDirs(directory) directory.mkdir() } }
Example 48
Source File: FileUtilsSpec.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.util import com.google.common.io.Files import java.io.File import java.util import org.scalatest.FlatSpec class FileUtilsSpec extends FlatSpec { val TXT = """ |This is a multiple line |text | """.stripMargin it should "read/write string correctly" in { val file = File.createTempFile("fileutilspec", ".test") FileUtils.write(file, TXT) assert(FileUtils.read(file) == TXT) file.delete() } it should "read/write bytes array correctly" in { val file = File.createTempFile("fileutilspec", ".test") val bytes = TXT.toCharArray.map(_.toByte) FileUtils.writeByteArrayToFile(file, bytes) util.Arrays.equals(bytes, FileUtils.readFileToByteArray(file)) file.delete() } it should "create directory and all parents" in { val temp = Files.createTempDir() val parent = new File(temp, "sub1") val child = new File(parent, "sub2" + File.separator) FileUtils.forceMkdir(child) assert(child.exists()) assert(child.isDirectory) child.delete() parent.delete() temp.delete() } }
Example 49
Source File: LibSVMRelationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 50
Source File: YarnShuffleIntegrationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.commons.io.FileUtils import org.apache.hadoop.yarn.conf.YarnConfiguration import org.scalatest.Matchers import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.network.shuffle.ShuffleTestAccessor import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor} import org.apache.spark.tags.ExtendedYarnTest @ExtendedYarnTest class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite { override def newYarnConfig(): YarnConfiguration = { val yarnConfig = new YarnConfiguration() yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle") yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"), classOf[YarnShuffleService].getCanonicalName) yarnConfig.set("spark.shuffle.service.port", "0") yarnConfig } test("external shuffle service") { val shuffleServicePort = YarnTestAccessor.getShuffleServicePort val shuffleService = YarnTestAccessor.getShuffleServiceInstance val registeredExecFile = YarnTestAccessor.getRegisteredExecutorFile(shuffleService) logInfo("Shuffle service port = " + shuffleServicePort) val result = File.createTempFile("result", null, tempDir) val finalState = runSpark( false, mainClassName(YarnExternalShuffleDriver.getClass), appArgs = Seq(result.getAbsolutePath(), registeredExecFile.getAbsolutePath), extraConf = Map( "spark.shuffle.service.enabled" -> "true", "spark.shuffle.service.port" -> shuffleServicePort.toString ) ) checkResult(finalState, result) assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists()) } } private object YarnExternalShuffleDriver extends Logging with Matchers { val WAIT_TIMEOUT_MILLIS = 10000 def main(args: Array[String]): Unit = { if (args.length != 2) { // scalastyle:off println System.err.println( s""" |Invalid command line: ${args.mkString(" ")} | |Usage: ExternalShuffleDriver [result file] [registered exec file] """.stripMargin) // scalastyle:on println System.exit(1) } val sc = new SparkContext(new SparkConf() .setAppName("External Shuffle Test")) val conf = sc.getConf val status = new File(args(0)) val registeredExecFile = new File(args(1)) logInfo("shuffle service executor file = " + registeredExecFile) var result = "failure" val execStateCopy = new File(registeredExecFile.getAbsolutePath + "_dup") try { val data = sc.parallelize(0 until 100, 10).map { x => (x % 10) -> x }.reduceByKey{ _ + _ }. collect().toSet sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) data should be ((0 until 10).map{x => x -> (x * 10 + 450)}.toSet) result = "success" // only one process can open a leveldb file at a time, so we copy the files FileUtils.copyDirectory(registeredExecFile, execStateCopy) assert(!ShuffleTestAccessor.reloadRegisteredExecutors(execStateCopy).isEmpty) } finally { sc.stop() FileUtils.deleteDirectory(execStateCopy) Files.write(result, status, StandardCharsets.UTF_8) } } }
Example 51
Source File: HistoryServerArgumentsSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.history import java.io.File import java.nio.charset.StandardCharsets._ import com.google.common.io.Files import org.apache.spark._ import org.apache.spark.util.Utils class HistoryServerArgumentsSuite extends SparkFunSuite { private val logDir = new File("src/test/resources/spark-events") private val conf = new SparkConf() .set("spark.history.fs.logDirectory", logDir.getAbsolutePath) .set("spark.history.fs.updateInterval", "1") .set("spark.testing", "true") test("No Arguments Parsing") { val argStrings = Array.empty[String] val hsa = new HistoryServerArguments(conf, argStrings) assert(conf.get("spark.history.fs.logDirectory") === logDir.getAbsolutePath) assert(conf.get("spark.history.fs.updateInterval") === "1") assert(conf.get("spark.testing") === "true") } test("Directory Arguments Parsing --dir or -d") { val argStrings = Array("--dir", "src/test/resources/spark-events1") val hsa = new HistoryServerArguments(conf, argStrings) assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events1") } test("Directory Param can also be set directly") { val argStrings = Array("src/test/resources/spark-events2") val hsa = new HistoryServerArguments(conf, argStrings) assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events2") } test("Properties File Arguments Parsing --properties-file") { val tmpDir = Utils.createTempDir() val outFile = File.createTempFile("test-load-spark-properties", "test", tmpDir) try { Files.write("spark.test.CustomPropertyA blah\n" + "spark.test.CustomPropertyB notblah\n", outFile, UTF_8) val argStrings = Array("--properties-file", outFile.getAbsolutePath) val hsa = new HistoryServerArguments(conf, argStrings) assert(conf.get("spark.test.CustomPropertyA") === "blah") assert(conf.get("spark.test.CustomPropertyB") === "notblah") } finally { Utils.deleteRecursively(tmpDir) } } }
Example 52
Source File: HttpFileServer.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io.File import com.google.common.io.Files import org.apache.spark.util.Utils private[spark] class HttpFileServer( conf: SparkConf, securityManager: SecurityManager, requestedPort: Int = 0) extends Logging { var baseDir : File = null var fileDir : File = null var jarDir : File = null var httpServer : HttpServer = null var serverUri : String = null def initialize() { baseDir = Utils.createTempDir(Utils.getLocalDir(conf), "httpd") fileDir = new File(baseDir, "files") jarDir = new File(baseDir, "jars") fileDir.mkdir() jarDir.mkdir() logInfo("HTTP File server directory is " + baseDir) httpServer = new HttpServer(conf, baseDir, securityManager, requestedPort, "HTTP file server") httpServer.start() serverUri = httpServer.uri logDebug("HTTP file server started at: " + serverUri) } def stop() { httpServer.stop() } def addFile(file: File) : String = { addFileToDir(file, fileDir) serverUri + "/files/" + file.getName } def addJar(file: File) : String = { addFileToDir(file, jarDir) serverUri + "/jars/" + file.getName } def addFileToDir(file: File, dir: File) : String = { // Check whether the file is a directory. If it is, throw a more meaningful exception. // If we don't catch this, Guava throws a very confusing error message: // java.io.FileNotFoundException: [file] (No such file or directory) // even though the directory ([file]) exists. if (file.isDirectory) { throw new IllegalArgumentException(s"$file cannot be a directory.") } Files.copy(file, new File(dir, file.getName)) dir + "/" + file.getName } }
Example 53
Source File: ExampleData.scala From cuesheet with Apache License 2.0 | 5 votes |
package com.kakao.cuesheet.examples.util import java.io.FileOutputStream import com.google.common.io.{ByteStreams, Files} import scala.util.control.NonFatal object ExampleData { lazy val path: String = { try { val resource = "data.tsv" val tmpfile = Files.createTempDir().getAbsolutePath + resource val input = getClass.getResourceAsStream(resource) val output = new FileOutputStream(tmpfile) ByteStreams.copy(input, output) input.close() output.close() tmpfile } catch { case NonFatal(e) => throw new RuntimeException("Could not copy example data file to temp directory", e) } } }
Example 54
Source File: MultipartFileTest.scala From fintrospect with Apache License 2.0 | 5 votes |
package io.fintrospect.parameters import java.io.File import java.nio.charset.StandardCharsets.UTF_8 import com.google.common.io.Files import com.twitter.io.{Buf, Bufs} import org.scalatest.{FunSpec, Matchers} class MultipartFileTest extends FunSpec with Matchers { describe("OnDiskMultiPartFile") { it("converts toFileElement") { val tempFile = File.createTempFile("temp", "file") Files.write("hello bob", tempFile, UTF_8) tempFile.deleteOnExit() Bufs.asUtf8String(OnDiskMultiPartFile("file", tempFile, None).toFileElement("hello").content) shouldBe "hello bob" } } describe("InMemoryMultiPartFile") { it("converts toFileElement") { Bufs.asUtf8String(InMemoryMultiPartFile("file", Buf.Utf8("hello bob"), None).toFileElement("hello").content) shouldBe "hello bob" } } }
Example 55
Source File: LibSVMRelationSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 56
Source File: YarnShuffleIntegrationSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.commons.io.FileUtils import org.apache.hadoop.yarn.conf.YarnConfiguration import org.scalatest.Matchers import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.network.shuffle.ShuffleTestAccessor import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor} import org.apache.spark.tags.ExtendedYarnTest @ExtendedYarnTest class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite { override def newYarnConfig(): YarnConfiguration = { val yarnConfig = new YarnConfiguration() yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle") yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"), classOf[YarnShuffleService].getCanonicalName) yarnConfig.set("spark.shuffle.service.port", "0") yarnConfig } test("external shuffle service") { val shuffleServicePort = YarnTestAccessor.getShuffleServicePort val shuffleService = YarnTestAccessor.getShuffleServiceInstance val registeredExecFile = YarnTestAccessor.getRegisteredExecutorFile(shuffleService) logInfo("Shuffle service port = " + shuffleServicePort) val result = File.createTempFile("result", null, tempDir) val finalState = runSpark( false, mainClassName(YarnExternalShuffleDriver.getClass), appArgs = Seq(result.getAbsolutePath(), registeredExecFile.getAbsolutePath), extraConf = Map( "spark.shuffle.service.enabled" -> "true", "spark.shuffle.service.port" -> shuffleServicePort.toString ) ) checkResult(finalState, result) assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists()) } } private object YarnExternalShuffleDriver extends Logging with Matchers { val WAIT_TIMEOUT_MILLIS = 10000 def main(args: Array[String]): Unit = { if (args.length != 2) { // scalastyle:off println System.err.println( s""" |Invalid command line: ${args.mkString(" ")} | |Usage: ExternalShuffleDriver [result file] [registered exec file] """.stripMargin) // scalastyle:on println System.exit(1) } val sc = new SparkContext(new SparkConf() .setAppName("External Shuffle Test")) val conf = sc.getConf val status = new File(args(0)) val registeredExecFile = new File(args(1)) logInfo("shuffle service executor file = " + registeredExecFile) var result = "failure" val execStateCopy = new File(registeredExecFile.getAbsolutePath + "_dup") try { val data = sc.parallelize(0 until 100, 10).map { x => (x % 10) -> x }.reduceByKey{ _ + _ }. collect().toSet sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) data should be ((0 until 10).map{x => x -> (x * 10 + 450)}.toSet) result = "success" // only one process can open a leveldb file at a time, so we copy the files FileUtils.copyDirectory(registeredExecFile, execStateCopy) assert(!ShuffleTestAccessor.reloadRegisteredExecutors(execStateCopy).isEmpty) } finally { sc.stop() FileUtils.deleteDirectory(execStateCopy) Files.write(result, status, StandardCharsets.UTF_8) } } }