org.apache.spark.SparkFiles Scala Examples
The following examples show how to use org.apache.spark.SparkFiles.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: GrokHelper.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.s2jobs.utils import io.thekraken.grok.api.Grok import org.apache.s2graph.s2jobs.Logger import org.apache.spark.SparkFiles import org.apache.spark.sql.Row import scala.collection.mutable object GrokHelper extends Logger { private val grokPool:mutable.Map[String, Grok] = mutable.Map.empty def getGrok(name:String, patternFiles:Seq[String], patterns:Map[String, String], compilePattern:String):Grok = { if (grokPool.get(name).isEmpty) { println(s"Grok '$name' initialized..") val grok = new Grok() patternFiles.foreach { patternFile => val filePath = SparkFiles.get(patternFile) println(s"[Grok][$name] add pattern file : $patternFile ($filePath)") grok.addPatternFromFile(filePath) } patterns.foreach { case (name, pattern) => println(s"[Grok][$name] add pattern : $name ($pattern)") grok.addPattern(name, pattern) } grok.compile(compilePattern) println(s"[Grok][$name] patterns: ${grok.getPatterns}") grokPool.put(name, grok) } grokPool(name) } def grokMatch(text:String)(implicit grok:Grok):Option[Map[String, String]] = { import scala.collection.JavaConverters._ val m = grok.`match`(text) m.captures() val rstMap = m.toMap.asScala.toMap .filter(_._2 != null) .map{ case (k, v) => k -> v.toString} if (rstMap.isEmpty) None else Some(rstMap) } def grokMatchWithSchema(text:String)(implicit grok:Grok, keys:Array[String]):Option[Row] = { import scala.collection.JavaConverters._ val m = grok.`match`(text) m.captures() val rstMap = m.toMap.asScala.toMap if (rstMap.isEmpty) None else { val l = keys.map { key => rstMap.getOrElse(key, null)} Some(Row.fromSeq(l)) } } }
Example 2
Source File: L8-38SparkR.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import java.nio.file.Paths import org.apache.spark.SparkFiles object CdrStreamingSparkRApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: CdrStreamingSparkRApp <appname> <batchInterval> <hostname> <port> <tableName> <RScriptPath> <RScriptLogsPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, tableName, rScriptPath, logsPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val cl = Thread.currentThread().getContextClassLoader() val hiveC = new HiveContext(ssc.sparkContext) Thread.currentThread().setContextClassLoader(cl) import hiveC.implicits._ ssc.sparkContext.addFile(rScriptPath) val rScriptName = SparkFiles.get(Paths.get(rScriptPath).getFileName.toString) val master = hiveC.sparkContext.getConf.get("spark.master") val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD((rdd, time) => { val iTableName = tableName + time.milliseconds seqToCdr(rdd).toDF().write.saveAsTable(iTableName) hiveC.sparkContext.parallelize(Array(iTableName)).pipe("%s %s".format(rScriptName, master)).saveAsTextFile(Paths.get(logsPath, iTableName).toString) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 3
Source File: SparkExample.scala From Hands-On-Data-Analysis-with-Scala with MIT License | 5 votes |
package handson.example.spark import org.apache.spark.sql.SparkSession import org.apache.spark.SparkFiles import vegas._ import vegas.sparkExt._ object SparkExample { def getSparkSession(): SparkSession = { val spark = SparkSession.builder().master("local").getOrCreate() spark.sparkContext.setLogLevel("ERROR") spark } def main(args: Array[String]): Unit = { val spark = getSparkSession() spark.sparkContext.addFile("https://data.lacity.org/api/views/nxs9-385f/rows.csv") val df = spark.read.option("header", true).option("inferSchema", true).csv(SparkFiles.get("rows.csv")) df.printSchema() df.show() println("Covariance: " + df.stat.cov("Total Population", "Total Households")) println("Correlation: " + df.stat.corr("Total Population", "Total Households")) df.createOrReplaceTempView("tmp_data") val dfWithTier = spark.sql("select *, ntile(100) over(order by `Total Population`) tier from tmp_data") val dfTier90Plus = dfWithTier.where("tier >= 90") val plot = Vegas().withDataFrame(dfTier90Plus).encodeX("Zip Code", Nom). encodeY("Total Population", Quant). mark(Bar) plot.show spark.stop() } }
Example 4
Source File: LoadsContrib.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.ner.dl import java.io.{BufferedOutputStream, File, FileOutputStream} import java.nio.file.Paths import com.johnsnowlabs.nlp.util.io.ResourceHelper import org.apache.commons.lang.SystemUtils import org.apache.spark.SparkFiles import org.apache.spark.sql.SparkSession import org.tensorflow.TensorFlow object LoadsContrib { @transient var loadedToCluster = false @transient var loadedToTensorflow = false private lazy val lib1 = "_sparse_feature_cross_op.so" private lazy val lib2 = "_lstm_ops.so" private def resourcePath(os: String, lib: String) = "ner-dl/"+os+"/"+lib if (!LoadsContrib.loadedToCluster && contribPaths.isDefined) { LoadsContrib.loadedToCluster = true spark.sparkContext.addFile(copyResourceToTmp(contribPaths.get._1).getPath) spark.sparkContext.addFile(copyResourceToTmp(contribPaths.get._2).getPath) } } def loadContribToTensorflow(): Unit = { if (!LoadsContrib.loadedToTensorflow && contribPaths.isDefined) { LoadsContrib.loadedToTensorflow = true val fp1 = SparkFiles.get(getFileName(contribPaths.get._1)) val fp2 = SparkFiles.get(getFileName(contribPaths.get._2)) if (new File(fp1).exists() && new File(fp2).exists()) { TensorFlow.loadLibrary(fp1) TensorFlow.loadLibrary(fp2) } } } }
Example 5
Source File: StorageHelper.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.storage import java.io.File import org.apache.hadoop.fs.{FileSystem, FileUtil, Path} import org.apache.spark.{SparkContext, SparkFiles} import org.apache.spark.sql.SparkSession object StorageHelper { def resolveStorageName(database: String, storageRef: String): String = new Path(database + "_" + storageRef).toString def load( storageSourcePath: String, spark: SparkSession, database: String, storageRef: String, withinStorage: Boolean ): RocksDBConnection = { val dbFolder = StorageHelper.resolveStorageName(database.toString, storageRef) val src = StorageLocator.getStorageSerializedPath(storageSourcePath.replaceAllLiterally("\\", "/"), dbFolder, withinStorage) val locator = StorageLocator(database, storageRef, spark) sendToCluster(src, locator.clusterFilePath, locator.clusterFileName, locator.destinationScheme, spark.sparkContext) RocksDBConnection.getOrCreate(locator.clusterFileName) } def save(path: String, connection: RocksDBConnection, spark: SparkSession, withinStorage: Boolean): Unit = { val indexUri = "file://"+(new java.net.URI(connection.findLocalIndex.replaceAllLiterally("\\", "/")).getPath) val index = new Path(indexUri) val uri = new java.net.URI(path.replaceAllLiterally("\\", "/")) val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration) val dst = new Path(path+{if (withinStorage) "/storage/" else ""}) save(fs, index, dst) } private def save(fs: FileSystem, index: Path, dst: Path): Unit = { if (!fs.exists(dst)) fs.mkdirs(dst) fs.copyFromLocalFile(false, true, index, dst) } def sendToCluster(source: Path, clusterFilePath: Path, clusterFileName: String, destinationScheme: String, sparkContext: SparkContext): Unit = { if (destinationScheme == "file") { copyIndexToLocal(source, new Path(RocksDBConnection.getLocalPath(clusterFileName)), sparkContext) } else { copyIndexToCluster(source, clusterFilePath, sparkContext) } } private def copyIndexToCluster(sourcePath: Path, dst: Path, spark: SparkContext): String = { if (!new File(SparkFiles.get(dst.getName)).exists()) { val srcFS = sourcePath.getFileSystem(spark.hadoopConfiguration) val dstFS = dst.getFileSystem(spark.hadoopConfiguration) if (srcFS.getScheme == "file") { val src = sourcePath dstFS.copyFromLocalFile(false, true, src, dst) } else { FileUtil.copy(srcFS, sourcePath, dstFS, dst, false, true, spark.hadoopConfiguration) } spark.addFile(dst.toString, recursive = true) } dst.toString } private def copyIndexToLocal(source: Path, destination: Path, context: SparkContext): Unit = { val fs = source.getFileSystem(context.hadoopConfiguration) if (!fs.exists(destination)) fs.copyFromLocalFile(false, true, source, destination) } }