org.apache.spark.util.LongAccumulator Scala Examples
The following examples show how to use org.apache.spark.util.LongAccumulator.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: package.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import java.util.Collections import scala.collection.JavaConverters._ import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.trees.TreeNodeRef import org.apache.spark.util.{AccumulatorV2, LongAccumulator} case class ColumnMetrics() { val elementTypes = new SetAccumulator[String] sparkContext.register(elementTypes) } val tupleCount: LongAccumulator = sparkContext.longAccumulator val numColumns: Int = child.output.size val columnStats: Array[ColumnMetrics] = Array.fill(child.output.size)(new ColumnMetrics()) def dumpStats(): Unit = { debugPrint(s"== ${child.simpleString} ==") debugPrint(s"Tuples output: ${tupleCount.value}") child.output.zip(columnStats).foreach { case (attr, metric) => // This is called on driver. All accumulator updates have a fixed value. So it's safe to use // `asScala` which accesses the internal values using `java.util.Iterator`. val actualDataTypes = metric.elementTypes.value.asScala.mkString("{", ",", "}") debugPrint(s" ${attr.name} ${attr.dataType}: $actualDataTypes") } } protected override def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => new Iterator[InternalRow] { def hasNext: Boolean = iter.hasNext def next(): InternalRow = { val currentRow = iter.next() tupleCount.add(1) var i = 0 while (i < numColumns) { val value = currentRow.get(i, output(i).dataType) if (value != null) { columnStats(i).elementTypes.add(value.getClass.getName) } i += 1 } currentRow } } } } override def outputPartitioning: Partitioning = child.outputPartitioning override def inputRDDs(): Seq[RDD[InternalRow]] = { child.asInstanceOf[CodegenSupport].inputRDDs() } override def doProduce(ctx: CodegenContext): String = { child.asInstanceOf[CodegenSupport].produce(ctx, this) } override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { consume(ctx, input) } } }
Example 2
Source File: ShuffleWriteMetrics.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.executor import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.LongAccumulator def writeTime: Long = _writeTime.sum private[spark] def incBytesWritten(v: Long): Unit = _bytesWritten.add(v) private[spark] def incRecordsWritten(v: Long): Unit = _recordsWritten.add(v) private[spark] def incWriteTime(v: Long): Unit = _writeTime.add(v) private[spark] def decBytesWritten(v: Long): Unit = { _bytesWritten.setValue(bytesWritten - v) } private[spark] def decRecordsWritten(v: Long): Unit = { _recordsWritten.setValue(recordsWritten - v) } // Legacy methods for backward compatibility. // TODO: remove these once we make this class private. @deprecated("use bytesWritten instead", "2.0.0") def shuffleBytesWritten: Long = bytesWritten @deprecated("use writeTime instead", "2.0.0") def shuffleWriteTime: Long = writeTime @deprecated("use recordsWritten instead", "2.0.0") def shuffleRecordsWritten: Long = recordsWritten }
Example 3
Source File: SparkCore_5_Accumulator.scala From HadoopLearning with MIT License | 5 votes |
package com.liumm.transform import org.apache.spark.util.{CollectionAccumulator, DoubleAccumulator, LongAccumulator} import org.apache.spark.{SparkConf, SparkContext} object SparkCore_5_Accumulator { case class Info(var success: Boolean, var msg: String, count: Int) def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SparkCore_5_Accumulator").setMaster("local") val sc = new SparkContext(conf) val r = sc.parallelize(1 to 100) println("\n-----------------------传统方式累加------------------------") var total = 0 r.foreach(num => { total += num }) println(total) println("\n-----------------------注册LongAccumulator-------------------") //内置的累加器只支持 Long、Double、Collection val rdd = sc.parallelize(1 to 100, 4) //1、显示注册 val longAccumulator = new LongAccumulator sc.register(longAccumulator) rdd.foreach(num => { longAccumulator.add(num) }) println(longAccumulator.value) println("\n-----------------------直接使用LongAccumulator-------------------") //2、直接使用LongAccumulator,无需注册 val longAccumulator1 = sc.longAccumulator rdd.foreach(num => { longAccumulator1.add(num) }) println(longAccumulator1.value) println("\n-----------------------直接使用DoubleAccumulator-------------------") val rdd1 = sc.parallelize(List(1.1, 2.2, 3.3, 4.3)) //1、显示注册 val doubleAccumulator = new DoubleAccumulator sc.register(doubleAccumulator) rdd1.foreach(num => { doubleAccumulator.add(num) }) println(doubleAccumulator.value) //2、直接使用 val doubleAccumulator1 = sc.doubleAccumulator rdd1.foreach(num => { doubleAccumulator1.add(num) }) println(doubleAccumulator1.value) println("\n-----------------------直接使用CollectionAccumulator-------------------") val rdd2 = sc.parallelize(1 to 100) //1、显示注册 val collectionAccumulator = new CollectionAccumulator[Int] sc.register(collectionAccumulator) rdd2.foreach(num => { collectionAccumulator.add(num) }) println(collectionAccumulator) //2、直接使用 val collectionAccumulator1 = sc.collectionAccumulator[Int] rdd2.foreach(num => { collectionAccumulator.add(num) }) println(collectionAccumulator1) println("\n-----------------------使用自定义累加器-------------------") val customAccumulator = new CustomAccumulator sc.register(customAccumulator) rdd.foreach(num => { customAccumulator.add(num.toString) }) println(customAccumulator.value) } }
Example 4
Source File: Accumulators.scala From spark-distcp with Apache License 2.0 | 5 votes |
package com.coxautodata.objects import com.coxautodata.utils.FileUtils import org.apache.spark.sql.SparkSession import org.apache.spark.util.LongAccumulator import scala.collection.JavaConverters._ class Accumulators(sparkSession: SparkSession) extends Serializable { def handleResult(result: DistCPResult): Unit = result match { case DeleteResult(_, DeleteActionResult.SkippedDoesNotExists | DeleteActionResult.SkippedDryRun) => deleteOperationsSkipped.add(1) case DeleteResult(_, DeleteActionResult.Deleted) => deleteOperationsSuccessful.add(1) case DeleteResult(_, DeleteActionResult.Failed(e)) => deleteOperationsSkipped.add(1) deleteOperationsFailed.add(1) exceptionCount.add(e) case DirectoryCopyResult(_, _, CopyActionResult.SkippedAlreadyExists | CopyActionResult.SkippedDryRun) => foldersSkipped.add(1) case DirectoryCopyResult(_, _, CopyActionResult.Created) => foldersCreated.add(1) case DirectoryCopyResult(_, _, CopyActionResult.Failed(e)) => foldersFailed.add(1) foldersSkipped.add(1) exceptionCount.add(e) case FileCopyResult(_, _, l, CopyActionResult.SkippedAlreadyExists | CopyActionResult.SkippedIdenticalFileAlreadyExists | CopyActionResult.SkippedDryRun) => filesSkipped.add(1) bytesSkipped.add(l) case FileCopyResult(_, _, l, CopyActionResult.Copied) => filesCopied.add(1) bytesCopied.add(l) case FileCopyResult(_, _, l, CopyActionResult.OverwrittenOrUpdated) => filesCopied.add(1) bytesCopied.add(l) filesUpdatedOrOverwritten.add(1) case FileCopyResult(_, _, l, CopyActionResult.Failed(e)) => filesFailed.add(1) exceptionCount.add(e) filesSkipped.add(1) bytesSkipped.add(l) } def getOutputText: String = { val intFormatter = java.text.NumberFormat.getIntegerInstance s"""--Raw data-- |Data copied: ${FileUtils.byteCountToDisplaySize(bytesCopied.value)} |Data skipped (already existing files, dry-run and failures): ${FileUtils.byteCountToDisplaySize(bytesSkipped.value)} |--Files-- |Files copied (new files and overwritten/updated files): ${intFormatter.format(filesCopied.value)} |Files overwritten/updated: ${intFormatter.format(filesUpdatedOrOverwritten.value)} |Skipped files for copying (already existing files, dry-run and failures): ${intFormatter.format(filesSkipped.value)} |Failed files during copy: ${intFormatter.format(filesFailed.value)} |--Folders-- |Folders created: ${intFormatter.format(foldersCreated.value)} |Skipped folder creates (already existing folders, dry-run and failures): ${intFormatter.format(foldersSkipped.value)} |Failed folder creates: ${intFormatter.format(foldersFailed.value)} |--Deletes-- |Successful delete operations: ${intFormatter.format(deleteOperationsSuccessful.value)} |Skipped delete operations (files/folders already missing, dry-run and failures): ${intFormatter.format(deleteOperationsSkipped.value)} |Failed delete operations: ${intFormatter.format(deleteOperationsFailed.value)} |--Exception counts-- |""".stripMargin ++ exceptionCount.value.asScala.toSeq.sortWith { case ((_, v1), (_, v2)) => v1 > v2 }.map { case (k, v) => s"$k: ${intFormatter.format(v)}" }.mkString("\n") } val bytesCopied: LongAccumulator = sparkSession.sparkContext.longAccumulator("BytesCopied") val bytesSkipped: LongAccumulator = sparkSession.sparkContext.longAccumulator("BytesSkipped") // Already exists, dryrun and failure val foldersCreated: LongAccumulator = sparkSession.sparkContext.longAccumulator("FoldersCreated") val foldersSkipped: LongAccumulator = sparkSession.sparkContext.longAccumulator("FoldersSkipped") val foldersFailed: LongAccumulator = sparkSession.sparkContext.longAccumulator("FoldersFailed") val filesCopied: LongAccumulator = sparkSession.sparkContext.longAccumulator("FilesCopied") val filesSkipped: LongAccumulator = sparkSession.sparkContext.longAccumulator("FilesSkipped") // Already exists, dryrun and failure val filesFailed: LongAccumulator = sparkSession.sparkContext.longAccumulator("FilesFailed") val filesUpdatedOrOverwritten: LongAccumulator = sparkSession.sparkContext.longAccumulator("FilesUpdatedOrOverwritten") val deleteOperationsSuccessful: LongAccumulator = sparkSession.sparkContext.longAccumulator("DeleteOperationsSuccessful") val deleteOperationsSkipped: LongAccumulator = sparkSession.sparkContext.longAccumulator("DeleteOperationsSkipped") // Already exists, dryrun and failure val deleteOperationsFailed: LongAccumulator = sparkSession.sparkContext.longAccumulator("DeleteOperationsFailed") val exceptionCount: ExceptionCountAccumulator = new ExceptionCountAccumulator sparkSession.sparkContext.register(exceptionCount, "ExceptionCount") }
Example 5
Source File: ShuffleWriteMetrics.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.executor import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.LongAccumulator def writeTime: Long = _writeTime.sum private[spark] def incBytesWritten(v: Long): Unit = _bytesWritten.add(v) private[spark] def incRecordsWritten(v: Long): Unit = _recordsWritten.add(v) private[spark] def incWriteTime(v: Long): Unit = _writeTime.add(v) private[spark] def decBytesWritten(v: Long): Unit = { _bytesWritten.setValue(bytesWritten - v) } private[spark] def decRecordsWritten(v: Long): Unit = { _recordsWritten.setValue(recordsWritten - v) } // Legacy methods for backward compatibility. // TODO: remove these once we make this class private. @deprecated("use bytesWritten instead", "2.0.0") def shuffleBytesWritten: Long = bytesWritten @deprecated("use writeTime instead", "2.0.0") def shuffleWriteTime: Long = writeTime @deprecated("use recordsWritten instead", "2.0.0") def shuffleRecordsWritten: Long = recordsWritten }
Example 6
Source File: SparkUtil.scala From Adenium with Apache License 2.0 | 5 votes |
package com.adenium.externals.spark import com.adenium.utils.Logger import com.adenium.utils.May.maybe import com.adenium.utils.May.maybeInfo import org.apache.spark.util.LongAccumulator object SparkUtil { case class Accumulate ( acs : Seq[ LongAccumulator ] ) { def logStr: Seq[String ] = { acs map { ac => s"${ac.name.getOrElse( ac.id.toString )} = ${ac.value}" } } def log (): Unit = { acs foreach { ac => Logger.logInfo( ac.name.getOrElse( ac.id.toString) + " = " + ac.value )} } def add ( is: Seq[ Int] ): Unit = { acs.zip( is).foreach{ case (a, i) => maybe { a add i } } // swallow exception } def add( nls: Iterator[ Seq[ Int]]): Unit = { //ex: Iterator [Seq( ls, ncnt, bcnt, fcnt )] nls foreach { is => this.add( is) } } def add2sum(is: Seq[ Int], f: Seq[Int] => Int = _.sum ): Unit = { acs.zip( f(is) +: is).foreach { case (a, i) => maybeInfo( a add i )("add2sum failed") } } } }
Example 7
Source File: ShuffleWriteMetrics.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.executor import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.LongAccumulator def writeTime: Long = _writeTime.sum private[spark] def incBytesWritten(v: Long): Unit = _bytesWritten.add(v) private[spark] def incRecordsWritten(v: Long): Unit = _recordsWritten.add(v) private[spark] def incWriteTime(v: Long): Unit = _writeTime.add(v) private[spark] def decBytesWritten(v: Long): Unit = { _bytesWritten.setValue(bytesWritten - v) } private[spark] def decRecordsWritten(v: Long): Unit = { _recordsWritten.setValue(recordsWritten - v) } // Legacy methods for backward compatibility. // TODO: remove these once we make this class private. @deprecated("use bytesWritten instead", "2.0.0") def shuffleBytesWritten: Long = bytesWritten @deprecated("use writeTime instead", "2.0.0") def shuffleWriteTime: Long = writeTime @deprecated("use recordsWritten instead", "2.0.0") def shuffleRecordsWritten: Long = recordsWritten }
Example 8
Source File: ShuffleWriteMetrics.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.executor import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.LongAccumulator def writeTime: Long = _writeTime.sum private[spark] def incBytesWritten(v: Long): Unit = _bytesWritten.add(v) private[spark] def incRecordsWritten(v: Long): Unit = _recordsWritten.add(v) private[spark] def incWriteTime(v: Long): Unit = _writeTime.add(v) private[spark] def decBytesWritten(v: Long): Unit = { _bytesWritten.setValue(bytesWritten - v) } private[spark] def decRecordsWritten(v: Long): Unit = { _recordsWritten.setValue(recordsWritten - v) } // Legacy methods for backward compatibility. // TODO: remove these once we make this class private. @deprecated("use bytesWritten instead", "2.0.0") def shuffleBytesWritten: Long = bytesWritten @deprecated("use writeTime instead", "2.0.0") def shuffleWriteTime: Long = writeTime @deprecated("use recordsWritten instead", "2.0.0") def shuffleRecordsWritten: Long = recordsWritten }
Example 9
Source File: 7_RecoverableNetworkWordCount.scala From wow-spark with MIT License | 5 votes |
package com.sev7e0.wow.spark_streaming import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} import org.apache.spark.util.LongAccumulator import org.apache.spark.{SparkConf, SparkContext} object RecoverableNetworkWordCount { def main(args: Array[String]): Unit = { StreamingLogger.setLoggerLevel() val conf = new SparkConf().setMaster("local").setAppName(RecoverableNetworkWordCount.getClass.getName) val context = new StreamingContext(conf, Seconds(1)) val linesDS = context.socketTextStream("localhost", 9999, StorageLevel.MEMORY_AND_DISK_2) val wordsCounts = linesDS.flatMap(_.split(" ")).map(word => (word, 1)).reduceByKey(_ + _) wordsCounts.foreachRDD((rdd: RDD[(String, Int)], time: Time) => { val blackList = WordBlackList.getInstance(context.sparkContext) val accumulator = DropWordCounter.getInstance(context.sparkContext) val str = rdd.filter { case (word, count) => if (blackList.value.contains(word)) { accumulator.add(count) false } else { true } }.collect().mkString("[", ", ", "]") println(s"str = $str") }) } } object WordBlackList { @volatile private var instance: Broadcast[Seq[String]] = _ def getInstance(context: SparkContext): Broadcast[Seq[String]] = { if (instance == null) { synchronized { if (instance == null) { val blackList = Seq("a", "b", "c") instance = context.broadcast(blackList) } } } instance } } object DropWordCounter { @volatile private var instance: LongAccumulator = _ def getInstance(context: SparkContext): LongAccumulator = { if (instance == null) { synchronized { if (instance == null) { instance = context.longAccumulator("WordCount") } } } instance } }
Example 10
Source File: GdeltTagger.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.tagging.gdelt import java.text.SimpleDateFormat import java.util.Date import com.typesafe.config.ConfigFactory import io.gzet.tagging.classifier.Classifier import io.gzet.tagging.html.HtmlHandler import io.gzet.tagging.html.HtmlHandler.Content import org.apache.spark.Accumulator import org.apache.spark.streaming.dstream.DStream import org.apache.spark.util.LongAccumulator import org.elasticsearch.spark._ class GdeltTagger() extends Serializable { val config = ConfigFactory.load().getConfig("io.gzet.kappa") val isoSdf = "yyyy-MM-dd HH:mm:ss" val esIndex = config.getString("gdeltIndex") val vectorSize = config.getInt("vectorSize") val minProba = config.getDouble("minProba") def predict(gdeltStream: DStream[String], batchId: LongAccumulator) = { // Extract HTML content val gdeltContent = fetchHtmlContent(gdeltStream) // Predict each RDD gdeltContent foreachRDD { batch => batch.cache() val count = batch.count() if (count > 0) { if (Classifier.model.isDefined) { val labels = Classifier.model.get.labels // Predict HashTags using latest Twitter model val textRdd = batch.map(_.body.get) val predictions = Classifier.predictProbabilities(textRdd) val taggedGdelt = batch.zip(predictions) map { case (content, probabilities) => val validLabels = probabilities filter { case (label, probability) => probability > minProba } val labels = validLabels.toSeq .sortBy(_._2) .reverse .map(_._1) (content, labels) } // Saving articles to Elasticsearch taggedGdelt map { case (content, hashTags) => gdeltToJson(content, hashTags.toArray) } saveToEs esIndex } else { // Saving articles to Elasticsearch batch map { content => gdeltToJson(content, Array()) } saveToEs esIndex } } batch.unpersist(blocking = false) } } private def gdeltToJson(content: Content, hashTags: Array[String]) = { val sdf = new SimpleDateFormat(isoSdf) Map( "time" -> sdf.format(new Date()), "body" -> content.body.get, "url" -> content.url, "tags" -> hashTags, "title" -> content.title ) } private def fetchHtmlContent(urlStream: DStream[String]) = { urlStream.map(_ -> 1).groupByKey().map(_._1) mapPartitions { urls => val sdf = new SimpleDateFormat(isoSdf) val htmlHandler = new HtmlHandler() val goose = htmlHandler.getGooseScraper urls map { url => htmlHandler.fetchUrl(goose, url, sdf) } } filter { content => content.isDefined && content.get.body.isDefined && content.get.body.get.length > 255 } map { content => content.get } } }