scala.collection.parallel.ForkJoinTaskSupport Scala Examples
The following examples show how to use scala.collection.parallel.ForkJoinTaskSupport.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: UnionRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport} import scala.concurrent.forkjoin.ForkJoinPool import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient private val rdd: RDD[T], val parentRddIndex: Int, @transient private val parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } object UnionRDD { private[spark] lazy val partitionEvalTaskSupport = new ForkJoinTaskSupport(new ForkJoinPool(8)) } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies // visible for testing private[spark] val isPartitionListingParallel: Boolean = rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10) override def getPartitions: Array[Partition] = { val parRDDs = if (isPartitionListingParallel) { val parArray = rdds.par parArray.tasksupport = UnionRDD.partitionEvalTaskSupport parArray } else { rdds } val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum) var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length) pos += rdd.partitions.length } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Example 2
Source File: COCOSeqFileGenerator.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.utils import com.intel.analytics.bigdl.dataset.segmentation.{COCODataset, COCOSerializeContext} import java.io.File import java.nio.file.{Files, Paths} import java.util.concurrent.atomic.AtomicInteger import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.SequenceFile.Writer import org.apache.hadoop.io.compress.BZip2Codec import org.apache.hadoop.io.{BytesWritable, SequenceFile} import scala.collection.parallel.ForkJoinTaskSupport import scopt.OptionParser object COCOSeqFileGenerator { case class COCOSeqFileGeneratorParams( folder: String = ".", metaPath: String = "instances_val2014.json", output: String = ".", parallel: Int = 1, blockSize: Int = 12800 ) private val parser = new OptionParser[COCOSeqFileGeneratorParams]("BigDL COCO " + "Sequence File Generator") { head("BigDL COCO Sequence File Generator") opt[String]('f', "folder") .text("where you put the COCO image files") .action((x, c) => c.copy(folder = x)) opt[String]('o', "output folder") .text("where you put the generated seq files") .action((x, c) => c.copy(output = x)) opt[Int]('p', "parallel") .text("parallel num") .action((x, c) => c.copy(parallel = x)) opt[Int]('b', "blockSize") .text("block size") .action((x, c) => c.copy(blockSize = x)) opt[String]('m', "metaPath") .text("metadata json file path") .action((x, c) => c.copy(metaPath = x)) } def main(args: Array[String]): Unit = { parser.parse(args, COCOSeqFileGeneratorParams()).foreach { param => println("Loading COCO metadata") val meta = COCODataset.load(param.metaPath, param.folder) println("Metadata loaded") val conf: Configuration = new Configuration val doneCount = new AtomicInteger(0) val tasks = meta.images.filter(img => { val path = img.path val valid = Files.exists(path) && !Files.isDirectory(path) if (!valid) { System.err.print(s"[Warning] The image file ${path.getFileName} does not exist.\n") } valid }).grouped(param.blockSize).zipWithIndex.toArray.par tasks.tasksupport = new ForkJoinTaskSupport( new scala.concurrent.forkjoin.ForkJoinPool(param.parallel)) tasks.foreach { case (imgs, blkId) => val outFile = new Path(param.output, s"coco-seq-$blkId.seq") val key = new BytesWritable val value = new BytesWritable val writer = SequenceFile.createWriter(conf, Writer.file(outFile), Writer.keyClass(key .getClass), Writer.valueClass(value.getClass), Writer.compression(SequenceFile .CompressionType.BLOCK, new BZip2Codec)) val context = new COCOSerializeContext imgs.foreach { img => context.clear() context.dump(img.fileName) img.dumpTo(context) context.dump(COCODataset.MAGIC_NUM) val keyBytes = context.toByteArray key.set(keyBytes, 0, keyBytes.length) val bytes = img.data value.set(bytes, 0, bytes.length) writer.append(key, value) val cnt = doneCount.incrementAndGet() if (cnt % 500 == 0) { System.err.print(s"\r$cnt / ${meta.images.length} = ${cnt.toFloat/meta.images.length}") } } writer.close() } System.err.print("\n") } } }
Example 3
Source File: SparkLaunch.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.sparklaunch import com.typesafe.config._ import java.io.File import com.ibm.sparktc.sparkbench.sparklaunch.confparse.{ConfigWrangler, SparkJobConf} import com.ibm.sparktc.sparkbench.sparklaunch.submission.livy.LivySubmit import com.ibm.sparktc.sparkbench.sparklaunch.submission.sparksubmit.SparkSubmit import scala.collection.parallel.ForkJoinTaskSupport import scala.collection.JavaConverters._ import scala.util.Try object SparkLaunch extends App { override def main(args: Array[String]): Unit = { assert(args.nonEmpty) val path = args.head val (confSeq: Seq[SparkJobConf], parallel: Boolean) = mkConfs(new File(path)) launchJobs(confSeq, parallel) } def mkConfs(file: File): (Seq[SparkJobConf], Boolean) = { val config: Config = ConfigFactory.parseFile(file) val sparkBenchConfig = config.getObject("spark-bench").toConfig val confs: Seq[SparkJobConf] = ConfigWrangler(file) val parallel = Try(sparkBenchConfig.getBoolean("spark-submit-parallel")).getOrElse(false) (confs, parallel) } private def getConfigListByName(name: String, config: Config): List[Config] = { val workloadObjs: Iterable[ConfigObject] = config.getObjectList(name).asScala workloadObjs.map(_.toConfig).toList } def launchJobs(confSeq: Seq[SparkJobConf], parallel: Boolean): Unit = { def launch(conf: SparkJobConf): Unit = conf.submissionParams match { case s if ConfigWrangler.isLivySubmit(s) => LivySubmit().launch(conf) case s if ConfigWrangler.isSparkSubmit(s) => SparkSubmit.launch(conf) } if (parallel) { val confSeqPar = confSeq.par confSeqPar.tasksupport = new ForkJoinTaskSupport(new scala.concurrent.forkjoin.ForkJoinPool(confSeqPar.size)) confSeqPar.foreach(launch) } else confSeq.foreach(launch) } }
Example 4
Source File: SuiteKickoff.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.workload import com.ibm.sparktc.sparkbench.utils.SparkFuncs._ import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.sql.functions.{col, lit} import scala.collection.parallel.ForkJoinTaskSupport object SuiteKickoff { private val log = org.slf4j.LoggerFactory.getLogger(getClass) def run(s: Suite, spark: SparkSession): Unit = { verifyOutput(s.benchmarkOutput, s.saveMode, spark) // Translate the maps into runnable workloads val workloads: Seq[Workload] = s.workloadConfigs.map(ConfigCreator.mapToConf) val dataframes: Seq[DataFrame] = (0 until s.repeat).flatMap { i => // This will produce one DataFrame of one row for each workload in the sequence. // We're going to produce one coherent DF later from these val dfSeqFromOneRun: Seq[DataFrame] = { if (s.parallel) runParallel(workloads, spark) else runSerially(workloads, spark) } // Indicate which run of this suite this was. dfSeqFromOneRun.map(_.withColumn("run", lit(i))) } // getting the Spark confs so we can output them in the results. val strSparkConfs = spark.conf.getAll // Ah, see, here's where we're joining that series of one-row DFs val singleDF = joinDataFrames(dataframes, spark) s.description.foreach(log.info) // And now we're going to curry in the results val plusSparkConf = addConfToResults(singleDF, strSparkConfs) val plusDescription = addConfToResults(plusSparkConf, Map("description" -> s.description)).coalesce(1) // And write to disk. We're done with this suite! if(s.benchmarkOutput.nonEmpty) writeToDisk(s.benchmarkOutput.get, s.saveMode, plusDescription, spark) } private def runParallel(workloadConfigs: Seq[Workload], spark: SparkSession): Seq[DataFrame] = { val confSeqPar = workloadConfigs.par confSeqPar.tasksupport = new ForkJoinTaskSupport(new scala.concurrent.forkjoin.ForkJoinPool(confSeqPar.size)) confSeqPar.map(_.run(spark)).seq } private def runSerially(workloadConfigs: Seq[Workload], spark: SparkSession): Seq[DataFrame] = { workloadConfigs.map(_.run(spark)) } private def joinDataFrames(seq: Seq[DataFrame], spark: SparkSession): DataFrame = { if (seq.length == 1) seq.head else { val seqOfColNames = seq.map(_.columns.toSet) val allTheColumns = seqOfColNames.foldLeft(Set[String]())(_ ++ _) def expr(myCols: Set[String], allCols: Set[String]) = { allCols.toList.map { case x if myCols.contains(x) => col(x) case x => lit(null).as(x) } } val seqFixedDfs = seq.map(df => df.select(expr(df.columns.toSet, allTheColumns): _*)) // Folding left across this sequence should be fine because each DF should only have 1 row // Nevarr Evarr do this to legit dataframes that are all like big and stuff seqFixedDfs.foldLeft(spark.createDataFrame(spark.sparkContext.emptyRDD[Row], seqFixedDfs.head.schema))(_ union _) } } }
Example 5
Source File: MultipleSuiteKickoff.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.workload import org.apache.spark.sql.SparkSession import scala.collection.parallel.ForkJoinTaskSupport object MultipleSuiteKickoff { def run(seq: Seq[MultiSuiteRunConfig]): Unit = seq.foreach { contextConf => val spark = createSparkContext(seq) if (contextConf.suitesParallel) runSuitesInParallel(contextConf.suites, spark) else runSuitesSerially(contextConf.suites, spark) } private def runSuitesInParallel(suiteSeq: Seq[Suite], spark: SparkSession): Unit = { val parallelSeq = suiteSeq.par parallelSeq.tasksupport = new ForkJoinTaskSupport(new scala.concurrent.forkjoin.ForkJoinPool(parallelSeq.size)) parallelSeq.foreach(suite => SuiteKickoff.run(suite, spark)) } private def runSuitesSerially(suiteSeq: Seq[Suite], spark: SparkSession): Unit = suiteSeq.foreach(SuiteKickoff.run(_, spark)) private def createSparkContext(configs: Seq[MultiSuiteRunConfig]): SparkSession = { val builder = SparkSession.builder // if any configs have hive enabled, enable it for all val builderWithHive = if (configs.exists(_.enableHive)) builder.enableHiveSupport else builder builderWithHive.getOrCreate } }
Example 6
Source File: UnionRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport} import scala.concurrent.forkjoin.ForkJoinPool import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient private val rdd: RDD[T], val parentRddIndex: Int, @transient private val parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } object UnionRDD { private[spark] lazy val partitionEvalTaskSupport = new ForkJoinTaskSupport(new ForkJoinPool(8)) } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies // visible for testing private[spark] val isPartitionListingParallel: Boolean = rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10) override def getPartitions: Array[Partition] = { val parRDDs = if (isPartitionListingParallel) { val parArray = rdds.par parArray.tasksupport = UnionRDD.partitionEvalTaskSupport parArray } else { rdds } val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum) var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length) pos += rdd.partitions.length } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Example 7
Source File: TestThreading.scala From eidos with Apache License 2.0 | 5 votes |
package org.clulab.wm.eidos.utils import org.clulab.wm.eidos.test.TestUtils._ import scala.collection.parallel.ForkJoinTaskSupport class TestThreading extends Test { val threads = 26 val numbers = 0.until(threads) val parNumbers = numbers.par val forkJoinPool = ThreadUtils.newForkJoinPool(threads) val forkJoinTaskSupport = new ForkJoinTaskSupport(forkJoinPool) // This should compile and output is not in order. parNumbers.tasksupport = forkJoinTaskSupport parNumbers.foreach { number => println(number) } }
Example 8
Source File: UnionRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport} import scala.concurrent.forkjoin.ForkJoinPool import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient private val rdd: RDD[T], val parentRddIndex: Int, @transient private val parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } object UnionRDD { private[spark] lazy val partitionEvalTaskSupport = new ForkJoinTaskSupport(new ForkJoinPool(8)) } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies // visible for testing private[spark] val isPartitionListingParallel: Boolean = rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10) override def getPartitions: Array[Partition] = { val parRDDs = if (isPartitionListingParallel) { val parArray = rdds.par parArray.tasksupport = UnionRDD.partitionEvalTaskSupport parArray } else { rdds } val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum) var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length) pos += rdd.partitions.length } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Example 9
Source File: UnionRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.collection.parallel.ForkJoinTaskSupport import scala.concurrent.forkjoin.ForkJoinPool import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient private val rdd: RDD[T], val parentRddIndex: Int, @transient private val parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } object UnionRDD { private[spark] lazy val partitionEvalTaskSupport = new ForkJoinTaskSupport(new ForkJoinPool(8)) } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies // visible for testing private[spark] val isPartitionListingParallel: Boolean = rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10) override def getPartitions: Array[Partition] = { val parRDDs = if (isPartitionListingParallel) { val parArray = rdds.par parArray.tasksupport = UnionRDD.partitionEvalTaskSupport parArray } else { rdds } val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum) var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length) pos += rdd.partitions.length } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Example 10
Source File: ProxyChecker.scala From ProxyCrawler with Apache License 2.0 | 5 votes |
package org.crowdcrawler.proxycrawler.checker import java.io.IOException import java.net.SocketTimeoutException import com.typesafe.scalalogging.Logger import org.apache.http.annotation.ThreadSafe import org.apache.http.conn.ConnectTimeoutException import org.crowdcrawler.proxycrawler.ProxyInfo import org.slf4j.LoggerFactory import scala.collection.parallel.ForkJoinTaskSupport import scala.concurrent.forkjoin.ForkJoinPool @ThreadSafe object ProxyChecker { private val LOGGER = Logger(LoggerFactory.getLogger(ProxyChecker.getClass.getName)) private def check(proxyInfo: ProxyInfo): ProxyInfo = { val start = System.currentTimeMillis try { LOGGER.info("Executing request via proxy " + proxyInfo) val (statusCode, bytes) = proxyInfo.schema match { case "HTTP" => HttpProxyChecker.check(proxyInfo.host, proxyInfo.port) case "HTTPS" => HttpsProxyChecker.check(proxyInfo.host, proxyInfo.port) case "SOCKS" | "SOCKS4" | "SOCKS5" => SocksProxyChecker.check(proxyInfo.host, proxyInfo.port) case other => throw new IllegalArgumentException("Unsupported schema " + other) } val end = System.currentTimeMillis LOGGER.info("Time elapsed " + (end - start) + " milliseconds") if (statusCode != 200) { LOGGER.error("HTTP status code is " + statusCode) ProxyInfo(proxyInfo.host, proxyInfo.port, proxyInfo.schema, -1, proxyInfo.location, proxyInfo.from) } else { if (bytes > 0) { val speed = (bytes / ((end - start) / 1000.0)).toInt LOGGER.info("Speed is " + speed + " bytes/s") ProxyInfo(proxyInfo.host, proxyInfo.port, proxyInfo.schema, speed, proxyInfo.location, proxyInfo.from) } else { LOGGER.error("HTTP status code is 200 but the proxy failed to retrieve HTML source code") if (proxyInfo.speed >= 0) { ProxyInfo(proxyInfo.host, proxyInfo.port, proxyInfo.schema, -1, proxyInfo.location, proxyInfo.from) } else { ProxyInfo(proxyInfo.host, proxyInfo.port, proxyInfo.schema, proxyInfo.speed - 1, proxyInfo.location, proxyInfo.from) } } } } catch { case e: IOException => val end = System.currentTimeMillis if (e.isInstanceOf[ConnectTimeoutException] || e.isInstanceOf[SocketTimeoutException]) { LOGGER.info(e.getClass.getName + " : " + e.getMessage) LOGGER.info("Time elapsed " + (end - start) + " milliseconds") } else { LOGGER.error(e.getClass.getName + " : " + e.getMessage) LOGGER.error("Time elapsed " + (end - start) + " milliseconds") } if (proxyInfo.speed >= 0) { ProxyInfo(proxyInfo.host, proxyInfo.port, proxyInfo.schema, -1, proxyInfo.location, proxyInfo.from) } else { ProxyInfo(proxyInfo.host, proxyInfo.port, proxyInfo.schema, proxyInfo.speed - 1, proxyInfo.location, proxyInfo.from) } } } }
Example 11
Source File: GetSourceMetadataAction.scala From berilia with Apache License 2.0 | 5 votes |
package com.criteo.dev.cluster.source import com.criteo.dev.cluster.Node import com.criteo.dev.cluster.config.{GlobalConfig, TableConfig} import com.criteo.dev.cluster.copy.GetMetadataAction import scala.collection.parallel.ForkJoinTaskSupport import scala.concurrent.forkjoin.ForkJoinPool import scala.util.{Failure, Success, Try} def apply(tables: List[TableConfig], useLocalScheme: Boolean = config.source.isLocalScheme): List[Either[InvalidTable, FullTableInfo]] = { val conf = config.backCompat val getMetadata = new GetMetadataAction(config, conf, node) // configure parallel execution val parTables = tables.par parTables.tasksupport = new ForkJoinTaskSupport(new ForkJoinPool(config.source.parallelism.table)) val (validTables, invalidTables) = parTables .map { table => val (tableName, spec) = (table.name, (table.name :: table.partitions.map(_.mkString("(", ",", ")")).mkString(" ") :: Nil).mkString(" ")) (tableName, spec, Try(getMetadata(spec, useLocalScheme))) } .toList .partition(_._3.isSuccess) val tableAndLocations = validTables .flatMap { case (_, _, Success(m)) => if (m.partitions.size > 0) m.partitions.map(p => (m, p.location)) else List((m, m.ddl.location.get)) } tableAndLocations .zip( if (useLocalScheme) HDFSUtils.getFileSize(tableAndLocations.map(_._2)) else HDFSUtils.getFileSize(tableAndLocations.map(_._2), node) ) .groupBy { case ((m, _), _) => m } .foldLeft(List.empty[FullTableInfo]) { case (acc, (table, results)) => FullTableInfo( table, TableHDFSInfo( table.database, table.ddl.table, results.map(_._2).sum, results.map(r => HDFSFileInfo( r._1._2, r._2 )), table.partitions.size ) ) :: acc } .map(Right(_)) ++ invalidTables.map { case (tableName, spec, Failure(e)) => Left(InvalidTable(tableName, spec, e.getMessage)) } } }
Example 12
Source File: GetMetadataAction.scala From berilia with Apache License 2.0 | 5 votes |
package com.criteo.dev.cluster.copy import com.criteo.dev.cluster.Node import com.criteo.dev.cluster.config.GlobalConfig import org.slf4j.LoggerFactory import scala.collection.parallel.ForkJoinTaskSupport import scala.concurrent.forkjoin.ForkJoinPool class GetMetadataAction(config: GlobalConfig, conf: Map[String, String], node : Node, throttle: Boolean = true) { private val logger = LoggerFactory.getLogger(classOf[GetMetadataAction]) def apply(dbTablePartSpec: String, useLocalScheme: Boolean = config.source.isLocalScheme) : TableInfo = { //parse the configured source tables of form "$db.$table (part1=$part1, part2=$part2) (part1=$part1, part2=$part3)" val regex = """(\S*)\.(\S*)\s*(.*)""".r dbTablePartSpec match { case regex(db, table, partSpec) => { //1. Get the table metadata, like location, isPartitioned, and createStmt. val getTableMetadataAction = new GetTableMetadataAction(conf, node, useLocalScheme) val createTable = getTableMetadataAction(db, table) //2. If partitioned, get the list of partitions. val partitionList: Array[String] = if (createTable.partitionedBy.length != 0) { if (partSpec.isEmpty) { ListPartitionAction(conf, node, useLocalScheme, db, table, None, throttle) } else { val parenRegex = """\((.*?)\)""".r val parPartSpecs = parenRegex.findAllIn(partSpec).toList.par parPartSpecs.tasksupport = new ForkJoinTaskSupport(new ForkJoinPool(config.source.parallelism.partition)) parPartSpecs.flatMap(p => ListPartitionAction(conf, node, useLocalScheme, db, table, Some(p), throttle) ).distinct.toArray } } else { Array.empty[String] } //3. Get partitionSpec in model form. val partitionSpecList: Array[PartSpec] = partitionList.map(s => { CopyUtilities.getPartInfos(s) }) //4. Get partition locations as well val getPartitionAction = new GetPartitionMetadataAction(conf, node, useLocalScheme) val partitions = getPartitionAction(db, table, createTable, partitionSpecList) TableInfo(db, createTable.table, createTable, partitions) } case _ => throw new IllegalArgumentException(s"Cannot parse ${CopyConstants.sourceTables}: $dbTablePartSpec. " + "Make sure it is of form $db.$table $partition, where $partition is optional and of form (part1='val1', part2='val2').") } } }