scala.concurrent.forkjoin.ForkJoinPool Scala Example

Source File: UnionRDD.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport}
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
}

Source File: UnionRDD.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport}
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
}

Source File: UnionRDD.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport}
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
}

Source File: UnionRDD.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.ForkJoinTaskSupport
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
}

Source File: ProxyChecker.scala From ProxyCrawler with Apache License 2.0

5 votes

package org.crowdcrawler.proxycrawler.checker

import java.io.IOException
import java.net.SocketTimeoutException

import com.typesafe.scalalogging.Logger
import org.apache.http.annotation.ThreadSafe
import org.apache.http.conn.ConnectTimeoutException
import org.crowdcrawler.proxycrawler.ProxyInfo
import org.slf4j.LoggerFactory

import scala.collection.parallel.ForkJoinTaskSupport
import scala.concurrent.forkjoin.ForkJoinPool


@ThreadSafe
object ProxyChecker {
  private val LOGGER = Logger(LoggerFactory.getLogger(ProxyChecker.getClass.getName))


  
  private def check(proxyInfo: ProxyInfo): ProxyInfo = {
    val start = System.currentTimeMillis
    try {
      LOGGER.info("Executing request via proxy " + proxyInfo)
      val (statusCode, bytes) = proxyInfo.schema match {
        case "HTTP" =>
          HttpProxyChecker.check(proxyInfo.host, proxyInfo.port)
        case "HTTPS" =>
          HttpsProxyChecker.check(proxyInfo.host, proxyInfo.port)
        case "SOCKS" | "SOCKS4" | "SOCKS5" =>
          SocksProxyChecker.check(proxyInfo.host, proxyInfo.port)
        case other => throw new IllegalArgumentException("Unsupported schema " + other)
      }
      val end = System.currentTimeMillis
      LOGGER.info("Time elapsed " + (end - start) + " milliseconds")

      if (statusCode != 200) {
        LOGGER.error("HTTP status code is " + statusCode)
        ProxyInfo(proxyInfo.host, proxyInfo.port, proxyInfo.schema, -1, proxyInfo.location, proxyInfo.from)
      } else {
        if (bytes > 0) {
          val speed = (bytes / ((end - start) / 1000.0)).toInt
          LOGGER.info("Speed is " + speed + " bytes/s")
          ProxyInfo(proxyInfo.host, proxyInfo.port, proxyInfo.schema, speed, proxyInfo.location, proxyInfo.from)
        } else {
          LOGGER.error("HTTP status code is 200 but the proxy failed to retrieve HTML source code")
          if (proxyInfo.speed >= 0) {
            ProxyInfo(proxyInfo.host, proxyInfo.port, proxyInfo.schema, -1, proxyInfo.location, proxyInfo.from)
          } else {
            ProxyInfo(proxyInfo.host, proxyInfo.port, proxyInfo.schema, proxyInfo.speed - 1,
              proxyInfo.location, proxyInfo.from)
          }
        }
      }
    } catch {
      case e: IOException =>
        val end = System.currentTimeMillis
        if (e.isInstanceOf[ConnectTimeoutException] || e.isInstanceOf[SocketTimeoutException]) {
          LOGGER.info(e.getClass.getName + " : " + e.getMessage)
          LOGGER.info("Time elapsed " + (end - start) + " milliseconds")
        } else {
          LOGGER.error(e.getClass.getName + " : " + e.getMessage)
          LOGGER.error("Time elapsed " + (end - start) + " milliseconds")
        }

        if (proxyInfo.speed >= 0) {
          ProxyInfo(proxyInfo.host, proxyInfo.port, proxyInfo.schema, -1, proxyInfo.location, proxyInfo.from)
        } else {
          ProxyInfo(proxyInfo.host, proxyInfo.port, proxyInfo.schema, proxyInfo.speed - 1,
            proxyInfo.location, proxyInfo.from)
        }
    }
  }

}

Source File: GetSourceMetadataAction.scala From berilia with Apache License 2.0

5 votes

package com.criteo.dev.cluster.source

import com.criteo.dev.cluster.Node
import com.criteo.dev.cluster.config.{GlobalConfig, TableConfig}
import com.criteo.dev.cluster.copy.GetMetadataAction

import scala.collection.parallel.ForkJoinTaskSupport
import scala.concurrent.forkjoin.ForkJoinPool
import scala.util.{Failure, Success, Try}


  def apply(tables: List[TableConfig], useLocalScheme: Boolean = config.source.isLocalScheme): List[Either[InvalidTable, FullTableInfo]] = {
    val conf = config.backCompat
    val getMetadata = new GetMetadataAction(config, conf, node)

    // configure parallel execution
    val parTables = tables.par
    parTables.tasksupport = new ForkJoinTaskSupport(new ForkJoinPool(config.source.parallelism.table))
    val (validTables, invalidTables) = parTables
      .map { table =>
        val (tableName, spec) = (table.name, (table.name :: table.partitions.map(_.mkString("(", ",", ")")).mkString(" ") :: Nil).mkString(" "))
        (tableName, spec, Try(getMetadata(spec, useLocalScheme)))
      }
      .toList
      .partition(_._3.isSuccess)
    val tableAndLocations = validTables
      .flatMap { case (_, _, Success(m)) =>
        if (m.partitions.size > 0)
          m.partitions.map(p => (m, p.location))
        else
          List((m, m.ddl.location.get))
      }
    tableAndLocations
      .zip(
        if (useLocalScheme)
          HDFSUtils.getFileSize(tableAndLocations.map(_._2))
        else
          HDFSUtils.getFileSize(tableAndLocations.map(_._2), node)
      )
      .groupBy { case ((m, _), _) => m }
      .foldLeft(List.empty[FullTableInfo]) { case (acc, (table, results)) =>
        FullTableInfo(
          table,
          TableHDFSInfo(
            table.database,
            table.ddl.table,
            results.map(_._2).sum,
            results.map(r => HDFSFileInfo(
              r._1._2,
              r._2
            )),
            table.partitions.size
          )
        ) :: acc
      }
      .map(Right(_)) ++
      invalidTables.map { case (tableName, spec, Failure(e)) =>
        Left(InvalidTable(tableName, spec, e.getMessage))
      }
  }
}

Source File: GetMetadataAction.scala From berilia with Apache License 2.0

5 votes

package com.criteo.dev.cluster.copy

import com.criteo.dev.cluster.Node
import com.criteo.dev.cluster.config.GlobalConfig
import org.slf4j.LoggerFactory

import scala.collection.parallel.ForkJoinTaskSupport
import scala.concurrent.forkjoin.ForkJoinPool


class GetMetadataAction(config: GlobalConfig, conf: Map[String, String], node : Node, throttle: Boolean = true) {

  private val logger = LoggerFactory.getLogger(classOf[GetMetadataAction])

  def apply(dbTablePartSpec: String, useLocalScheme: Boolean = config.source.isLocalScheme) : TableInfo = {
    //parse the configured source tables of form "$db.$table (part1=$part1, part2=$part2) (part1=$part1, part2=$part3)"
    val regex = """(\S*)\.(\S*)\s*(.*)""".r

    dbTablePartSpec match {
      case regex(db, table, partSpec) => {
        //1. Get the table metadata, like location, isPartitioned, and createStmt.
        val getTableMetadataAction = new GetTableMetadataAction(conf, node, useLocalScheme)
        val createTable = getTableMetadataAction(db, table)

        //2.  If partitioned, get the list of partitions.
        val partitionList: Array[String] =
          if (createTable.partitionedBy.length != 0) {
            if (partSpec.isEmpty) {
              ListPartitionAction(conf, node, useLocalScheme, db, table, None, throttle)
            } else {
              val parenRegex = """\((.*?)\)""".r
              val parPartSpecs = parenRegex.findAllIn(partSpec).toList.par
              parPartSpecs.tasksupport = new ForkJoinTaskSupport(new ForkJoinPool(config.source.parallelism.partition))
              parPartSpecs.flatMap(p =>
                ListPartitionAction(conf, node, useLocalScheme, db, table, Some(p), throttle)
              ).distinct.toArray
            }
          } else {
            Array.empty[String]
          }

        //3.  Get partitionSpec in model form.
        val partitionSpecList: Array[PartSpec] = partitionList.map(s => {
          CopyUtilities.getPartInfos(s)
        })

        //4.  Get partition locations as well
        val getPartitionAction = new GetPartitionMetadataAction(conf, node, useLocalScheme)
        val partitions = getPartitionAction(db, table, createTable, partitionSpecList)
        TableInfo(db, createTable.table, createTable, partitions)
      }
      case _ => throw new IllegalArgumentException(s"Cannot parse ${CopyConstants.sourceTables}: $dbTablePartSpec.  " +
        "Make sure it is of form $db.$table $partition, where $partition is optional and of form (part1='val1', part2='val2').")
    }
  }
}

scala.concurrent.forkjoin.ForkJoinPool Scala Examples