scala.collection.mutable.ArrayBuffer Scala Example

Source File: IntegrationTest.scala From kmq with Apache License 2.0

6 votes

package com.softwaremill.kmq.redelivery

import java.time.Duration
import java.util.Random

import akka.actor.ActorSystem
import akka.kafka.scaladsl.{Consumer, Producer}
import akka.kafka.{ConsumerSettings, ProducerMessage, ProducerSettings, Subscriptions}
import akka.stream.ActorMaterializer
import akka.testkit.TestKit
import com.softwaremill.kmq._
import com.softwaremill.kmq.redelivery.infrastructure.KafkaSpec
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.clients.producer.{ProducerConfig, ProducerRecord}
import org.apache.kafka.common.serialization.StringDeserializer
import org.scalatest.concurrent.Eventually
import org.scalatest.time.{Seconds, Span}
import org.scalatest.{BeforeAndAfterAll, FlatSpecLike, Matchers}

import scala.collection.mutable.ArrayBuffer

class IntegrationTest extends TestKit(ActorSystem("test-system")) with FlatSpecLike with KafkaSpec with BeforeAndAfterAll with Eventually with Matchers {

  implicit val materializer = ActorMaterializer()
  import system.dispatcher

  "KMQ" should "resend message if not committed" in {
    val bootstrapServer = s"localhost:${testKafkaConfig.kafkaPort}"
    val kmqConfig = new KmqConfig("queue", "markers", "kmq_client", "kmq_redelivery", Duration.ofSeconds(1).toMillis,
    1000)

    val consumerSettings = ConsumerSettings(system, new StringDeserializer, new StringDeserializer)
      .withBootstrapServers(bootstrapServer)
      .withGroupId(kmqConfig.getMsgConsumerGroupId)
      .withProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest")

    val markerProducerSettings = ProducerSettings(system,
      new MarkerKey.MarkerKeySerializer(), new MarkerValue.MarkerValueSerializer())
      .withBootstrapServers(bootstrapServer)
      .withProperty(ProducerConfig.PARTITIONER_CLASS_CONFIG, classOf[ParititionFromMarkerKey].getName)
    val markerProducer = markerProducerSettings.createKafkaProducer()

    val random = new Random()

    lazy val processedMessages = ArrayBuffer[String]()
    lazy val receivedMessages = ArrayBuffer[String]()

    val control = Consumer.committableSource(consumerSettings, Subscriptions.topics(kmqConfig.getMsgTopic)) // 1. get messages from topic
      .map { msg =>
      ProducerMessage.Message(
        new ProducerRecord[MarkerKey, MarkerValue](kmqConfig.getMarkerTopic, MarkerKey.fromRecord(msg.record), new StartMarker(kmqConfig.getMsgTimeoutMs)), msg)
    }
      .via(Producer.flow(markerProducerSettings, markerProducer)) // 2. write the "start" marker
      .map(_.message.passThrough)
      .mapAsync(1) { msg =>
        msg.committableOffset.commitScaladsl().map(_ => msg.record) // this should be batched
      }
      .map { msg =>
        receivedMessages += msg.value
        msg
      }
      .filter(_ => random.nextInt(5) != 0)
      .map { processedMessage =>
        processedMessages += processedMessage.value
        new ProducerRecord[MarkerKey, MarkerValue](kmqConfig.getMarkerTopic, MarkerKey.fromRecord(processedMessage), EndMarker.INSTANCE)
      }
      .to(Producer.plainSink(markerProducerSettings, markerProducer)) // 5. write "end" markers
      .run()

    val redeliveryHook = RedeliveryTracker.start(new KafkaClients(bootstrapServer), kmqConfig)

    val messages = (0 to 20).map(_.toString)
    messages.foreach(msg => sendToKafka(kmqConfig.getMsgTopic,msg))

    eventually {
      receivedMessages.size should be > processedMessages.size
      processedMessages.sortBy(_.toInt).distinct shouldBe messages
    }(PatienceConfig(timeout = Span(15, Seconds)), implicitly)

    redeliveryHook.close()
    control.shutdown()
  }

  override def afterAll(): Unit = {
    super.afterAll()
    TestKit.shutdownActorSystem(system)
  }
}

Source File: UndoSnackbarManager.scala From shadowsocksr-android with GNU General Public License v3.0

5 votes

package com.github.shadowsocks.widget

import android.support.design.widget.Snackbar
import android.view.View
import com.github.shadowsocks.R

import scala.collection.mutable.ArrayBuffer


class UndoSnackbarManager[T](view: View, undo: Iterator[(Int, T)] => Unit,
                             commit: Iterator[(Int, T)] => Unit = null) {
  private val recycleBin = new ArrayBuffer[(Int, T)]
  private val removedCallback = new Snackbar.Callback {
    override def onDismissed(snackbar: Snackbar, event: Int) = {
      event match {
        case Snackbar.Callback.DISMISS_EVENT_SWIPE | Snackbar.Callback.DISMISS_EVENT_MANUAL |
             Snackbar.Callback.DISMISS_EVENT_TIMEOUT =>
          if (commit != null) commit(recycleBin.iterator)
          recycleBin.clear
        case _ =>
      }
      last = null
    }
  }
  private var last: Snackbar = _

  def remove(index: Int, item: T) = {
    recycleBin.append((index, item))
    val count = recycleBin.length
    last = Snackbar
      .make(view, view.getResources.getQuantityString(R.plurals.removed, count, count: Integer), Snackbar.LENGTH_LONG)
      .setCallback(removedCallback).setAction(R.string.undo, (_ => {
      undo(recycleBin.reverseIterator)
      recycleBin.clear
    }): View.OnClickListener)
    last.show
  }

  def flush = if (last != null) last.dismiss
}

Source File: SinkRouteHandler.scala From ohara with Apache License 2.0

5 votes

package oharastream.ohara.shabondi.sink

import java.time.{Duration => JDuration}
import java.util.concurrent.TimeUnit

import akka.actor.ActorSystem
import akka.http.scaladsl.model.{ContentTypes, HttpEntity, StatusCodes}
import akka.http.scaladsl.server.{ExceptionHandler, Route}
import com.typesafe.scalalogging.Logger
import oharastream.ohara.common.data.Row
import oharastream.ohara.common.util.Releasable
import oharastream.ohara.shabondi.common.{JsonSupport, RouteHandler, ShabondiUtils}
import org.apache.commons.lang3.StringUtils

import scala.collection.mutable.ArrayBuffer
import scala.compat.java8.DurationConverters._
import scala.concurrent.ExecutionContextExecutor
import scala.concurrent.duration.Duration
import spray.json.DefaultJsonProtocol._
import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport._

private[shabondi] object SinkRouteHandler {
  def apply(config: SinkConfig)(implicit actorSystem: ActorSystem) =
    new SinkRouteHandler(config)
}

private[shabondi] class SinkRouteHandler(config: SinkConfig)(implicit actorSystem: ActorSystem) extends RouteHandler {
  implicit private val contextExecutor: ExecutionContextExecutor = actorSystem.dispatcher

  private val log              = Logger(classOf[SinkRouteHandler])
  private[sink] val dataGroups = SinkDataGroups(config)

  def scheduleFreeIdleGroups(interval: JDuration, idleTime: JDuration): Unit =
    actorSystem.scheduler.scheduleWithFixedDelay(Duration(1, TimeUnit.SECONDS), interval.toScala) { () =>
      {
        log.trace("scheduled free group, total group: {} ", dataGroups.size)
        dataGroups.freeIdleGroup(idleTime)
      }
    }

  private val exceptionHandler = ExceptionHandler {
    case ex: Throwable =>
      log.error(ex.getMessage, ex)
      complete((StatusCodes.InternalServerError, ex.getMessage))
  }

  private def fullyPollQueue(queue: RowQueue): Seq[Row] = {
    val buffer    = ArrayBuffer.empty[Row]
    var item: Row = queue.poll()
    while (item != null) {
      buffer += item
      item = queue.poll()
    }
    buffer.toSeq
  }

  private def apiUrl = ShabondiUtils.apiUrl

  def route(): Route = handleExceptions(exceptionHandler) {
    path("groups" / Segment) { groupId =>
      get {
        if (StringUtils.isAlphanumeric(groupId)) {
          val group  = dataGroups.createIfAbsent(groupId)
          val result = fullyPollQueue(group.queue).map(row => JsonSupport.toRowData(row))
          complete(result)
        } else {
          val entity =
            HttpEntity(ContentTypes.`text/plain(UTF-8)`, "Illegal group name, only accept alpha and numeric.")
          complete(StatusCodes.NotAcceptable -> entity)
        }
      } ~ {
        complete(StatusCodes.MethodNotAllowed -> s"Unsupported method, please reference: $apiUrl")
      }
    } ~ {
      complete(StatusCodes.NotFound -> s"Please reference: $apiUrl")
    }
  }

  override def close(): Unit = {
    Releasable.close(dataGroups)
  }
}

Source File: CSVConverter.scala From spark-snowflake with Apache License 2.0

5 votes

package net.snowflake.spark.snowflake

import org.apache.spark.sql.types.StructType
import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

object CSVConverter {

  private final val delimiter = '|'
  private final val quoteChar = '"'

  private[snowflake] def convert[T: ClassTag](
    partition: Iterator[String],
    resultSchema: StructType
  ): Iterator[T] = {
    val converter = Conversions.createRowConverter[T](resultSchema)
    partition.map(s => {
      val fields = ArrayBuffer.empty[String]
      var buff = new StringBuilder

      def addField(): Unit = {
        if (buff.isEmpty) fields.append(null)
        else {
          val field = buff.toString()
          buff = new StringBuilder
          fields.append(field)
        }
      }

      var escaped = false
      var index = 0

      while (index < s.length) {
        escaped = false
        if (s(index) == quoteChar) {
          index += 1
          while (index < s.length && !(escaped && s(index) == delimiter)) {
            if (escaped) {
              escaped = false
              buff.append(s(index))
            } else if (s(index) == quoteChar) escaped = true
            else buff.append(s(index))
            index += 1
          }
          addField()
        } else {
          while (index < s.length && s(index) != delimiter) {
            buff.append(s(index))
            index += 1
          }
          addField()
        }
        index += 1
      }
      addField()
      converter(fields.toArray)
    })
  }

}

Source File: InterfaceTreeSpec.scala From daml with Apache License 2.0

5 votes

// Copyright (c) 2020 Digital Asset (Switzerland) GmbH and/or its affiliates. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

package com.daml.lf.codegen

import com.daml.lf.data.ImmArray
import com.daml.lf.data.ImmArray.ImmArraySeq
import com.daml.lf.data.Ref.{DottedName, QualifiedName, PackageId}
import com.daml.lf.iface.{DefDataType, Interface, InterfaceType, Record, Variant}
import org.scalatest.{FlatSpec, Matchers}

import scala.collection.mutable.ArrayBuffer

class InterfaceTreeSpec extends FlatSpec with Matchers {

  behavior of "InterfaceTree.bfs"

  it should "traverse an empty tree" in {
    val interfaceTree =
      InterfaceTree(Map.empty, Interface(PackageId.assertFromString("packageid"), Map.empty))
    interfaceTree.bfs(0)((x, _) => x + 1) shouldEqual 0
  }

  it should "traverse a tree with n elements in bfs order" in {
    val qualifiedName1 = QualifiedName(
      DottedName.assertFromSegments(ImmArray("foo").toSeq),
      DottedName.assertFromSegments(ImmArray("bar").toSeq))
    val record1 = InterfaceType.Normal(DefDataType(ImmArraySeq(), Record(ImmArraySeq())))
    val qualifiedName2 =
      QualifiedName(
        DottedName.assertFromSegments(ImmArray("foo").toSeq),
        DottedName.assertFromSegments(ImmArray("bar", "baz").toSeq))
    val variant1 = InterfaceType.Normal(DefDataType(ImmArraySeq(), Variant(ImmArraySeq())))
    val qualifiedName3 = QualifiedName(
      DottedName.assertFromSegments(ImmArray("foo").toSeq),
      DottedName.assertFromSegments(ImmArray("qux").toSeq))
    val record2 = InterfaceType.Normal(DefDataType(ImmArraySeq(), Record(ImmArraySeq())))
    val typeDecls =
      Map(qualifiedName1 -> record1, qualifiedName2 -> variant1, qualifiedName3 -> record2)
    val interface = new Interface(PackageId.assertFromString("packageId2"), typeDecls)
    val tree = InterfaceTree.fromInterface(interface)
    val result = tree.bfs(ArrayBuffer.empty[InterfaceType])((ab, n) =>
      n match {
        case ModuleWithContext(interface @ _, modulesLineage @ _, name @ _, module @ _) => ab
        case TypeWithContext(interface @ _, modulesLineage @ _, typesLineage @ _, name @ _, typ) =>
          ab ++= typ.typ.toList
    })
    result should contain theSameElementsInOrderAs Seq(record1, record2, variant1)
  }

  behavior of "InterfaceTree.fromInterface"

  it should "permit standalone types with multi-component names" in {
    val bazQuux =
      QualifiedName(
        DottedName.assertFromSegments(ImmArray("foo", "bar").toSeq),
        DottedName.assertFromSegments(ImmArray("baz", "quux").toSeq)
      )
    val record = InterfaceType.Normal(DefDataType(ImmArraySeq(), Record(ImmArraySeq())))

    val typeDecls = Map(bazQuux -> record)
    val interface = new Interface(PackageId.assertFromString("pkgid"), typeDecls)
    val tree = InterfaceTree.fromInterface(interface)
    val result = tree.bfs(ArrayBuffer.empty[InterfaceType])((types, n) =>
      n match {
        case _: ModuleWithContext => types
        case TypeWithContext(_, _, _, _, tpe) =>
          types ++= tpe.typ.toList
    })
    result.toList shouldBe List(record)
  }

}

Source File: SpearmanCorrelation.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat.correlation

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.internal.Logging
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.rdd.RDD


  override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = {
    // ((columnIndex, value), rowUid)
    val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) =>
      vec.toArray.view.zipWithIndex.map { case (v, j) =>
        ((j, v), uid)
      }
    }
    // global sort by (columnIndex, value)
    val sorted = colBased.sortByKey()
    // assign global ranks (using average ranks for tied values)
    val globalRanks = sorted.zipWithIndex().mapPartitions { iter =>
      var preCol = -1
      var preVal = Double.NaN
      var startRank = -1.0
      var cachedUids = ArrayBuffer.empty[Long]
      val flush: () => Iterable[(Long, (Int, Double))] = () => {
        val averageRank = startRank + (cachedUids.size - 1) / 2.0
        val output = cachedUids.map { uid =>
          (uid, (preCol, averageRank))
        }
        cachedUids.clear()
        output
      }
      iter.flatMap { case (((j, v), uid), rank) =>
        // If we see a new value or cachedUids is too big, we flush ids with their average rank.
        if (j != preCol || v != preVal || cachedUids.size >= 10000000) {
          val output = flush()
          preCol = j
          preVal = v
          startRank = rank
          cachedUids += uid
          output
        } else {
          cachedUids += uid
          Iterator.empty
        }
      } ++ flush()
    }
    // Replace values in the input matrix by their ranks compared with values in the same column.
    // Note that shifting all ranks in a column by a constant value doesn't affect result.
    val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) =>
      // sort by column index and then convert values to a vector
      Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray)
    }
    PearsonCorrelation.computeCorrelationMatrix(groupedRanks)
  }
}

Source File: KPLBasedKinesisTestUtils.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.kinesis

import java.nio.ByteBuffer
import java.nio.charset.StandardCharsets

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

import com.amazonaws.services.kinesis.producer.{KinesisProducer => KPLProducer, KinesisProducerConfiguration, UserRecordResult}
import com.google.common.util.concurrent.{FutureCallback, Futures}

private[kinesis] class KPLBasedKinesisTestUtils extends KinesisTestUtils {
  override protected def getProducer(aggregate: Boolean): KinesisDataGenerator = {
    if (!aggregate) {
      new SimpleDataGenerator(kinesisClient)
    } else {
      new KPLDataGenerator(regionName)
    }
  }
}


private[kinesis] class KPLDataGenerator(regionName: String) extends KinesisDataGenerator {

  private lazy val producer: KPLProducer = {
    val conf = new KinesisProducerConfiguration()
      .setRecordMaxBufferedTime(1000)
      .setMaxConnections(1)
      .setRegion(regionName)
      .setMetricsLevel("none")

    new KPLProducer(conf)
  }

  override def sendData(streamName: String, data: Seq[Int]): Map[String, Seq[(Int, String)]] = {
    val shardIdToSeqNumbers = new mutable.HashMap[String, ArrayBuffer[(Int, String)]]()
    data.foreach { num =>
      val str = num.toString
      val data = ByteBuffer.wrap(str.getBytes(StandardCharsets.UTF_8))
      val future = producer.addUserRecord(streamName, str, data)
      val kinesisCallBack = new FutureCallback[UserRecordResult]() {
        override def onFailure(t: Throwable): Unit = {} // do nothing

        override def onSuccess(result: UserRecordResult): Unit = {
          val shardId = result.getShardId
          val seqNumber = result.getSequenceNumber()
          val sentSeqNumbers = shardIdToSeqNumbers.getOrElseUpdate(shardId,
            new ArrayBuffer[(Int, String)]())
          sentSeqNumbers += ((num, seqNumber))
        }
      }
      Futures.addCallback(future, kinesisCallBack)
    }
    producer.flushSync()
    shardIdToSeqNumbers.toMap
  }
}

Source File: Exchange.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.exchange

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

import org.apache.spark.broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.StructType


case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] {

  def apply(plan: SparkPlan): SparkPlan = {
    if (!conf.exchangeReuseEnabled) {
      return plan
    }
    // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls.
    val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]()
    plan.transformUp {
      case exchange: Exchange =>
        // the exchanges that have same results usually also have same schemas (same column names).
        val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]())
        val samePlan = sameSchema.find { e =>
          exchange.sameResult(e)
        }
        if (samePlan.isDefined) {
          // Keep the output of this exchange, the following plans require that to resolve
          // attributes.
          ReusedExchangeExec(exchange.output, samePlan.get)
        } else {
          sameSchema += exchange
          exchange
        }
    }
  }
}

Source File: subquery.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.{expressions, InternalRow}
import org.apache.spark.sql.catalyst.expressions.{Expression, ExprId, InSet, Literal, PlanExpression}
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.{BooleanType, DataType, StructType}


case class ReuseSubquery(conf: SQLConf) extends Rule[SparkPlan] {

  def apply(plan: SparkPlan): SparkPlan = {
    if (!conf.exchangeReuseEnabled) {
      return plan
    }
    // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls.
    val subqueries = mutable.HashMap[StructType, ArrayBuffer[SubqueryExec]]()
    plan transformAllExpressions {
      case sub: ExecSubqueryExpression =>
        val sameSchema = subqueries.getOrElseUpdate(sub.plan.schema, ArrayBuffer[SubqueryExec]())
        val sameResult = sameSchema.find(_.sameResult(sub.plan))
        if (sameResult.isDefined) {
          sub.withNewPlan(sameResult.get)
        } else {
          sameSchema += sub.plan
          sub
        }
    }
  }
}

Source File: ApplicationMasterArguments.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.util.{IntParam, MemoryParam}

class ApplicationMasterArguments(val args: Array[String]) {
  var userJar: String = null
  var userClass: String = null
  var primaryPyFile: String = null
  var primaryRFile: String = null
  var userArgs: Seq[String] = Nil
  var propertiesFile: String = null

  parseArgs(args.toList)

  private def parseArgs(inputArgs: List[String]): Unit = {
    val userArgsBuffer = new ArrayBuffer[String]()

    var args = inputArgs

    while (!args.isEmpty) {
      // --num-workers, --worker-memory, and --worker-cores are deprecated since 1.0,
      // the properties with executor in their names are preferred.
      args match {
        case ("--jar") :: value :: tail =>
          userJar = value
          args = tail

        case ("--class") :: value :: tail =>
          userClass = value
          args = tail

        case ("--primary-py-file") :: value :: tail =>
          primaryPyFile = value
          args = tail

        case ("--primary-r-file") :: value :: tail =>
          primaryRFile = value
          args = tail

        case ("--arg") :: value :: tail =>
          userArgsBuffer += value
          args = tail

        case ("--properties-file") :: value :: tail =>
          propertiesFile = value
          args = tail

        case _ =>
          printUsageAndExit(1, args)
      }
    }

    if (primaryPyFile != null && primaryRFile != null) {
      // scalastyle:off println
      System.err.println("Cannot have primary-py-file and primary-r-file at the same time")
      // scalastyle:on println
      System.exit(-1)
    }

    userArgs = userArgsBuffer.toList
  }

  def printUsageAndExit(exitCode: Int, unknownParam: Any = null) {
    // scalastyle:off println
    if (unknownParam != null) {
      System.err.println("Unknown/unsupported param " + unknownParam)
    }
    System.err.println("""
      |Usage: org.apache.spark.deploy.yarn.ApplicationMaster [options]
      |Options:
      |  --jar JAR_PATH       Path to your application's JAR file
      |  --class CLASS_NAME   Name of your application's main class
      |  --primary-py-file    A main Python file
      |  --primary-r-file     A main R file
      |  --arg ARG            Argument to be passed to your application's main class.
      |                       Multiple invocations are possible, each will be passed in order.
      |  --properties-file FILE Path to a custom Spark properties file.
      """.stripMargin)
    // scalastyle:on println
    System.exit(exitCode)
  }
}

object ApplicationMasterArguments {
  val DEFAULT_NUMBER_EXECUTORS = 2
}

Source File: ClientArguments.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import scala.collection.mutable.ArrayBuffer

// TODO: Add code and support for ensuring that yarn resource 'tasks' are location aware !
private[spark] class ClientArguments(args: Array[String]) {

  var userJar: String = null
  var userClass: String = null
  var primaryPyFile: String = null
  var primaryRFile: String = null
  var userArgs: ArrayBuffer[String] = new ArrayBuffer[String]()

  parseArgs(args.toList)

  private def parseArgs(inputArgs: List[String]): Unit = {
    var args = inputArgs

    while (!args.isEmpty) {
      args match {
        case ("--jar") :: value :: tail =>
          userJar = value
          args = tail

        case ("--class") :: value :: tail =>
          userClass = value
          args = tail

        case ("--primary-py-file") :: value :: tail =>
          primaryPyFile = value
          args = tail

        case ("--primary-r-file") :: value :: tail =>
          primaryRFile = value
          args = tail

        case ("--arg") :: value :: tail =>
          userArgs += value
          args = tail

        case Nil =>

        case _ =>
          throw new IllegalArgumentException(getUsageMessage(args))
      }
    }

    if (primaryPyFile != null && primaryRFile != null) {
      throw new IllegalArgumentException("Cannot have primary-py-file and primary-r-file" +
        " at the same time")
    }
  }

  private def getUsageMessage(unknownParam: List[String] = null): String = {
    val message = if (unknownParam != null) s"Unknown/unsupported param $unknownParam\n" else ""
    message +
      s"""
      |Usage: org.apache.spark.deploy.yarn.Client [options]
      |Options:
      |  --jar JAR_PATH           Path to your application's JAR file (required in yarn-cluster
      |                           mode)
      |  --class CLASS_NAME       Name of your application's main class (required)
      |  --primary-py-file        A main Python file
      |  --primary-r-file         A main R file
      |  --arg ARG                Argument to be passed to your application's main class.
      |                           Multiple invocations are possible, each will be passed in order.
      """.stripMargin
  }
}

Source File: YarnClientSchedulerBackend.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import scala.collection.mutable.ArrayBuffer

import org.apache.hadoop.yarn.api.records.YarnApplicationState

import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.deploy.yarn.{Client, ClientArguments, YarnSparkHadoopUtil}
import org.apache.spark.internal.Logging
import org.apache.spark.launcher.SparkAppHandle
import org.apache.spark.scheduler.TaskSchedulerImpl

private[spark] class YarnClientSchedulerBackend(
    scheduler: TaskSchedulerImpl,
    sc: SparkContext)
  extends YarnSchedulerBackend(scheduler, sc)
  with Logging {

  private var client: Client = null
  private var monitorThread: MonitorThread = null

  
  override def stop() {
    assert(client != null, "Attempted to stop this scheduler before starting it!")
    if (monitorThread != null) {
      monitorThread.stopMonitor()
    }

    // Report a final state to the launcher if one is connected. This is needed since in client
    // mode this backend doesn't let the app monitor loop run to completion, so it does not report
    // the final state itself.
    //
    // Note: there's not enough information at this point to provide a better final state,
    // so assume the application was successful.
    client.reportLauncherState(SparkAppHandle.State.FINISHED)

    super.stop()
    YarnSparkHadoopUtil.get.stopCredentialUpdater()
    client.stop()
    logInfo("Stopped")
  }

}

Source File: UnionDStream.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.SparkException
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Duration, Time}

private[streaming]
class UnionDStream[T: ClassTag](parents: Array[DStream[T]])
  extends DStream[T](parents.head.ssc) {

  require(parents.length > 0, "List of DStreams to union is empty")
  require(parents.map(_.ssc).distinct.length == 1, "Some of the DStreams have different contexts")
  require(parents.map(_.slideDuration).distinct.length == 1,
    "Some of the DStreams have different slide durations")

  override def dependencies: List[DStream[_]] = parents.toList

  override def slideDuration: Duration = parents.head.slideDuration

  override def compute(validTime: Time): Option[RDD[T]] = {
    val rdds = new ArrayBuffer[RDD[T]]()
    parents.map(_.getOrCompute(validTime)).foreach {
      case Some(rdd) => rdds += rdd
      case None => throw new SparkException("Could not generate RDD from a parent for unifying at" +
        s" time $validTime")
    }
    if (rdds.nonEmpty) {
      Some(ssc.sc.union(rdds))
    } else {
      None
    }
  }
}

Source File: QueueInputDStream.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream}

import scala.collection.mutable.{ArrayBuffer, Queue}
import scala.reflect.ClassTag

import org.apache.spark.rdd.{RDD, UnionRDD}
import org.apache.spark.streaming.{StreamingContext, Time}

private[streaming]
class QueueInputDStream[T: ClassTag](
    ssc: StreamingContext,
    val queue: Queue[RDD[T]],
    oneAtATime: Boolean,
    defaultRDD: RDD[T]
  ) extends InputDStream[T](ssc) {

  override def start() { }

  override def stop() { }

  private def readObject(in: ObjectInputStream): Unit = {
    throw new NotSerializableException("queueStream doesn't support checkpointing. " +
      "Please don't use queueStream when checkpointing is enabled.")
  }

  private def writeObject(oos: ObjectOutputStream): Unit = {
    logWarning("queueStream doesn't support checkpointing")
  }

  override def compute(validTime: Time): Option[RDD[T]] = {
    val buffer = new ArrayBuffer[RDD[T]]()
    queue.synchronized {
      if (oneAtATime && queue.nonEmpty) {
        buffer += queue.dequeue()
      } else {
        buffer ++= queue
        queue.clear()
      }
    }
    if (buffer.nonEmpty) {
      if (oneAtATime) {
        Some(buffer.head)
      } else {
        Some(new UnionRDD(context.sc, buffer.toSeq))
      }
    } else if (defaultRDD != null) {
      Some(defaultRDD)
    } else {
      Some(ssc.sparkContext.emptyRDD)
    }
  }

}

Source File: LocalSparkCluster.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.SparkConf
import org.apache.spark.deploy.master.Master
import org.apache.spark.deploy.worker.Worker
import org.apache.spark.internal.Logging
import org.apache.spark.rpc.RpcEnv
import org.apache.spark.util.Utils


    for (workerNum <- 1 to numWorkers) {
      val workerEnv = Worker.startRpcEnvAndEndpoint(localHostname, 0, 0, coresPerWorker,
        memoryPerWorker, masters, null, Some(workerNum), _conf)
      workerRpcEnvs += workerEnv
    }

    masters
  }

  def stop() {
    logInfo("Shutting down local Spark cluster.")
    // Stop the workers before the master so they don't get upset that it disconnected
    workerRpcEnvs.foreach(_.shutdown())
    masterRpcEnvs.foreach(_.shutdown())
    workerRpcEnvs.foreach(_.awaitTermination())
    masterRpcEnvs.foreach(_.awaitTermination())
    masterRpcEnvs.clear()
    workerRpcEnvs.clear()
  }
}

Source File: TaskResult.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.io._
import java.nio.ByteBuffer

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.SparkEnv
import org.apache.spark.serializer.SerializerInstance
import org.apache.spark.storage.BlockId
import org.apache.spark.util.{AccumulatorV2, Utils}

// Task result. Also contains updates to accumulator variables.
private[spark] sealed trait TaskResult[T]


  def value(resultSer: SerializerInstance = null): T = {
    if (valueObjectDeserialized) {
      valueObject
    } else {
      // This should not run when holding a lock because it may cost dozens of seconds for a large
      // value
      val ser = if (resultSer == null) SparkEnv.get.serializer.newInstance() else resultSer
      valueObject = ser.deserialize(valueBytes)
      valueObjectDeserialized = true
      valueObject
    }
  }
}

Source File: Schedulable.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.util.concurrent.ConcurrentLinkedQueue

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.scheduler.SchedulingMode.SchedulingMode


private[spark] trait Schedulable {
  var parent: Pool
  // child queues
  def schedulableQueue: ConcurrentLinkedQueue[Schedulable]
  def schedulingMode: SchedulingMode
  def weight: Int
  def minShare: Int
  def runningTasks: Int
  def priority: Int
  def stageId: Int
  def name: String

  def addSchedulable(schedulable: Schedulable): Unit
  def removeSchedulable(schedulable: Schedulable): Unit
  def getSchedulableByName(name: String): Schedulable
  def executorLost(executorId: String, host: String, reason: ExecutorLossReason): Unit
  def checkSpeculatableTasks(minTimeToSpeculation: Int): Boolean
  def getSortedTaskSetQueue: ArrayBuffer[TaskSetManager]
}

Source File: ChunkedByteBufferOutputStream.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.util.io

import java.io.OutputStream
import java.nio.ByteBuffer

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.storage.StorageUtils


  private[this] var position = chunkSize
  private[this] var _size = 0
  private[this] var closed: Boolean = false

  def size: Long = _size

  override def close(): Unit = {
    if (!closed) {
      super.close()
      closed = true
    }
  }

  override def write(b: Int): Unit = {
    require(!closed, "cannot write to a closed ChunkedByteBufferOutputStream")
    allocateNewChunkIfNeeded()
    chunks(lastChunkIndex).put(b.toByte)
    position += 1
    _size += 1
  }

  override def write(bytes: Array[Byte], off: Int, len: Int): Unit = {
    require(!closed, "cannot write to a closed ChunkedByteBufferOutputStream")
    var written = 0
    while (written < len) {
      allocateNewChunkIfNeeded()
      val thisBatch = math.min(chunkSize - position, len - written)
      chunks(lastChunkIndex).put(bytes, written + off, thisBatch)
      written += thisBatch
      position += thisBatch
    }
    _size += len
  }

  @inline
  private def allocateNewChunkIfNeeded(): Unit = {
    if (position == chunkSize) {
      chunks += allocator(chunkSize)
      lastChunkIndex += 1
      position = 0
    }
  }

  def toChunkedByteBuffer: ChunkedByteBuffer = {
    require(closed, "cannot call toChunkedByteBuffer() unless close() has been called")
    require(!toChunkedByteBufferWasCalled, "toChunkedByteBuffer() can only be called once")
    toChunkedByteBufferWasCalled = true
    if (lastChunkIndex == -1) {
      new ChunkedByteBuffer(Array.empty[ByteBuffer])
    } else {
      // Copy the first n-1 chunks to the output, and then create an array that fits the last chunk.
      // An alternative would have been returning an array of ByteBuffers, with the last buffer
      // bounded to only the last chunk's position. However, given our use case in Spark (to put
      // the chunks in block manager), only limiting the view bound of the buffer would still
      // require the block manager to store the whole chunk.
      val ret = new Array[ByteBuffer](chunks.size)
      for (i <- 0 until chunks.size - 1) {
        ret(i) = chunks(i)
        ret(i).flip()
      }
      if (position == chunkSize) {
        ret(lastChunkIndex) = chunks(lastChunkIndex)
        ret(lastChunkIndex).flip()
      } else {
        ret(lastChunkIndex) = allocator(position)
        chunks(lastChunkIndex).flip()
        ret(lastChunkIndex).put(chunks(lastChunkIndex))
        ret(lastChunkIndex).flip()
        StorageUtils.dispose(chunks(lastChunkIndex))
      }
      new ChunkedByteBuffer(ret)
    }
  }
}

Source File: SubtractedRDD.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.Dependency
import org.apache.spark.OneToOneDependency
import org.apache.spark.Partition
import org.apache.spark.Partitioner
import org.apache.spark.ShuffleDependency
import org.apache.spark.SparkEnv
import org.apache.spark.TaskContext


private[spark] class SubtractedRDD[K: ClassTag, V: ClassTag, W: ClassTag](
    @transient var rdd1: RDD[_ <: Product2[K, V]],
    @transient var rdd2: RDD[_ <: Product2[K, W]],
    part: Partitioner)
  extends RDD[(K, V)](rdd1.context, Nil) {


  override def getDependencies: Seq[Dependency[_]] = {
    def rddDependency[T1: ClassTag, T2: ClassTag](rdd: RDD[_ <: Product2[T1, T2]])
      : Dependency[_] = {
      if (rdd.partitioner == Some(part)) {
        logDebug("Adding one-to-one dependency with " + rdd)
        new OneToOneDependency(rdd)
      } else {
        logDebug("Adding shuffle dependency with " + rdd)
        new ShuffleDependency[T1, T2, Any](rdd, part)
      }
    }
    Seq(rddDependency[K, V](rdd1), rddDependency[K, W](rdd2))
  }

  override def getPartitions: Array[Partition] = {
    val array = new Array[Partition](part.numPartitions)
    for (i <- 0 until array.length) {
      // Each CoGroupPartition will depend on rdd1 and rdd2
      array(i) = new CoGroupPartition(i, Seq(rdd1, rdd2).zipWithIndex.map { case (rdd, j) =>
        dependencies(j) match {
          case s: ShuffleDependency[_, _, _] =>
            None
          case _ =>
            Some(new NarrowCoGroupSplitDep(rdd, i, rdd.partitions(i)))
        }
      }.toArray)
    }
    array
  }

  override val partitioner = Some(part)

  override def compute(p: Partition, context: TaskContext): Iterator[(K, V)] = {
    val partition = p.asInstanceOf[CoGroupPartition]
    val map = new JHashMap[K, ArrayBuffer[V]]
    def getSeq(k: K): ArrayBuffer[V] = {
      val seq = map.get(k)
      if (seq != null) {
        seq
      } else {
        val seq = new ArrayBuffer[V]()
        map.put(k, seq)
        seq
      }
    }
    def integrate(depNum: Int, op: Product2[K, V] => Unit): Unit = {
      dependencies(depNum) match {
        case oneToOneDependency: OneToOneDependency[_] =>
          val dependencyPartition = partition.narrowDeps(depNum).get.split
          oneToOneDependency.rdd.iterator(dependencyPartition, context)
            .asInstanceOf[Iterator[Product2[K, V]]].foreach(op)

        case shuffleDependency: ShuffleDependency[_, _, _] =>
          val iter = SparkEnv.get.shuffleManager
            .getReader(
              shuffleDependency.shuffleHandle, partition.index, partition.index + 1, context)
            .read()
          iter.foreach(op)
      }
    }

    // the first dep is rdd1; add all values to the map
    integrate(0, t => getSeq(t._1) += t._2)
    // the second dep is rdd2; remove all of its keys
    integrate(1, t => map.remove(t._1))
    map.asScala.iterator.map(t => t._2.iterator.map((t._1, _))).flatten
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdd1 = null
    rdd2 = null
  }

}

Source File: UnionRDD.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport}
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
}

Source File: TaskContextImpl.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark

import java.util.Properties

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.executor.TaskMetrics
import org.apache.spark.internal.Logging
import org.apache.spark.memory.TaskMemoryManager
import org.apache.spark.metrics.MetricsSystem
import org.apache.spark.metrics.source.Source
import org.apache.spark.util._

private[spark] class TaskContextImpl(
    val stageId: Int,
    val partitionId: Int,
    override val taskAttemptId: Long,
    override val attemptNumber: Int,
    var _taskMemoryManager: TaskMemoryManager,
    localProperties: Properties,
    @transient private val metricsSystem: MetricsSystem,
    // The default value is only used in tests.
    override val taskMetrics: TaskMetrics = TaskMetrics.empty,
    var batchId: Int = 0)
  extends TaskContext
  with Logging {

  
  private[spark] def markInterrupted(): Unit = {
    interrupted = true
  }

  override def isCompleted(): Boolean = completed

  override def isRunningLocally(): Boolean = false

  override def isInterrupted(): Boolean = interrupted

  override def getLocalProperty(key: String): String = localProperties.getProperty(key)

  override def getMetricsSources(sourceName: String): Seq[Source] =
    metricsSystem.getSourcesByName(sourceName)

  private[spark] override def registerAccumulator(a: AccumulatorV2[_, _]): Unit = {
    taskMetrics.registerAccumulator(a)
  }

}

Source File: TimeStampedHashMapSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.util

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import scala.util.Random

import org.apache.spark.SparkFunSuite

class TimeStampedHashMapSuite extends SparkFunSuite {

  // Test the testMap function - a Scala HashMap should obviously pass
  testMap(new mutable.HashMap[String, String]())

  // Test TimeStampedHashMap basic functionality
  testMap(new TimeStampedHashMap[String, String]())
  testMapThreadSafety(new TimeStampedHashMap[String, String]())

  test("TimeStampedHashMap - clearing by timestamp") {
    // clearing by insertion time
    val map = new TimeStampedHashMap[String, String](updateTimeStampOnGet = false)
    map("k1") = "v1"
    assert(map("k1") === "v1")
    Thread.sleep(10)
    val threshTime = System.currentTimeMillis
    assert(map.getTimestamp("k1").isDefined)
    assert(map.getTimestamp("k1").get < threshTime)
    map.clearOldValues(threshTime)
    assert(map.get("k1") === None)

    // clearing by modification time
    val map1 = new TimeStampedHashMap[String, String](updateTimeStampOnGet = true)
    map1("k1") = "v1"
    map1("k2") = "v2"
    assert(map1("k1") === "v1")
    Thread.sleep(10)
    val threshTime1 = System.currentTimeMillis
    Thread.sleep(10)
    assert(map1("k2") === "v2")     // access k2 to update its access time to > threshTime
    assert(map1.getTimestamp("k1").isDefined)
    assert(map1.getTimestamp("k1").get < threshTime1)
    assert(map1.getTimestamp("k2").isDefined)
    assert(map1.getTimestamp("k2").get >= threshTime1)
    map1.clearOldValues(threshTime1) // should only clear k1
    assert(map1.get("k1") === None)
    assert(map1.get("k2").isDefined)
  }

  
  def testMapThreadSafety(hashMapConstructor: => mutable.Map[String, String]) {
    def newMap() = hashMapConstructor
    val name = newMap().getClass.getSimpleName
    val testMap = newMap()
    @volatile var error = false

    def getRandomKey(m: mutable.Map[String, String]): Option[String] = {
      val keys = testMap.keysIterator.toSeq
      if (keys.nonEmpty) {
        Some(keys(Random.nextInt(keys.size)))
      } else {
        None
      }
    }

    val threads = (1 to 25).map(i => new Thread() {
      override def run() {
        try {
          for (j <- 1 to 1000) {
            Random.nextInt(3) match {
              case 0 =>
                testMap(Random.nextString(10)) = Random.nextDouble().toString // put
              case 1 =>
                getRandomKey(testMap).map(testMap.get) // get
              case 2 =>
                getRandomKey(testMap).map(testMap.remove) // remove
            }
          }
        } catch {
          case t: Throwable =>
            error = true
            throw t
        }
      }
    })

    test(name + " - threading safety test")  {
      threads.foreach(_.start())
      threads.foreach(_.join())
      assert(!error)
    }
  }
}

Source File: Predict.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.example.lenetLocal
import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToSample}
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.utils.Engine
import com.intel.analytics.bigdl.dataset.Sample
import com.intel.analytics.bigdl.optim.LocalPredictor
import org.apache.log4j.{Level, Logger}

import scala.collection.mutable.ArrayBuffer

object Predict {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)


  import Utils._

  def main(args: Array[String]): Unit = {
    predictParser.parse(args, new PredictParams()).foreach { param =>

      System.setProperty("bigdl.localMode", "true")
      System.setProperty("bigdl.coreNumber", (param.coreNumber.toString))
      Engine.init

      val validationData = param.folder + "/t10k-images-idx3-ubyte"
      val validationLabel = param.folder + "/t10k-labels-idx1-ubyte"

      val rawData = load(validationData, validationLabel)
      val iter = rawData.iterator
      val sampleIter = GreyImgToSample()(
          GreyImgNormalizer(trainMean, trainStd)(
          BytesToGreyImg(28, 28)(iter)))
      var samplesBuffer = ArrayBuffer[Sample[Float]]()
      while (sampleIter.hasNext) {
        val elem = sampleIter.next().clone()
        samplesBuffer += elem
      }
      val samples = samplesBuffer.toArray

      val model = Module.load[Float](param.model)
      val localPredictor = LocalPredictor(model)
      val result = localPredictor.predict(samples)
      val result_class = localPredictor.predictClass(samples)
      result_class.foreach(r => println(s"${r}"))
    }
  }
}

Source File: BatchSampler.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.transform.vision.image.label.roi

import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.transform.vision.image.util.{BboxUtil, BoundingBox}
import com.intel.analytics.bigdl.utils.RandomGenerator._

import scala.collection.mutable.ArrayBuffer



  def generateBatchSamples(label: RoiLabel, batchSamplers: Array[BatchSampler],
    sampledBoxes: ArrayBuffer[BoundingBox]): Unit = {
    sampledBoxes.clear()
    var i = 0
    val unitBox = BoundingBox(0, 0, 1, 1)
    while (i < batchSamplers.length) {
      batchSamplers(i).sample(unitBox, label, sampledBoxes)
      i += 1
    }
  }
}

Source File: RandomSampler.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.transform.vision.image.label.roi

import com.intel.analytics.bigdl.transform.vision.image.{FeatureTransformer, ImageFeature}
import com.intel.analytics.bigdl.transform.vision.image.augmentation.Crop
import com.intel.analytics.bigdl.transform.vision.image.util.{BoundingBox}
import com.intel.analytics.bigdl.utils.RandomGenerator._
import org.opencv.core.Mat

import scala.collection.mutable.ArrayBuffer


class RandomSampler extends Crop {
  // random cropping samplers
  val batchSamplers = Array(
    new BatchSampler(maxTrials = 1),
    new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2,
      minOverlap = Some(0.1)),
    new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2,
      minOverlap = Some(0.3)),
    new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2,
      minOverlap = Some(0.5)),
    new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2,
      minOverlap = Some(0.7)),
    new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2,
      minOverlap = Some(0.9)),
    new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2,
      maxOverlap = Some(1.0)))

  def generateRoi(feature: ImageFeature): BoundingBox = {
    val roiLabel = feature(ImageFeature.label).asInstanceOf[RoiLabel]
    val boxesBuffer = new ArrayBuffer[BoundingBox]()
    BatchSampler.generateBatchSamples(roiLabel,
      batchSamplers, boxesBuffer)

    // randomly pick up one as input data
    if (boxesBuffer.nonEmpty) {
      // Randomly pick a sampled bbox and crop the expand_datum.
      val index = (RNG.uniform(0, 1) * boxesBuffer.length).toInt
      boxesBuffer(index)
    } else {
      BoundingBox(0, 0, 1, 1)
    }
  }
}

object RandomSampler {
  def apply(): FeatureTransformer = {
    new RandomSampler() -> RoiProject()
  }
}

Source File: RoiTransformer.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.transform.vision.image.label.roi

import com.intel.analytics.bigdl.transform.vision.image.util.{BboxUtil, BoundingBox}
import com.intel.analytics.bigdl.transform.vision.image.{FeatureTransformer, ImageFeature}

import scala.collection.mutable.ArrayBuffer


case class RoiProject(needMeetCenterConstraint: Boolean = true) extends FeatureTransformer {
  val transformedAnnot = new ArrayBuffer[BoundingBox]()
  override def transformMat(feature: ImageFeature): Unit = {
    val imageBoundary = feature[BoundingBox](ImageFeature.boundingBox)
    if (!imageBoundary.normalized) {
      imageBoundary.scaleBox(1.0f / feature.getHeight(), 1f / feature.getWidth(), imageBoundary)
    }
    val target = feature[RoiLabel](ImageFeature.label)
    transformedAnnot.clear()
    // Transform the annotation according to bounding box.
    var i = 1
    while (i <= target.size()) {
      val gtBoxes = BoundingBox(target.bboxes.valueAt(i, 1),
        target.bboxes.valueAt(i, 2),
        target.bboxes.valueAt(i, 3),
        target.bboxes.valueAt(i, 4))
      if (!needMeetCenterConstraint ||
        imageBoundary.meetEmitCenterConstraint(gtBoxes)) {
        val transformedBox = new BoundingBox()
        if (imageBoundary.projectBbox(gtBoxes, transformedBox)) {
          transformedBox.setLabel(target.classes.valueAt(1, i))
          transformedBox.setDifficult(target.classes.valueAt(2, i))
          transformedAnnot.append(transformedBox)
        }
      }
      i += 1
    }
    // write the transformed annotation back to target
    target.bboxes.resize(transformedAnnot.length, 4)
    target.classes.resize(2, transformedAnnot.length)

    i = 1
    while (i <= transformedAnnot.length) {
      target.bboxes.setValue(i, 1, transformedAnnot(i - 1).x1)
      target.bboxes.setValue(i, 2, transformedAnnot(i - 1).y1)
      target.bboxes.setValue(i, 3, transformedAnnot(i - 1).x2)
      target.bboxes.setValue(i, 4, transformedAnnot(i - 1).y2)
      target.classes.setValue(1, i, transformedAnnot(i - 1).label)
      target.classes.setValue(2, i, transformedAnnot(i - 1).difficult)
      i += 1
    }
  }
}

Source File: Mean.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.utils.tf.loaders

import java.nio.ByteOrder

import com.intel.analytics.bigdl.Module
import com.intel.analytics.bigdl.nn.abstractnn.{AbstractModule, Activity}
import com.intel.analytics.bigdl.nn.Sequential
import com.intel.analytics.bigdl.nn.tf.Mean
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.utils.tf.Context
import org.tensorflow.framework.{DataType, NodeDef}

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

class Mean extends TensorflowOpsLoader {

  import Utils._

  override def build[T: ClassTag](nodeDef: NodeDef, byteOrder: ByteOrder
    , context: Context[T])(implicit ev: TensorNumeric[T]): Module[T] = {
    val attr = nodeDef.getAttrMap
    val dataType = getType(attr, "T")
    val squeeze = !getBoolean(attr, "keep_dims")
    val dt = dataType match {
      case DataType.DT_INT8 =>
        "Int"
      case DataType.DT_INT16 =>
        "Int"
      case DataType.DT_UINT8 =>
        "Int"
      case DataType.DT_UINT16 =>
        "Int"
      case DataType.DT_INT32 =>
        "Int"
      case DataType.DT_INT64 =>
        "Long"
      case DataType.DT_FLOAT =>
        "Float"
      case DataType.DT_DOUBLE =>
        "Double"
      case _ => throw new UnsupportedOperationException("Data Type: " + dataType +
        " is not Unsupported yet.")
    }
    new MeanLoadTF[T](dt, squeeze)
  }
}

class MeanLoadTF[T: ClassTag](val dataType: String,
                              val squeeze: Boolean)(implicit ev: TensorNumeric[T])
  extends Adapter[T](Array(2)) {
  override def build(tensorArrays: Array[Tensor[_]]): AbstractModule[Activity, Activity, T] = {
    val dims = tensorArrays(0).asInstanceOf[Tensor[Int]]
    val dim = ArrayBuffer[Int]()
    val mean = Sequential[T]()
    for (i <- 1 to dims.size(1)) {
      dim += dims.valueAt(i) + 1
    }
    dataType match {
      case "Int" =>
        dim.foreach(i => mean.add(Mean[T, Int](i, squeeze = squeeze)))
      case "Long" =>
        dim.foreach(i => mean.add(Mean[T, Long](i, squeeze = squeeze)))
      case "Float" =>
        dim.foreach(i => mean.add(Mean[T, Float](i, squeeze = squeeze)))
      case "Double" =>
        dim.foreach(i => mean.add(Mean[T, Double](i, squeeze = squeeze)))
      case _ => throw new UnsupportedOperationException("Data Type: " + dataType +
        " is not Unsupported yet.")
    }
    mean
  }
}

Source File: Transpose.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.utils.tf.loaders

import java.nio.ByteOrder

import com.intel.analytics.bigdl.Module
import com.intel.analytics.bigdl.nn.abstractnn.{AbstractModule, Activity}
import com.intel.analytics.bigdl.nn.{Contiguous, Sequential, Transpose => TransposeLayer}
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.utils.tf.Context
import org.tensorflow.framework.NodeDef

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

class Transpose extends TensorflowOpsLoader {

  import Utils._

  override def build[T: ClassTag](nodeDef: NodeDef, byteOrder: ByteOrder
  , context: Context[T])(implicit ev: TensorNumeric[T]): Module[T] = {
    new TransposeLoadTF[T]()
  }
}

object TransposeLoadTF {

  def permToPair(perm: Array[Int]): Array[(Int, Int)] = {
    val numToRank = perm.zipWithIndex.toMap
    val arr = perm.indices.toArray
    val pairs = ArrayBuffer[(Int, Int)]()

    def sort(arr: Array[Int], low: Int, high: Int): Unit = {
      var i = low
      var j = high
      val pivot = arr(low + (high - low)/2)

      while (i <= j) {
        while (arr(i) < pivot) i += 1
        while (arr(j) > pivot) j -= 1

        if (i <= j) {
          exchangeNumbers(arr, i, j)
          i += 1
          j -= 1
        }
      }

      if (low < j) sort(arr, low, j)
      if (i < high) sort(arr, i, high)
    }

    def exchangeNumbers(arr: Array[Int], i: Int, j: Int): Unit = {
      val temp = arr(i)
      arr(i) = arr(j)
      arr(j) = temp
      pairs += ((i, j))
    }

    sort(arr.map(numToRank), 0, arr.length-1)

    pairs.filter(pair => pair._1 != pair._2).toArray
  }
}

class TransposeLoadTF[T: ClassTag]()(implicit ev: TensorNumeric[T]) extends Adapter[T](Array(2)) {
  import TransposeLoadTF._

  override def build(tensorArrays: Array[Tensor[_]]): AbstractModule[Activity, Activity, T] = {
    val perm = tensorArrays(0).asInstanceOf[Tensor[Int]].storage().array()
    val paris = permToPair(perm)
    val layer = Sequential()
    layer.add(TransposeLayer[T](paris.map(x => (x._1 + 1, x._2 + 1))))
    layer.add(Contiguous())
    layer
  }
}

Source File: Pad.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.utils.tf.loaders

import java.nio.ByteOrder

import com.intel.analytics.bigdl.Module
import com.intel.analytics.bigdl.nn.abstractnn.{AbstractModule, Activity}
import com.intel.analytics.bigdl.nn.{Padding, Sequential}
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.utils.tf.{Context, TFUtils}
import org.tensorflow.framework.NodeDef

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

class Pad extends TensorflowOpsLoader {

  import Utils._

  override def build[T: ClassTag](nodeDef: NodeDef, byteOrder: ByteOrder,
    context: Context[T])(implicit ev: TensorNumeric[T]): Module[T] = {
    new PadLoadTF[T]()
  }
}

class PadLoadTF[T: ClassTag]()(implicit ev: TensorNumeric[T]) extends Adapter[T](Array(2)) {
  override def build(tensorArrays: Array[Tensor[_]]): AbstractModule[Activity, Activity, T] = {
    val paddings = tensorArrays(0).asInstanceOf[Tensor[Int]]
    val pad = ArrayBuffer[Int]()
    val padding = Sequential[T]()

    for(dim <- 1 to paddings.size(1)) {
      if (paddings.valueAt(dim, 1) != 0 || paddings.valueAt(dim, 2) != 0 ) {
        if (paddings(Array(dim, 1)) != 0) {
          padding.add(Padding[T](dim, -paddings.valueAt(dim, 1), 4))
        }
        if (paddings(Array(dim, 2)) != 0) {
          padding.add(Padding[T](dim, paddings.valueAt(dim, 2), 4))
        }
      }
    }

    padding
  }
}

Source File: IRConverter.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.utils.intermediate

import com.intel.analytics.bigdl.nn.Graph
import com.intel.analytics.bigdl.nn.abstractnn.{AbstractModule, Activity}
import com.intel.analytics.bigdl.nn.mkldnn._
import com.intel.analytics.bigdl.tensor.{FloatType, Tensor}
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.{Module, utils}
import com.intel.analytics.bigdl.utils.{Engine, MklBlas, MklDnn, Node}

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag


private[bigdl] class IRConverter[T: ClassTag](IRgraph: IRGraph[T])(implicit ev: TensorNumeric[T]) {
  private val allNodes = new ArrayBuffer[Node[IRElement[T]]]
  private val irInputs = IRgraph.inputs.toArray
  private val irOutputs = IRgraph.outputs.toArray

  init()
  private def init() : Unit = {
    getNodes(irInputs, allNodes)
    // reminder: some output nodes may not be searched from inputs
    irOutputs.foreach(node => {
      if (!allNodes.contains(node)) allNodes.append(node)
    })
  }


  private def getNodes(inputs: Seq[Node[IRElement[T]]],
                       nodesBuffer: ArrayBuffer[Node[IRElement[T]]]): Unit = {
    if (inputs.length == 0) return
    inputs.foreach(node => {
      if (!nodesBuffer.contains(node)) {
        nodesBuffer.append(node)
        getNodes(node.nextNodes, nodesBuffer)
      }
    })
  }

  
  def toGraph() : Graph[T] = {
    if (utils.Engine.getEngineType() == MklBlas) {
      require(IRToBlas[T].convertingCheck(allNodes.toArray),
        "IR graph can not be converted to Blas layer")
      toBlasGraph()
    } else if (utils.Engine.getEngineType() == MklDnn) {
      require(ev.getType() == FloatType, "Mkldnn engine only supports float data")
      require(IRToDnn[Float].convertingCheck(
        allNodes.toArray.asInstanceOf[Array[Node[IRElement[Float]]]]),
        "IR graph can not be converted to Dnn layer")
      toDnnGraph()
    } else throw new UnsupportedOperationException(
      s"Only support engineType mkldnn/mklblas, but get ${Engine.getEngineType()}")
  }

  private def toDnnGraph(): Graph[T] = {
    val nodeMap = IRToDnn[Float].convert(
      allNodes.toArray.asInstanceOf[Array[Node[IRElement[Float]]]])
    val inputs = irInputs.map(
      n => nodeMap.get(n.asInstanceOf[Node[IRElement[Float]]]).get)
    val outputs = irOutputs.map(
      n => nodeMap.get(n.asInstanceOf[Node[IRElement[Float]]]).get)

    // add input node for dnn graph
    val realInputs = inputs.map(n => {
      val node = new Node[Module[Float]](new InputWrapper())
      n.from(node)
      node
    })

    // add output node for graph
    val realOutputs = outputs.zipWithIndex.map {
      case (model: Node[Module[Float]], index: Int) =>
        val node = if (model.element.isInstanceOf[BlasWrapper]) {
          model
        } else {
          model.add(new Node[Module[Float]](Output(IRgraph.outputFormats(index))))
        }
        node
    }

    DnnGraph(realInputs, realOutputs,
      IRgraph.variables.asInstanceOf[Option[(Array[Tensor[Float]], Array[Tensor[Float]])]],
      IRgraph.generateBackward).asInstanceOf[Graph[T]]
  }

  private def toBlasGraph(): Graph[T] = {
    val nodeMap = IRToBlas[T].convert(allNodes.toArray)
    val inputs = irInputs.map(n => nodeMap.get(n).get)
    val outputs = irOutputs.map(n => nodeMap.get(n).get)

    Graph.dynamic(inputs, outputs, IRgraph.variables, IRgraph.generateBackward)
  }
}

Source File: FileReader.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.visualization.tensorboard

import java.io.{BufferedInputStream}
import java.nio.ByteBuffer

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.tensorflow.util.Event

import scala.collection.mutable.ArrayBuffer
import scala.util.matching.Regex

private[bigdl] object FileReader {
  val fileNameRegex = """bigdl.tfevents.*""".r

  
  def readScalar(file: Path, tag: String, fs: FileSystem): Array[(Long, Float, Double)] = {
    require(fs.isFile(file), s"FileReader: ${file} should be a file")
    val bis = new BufferedInputStream(fs.open(file))
    val longBuffer = new Array[Byte](8)
    val crcBuffer = new Array[Byte](4)
    val bf = new ArrayBuffer[(Long, Float, Double)]
    while (bis.read(longBuffer) > 0) {
      val l = ByteBuffer.wrap(longBuffer.reverse).getLong()
      bis.read(crcBuffer)
      // TODO: checksum
      //      val crc1 = ByteBuffer.wrap(crcBuffer.reverse).getInt()
      val eventBuffer = new Array[Byte](l.toInt)
      bis.read(eventBuffer)
      val e = Event.parseFrom(eventBuffer)
      if (e.getSummary.getValueCount == 1 &&
        tag.equals(e.getSummary.getValue(0).getTag())) {
        bf.append((e.getStep, e.getSummary.getValue(0).getSimpleValue,
          e.getWallTime))
      }
      bis.read(crcBuffer)
      //      val crc2 = ByteBuffer.wrap(crcBuffer.reverse).getInt()
    }
    bis.close()
    bf.toArray.sortWith(_._1 < _._1)
  }
}

Source File: Permute.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.nn.keras

import com.intel.analytics.bigdl.nn.Transpose
import com.intel.analytics.bigdl.nn.abstractnn.AbstractModule
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.utils.Shape

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag


class Permute[T: ClassTag](
   val dims: Array[Int],
   val inputShape: Shape = null)(implicit ev: TensorNumeric[T])
  extends KerasLayer[Tensor[T], Tensor[T], T](KerasLayer.addBatch(inputShape)) {

  private def permToPair(perm: Array[Int]): Array[(Int, Int)] = {
    val numToRank = perm.zipWithIndex.toMap
    val arr = perm.indices.toArray
    val pairs = ArrayBuffer[(Int, Int)]()

    def sort(arr: Array[Int], low: Int, high: Int): Unit = {
      var i = low
      var j = high
      val pivot = arr(low + (high - low)/2)

      while (i <= j) {
        while (arr(i) < pivot) i += 1
        while (arr(j) > pivot) j -= 1

        if (i <= j) {
          exchangeNumbers(arr, i, j)
          i += 1
          j -= 1
        }
      }

      if (low < j) sort(arr, low, j)
      if (i < high) sort(arr, i, high)
    }

    def exchangeNumbers(arr: Array[Int], i: Int, j: Int): Unit = {
      val temp = arr(i)
      arr(i) = arr(j)
      arr(j) = temp
      pairs += ((i, j))
    }

    sort(arr.map(numToRank), 0, arr.length-1)

    pairs.filter(pair => pair._1 != pair._2).toArray
  }

  override def computeOutputShape(inputShape: Shape): Shape = {
    val input = inputShape.toSingle().toArray
    val outputShape = input.clone()
    var i = 0
    while (i < dims.length) {
      outputShape(i + 1) = input(dims(i))
      i += 1
    }
    Shape(outputShape)
  }

  override def doBuild(inputShape: Shape): AbstractModule[Tensor[T], Tensor[T], T] = {
    val swaps = permToPair(dims.map(x => x - 1)).map(pair => (pair._1 + 2, pair._2 + 2))
    val layer = Transpose(swaps)
    layer.asInstanceOf[AbstractModule[Tensor[T], Tensor[T], T]]
  }
}

object Permute {
  def apply[@specialized(Float, Double) T: ClassTag](
    dims: Array[Int],
    inputShape: Shape = null)(implicit ev: TensorNumeric[T]): Permute[T] = {
    new Permute[T](dims, inputShape)
  }
}

Source File: FrameManager.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.nn

import java.util.concurrent.atomic.AtomicInteger

import com.intel.analytics.bigdl.nn.Graph.ModuleNode
import com.intel.analytics.bigdl.nn.tf.{Exit, MergeOps, NextIteration}

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer


  class Frame[T] private[FrameManager] (
    val name: String,
    val parent: Option[Frame[T]]
  ) {
    // Sync all next iteration nodes execution
    private[bigdl] var barrier: AtomicInteger = new AtomicInteger(0)
    // User can use NextIteration to sync execution. This is a list of those type of nodes
    private[bigdl] val waitingNodes: ArrayBuffer[ModuleNode[T]] = new ArrayBuffer[ModuleNode[T]]()

    // Nodes should be refreshed in a iteration of the frame
    private[bigdl] val nodes: ArrayBuffer[ModuleNode[T]] = new ArrayBuffer[ModuleNode[T]]()
  }
}

Source File: TimeDistributedCriterion.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.nn

import com.intel.analytics.bigdl.nn.abstractnn.TensorCriterion
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.utils.Engine

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.Future
import scala.reflect.ClassTag


    require(input.size(dimension) == target.size(dimension),
      s"target should have as many elements as input, " +
        s"input ${input.size(dimension)}, target ${target.size(dimension)}")
    gradInput.resizeAs(input).zero()

    val nstep = input.size(dimension)

    var i = 0
    while (i < nstep) {
      val _i = i + 1
      results(i) = Engine.model.invoke(() => {
        fInput = input.select(dimension, _i)
        fTarget = target.select(dimension, _i)
        _gradInput = gradInput.select(dimension, _i)
        _gradInput.copy(cells(_i - 1).updateGradInput(fInput, fTarget).toTensor[T])
        if (sizeAverage) {
          _gradInput = _gradInput.div(ev.fromType[Int](nstep))
        }
      })
      i += 1
    }
    Engine.model.sync(results)
    gradInput
  }

  override def canEqual(other: Any): Boolean = other.isInstanceOf[TimeDistributedCriterion[T]]
}

object TimeDistributedCriterion {
  def apply[@specialized(Float, Double) T: ClassTag](
    critrn: TensorCriterion[T] = null, sizeAverage: Boolean = false, dimension: Int = 2)
    (implicit ev: TensorNumeric[T]) : TimeDistributedCriterion[T] = {
    new TimeDistributedCriterion[T](critrn, sizeAverage, dimension)
  }
}

Source File: ExpandSize.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.nn

import com.intel.analytics.bigdl.nn.abstractnn.AbstractModule
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag


class ExpandSize[T: ClassTag](targetSizes: Array[Int])
   (implicit ev: TensorNumeric[T]) extends AbstractModule[Tensor[T], Tensor[T], T] {

  override def updateOutput(input: Tensor[T]): Tensor[T] = {
    require(targetSizes.length == input.dim(),
      s"the number of dimensions provided must equal ${input.dim()}")
    val tensorDim = input.dim()
    val tensorStride = input.stride()
    val tensorSize = input.size()

    var i = 0
    while (i < tensorDim) {
      if (targetSizes(i) != -1) {
        if (tensorSize(i) == 1) {
          tensorSize(i) = targetSizes(i)
          tensorStride(i) = 0
        } else if (tensorSize(i) != targetSizes(i)) {
          throw new UnsupportedOperationException(
            "incorrect size: only supporting singleton expansion (size=1)")
        }
      }
      i += 1
    }

    output.set(input.storage(), input.storageOffset(), tensorSize, tensorStride)
    output
  }

  override def updateGradInput(input: Tensor[T], gradOutput: Tensor[T]): Tensor[T] = {
    val tensorDim = input.dim()
    val tensorSize = input.size()

    gradInput = Tensor[T](tensorSize)
    val expandDim = new ArrayBuffer[Int]()
    var i = 0
    while (i < tensorDim) {
      if (targetSizes(i) != -1) {
        if (tensorSize(i) == 1 && targetSizes(i) != 1) {
          expandDim.append(i + 1)
        }
      }
      i += 1
    }

    i = expandDim.size - 1
    val sizes = gradOutput.size()
    var _gradOutput = gradOutput
    while (i >= 0) {
      var start = 1
      sizes(expandDim(i) - 1) = 1
      val _gradInput = Tensor[T](sizes)
      while (start <= gradOutput.size(expandDim(i))) {
        val x = _gradOutput.narrow(expandDim(i), start, 1)
        _gradInput.add(x)
        start += 1
      }
      _gradOutput = _gradInput
      i -= 1
    }
    gradInput = _gradOutput
    gradInput
  }

  override def toString: String = s"ExpandSize"
}

object ExpandSize {
  def apply[@specialized(Float, Double) T: ClassTag](targetSizes: Array[Int])
     (implicit ev: TensorNumeric[T]) : ExpandSize[T] = {
    new ExpandSize[T](targetSizes)
  }
}

Source File: Utils.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.nn.quantized

import com.intel.analytics.bigdl.Module
import com.intel.analytics.bigdl.nn.abstractnn.{AbstractModule, Activity, TensorModule}
import com.intel.analytics.bigdl.nn.tf.WithoutInput
import com.intel.analytics.bigdl.nn.{Cell, Container, Graph, Input, TimeDistributed, Linear => NNLinear, SpatialConvolution => NNConv, SpatialDilatedConvolution => NNDilatedConv}
import com.intel.analytics.bigdl.tensor.{QuantizedTensor, Tensor}
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.utils.Node
import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

object Utils {
  type ModuleNode[R] = AbstractModule[Activity, Activity, R]
  type SeqNodes[R] = Seq[Node[ModuleNode[R]]]
  type ArrayNodes[R] = Array[Node[ModuleNode[R]]]
  type ANode[R] = Node[ModuleNode[R]]
  type AbsModule[R] = AbstractModule[Activity, Activity, R]

  
  def reorganizeParameters[T: ClassTag](parameters: Array[Tensor[T]])(
    implicit ev: TensorNumeric[T]): Tensor[T] = {
    var length = 0
    for (i <- parameters.indices) {
      if (!parameters(i).isInstanceOf[QuantizedTensor[T]]) {
        length += parameters(i).nElement()
      }
    }

    val result = Tensor[T](length)

    var offset = 0
    for (i <- parameters.indices) {
      val parameter = parameters(i)

      if (!parameter.isInstanceOf[QuantizedTensor[T]]) {
        val length = parameter.nElement()

        val (src, srcOffset) = (parameter.storage().array(), parameter.storageOffset() - 1)
        val (dst, dstOffset) = (result.storage().array(), offset)

        val (size, stride) = (parameter.size(), parameter.stride())

        System.arraycopy(src, srcOffset, dst, dstOffset, length)
        parameter.set(result.storage(), offset + 1, size, stride)

        offset += length
      }
    }

    result
  }
}

Source File: Any.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.nn.ops

import com.intel.analytics.bigdl.nn.abstractnn.Activity
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.utils.Table

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

class Any[T: ClassTag](keepDim : Boolean = false, startFromZero : Boolean = false)
  (implicit ev: TensorNumeric[T]) extends Operation[Table,
  Tensor[Boolean], T] {

  output = Tensor[Boolean]()

  private var buffer = Tensor[Boolean]()

  override def updateOutput(input: Table): Tensor[Boolean] = {
    val data = input[Tensor[Boolean]](1)
    val indices = input[Tensor[Int]](2)
    require(indices.nDimension() == 1 || indices.isScalar, "indices must be 1D tensor or scala")
    output.resizeAs(data)
    buffer.resizeAs(data).copy(data)
    val reduceDims = new ArrayBuffer[Int]()
    val size = output.size()
    if (indices.isScalar) {
      val dim = if (indices.value() < 0) {
        data.nDimension() + indices.value() + 1
      } else if (startFromZero) {
        indices.value() + 1
      } else {
        indices.value()
      }

      if (size(dim - 1) != 1) {
        size(dim - 1) = 1
        reduceDims += dim
        output.resize(size)
        buffer.reduce(dim, output, (a, b) => a || b)
        buffer.resizeAs(output).copy(output)
      }
    } else {
      var i = 1
      while (i <= indices.size(1)) {
        val dim = if (indices.valueAt(i) < 0) {
          data.nDimension() + indices.valueAt(i) + 1
        } else if (startFromZero) {
          indices.valueAt(i) + 1
        } else {
          indices.valueAt(i)
        }
        if (size(dim - 1) != 1) {
          size(dim - 1) = 1
          reduceDims += dim
          output.resize(size)
          buffer.reduce(dim, output, (a, b) => a || b)
          buffer.resizeAs(output).copy(output)
        }
        i += 1
      }
    }

    if (!keepDim) {
      val sizeBuffer = new ArrayBuffer[Int]()
      var i = 1
      while (i <= data.nDimension()) {
        if (!reduceDims.contains(i)) sizeBuffer.append(data.size(i))
        i += 1
      }
      output.resize(sizeBuffer.toArray)
    }
    output
  }

  override def clearState(): this.type = {
    super.clearState()
    buffer.set()
    this
  }
}

object Any {
  def apply[T: ClassTag](keepDim: Boolean = false, startFromZero : Boolean = false)
    (implicit ev: TensorNumeric[T]): Any[T] = new Any[T](keepDim, startFromZero)
}

Source File: CategoricalColVocaList.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.nn.ops

import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.utils.HashFunc

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag



class CategoricalColVocaList[T: ClassTag](
  val vocaList: Array[String],
  val strDelimiter: String = ",",
  val isSetDefault: Boolean = false,
  val numOovBuckets: Int = 0
) (implicit ev: TensorNumeric[T])
  extends Operation[Tensor[String], Tensor[Int], T]{

  private val vocaLen = vocaList.length
  private val vocaMap = vocaList.zipWithIndex.toMap

  require(numOovBuckets >= 0,
    "numOovBuckets is a negative integer")
  require(!(isSetDefault && numOovBuckets != 0),
    "defaultValue and numOovBuckets are both specified")
  require(vocaLen > 0,
    "the vocabulary list is empty")
  require(vocaLen == vocaMap.size,
    "the vocabulary list contains duplicate keys")

  output = Tensor[Int]()

  override def updateOutput(input: Tensor[String]): Tensor[Int] = {

    input.squeeze()
    val rows = input.size(dim = 1)

    val cols = if (numOovBuckets==0) {
      if (isSetDefault) vocaLen + 1 else vocaLen
    }
    else {
      vocaLen + numOovBuckets
    }
    val shape = Array(rows, cols)
    val indices0 = new ArrayBuffer[Int]()
    val indices1 = new ArrayBuffer[Int]()
    val values = new ArrayBuffer[Int]()

    var i = 1
    while (i <= rows) {
      var feaStrArr = input.valueAt(i).split(strDelimiter)
      if (!isSetDefault && numOovBuckets == 0) {
        feaStrArr = feaStrArr.filter(x => vocaMap.contains(x))
      }
      var j = 0
      while (j < feaStrArr.length) {
        val mapVal = numOovBuckets==0 match {
          case true =>
            vocaMap.getOrElse(feaStrArr(j), vocaMap.size)
          case false =>
            vocaMap.getOrElse(feaStrArr(j),
              HashFunc.stringHashBucket32(feaStrArr(j), numOovBuckets) + vocaLen)
        }
        indices0 += i-1
        indices1 += j
        values += mapVal
        j += 1
      }
      i += 1
    }
    val indices = Array(indices0.toArray, indices1.toArray)
    output = Tensor.sparse(indices, values.toArray, shape)
    output
  }
}

object CategoricalColVocaList {
  def apply[T: ClassTag](
    vocaList: Array[String],
    strDelimiter: String = ",",
    isSetDefault: Boolean = false,
    numOovBuckets: Int = 0
  ) (implicit ev: TensorNumeric[T]): CategoricalColVocaList[T]
  = new CategoricalColVocaList[T](
    vocaList = vocaList,
    strDelimiter = strDelimiter,
    isSetDefault = isSetDefault,
    numOovBuckets = numOovBuckets
  )
}

Source File: CategoricalColHashBucket.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.nn.ops

import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag
import scala.util.hashing.MurmurHash3



class CategoricalColHashBucket[T: ClassTag](
  val hashBucketSize: Int,
  val strDelimiter: String = ",",
  val isSparse: Boolean = true
  )(implicit ev: TensorNumeric[T])
  extends Operation[Tensor[String], Tensor[Int], T] {

  output = Tensor[Int]()

  override def updateOutput(input: Tensor[String]): Tensor[Int] = {
    val rows = input.size(dim = 1)
    val indices0 = new ArrayBuffer[Int]()
    val indices1 = new ArrayBuffer[Int]()
    val values = new ArrayBuffer[Int]()
    var i = 1
    var max_fea_len = 0
    while(i <= rows) {
      val feaStrArr = input.valueAt(i, 1).split(strDelimiter)
      max_fea_len = math.max(max_fea_len, feaStrArr.length)
      var j = 0
      while(j < feaStrArr.length) {
        val hashVal = MurmurHash3.stringHash(feaStrArr(j)) % hashBucketSize match {
          case v if v < 0 => v + hashBucketSize
          case v => v
        }
        indices0 += i-1
        indices1 += j
        values += hashVal
        j += 1
      }
      i += 1
    }
    val indices = Array(indices0.toArray, indices1.toArray)
    val shape = Array(rows, max_fea_len)
    output = isSparse match {
      case true =>
        Tensor.sparse(indices, values.toArray, shape)
      case false =>
        Tensor.dense(Tensor.sparse(indices, values.toArray, shape))
    }
    output
  }
}

object CategoricalColHashBucket{
  def apply[T: ClassTag](
      hashBucketSize: Int,
      strDelimiter: String = ",",
      isSparse: Boolean = true)
      (implicit ev: TensorNumeric[T])
  : CategoricalColHashBucket[T] = new CategoricalColHashBucket[T](
    hashBucketSize = hashBucketSize,
    strDelimiter = strDelimiter,
    isSparse = isSparse
  )
}

Source File: Sum.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.nn.ops

import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.utils.Table
import com.intel.analytics.bigdl.nn.{Sum => SumLayer}

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

class Sum[T: ClassTag, D: ClassTag](val keepDims: Boolean, val startFromZero: Boolean = false)
  (implicit ev: TensorNumeric[T], ev2: TensorNumeric[D])
  extends Operation[Table, Tensor[D], T] {

  private val sum: SumLayer[D] = SumLayer[D](squeeze = !keepDims)

  output = Tensor[D]()

  override def updateOutput(input: Table): Tensor[D] = {
    val data = input[Tensor[D]](1)
    val dims = input[Tensor[Int]](2)

    output.resizeAs(data).copy(data)

    val sumDims = if (dims.isEmpty) {
      return output
    } else if (dims.isScalar) {
      Array(if (startFromZero) dims.value() + 1 else dims.value())
    } else {
      require(dims.nDimension() == 1, s"Only accept 1D as dims, but now is ${dims.nDimension()}")
      val buffer = new ArrayBuffer[Int]()
      dims.apply1(a => {
        buffer.append(if (startFromZero) a + 1 else a)
        a
      })
      buffer.toArray.sortWith(_ > _)
    }

    var i = 0
    while(i < sumDims.length) {
      sum.changeSumDims(sumDims(i))
      val tmp = sum.updateOutput(output)
      output.resizeAs(tmp).copy(tmp)
      i += 1
    }

    output
  }

  override def getClassTagNumerics() : (Array[ClassTag[_]], Array[TensorNumeric[_]]) = {
    (Array[ClassTag[_]](scala.reflect.classTag[T], scala.reflect.classTag[D]),
      Array[TensorNumeric[_]](ev, ev2))
  }
}

object Sum {
  def apply[T: ClassTag, D: ClassTag](keepDims: Boolean = false, startFromZero: Boolean = false)
    (implicit ev: TensorNumeric[T], ev2: TensorNumeric[D]): Sum[T, D] =
    new Sum(keepDims, startFromZero)
}

Source File: Kv2Tensor.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.nn.ops

import com.intel.analytics.bigdl.nn.abstractnn.Activity
import com.intel.analytics.bigdl.tensor._
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.utils.Table

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag



class Kv2Tensor[T: ClassTag, D: ClassTag](
  val kvDelimiter: String,
  val itemDelimiter: String,
  val transType: Int
  )(implicit ev: TensorNumeric[T], ev2: TensorNumeric[D])
  extends Operation[Table, Tensor[D], T]{

  output = Activity.allocate[Tensor[D], D]()

  override def updateOutput(input: Table): Tensor[D] = {
    val kvTensor = input[Tensor[String]](1)
    val feaLen = input[Tensor[Int]](2).value()
    val indices0 = new ArrayBuffer[Int]()
    val indices1 = new ArrayBuffer[Int]()
    val values = new ArrayBuffer[D]()
    val rows = kvTensor.size(dim = 1)
    val shape = Array(rows, feaLen)

    var i = 1
    while(i<=rows) {
      val kvFeaString = kvTensor.select(1, i).valueAt(1)
      kvFeaString.split(kvDelimiter).foreach { kv =>
        indices0 += i-1
        indices1 += kv.split(itemDelimiter)(0).toInt
        ev2.getType() match {
          case DoubleType =>
            values += kv.split(itemDelimiter)(1).toDouble.asInstanceOf[D]
          case FloatType =>
            values += kv.split(itemDelimiter)(1).toFloat.asInstanceOf[D]
          case t => throw new NotImplementedError(s"$t is not supported")
        }
      }
      i += 1
    }

    val indices = Array(indices0.toArray, indices1.toArray)
    val resTensor = transType match {
      case 0 =>
        Tensor.dense(Tensor.sparse(indices, values.toArray, shape))
      case 1 =>
        Tensor.sparse(indices, values.toArray, shape)
    }
    output = resTensor
    output
  }

  override def getClassTagNumerics() : (Array[ClassTag[_]], Array[TensorNumeric[_]]) = {
    (Array[ClassTag[_]](scala.reflect.classTag[T], scala.reflect.classTag[D]),
      Array[TensorNumeric[_]](ev, ev2))
  }
}

object Kv2Tensor{
  def apply[T: ClassTag, D: ClassTag](
     kvDelimiter: String = ",",
     itemDelimiter: String = ":",
     transType: Int = 0)
     (implicit ev: TensorNumeric[T], ev2: TensorNumeric[D]): Kv2Tensor[T, D]
  = new Kv2Tensor[T, D](
    kvDelimiter = kvDelimiter,
    itemDelimiter = itemDelimiter,
    transType = transType
  )
}

Source File: All.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.nn.ops

import com.intel.analytics.bigdl.nn.abstractnn.Activity
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.utils.Table

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

class All[T: ClassTag](keepDim : Boolean = false, startFromZero : Boolean = false)
  (implicit ev: TensorNumeric[T]) extends Operation[Table,
  Tensor[Boolean], T] {

  output = Tensor[Boolean]()

  private var buffer = Tensor[Boolean]()

  override def updateOutput(input: Table): Tensor[Boolean] = {
    val data = input[Tensor[Boolean]](1)
    val indices = input[Tensor[Int]](2)
    require(indices.nDimension() == 1 || indices.isScalar, "indices must be 1D tensor or scala")
    output.resizeAs(data)
    buffer.resizeAs(data).copy(data)
    val reduceDims = new ArrayBuffer[Int]()
    val size = output.size()
    if (indices.isScalar) {
      val dim = if (indices.value() < 0) {
        data.nDimension() + indices.value() + 1
      } else if (startFromZero) {
        indices.value() + 1
      } else {
        indices.value()
      }

      if (size(dim - 1) != 1) {
        size(dim - 1) = 1
        reduceDims += dim
        output.resize(size)
        buffer.reduce(dim, output, (a, b) => a && b)
        buffer.resizeAs(output).copy(output)
      }
    } else {
      var i = 1
      while (i <= indices.size(1)) {
        val dim = if (indices.valueAt(i) < 0) {
          data.nDimension() + indices.valueAt(i) + 1
        } else if (startFromZero) {
          indices.valueAt(i) + 1
        } else {
          indices.valueAt(i)
        }
        if (size(dim - 1) != 1) {
          size(dim - 1) = 1
          reduceDims += dim
          output.resize(size)
          buffer.reduce(dim, output, (a, b) => a && b)
          buffer.resizeAs(output).copy(output)
        }
        i += 1
      }
    }

    if (!keepDim) {
      val sizeBuffer = new ArrayBuffer[Int]()
      var i = 1
      while (i <= data.nDimension()) {
        if (!reduceDims.contains(i)) sizeBuffer.append(data.size(i))
        i += 1
      }
      output.resize(sizeBuffer.toArray)
    }
    output
  }

  override def clearState(): this.type = {
    super.clearState()
    buffer.set()
    this
  }
}

object All {
  def apply[T: ClassTag](keepDim: Boolean = false, startFromZero : Boolean = false)
    (implicit ev: TensorNumeric[T]): All[T] = new All[T](keepDim, startFromZero)
}

Source File: ParallelTable.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.nn

import com.intel.analytics.bigdl.nn.Graph.ModuleNode
import com.intel.analytics.bigdl.nn.abstractnn.{AbstractModule, Activity}
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.utils.Table

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag



@SerialVersionUID(- 1197848941394786045L)
class ParallelTable[T: ClassTag]
  (implicit ev: TensorNumeric[T]) extends DynamicContainer[Table, Table, T] {

  override def updateOutput(input: Table): Table = {
    var i = 0
    while (i < input.length()) {
      output.update(i + 1, modules(i).forward(input(i + 1)))
      i += 1
    }
    output
  }

  override def updateGradInput(input: Table, gradOutput: Table): Table = {
    var i = 0
    while (i < input.length()) {
      gradInput.update(i + 1, modules(i).updateGradInput(input(i + 1), gradOutput(i + 1)))
      i += 1
    }
    gradInput
  }

  override def accGradParameters(input: Table, gradOutput: Table): Unit = {
    var i = 0
    while (i < input.length()) {
      modules(i).accGradParameters(input(i + 1), gradOutput(i + 1))
      i += 1
    }
  }

  override def backward(input: Table, gradOutput: Table): Table = {
    val before = System.nanoTime()
    var i = 0
    while (i < input.length()) {
      gradInput.update(i + 1, modules(i).backward(input(i + 1), gradOutput(i + 1)))
      i += 1
    }
    backwardTime += System.nanoTime() - before
    gradInput
  }

  override def getEndNodes(startNodes: Array[ModuleNode[T]]): Array[ModuleNode[T]] = {
    val outputs = ArrayBuffer[ModuleNode[T]]()
    var outputTuple: Array[ModuleNode[T]] = null
    require(startNodes.length == modules.length, s"ParallelTable: " +
      s"startNodes length ${startNodes.length} is more than modules length ${modules.length}")
    for (i <- 0 to modules.size - 1) {
      outputTuple = modules(i).getEndNodes(Array(startNodes(i)))
      outputs ++= outputTuple
    }
    outputs.toArray
  }

  override def toString: String = {
    val tab = "\t"
    val line = "\n"
    val next = "  |`-> "
    val lastNext = "   `-> "
    val ext = "  |    "
    val extlast = "       "
    val last = "   ... -> "
    var str = "nn.ParallelTable"
    str = str + " {" + line + tab + "input"
    var i = 1
    while (i <= modules.length) {
      if (i == modules.length) {
        str = str + line + tab + lastNext + "(" + i + "): " +
          modules(i-1).toString.replace(line, line + tab + extlast)
      } else {
        str = str + line + tab + next + "(" + i + "): " +
          modules(i-1).toString.replace(line, line + tab + ext)
      }
      i += 1
    }
    str = str + line + tab + last + "output"
    str = str + line + "}"
    str
  }
}

object ParallelTable {
  def apply[@specialized(Float, Double) T: ClassTag]()
      (implicit ev: TensorNumeric[T]) : ParallelTable[T] = {
    new ParallelTable[T]()
  }
}

Source File: MultiCriterion.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.nn

import com.intel.analytics.bigdl.nn.abstractnn.{Activity, AbstractCriterion}
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.utils.T

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag



@SerialVersionUID(- 8679064077837483164L)
class MultiCriterion[@specialized(Float, Double) T: ClassTag]
(implicit ev: TensorNumeric[T]) extends AbstractCriterion[Activity, Activity, T] {

  private val weights = new ArrayBuffer[Double]
  private val criterions = T()

  def add(criterion: AbstractCriterion[Activity, Activity, T], weight: Double = 1): Unit = {
    criterions.insert(criterions.length() + 1, criterion)
    weights.append(weight)
  }
  override def updateOutput(input: Activity, target: Activity): T = {
    var i = 1
    while (i <= criterions.length) {
      output = ev.plus(output, ev.times(ev.fromType(weights(i-1)),
        criterions[AbstractCriterion[Activity, Activity, T]](i).updateOutput(input, target)))
      i +=1
    }
    output
  }

  override def updateGradInput(input: Activity, target: Activity): Activity = {
    gradInput = Utils.recursiveResizeAs[T](gradInput,
      input)
    Utils.recursiveFill[T](gradInput, 0)
    var i = 1
    while (i <= criterions.length) {
      Utils.recursiveAdd(gradInput, weights(i - 1),
        criterions[AbstractCriterion[Activity, Activity, T]](i).updateGradInput(input, target))
      i += 1
    }
    gradInput
  }

  override def canEqual(other: Any): Boolean = other.isInstanceOf[MultiCriterion[T]]

  override def equals(other: Any): Boolean = other match {
    case that: MultiCriterion[T] =>
      super.equals(that) &&
      (that canEqual this) &&
        weights == that.weights
    case _ => false
  }

  override def hashCode(): Int = {
    def getHashCode(a: Any): Int = if (a == null) 0 else a.hashCode()
    val state = Seq(super.hashCode(), weights)
    state.map(getHashCode).foldLeft(0)((a, b) => 31 * a + b)
  }

  override def toString(): String = {
    s"nn.MultiCriterion"
  }
}

object MultiCriterion {
  def apply[@specialized(Float, Double) T: ClassTag]()
      (implicit ev: TensorNumeric[T]) : MultiCriterion[T] = {
    new MultiCriterion[T]()
  }
}

Source File: Metrics.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.optim

import com.google.common.util.concurrent.AtomicDouble
import org.apache.spark.{Accumulable, Accumulator, SparkContext}

import scala.collection.mutable.{ArrayBuffer, Map}


class Metrics extends Serializable {
  private val localMetricsMap: Map[String, LocalMetricsEntry] = Map()
  private val aggregateDistributeMetricsMap: Map[String, AggregateDistributeMetricsEntry] = Map()
  private val distributeMetricsMap: Map[String, DistributeMetricsEntry] = Map()

  def add(name: String, value: Double): this.type = {
    require(localMetricsMap.contains(name) || aggregateDistributeMetricsMap.contains(name) ||
      distributeMetricsMap.contains(name))
    if (localMetricsMap.contains(name)) {
      localMetricsMap(name).value.addAndGet(value)
    }

    if (aggregateDistributeMetricsMap.contains(name)) {
      aggregateDistributeMetricsMap(name).value += value
    }

    if (distributeMetricsMap.contains(name)) {
      distributeMetricsMap(name).value += value
    }
    this
  }

  def set(name: String, value: Double, parallel: Int = 1): this.type = {
    require(!aggregateDistributeMetricsMap.contains(name), "duplicated distribute metric")
    require(!distributeMetricsMap.contains(name), "duplicated distribute metric2")
    if (localMetricsMap.contains(name)) {
      localMetricsMap(name).value.set(value)
      localMetricsMap(name).parallel = parallel
    } else {
      localMetricsMap(name) = LocalMetricsEntry(new AtomicDouble(value), parallel)
    }
    this
  }

  def set(name: String, value: Double, sc: SparkContext, parallel: Int): this.type = {
    require(!localMetricsMap.contains(name), "duplicated local metric")
    if (aggregateDistributeMetricsMap.contains(name)) {
      aggregateDistributeMetricsMap(name).value.setValue(value)
      aggregateDistributeMetricsMap(name).parallel = parallel
    } else {
      aggregateDistributeMetricsMap(name) =
        AggregateDistributeMetricsEntry(sc.accumulator(value, name), parallel)
    }
    this
  }

  def set(name: String, value: ArrayBuffer[Double], sc: SparkContext): this.type = {
    require(!localMetricsMap.contains(name), "duplicated local metric")
    require(!aggregateDistributeMetricsMap.contains(name), "duplicated distribute metric")
    if (distributeMetricsMap.contains(name)) {
      distributeMetricsMap(name).value.setValue(value)
    } else {
      distributeMetricsMap(name) = DistributeMetricsEntry(sc.accumulableCollection(value))
    }
    this
  }

  def get(name: String): (Double, Int) = {
    require(localMetricsMap.contains(name) || aggregateDistributeMetricsMap.contains(name))
    if (localMetricsMap.contains(name)) {
      (localMetricsMap(name).value.get(), localMetricsMap(name).parallel)
    } else {
      (aggregateDistributeMetricsMap(name).value.value,
        aggregateDistributeMetricsMap(name).parallel)
    }
  }

  def get(name: String, number: Int): Array[Double] = {
    require(distributeMetricsMap.contains(name))
    distributeMetricsMap(name).value.value.toArray.dropRight(number)
  }

  def summary(unit: String = "s", scale: Double = 1e9): String = {
    "========== Metrics Summary ==========\n" +
      localMetricsMap.map(
        entry => s"${entry._1} : ${entry._2.value.get() / entry._2.parallel / scale} $unit\n")
        .mkString("") +
      aggregateDistributeMetricsMap.map(
        entry => s"${entry._1} : ${entry._2.value.value / entry._2.parallel / scale} $unit\n")
        .mkString("") +
      distributeMetricsMap.map { entry =>
        s"${entry._1} : ${entry._2.value.value.map(_ / scale).mkString(" ")} \n"
      }.mkString("") +
      "====================================="
  }
}


private case class LocalMetricsEntry(value: AtomicDouble, var parallel: Int)

private case class AggregateDistributeMetricsEntry(value: Accumulator[Double], var parallel: Int)

private case class DistributeMetricsEntry(value: Accumulable[ArrayBuffer[Double], Double])

Source File: BatchSamplerSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.transform.vision.image.label.roi

import com.intel.analytics.bigdl.tensor.{Storage, Tensor}
import com.intel.analytics.bigdl.transform.vision.image.util.BoundingBox
import org.scalatest.{FlatSpec, Matchers}

import scala.collection.mutable.ArrayBuffer

class BatchSamplerSpec extends FlatSpec with Matchers {
  "batch sampler with no change" should "work properly" in {
    val sampler = new BatchSampler(maxTrials = 1)
    val unitBox = BoundingBox(0, 0, 1, 1)
    val boxes = Tensor(Storage(Array(0.582296, 0.334719, 0.673582, 0.52183,
      0.596127, 0.282744, 0.670816, 0.449064,
      0.936376, 0.627859, 0.961272, 0.733888,
      0.896266, 0.640333, 0.923928, 0.740125).map(x => x.toFloat))).resize(4, 4)
    val classes = Tensor[Float](4).randn()
    val target = RoiLabel(classes, boxes)
    val sampledBoxes = new ArrayBuffer[BoundingBox]()
    sampler.sample(unitBox, target, sampledBoxes)

    sampledBoxes.length should be(1)
    sampledBoxes(0) should be(unitBox)
  }

  "satisfySampleConstraint with minOverlap 0.1" should "work properly" in {
    val boxes = Tensor(Storage(Array(0.418, 0.396396, 0.55, 0.666667,
      0.438, 0.321321, 0.546, 0.561562,
      0.93, 0.81982, 0.966, 0.972973,
      0.872, 0.837838, 0.912, 0.981982).map(x => x.toFloat))).resize(4, 4)
    val classes = Tensor[Float](4).randn()
    val target = RoiLabel(classes, boxes)

    val sampledBox = BoundingBox(0.114741f, 0.248062f, 0.633665f, 0.763736f)
    val sampler = new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2,
      minOverlap = Some(0.1))

    sampler.satisfySampleConstraint(sampledBox, target) should be(true)
  }

  "satisfySampleConstraint with minOverlap 0.3" should "work properly" in {
    val boxes = Tensor(Storage(Array(0.418, 0.396396, 0.55, 0.666667,
      0.438, 0.321321, 0.546, 0.561562,
      0.93, 0.81982, 0.966, 0.972973,
      0.872, 0.837838, 0.912, 0.981982).map(x => x.toFloat))).resize(4, 4)
    val classes = Tensor[Float](4).randn()
    val target = RoiLabel(classes, boxes)

    val sampledBox = BoundingBox(0.266885f, 0.416113f, 0.678256f, 0.67208f)
    val sampler = new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2,
      minOverlap = Some(0.3))

    sampler.satisfySampleConstraint(sampledBox, target) should be(true)
  }

  "batch samplers" should "work properly" in {
    val boxes = Tensor(Storage(Array(0.418, 0.396396, 0.55, 0.666667,
      0.438, 0.321321, 0.546, 0.561562,
      0.93, 0.81982, 0.966, 0.972973,
      0.872, 0.837838, 0.912, 0.981982).map(x => x.toFloat))).resize(4, 4)
    val classes = Tensor[Float](4).randn()
    val target = RoiLabel(classes, boxes)
    val sampledBoxes = new ArrayBuffer[BoundingBox]()
    val batchSamplers = Array(
      new BatchSampler(maxTrials = 1),
      new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2,
        minOverlap = Some(0.1)),
      new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2,
        minOverlap = Some(0.3)),
      new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2,
        minOverlap = Some(0.5)),
      new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2,
        minOverlap = Some(0.7)),
      new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2,
        minOverlap = Some(0.9)),
      new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2,
        maxOverlap = Some(1.0)))
    BatchSampler.generateBatchSamples(target, batchSamplers, sampledBoxes)

    sampledBoxes.foreach(box => {
      println(box)
    })
  }
}

Source File: BigDLSpecHelper.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.utils

import java.io.{File => JFile}

import org.apache.log4j.Logger
import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers}

import scala.collection.mutable.ArrayBuffer

abstract class BigDLSpecHelper extends FlatSpec with Matchers with BeforeAndAfter {
  protected val logger = Logger.getLogger(getClass)

  private val tmpFiles : ArrayBuffer[JFile] = new ArrayBuffer[JFile]()

  protected def createTmpFile(): JFile = {
    val file = java.io.File.createTempFile("UnitTest", "BigDLSpecBase")
    logger.info(s"created file $file")
    tmpFiles.append(file)
    file
  }

  protected def getFileFolder(path: String): String = {
    path.substring(0, path.lastIndexOf(JFile.separator))
  }

  protected def getFileName(path: String): String = {
    path.substring(path.lastIndexOf(JFile.separator) + 1)
  }

  def doAfter(): Unit = {}

  def doBefore(): Unit = {}

  before {
    doBefore()
  }

  after {
    doAfter()
    tmpFiles.foreach(f => {
      if (f.exists()) {
        require(f.isFile, "cannot clean folder")
        f.delete()
        logger.info(s"deleted file $f")
      }
    })
  }
}

Source File: Kv2TensorSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.nn.ops

import com.intel.analytics.bigdl.tensor.{DenseType, SparseType, Tensor}
import com.intel.analytics.bigdl.utils.serializer.ModuleSerializationTest
import com.intel.analytics.bigdl.utils.{T, Table}
import org.scalatest.{FlatSpec, Matchers}

import scala.collection.mutable.ArrayBuffer
import scala.util.Random

class Kv2TensorSpec extends FlatSpec with Matchers {

  protected def randDoubles(length: Int,
                            lp: Double = 0.0,
                            up: Double = 1.0): Array[Double] = {
    (1 to length).map(_ => lp + (up - lp) * Random.nextDouble()).toArray
  }

  protected def randKVMap(size: Int,
                          numActive: Int,
                          lp: Double = 0.0,
                          up: Double = 1.0): Map[Int, Double] = {
    require(numActive <= size)
    val keys = Random.shuffle((0 until size).toList).take(numActive)
    val values = randDoubles(numActive, lp, up)
    keys.zip(values).toMap
  }
  val batchLen = 3
  val numActive = Array(2, 3, 5)
  val feaLen = 8
  val originData = new ArrayBuffer[String]()
  val originArr = new ArrayBuffer[Table]()
  val indices0 = new ArrayBuffer[Int]()
  val indices1 = new ArrayBuffer[Int]()
  val values = new ArrayBuffer[Double]()
  for (i <- 0 until batchLen) {
    val kvMap = randKVMap(feaLen, numActive(i))
    val kvStr = kvMap.map(data => s"${data._1}:${data._2}").mkString(",")
    originData += kvStr
    originArr += T(kvStr)
    indices0 ++= ArrayBuffer.fill(numActive(i))(i)
    val kvArr = kvMap.toArray
    indices1 ++= kvArr.map(kv => kv._1)
    values ++= kvArr.map(kv => kv._2)
  }
  val originTable = T.array(originArr.toArray)
  val indices = Array(indices0.toArray, indices1.toArray)
  val shape = Array(batchLen, feaLen)

  "Kv2Tensor operation kvString to SparseTensor" should "work correctly" in {
    val input =
      T(
        Tensor[String](originTable),
        Tensor[Int](Array(feaLen), shape = Array[Int]())
      )

    val expectOutput =
      Tensor.sparse[Double](
        indices = indices,
        values = values.toArray,
        shape = shape
      )
    val output = Kv2Tensor[Double, Double](transType = 1)
      .forward(input)

    output should be(expectOutput)
  }

  "Kv2Tensor operation kvString to DenseTensor" should "work correctly" in {
    val input =
      T(
        Tensor[String](originTable),
        Tensor[Int](Array(feaLen), shape = Array[Int]())
      )

    val expectOutput =
      Tensor.dense(Tensor.sparse[Double](
        indices = indices,
        values = values.toArray,
        shape = shape
      ))
    val output = Kv2Tensor[Double, Double](transType = 0)
      .forward(input)

    output should be(expectOutput)
  }
}

class Kv2TensorSerialTest extends ModuleSerializationTest {
  override def test(): Unit = {
    val kv2tensor = Kv2Tensor[Float, Float](
      kvDelimiter = ",", itemDelimiter = ":", transType = 0
    ).setName("kv2tensor")
    val input = T(
      Tensor[String](
        T(T("0:0.1,1:0.2"), T("1:0.3,3:0.5"), T("2:0.15,4:0.25"))),
      Tensor[Int](Array(5), shape = Array[Int]())
    )
    runSerializationTest(kv2tensor, input)
  }
}

Source File: RMSpropSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.optim

import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.{T, TestUtils}
import org.scalatest.{FlatSpec, Matchers}

import scala.collection.mutable.ArrayBuffer

// @com.intel.analytics.bigdl.tags.Parallel
@com.intel.analytics.bigdl.tags.Serial
class RMSpropSpec extends FlatSpec with Matchers {
  val start = System.currentTimeMillis()
  "RMSprop" should "perform well on rosenbrock function" in {
    val x = Tensor[Double](2).fill(0)
    val config = T("learningRate" -> 5e-4)
    val optm = new RMSprop[Double]
    var fx = new ArrayBuffer[Double]
    for (i <- 1 to 10001) {
      val result = optm.optimize(TestUtils.rosenBrock, x, config)
      if ((i - 1) % 1000 == 0) {
        fx += result._2(0)
      }
    }

    println(s"x is \n$x")
    println("fx is")
    for (i <- 1 to fx.length) {
      println(s"${(i - 1) * 1000 + 1}, ${fx(i - 1)}")
    }

    val spend = System.currentTimeMillis() - start
    println("Time Cost: " + spend + "ms")

    (fx.last < 1e-4) should be(true)
    x(Array(1)) should be(1.0 +- 0.01)
    x(Array(2)) should be(1.0 +- 0.01)
  }
}

Source File: AdagradSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.optim

import com.intel.analytics.bigdl.utils.{TestUtils, T}
import org.scalatest.{FlatSpec, Matchers}
import com.intel.analytics.bigdl.tensor.Tensor

import scala.collection.mutable.ArrayBuffer

@com.intel.analytics.bigdl.tags.Parallel
class AdagradSpec extends FlatSpec with Matchers {
  "adagrad" should "perform well on rosenbrock function" in {
    val x = Tensor[Double](2).fill(0)
    val config = T("learningRate" -> 1e-1)
    val optm = new Adagrad[Double]
    var fx = new ArrayBuffer[Double]
    for (i <- 1 to 10001) {
      val result = optm.optimize(TestUtils.rosenBrock, x, config)
      if ((i - 1) % 1000 == 0) {
        fx += (result._2(0))
      }
    }

    println(s"x is \n$x")
    println("fx is")
    for (i <- 1 to fx.length) {
      println(s"${(i - 1) * 1000 + 1}, ${fx(i - 1)}")
    }

    (fx.last < 1e-9) should be(true)
    x(Array(1)) should be(1.0 +- 0.01)
    x(Array(2)) should be(1.0 +- 0.01)
  }
}

Source File: LBFGSSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.optim

import com.intel.analytics.bigdl.utils.{TestUtils, T}
import org.scalatest.{FlatSpec, Matchers}
import com.intel.analytics.bigdl.tensor.Tensor

import scala.collection.mutable.ArrayBuffer

@com.intel.analytics.bigdl.tags.Parallel
class LBFGSSpec extends FlatSpec with Matchers {
  "torchLBFGS in regular batch test" should "perform well on rosenbrock function" in {
    val x = Tensor[Double](2).fill(0)
    val optm = new LBFGS[Double]
    val result = optm.optimize(TestUtils.rosenBrock, x,
      T("maxIter" -> 100, "learningRate" -> 1e-1))
    val fx = result._2

    println()
    println("Rosenbrock test")
    println()

    println(s"x = $x")
    println("fx = ")
    for (i <- 1 to fx.length) {
      println(s"$i ${fx(i - 1)}")
    }
    println()
    println()

    fx.last < 1e-6 should be(true)
  }

  "torchLBFGS in stochastic test" should "perform well on rosenbrock function" in {
    val x = Tensor[Double](2).fill(0)
    val optm = new LBFGS[Double]
    val fx = new ArrayBuffer[Double]()

    val config = T("maxIter" -> 1, "learningRate" -> 1e-1)
    for (i <- 1 to 100) {
      val result = optm.optimize(TestUtils.rosenBrock, x, config)
      fx.append(result._2(0))
    }

    println()
    println("Rosenbrock test")
    println()

    println(s"x = $x")
    println("fx = ")
    for (i <- 1 to fx.length) {
      println(s"$i ${fx(i - 1)}")
    }
    println()
    println()

    fx.last < 1e-6 should be(true)
  }
}

Source File: AdadeltaSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.optim

import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.{T, TestUtils}
import org.scalatest.{FlatSpec, Matchers}

import scala.collection.mutable.ArrayBuffer

@com.intel.analytics.bigdl.tags.Parallel
class AdadeltaSpec extends FlatSpec with Matchers {
  val start = System.currentTimeMillis()
  "adadelta" should "perform well on rosenbrock function" in {
    val x = Tensor[Double](2).fill(0)
    val config = T("Epsilon" -> 1e-10)
    val optm = new Adadelta[Double]
    var fx = new ArrayBuffer[Double]
    for (i <- 1 to 10001) {
      val result = optm.optimize(TestUtils.rosenBrock, x, config)
      if ((i - 1) % 1000 == 0) {
        fx += result._2(0)
      }
    }

    println(s"x is \n$x")
    println("fx is")
    for (i <- 1 to fx.length) {
      println(s"${(i - 1) * 1000 + 1}, ${fx(i - 1)}")
    }

    val spend = System.currentTimeMillis() - start
    println("Time Cost: " + spend + "ms")

    (fx.last < 1e-4) should be(true)
    x(Array(1)) should be(1.0 +- 0.02)
    x(Array(2)) should be(1.0 +- 0.02)
  }
}

Source File: AdamaxSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.optim

import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.{T, TestUtils}
import org.scalatest.{FlatSpec, Matchers}

import scala.collection.mutable.ArrayBuffer

@com.intel.analytics.bigdl.tags.Parallel
class AdamaxSpec extends FlatSpec with Matchers {
  val start = System.currentTimeMillis()
  "adamax" should "perform well on rosenbrock function" in {
    val x = Tensor[Double](2).fill(0)
    val config = T()
    val optm = new Adamax[Double]
    var fx = new ArrayBuffer[Double]
    for (i <- 1 to 10001) {
      val result = optm.optimize(TestUtils.rosenBrock, x, config)
      if ((i - 1) % 1000 == 0) {
        fx += result._2(0)
      }
    }

    println(s"x is \n$x")
    println("fx is")
    for (i <- 1 to fx.length) {
      println(s"${(i - 1) * 1000 + 1}, ${fx(i - 1)}")
    }

    val spend = System.currentTimeMillis() - start
    println("Time Cost: " + spend + "ms")

    (fx.last < 1e-9) should be(true)
    x(Array(1)) should be(1.0 +- 0.01)
    x(Array(2)) should be(1.0 +- 0.01)
  }
}

Source File: AdamSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.optim

import com.intel.analytics.bigdl.nn.{CrossEntropyCriterion, Linear, Sequential}
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.{Engine, RandomGenerator, T, TestUtils}
import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers}

import scala.collection.mutable.ArrayBuffer
import scala.util.Random

@com.intel.analytics.bigdl.tags.Parallel
class AdamSpec extends FlatSpec with Matchers with BeforeAndAfter {

  before {
    System.setProperty("bigdl.localMode", "true")
    System.setProperty("spark.master", "local[2]")
    Engine.init
  }

  after {
    System.clearProperty("bigdl.localMode")
    System.clearProperty("spark.master")
  }


  val start = System.currentTimeMillis()
  "adam" should "perform well on rosenbrock function" in {
    val x = Tensor[Double](2).fill(0)
    val config = T("learningRate" -> 0.002)
    val optm = new Adam[Double]
    var fx = new ArrayBuffer[Double]
    for (i <- 1 to 10001) {
      val result = optm.optimize(TestUtils.rosenBrock, x, config)
      if ((i - 1) % 1000 == 0) {
        fx += result._2(0)
      }
    }

    println(s"x is \n$x")
    println("fx is")
    for (i <- 1 to fx.length) {
      println(s"${(i - 1) * 1000 + 1}, ${fx(i - 1)}")
    }

    val spend = System.currentTimeMillis() - start
    println("Time Cost: " + spend + "ms")

    (fx.last < 1e-9) should be(true)
    x(Array(1)) should be(1.0 +- 0.01)
    x(Array(2)) should be(1.0 +- 0.01)
  }

  "ParallelAdam" should "perform well on rosenbrock function" in {
    val x = Tensor[Double](2).fill(0)
    val optm = new ParallelAdam[Double](learningRate = 0.002, parallelNum = 2)
    var fx = new ArrayBuffer[Double]
    for (i <- 1 to 10001) {
      val result = optm.optimize(TestUtils.rosenBrock, x)
      if ((i - 1) % 1000 == 0) {
        fx += result._2(0)
      }
    }

    println(s"x is \n$x")
    println("fx is")
    for (i <- 1 to fx.length) {
      println(s"${(i - 1) * 1000 + 1}, ${fx(i - 1)}")
    }

    val spend = System.currentTimeMillis() - start
    println("Time Cost: " + spend + "ms")

    (fx.last < 1e-9) should be(true)
    x(Array(1)) should be(1.0 +- 0.01)
    x(Array(2)) should be(1.0 +- 0.01)
  }

}

Source File: TrimmedIndependentPixelEvaluator.scala From scalismo-faces with Apache License 2.0

5 votes

package scalismo.faces.sampling.face.evaluators

import scalismo.color.{RGB, RGBA}
import scalismo.faces.image.{ImageBuffer, PixelImage, PixelImageDomain}
import scalismo.sampling.DistributionEvaluator
import scalismo.sampling.evaluators.PairEvaluator

import scala.collection.mutable.ArrayBuffer


    def visualize(values: IndexedSeq[(Double, Int, Int)], domain: PixelImageDomain, callBack: PixelImage[Option[Double]] => Unit): Unit = {
      val buffer = ImageBuffer.makeConstantBuffer[Option[Double]](domain.width, domain.height, None)
      values.foreach { case (lh: Double, x: Int, y: Int) => buffer(x, y) = Some(lh) }
      callBack(buffer.toImage)
    }
    var transparencySum = 0.0
    var values = ArrayBuffer[(Double, Int, Int)]()
    var x: Int = 0
    while (x < reference.width) {
      var y: Int = 0
      while (y < reference.height) {
        val smp = sample(x, y)
        if (smp.a > 1e-4f) {
          val ref = reference(x, y).toRGB
          val fg: Double = pixelEvaluator.logValue(ref, smp.toRGB)
          val bg: Double = bgEvaluator.logValue(ref)
          val entry = (fg - bg, x, y)
          values += entry
        }
        transparencySum += smp.a
        y += 1
      }
      x += 1
    }
    val nCount = math.floor(values.length.toFloat * alphaClamped).toInt
    if (transparencySum > 0 && nCount > 0) {
      //was something rendered on the image?
      val data = values.toIndexedSeq.sortBy { case (d: Double, x: Int, y: Int) => d }
      var sumTrimmed: Double = 0.0
      for (i <- 0 until nCount) {
        sumTrimmed += data(data.size - 1 - i)._1
      }
      if (visualizationCallback.isDefined)
        visualize(data.slice(data.size - 1 - nCount, data.size - 1), reference.domain, visualizationCallback.get)
      sumTrimmed
    } else {
      // nothing was rendered on the image!
      Double.NegativeInfinity
    }
  }

  override def toString: String = {
    val builder = new StringBuilder(128)
    builder ++= "TrimmedIndependentPixelEvaluator("
    builder ++= pixelEvaluator.toString
    builder ++= "/"
    builder ++= bgEvaluator.toString
    builder ++= s"alpha=$alphaClamped"
    builder ++= ")"
    builder.mkString
  }
}

object TrimmedIndependentPixelEvaluator {
  def apply(pixelEvaluator: PairEvaluator[RGB], bgEvaluator: DistributionEvaluator[RGB], alpha: Double) = new TrimmedIndependentPixelEvaluator(pixelEvaluator, bgEvaluator, alpha, None)

  def apply(pixelEvaluator: PairEvaluator[RGB], bgEvaluator: DistributionEvaluator[RGB], alpha: Double, visualisationCallback: PixelImage[Option[Double]] => Unit) = new TrimmedIndependentPixelEvaluator(pixelEvaluator, bgEvaluator, alpha, Some(visualisationCallback))

}

Source File: MorphologicalFilter.scala From scalismo-faces with Apache License 2.0

5 votes

package scalismo.faces.image.filter

import scalismo.faces.image.AccessMode._
import scalismo.faces.image._

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag


    def perPixel(x: Int, y: Int): A = {
      var kx = 0
      var kernelPixels = new ArrayBuffer[A](width * height)
      while (kx < width) {
        val ix = x + kx - width / 2
        var ky = 0
        while (ky < height) {
          val iy = y + ky - height / 2
          if (structuringElement(kx, ky)) kernelPixels += image(ix, iy)
          ky += 1
        }
        kx += 1
      }
      if (kernelPixels.nonEmpty)
        windowFilter(kernelPixels)
      else
        image(x, y)
    }

    if(width <= 0 || height <= 0)
      image
    else
      PixelImage(image.width, image.height, perPixel, Strict())
  }
}

object MorphologicalFilter {
  def boxElement(size: Int): PixelImage[Boolean] = PixelImage.view(size, size, (x, y) => x >= 0 && x < size && y >= 0 && y < size)
}

Source File: ImmutableSelection.scala From hacktoberfest-scala-algorithms with GNU General Public License v3.0

5 votes

package io.github.sentenza.hacktoberfest.algos

import scala.collection.mutable.ArrayBuffer
import scala.math.Ordered


  def quickSelect(list: List[Int], idx: Int): Option[Int] = {
    if (idx < 0 || list.size <= idx) return None

    list match {
      case Nil => None
      case pivot :: rest => {
        val (smaller, larger) = rest partition (_ <= pivot)
        val pivotIdx          = smaller.size

        idx.compare(pivotIdx) match {
          case needleInSmaller if needleInSmaller < 0 =>
            quickSelect(smaller, idx)
          case needleIsPivot if needleIsPivot == 0 => Some(pivot)
          case needleInLarger if needleInLarger > 0 =>
            quickSelect(larger, idx - pivotIdx - 1)
        }
      }
    }
  }
}

Source File: RocksEdgeFetcher.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.core.storage.rocks

import com.typesafe.config.Config
import org.apache.s2graph.core._
import org.apache.s2graph.core.schema.Label
import org.apache.s2graph.core.storage.{SKeyValue, StorageIO, StorageSerDe}
import org.apache.s2graph.core.types.{HBaseType, VertexId}
import org.rocksdb.RocksDB

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.{ExecutionContext, Future}

class RocksEdgeFetcher(val graph: S2GraphLike,
                       val config: Config,
                       val db: RocksDB,
                       val vdb: RocksDB,
                       val serDe: StorageSerDe,
                       val io: StorageIO) extends EdgeFetcher  {
  import RocksStorage._

  override def fetches(queryRequests: Seq[QueryRequest], prevStepEdges: Map[VertexId, Seq[EdgeWithScore]])(implicit ec: ExecutionContext): Future[Seq[StepResult]] = {
    val futures = for {
      queryRequest <- queryRequests
    } yield {
      val parentEdges = prevStepEdges.getOrElse(queryRequest.vertex.id, Nil)
      val edge = graph.elementBuilder.toRequestEdge(queryRequest, parentEdges)
      val rpc = buildRequest(graph, serDe, queryRequest, edge)
      fetchKeyValues(vdb, db, rpc).map { kvs =>
        val queryParam = queryRequest.queryParam
        val stepResult = io.toEdges(kvs, queryRequest, queryRequest.prevStepScore, false, parentEdges)
        val edgeWithScores = stepResult.edgeWithScores.filter { case edgeWithScore =>
          val edge = edgeWithScore.edge
          val duration = queryParam.durationOpt.getOrElse((Long.MinValue, Long.MaxValue))
          edge.ts >= duration._1 && edge.ts < duration._2
        }

        stepResult.copy(edgeWithScores = edgeWithScores)
      }
    }

    Future.sequence(futures)
  }

  override def fetchEdgesAll()(implicit ec: ExecutionContext) = {
    val edges = new ArrayBuffer[S2EdgeLike]()
    Label.findAll().groupBy(_.hbaseTableName).toSeq.foreach { case (hTableName, labels) =>
      val distinctLabels = labels.toSet

      val iter = db.newIterator()
      try {
        iter.seekToFirst()
        while (iter.isValid) {
          val kv = SKeyValue(table, iter.key(), SKeyValue.EdgeCf, qualifier, iter.value, System.currentTimeMillis())

          serDe.indexEdgeDeserializer(schemaVer = HBaseType.DEFAULT_VERSION).fromKeyValues(Seq(kv), None)
            .filter(e => distinctLabels(e.innerLabel) && e.getDirection() == "out" && !e.isDegree)
            .foreach { edge =>
              edges += edge
            }


          iter.next()
        }

      } finally {
        iter.close()
      }
    }

    Future.successful(edges)
  }
}

Source File: RocksVertexFetcher.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.core.storage.rocks

import com.typesafe.config.Config
import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core._
import org.apache.s2graph.core.schema.ServiceColumn
import org.apache.s2graph.core.storage.rocks.RocksStorage.{qualifier, table}
import org.apache.s2graph.core.storage.{SKeyValue, StorageIO, StorageSerDe}
import org.apache.s2graph.core.types.HBaseType
import org.rocksdb.RocksDB

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.{ExecutionContext, Future}

class RocksVertexFetcher(val graph: S2GraphLike,
                         val config: Config,
                         val db: RocksDB,
                         val vdb: RocksDB,
                         val serDe: StorageSerDe,
                         val io: StorageIO) extends VertexFetcher {
  private def fetchKeyValues(queryRequest: QueryRequest, vertex: S2VertexLike)(implicit ec: ExecutionContext): Future[Seq[SKeyValue]] = {
    val rpc = RocksStorage.buildRequest(queryRequest, vertex)

    RocksStorage.fetchKeyValues(vdb, db, rpc)
  }

  override def fetchVertices(vertexQueryParam: VertexQueryParam)(implicit ec: ExecutionContext): Future[Seq[S2VertexLike]] = {
    def fromResult(kvs: Seq[SKeyValue], version: String): Seq[S2VertexLike] = {
      if (kvs.isEmpty) Nil
      else serDe.vertexDeserializer(version).fromKeyValues(kvs, None).toSeq.filter(vertexQueryParam.where.get.filter)
    }
    val vertices = vertexQueryParam.vertexIds.map(vId => graph.elementBuilder.newVertex(vId))

    val futures = vertices.map { vertex =>
      val queryParam = QueryParam.Empty
      val q = Query.toQuery(Seq(vertex), Seq(queryParam))
      val queryRequest = QueryRequest(q, stepIdx = -1, vertex, queryParam)

      fetchKeyValues(queryRequest, vertex).map { kvs =>
        fromResult(kvs, vertex.serviceColumn.schemaVersion)
      } recoverWith {
        case ex: Throwable => Future.successful(Nil)
      }
    }

    Future.sequence(futures).map(_.flatten)
  }

  override def fetchVerticesAll()(implicit ec: ExecutionContext) = {
    import scala.collection.mutable

    val vertices = new ArrayBuffer[S2VertexLike]()
    ServiceColumn.findAll().groupBy(_.service.hTableName).toSeq.foreach { case (hTableName, columns) =>
      val distinctColumns = columns.toSet

      val iter = vdb.newIterator()
      val buffer = mutable.ListBuffer.empty[SKeyValue]
      var oldVertexIdBytes = Array.empty[Byte]
      var minusPos = 0

      try {
        iter.seekToFirst()
        while (iter.isValid) {
          val row = iter.key()
          if (!Bytes.equals(oldVertexIdBytes, 0, oldVertexIdBytes.length - minusPos, row, 0, row.length - 1)) {
            if (buffer.nonEmpty)
              serDe.vertexDeserializer(schemaVer = HBaseType.DEFAULT_VERSION).fromKeyValues(buffer, None)
                .filter(v => distinctColumns(v.serviceColumn))
                .foreach { vertex =>
                  vertices += vertex
                }

            oldVertexIdBytes = row
            minusPos = 1
            buffer.clear()
          }
          val kv = SKeyValue(table, iter.key(), SKeyValue.VertexCf, qualifier, iter.value(), System.currentTimeMillis())
          buffer += kv

          iter.next()
        }
        if (buffer.nonEmpty)
          serDe.vertexDeserializer(schemaVer = HBaseType.DEFAULT_VERSION).fromKeyValues(buffer, None)
            .filter(v => distinctColumns(v.serviceColumn))
            .foreach { vertex =>
              vertices += vertex
            }

      } finally {
        iter.close()
      }
    }

    Future.successful(vertices)
  }
}

Source File: BytesUtilV1.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.counter.core.v1

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.counter.core.TimedQualifier.IntervalUnit
import org.apache.s2graph.counter.core.{TimedQualifier, ExactQualifier, ExactKeyTrait, BytesUtil}
import org.apache.s2graph.counter.models.Counter.ItemType
import org.apache.s2graph.counter.util.Hashes
import scala.collection.mutable.ArrayBuffer

object BytesUtilV1 extends BytesUtil {
  // ExactKey: [hash(2b)][policy(4b)][item(variable)]
  val BUCKET_BYTE_SIZE = Bytes.SIZEOF_SHORT
  val POLICY_ID_SIZE = Bytes.SIZEOF_INT
  val INTERVAL_SIZE = Bytes.SIZEOF_BYTE
  val TIMESTAMP_SIZE = Bytes.SIZEOF_LONG
  val TIMED_QUALIFIER_SIZE = INTERVAL_SIZE + TIMESTAMP_SIZE

  override def getRowKeyPrefix(id: Int): Array[Byte] = {
    Bytes.toBytes(id)
  }

  override def toBytes(key: ExactKeyTrait): Array[Byte] = {
    val buff = new ArrayBuffer[Byte]
    // hash key (2 byte)
    buff ++= Bytes.toBytes(Hashes.murmur3(key.itemKey)).take(BUCKET_BYTE_SIZE)

    buff ++= getRowKeyPrefix(key.policyId)
    buff ++= {
      key.itemType match {
        case ItemType.INT => Bytes.toBytes(key.itemKey.toInt)
        case ItemType.LONG => Bytes.toBytes(key.itemKey.toLong)
        case ItemType.STRING | ItemType.BLOB => Bytes.toBytes(key.itemKey)
      }
    }
    buff.toArray
  }

  override def toBytes(eq: ExactQualifier): Array[Byte] = {
    toBytes(eq.tq) ++ eq.dimension.getBytes
  }

  override def toBytes(tq: TimedQualifier): Array[Byte] = {
    Bytes.toBytes(tq.q.toString) ++ Bytes.toBytes(tq.ts)
  }

  override def toExactQualifier(bytes: Array[Byte]): ExactQualifier = {
    // qualifier: interval, ts, dimension 순서
    val tq = toTimedQualifier(bytes)

    val dimension = Bytes.toString(bytes, TIMED_QUALIFIER_SIZE, bytes.length - TIMED_QUALIFIER_SIZE)
    ExactQualifier(tq, dimension)
  }

  override def toTimedQualifier(bytes: Array[Byte]): TimedQualifier = {
    val interval = Bytes.toString(bytes, 0, INTERVAL_SIZE)
    val ts = Bytes.toLong(bytes, INTERVAL_SIZE)

    TimedQualifier(IntervalUnit.withName(interval), ts)
  }
}

Source File: BytesUtilV2.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.counter.core.v2

import org.apache.hadoop.hbase.util._
import org.apache.s2graph.counter
import org.apache.s2graph.counter.core.TimedQualifier.IntervalUnit
import org.apache.s2graph.counter.core.{TimedQualifier, ExactQualifier, ExactKeyTrait, BytesUtil}
import org.apache.s2graph.counter.models.Counter.ItemType
import org.apache.s2graph.counter.util.Hashes
import scala.collection.mutable.ArrayBuffer

object BytesUtilV2 extends BytesUtil {
  // ExactKey: [hash(1b)][version(1b)][policy(4b)][item(variable)]
  val BUCKET_BYTE_SIZE = Bytes.SIZEOF_BYTE
  val VERSION_BYTE_SIZE = Bytes.SIZEOF_BYTE
  val POLICY_ID_SIZE = Bytes.SIZEOF_INT

  val INTERVAL_SIZE = Bytes.SIZEOF_BYTE
  val TIMESTAMP_SIZE = Bytes.SIZEOF_LONG
  val TIMED_QUALIFIER_SIZE = INTERVAL_SIZE + TIMESTAMP_SIZE

  override def getRowKeyPrefix(id: Int): Array[Byte] = {
    Array(counter.VERSION_2) ++ Bytes.toBytes(id)
  }

  override def toBytes(key: ExactKeyTrait): Array[Byte] = {
    val buff = new ArrayBuffer[Byte]
    // hash byte
    buff ++= Bytes.toBytes(Hashes.murmur3(key.itemKey)).take(BUCKET_BYTE_SIZE)

    // row key prefix
    // version + policy id
    buff ++= getRowKeyPrefix(key.policyId)

    buff ++= {
      key.itemType match {
        case ItemType.INT => Bytes.toBytes(key.itemKey.toInt)
        case ItemType.LONG => Bytes.toBytes(key.itemKey.toLong)
        case ItemType.STRING | ItemType.BLOB => Bytes.toBytes(key.itemKey)
      }
    }
    buff.toArray
  }

  override def toBytes(eq: ExactQualifier): Array[Byte] = {
    val len = eq.dimKeyValues.map { case (k, v) => k.length + 2 + v.length + 2 }.sum
    val pbr = new SimplePositionedMutableByteRange(len)
    for {
      v <- ExactQualifier.makeSortedDimension(eq.dimKeyValues)
    } {
      OrderedBytes.encodeString(pbr, v, Order.ASCENDING)
    }
    toBytes(eq.tq) ++ pbr.getBytes
  }

  override def toBytes(tq: TimedQualifier): Array[Byte] = {
    val pbr = new SimplePositionedMutableByteRange(INTERVAL_SIZE + 2 + TIMESTAMP_SIZE + 1)
    OrderedBytes.encodeString(pbr, tq.q.toString, Order.ASCENDING)
    OrderedBytes.encodeInt64(pbr, tq.ts, Order.DESCENDING)
    pbr.getBytes
  }

  private def decodeString(pbr: PositionedByteRange): Stream[String] = {
    if (pbr.getRemaining > 0) {
      Stream.cons(OrderedBytes.decodeString(pbr), decodeString(pbr))
    }
    else {
      Stream.empty
    }
  }

  override def toExactQualifier(bytes: Array[Byte]): ExactQualifier = {
    val pbr = new SimplePositionedByteRange(bytes)
    ExactQualifier(toTimedQualifier(pbr), {
      val seqStr = decodeString(pbr).toSeq
      val (keys, values) = seqStr.splitAt(seqStr.length / 2)
      keys.zip(values).toMap
    })
  }

  override def toTimedQualifier(bytes: Array[Byte]): TimedQualifier = {
    val pbr = new SimplePositionedByteRange(bytes)
    toTimedQualifier(pbr)
  }

  def toTimedQualifier(pbr: PositionedByteRange): TimedQualifier = {
    TimedQualifier(IntervalUnit.withName(OrderedBytes.decodeString(pbr)), OrderedBytes.decodeInt64(pbr))
  }
}

Source File: AccountStorage.scala From matcher with MIT License

5 votes

package com.wavesplatform.dex.db

import java.io.{File, FileInputStream, FileOutputStream}
import java.nio.file.Files
import java.util.Base64

import cats.syntax.either._
import com.google.common.primitives.{Bytes, Ints}
import com.wavesplatform.dex.crypto.Enigma
import com.wavesplatform.dex.db.AccountStorage.Settings.EncryptedFile
import com.wavesplatform.dex.domain.account.KeyPair
import com.wavesplatform.dex.domain.bytes.ByteStr
import com.wavesplatform.dex.domain.crypto
import net.ceedubs.ficus.readers.ValueReader

import scala.collection.mutable.ArrayBuffer

case class AccountStorage(keyPair: KeyPair)

object AccountStorage {

  sealed trait Settings

  object Settings {

    case class InMem(seed: ByteStr)                        extends Settings
    case class EncryptedFile(path: File, password: String) extends Settings

    implicit val valueReader: ValueReader[Settings] = ValueReader.relative[Settings] { config =>
      config.getString("type") match {
        case "in-mem" => InMem(Base64.getDecoder.decode(config.getString("in-mem.seed-in-base64")))
        case "encrypted-file" =>
          EncryptedFile(
            path = new File(config.getString("encrypted-file.path")),
            password = config.getString("encrypted-file.password")
          )
        case x => throw new IllegalArgumentException(s"The type of account storage '$x' is unknown. Please update your settings.")
      }
    }
  }

  def load(settings: Settings): Either[String, AccountStorage] = settings match {
    case Settings.InMem(seed) => Right(AccountStorage(KeyPair(seed)))
    case Settings.EncryptedFile(file, password) =>
      if (file.isFile) {
        val encryptedSeedBytes = readFile(file)
        val key                = Enigma.prepareDefaultKey(password)
        val decryptedBytes     = Enigma.decrypt(key, encryptedSeedBytes)
        AccountStorage(KeyPair(decryptedBytes)).asRight
      } else s"A file '${file.getAbsolutePath}' doesn't exist".asLeft
  }

  def save(seed: ByteStr, to: EncryptedFile): Unit = {
    Files.createDirectories(to.path.getParentFile.toPath)
    val key                = Enigma.prepareDefaultKey(to.password)
    val encryptedSeedBytes = Enigma.encrypt(key, seed.arr)
    writeFile(to.path, encryptedSeedBytes)
  }

  def getAccountSeed(baseSeed: ByteStr, nonce: Int): ByteStr = ByteStr(crypto.secureHash(Bytes.concat(Ints.toByteArray(nonce), baseSeed)))

  def readFile(file: File): Array[Byte] = {
    val reader = new FileInputStream(file)
    try {
      val buff = new Array[Byte](1024)
      val r    = new ArrayBuffer[Byte]
      while (reader.available() > 0) {
        val read = reader.read(buff)
        if (read > 0) {
          r.appendAll(buff.iterator.take(read))
        }
      }
      r.toArray
    } finally {
      reader.close()
    }
  }

  def writeFile(file: File, bytes: Array[Byte]): Unit = {
    val writer = new FileOutputStream(file, false)
    try writer.write(bytes)
    finally writer.close()
  }
}

Source File: WordSpliter.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.nlp

import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import com.huaban.analysis.jieba.JiebaSegmenter.SegMode
import com.huaban.analysis.jieba._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer

class WordSpliter extends ConfigurableStop {

  val authorEmail: String = "[email protected]"
  val description: String = "Word segmentation"
  val inportList: List[String] = List(Port.AnyPort.toString)
  val outportList: List[String] = List(Port.DefaultPort.toString)

  var path:String = _


  val jiebaSegmenter = new JiebaSegmenter()
  var tokenARR:ArrayBuffer[String]=ArrayBuffer()

  def segmenter(str:String): Unit ={

    var strVar = str
    //delete symbol
    strVar = strVar.replaceAll( "[\\p{P}+~$`^=|<>～｀＄＾＋＝｜＜＞￥×+\\s]" , "");

    val tokens = jiebaSegmenter.process(strVar,SegMode.SEARCH).asScala

    for (token: SegToken <- tokens){

        tokenARR += token.word

    }
  }

  def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val session: SparkSession = pec.get[SparkSession]()

    //read
    val strDF = session.read.text(path)

    //segmenter
    segmenter(strDF.head().getString(0))

    //write df
    val rows: List[Row] = tokenARR.map(each => {
      var arr:Array[String]=Array(each)
      val row: Row = Row.fromSeq(arr)
      row
    }).toList
    val rowRDD: RDD[Row] = session.sparkContext.makeRDD(rows)
    val schema: StructType = StructType(Array(
      StructField("words",StringType)
    ))
    val df: DataFrame = session.createDataFrame(rowRDD,schema)

    out.write(df)
  }

  def initialize(ctx: ProcessContext): Unit = {

  }

  def setProperties(map : Map[String, Any]) = {
    path = MapUtil.get(map,"path").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val path = new PropertyDescriptor().name("path").displayName("path").description("The path of text file").defaultValue("").required(true)
    descriptor = path :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/nlp/NLP.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.Alg_NLPGroup.toString)
  }

}

Source File: JsonUtil.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.util

import org.apache.spark.sql.functions.explode
import org.apache.spark.sql.{Column, DataFrame, SQLContext, SparkSession}

import scala.collection.mutable.ArrayBuffer

object JsonUtil extends Serializable{


//  The tag you want to parse,If you want to open an array field,you have to write it like this:links_name(MasterField_ChildField)
  def ParserJsonDF(df:DataFrame,tag:String): DataFrame = {

    var openArrField:String=""
    var ArrSchame:String=""

    var tagARR: Array[String] = tag.split(",")
    var tagNew:String=""


    for(tt<-tagARR){

      if(tt.indexOf("_")> -1){
        //包含“.”
        val openField: Array[String] = tt.split("_")
        openArrField=openField(0)

        ArrSchame+=(openField(1)+",")
      }else{
        tagNew+=(tt+",")
      }
    }
    tagNew+=openArrField
    ArrSchame=ArrSchame.substring(0,ArrSchame.length-1)

    tagARR = tagNew.split(",")
    var FinalDF:DataFrame=df

    //如果用户选择返回字段
    var strings: Seq[Column] =tagNew.split(",").toSeq.map(p => new Column(p))

    if(tag.length>0){
      val df00 = FinalDF.select(strings : _*)
      FinalDF=df00
    }

    //如果用户选择打开的数组字段，并给出schame
    if(openArrField.length>0&&ArrSchame.length>0){

      val schames: Array[String] = ArrSchame.split(",")

      var selARR:ArrayBuffer[String]=ArrayBuffer()//分别取出已经打开的字段
      //遍历数组，封装到column对象中
      var coARR:ArrayBuffer[Column]=ArrayBuffer()//打开字段的select方法用
      val sss = tagNew.split(",")//打开字段后todf方法用
      var co: Column =null
      for(each<-tagARR){
        if(each==openArrField){
          co = explode(FinalDF(openArrField))
          for(x<-schames){

            selARR+=(openArrField+"."+x)
          }
        }else{
          selARR+=each
          co=FinalDF(each)
        }
        coARR+=co
      }
      println("###################")
      selARR.foreach(println(_))
      var selSEQ: Seq[Column] = selARR.toSeq.map(q => new Column(q))

      var df01: DataFrame = FinalDF.select(coARR : _*).toDF(sss:_*)
      FinalDF = df01.select(selSEQ : _*)

    }

FinalDF

  }
}

Source File: BufferListener.scala From Binding.scala with MIT License

5 votes

package com.thoughtworks.binding

import Binding.{PatchedEvent, ChangedEvent, PatchedListener, ChangedListener}
import com.thoughtworks.binding.Binding.{PatchedEvent, ChangedEvent, PatchedListener, ChangedListener}

import scala.collection.mutable.ArrayBuffer


final class BufferListener extends ArrayBuffer[Any] {
  val listener = new ChangedListener[Seq[Any]] with PatchedListener[Any] {
    override def changed(event: ChangedEvent[Seq[Any]]): Unit = {
      BufferListener.this += event
    }

    override def patched(event: PatchedEvent[Any]): Unit = {
      BufferListener.this += event
    }
  }
}

Source File: FlatMapRemove.scala From Binding.scala with MIT License

5 votes

package com.thoughtworks.binding.regression

import com.thoughtworks.binding.Binding._
import com.thoughtworks.binding._
import org.scalatest.freespec.AnyFreeSpec
import org.scalatest.matchers.should.Matchers

import scala.collection.mutable.ArrayBuffer


final class FlatMapRemove extends AnyFreeSpec with Matchers {
  "removed source of a flatMap" in {

    val data = Vars.empty[Either[String, String]]

    val left = for {
      s <- data
      if s.isLeft
    } yield s

    val events = ArrayBuffer.empty[String]
    val autoPrint = Binding {
      if (left.length.bind > 0) {
        events += "has left"
      } else {
        events += "does not has left"
      }
    }
    assert(events.forall(_ == "does not has left"))
    autoPrint.watch()
    assert(events.forall(_ == "does not has left"))
    data.value += Right("1")
    assert(events.forall(_ == "does not has left"))
    data.value += Right("2")
    assert(events.forall(_ == "does not has left"))
    data.value += Right("3")
    assert(events.forall(_ == "does not has left"))
    data.value(1) = Left("left 2")
    assert(events.last == "has left")
    data.value --= Seq(Left("left 2"))
    assert(events.last == "does not has left")
  }
}

Source File: InsertThenClear.scala From Binding.scala with MIT License

5 votes

package com.thoughtworks.binding.regression

import com.thoughtworks.binding.Binding._
import com.thoughtworks.binding._
import org.scalatest.freespec.AnyFreeSpec
import org.scalatest.matchers.should.Matchers

import scala.collection.mutable.ArrayBuffer


final class InsertThenClear extends AnyFreeSpec with Matchers {
  "insert then clear" in {
    val items = Vars(1 to 10: _*)

    val mapped = items.map(-_)
    mapped.watch()
    assert(mapped.get sameElements Seq(-1, -2, -3, -4, -5, -6, -7, -8, -9, -10))

    items.value.insertAll(3, 100 to 103)
    assert(mapped.get sameElements Seq(-1, -2, -3, -100, -101, -102, -103, -4, -5, -6, -7, -8, -9, -10))

    items.value.clear()
    assert(mapped.get sameElements Seq.empty)
  }
}

Source File: ProxyMessageHandler.scala From spark-riak-connector with Apache License 2.0

5 votes

package com.basho.riak.stub

import java.net.InetSocketAddress
import java.nio.ByteBuffer
import java.nio.channels._

import com.basho.riak.client.core.RiakMessage
import com.basho.riak.client.core.util.HostAndPort
import shaded.com.basho.riak.protobuf.RiakKvPB
import shaded.com.basho.riak.protobuf.RiakMessageCodes._
import shaded.com.google.protobuf.ByteString

import scala.collection.JavaConversions._
import scala.collection.mutable.ArrayBuffer


class ProxyMessageHandler(hostAndPort: HostAndPort) extends RiakMessageHandler {

  private final val riakAddress = new InetSocketAddress(hostAndPort.getHost, hostAndPort.getPort)

  override def handle(context: ClientHandler.Context, input: RiakMessage): Iterable[RiakMessage] = input.getCode match {
    // coverage plan received from real Riak node must be modified to replace real node's host and port with proxy
    case MSG_CoverageReq => forwardAndTransform(context, input) { output =>
      val resp = RiakKvPB.RpbCoverageResp.parseFrom(output.getData)
      val modified = RiakKvPB.RpbCoverageResp.newBuilder(resp)
        .clearEntries()
        .addAllEntries(resp.getEntriesList.map { ce =>
          val ceBuilder = RiakKvPB.RpbCoverageEntry.newBuilder(ce)
          if (ce.getIp.toStringUtf8 == hostAndPort.getHost && ce.getPort == hostAndPort.getPort) {
            val localAddress = context.channel.asInstanceOf[NetworkChannel]
              .getLocalAddress.asInstanceOf[InetSocketAddress]
            ceBuilder.setIp(ByteString.copyFromUtf8(localAddress.getHostString))
            ceBuilder.setPort(localAddress.getPort)
          }
          ceBuilder.build()
        }).build()
      new RiakMessage(output.getCode, modified.toByteArray)
    }
    case _ => forwardMessage(context, input)
  }

  private def forwardMessage(context: ClientHandler.Context, input: RiakMessage): Iterable[RiakMessage] = {
    def readRiakResponse(channel: SocketChannel, out: List[RiakMessage] = Nil): Iterable[RiakMessage] = out match {
      case _ if !isDoneReceived(out, input) => readRiakResponse(channel, out ++ readSocket(channel))
      case _ => out
    }

    val channel = SocketChannel.open(riakAddress)
    try {
      // forward request to real Riak node
      assert(channel.write(RiakMessageEncoder.encode(input)) > 0)

      // read response for forwarded request from real Riak node
      readRiakResponse(channel)
    } finally {
      channel.close()
    }
  }

  private def readSocket(channel: SocketChannel): Iterable[RiakMessage] = {
    var accumulator = ByteBuffer.allocateDirect(0)

    var out = ArrayBuffer[RiakMessage]()
    while (out.isEmpty || accumulator.hasRemaining) {
      // try to parse riak message from bytes in accumulator buffer
      RiakMessageEncoder.decode(accumulator) match {
        case Some(x) =>
          accumulator = accumulator.slice()
          out += x
        case None =>
          // read next chunk of data from channel and add it into accumulator
          val in = ByteBuffer.allocateDirect(1024) // scalastyle:ignore
          channel.read(in)
          accumulator = ByteBuffer
            .allocate(accumulator.rewind().limit() + in.flip().limit())
            .put(accumulator)
            .put(in)
          accumulator.rewind()
          in.clear()
      }
    }
    out
  }

  private def isDoneReceived(out: Iterable[RiakMessage], input: RiakMessage): Boolean = input.getCode match {
    case MSG_IndexReq => out.foldLeft[Boolean](false)((a, m) => a || RiakKvPB.RpbIndexResp.parseFrom(m.getData).getDone)
    case _ => out.nonEmpty
  }

  private def forwardAndTransform(context: ClientHandler.Context, input: RiakMessage
                                 )(transform: RiakMessage => RiakMessage
                                 ): Iterable[RiakMessage] = forwardMessage(context, input).map(transform(_))

  override def onRespond(input: RiakMessage, output: Iterable[RiakMessage]): Unit = {}
}

Source File: QueryBucketKeys.scala From spark-riak-connector with Apache License 2.0

5 votes

package com.basho.riak.spark.query

import com.basho.riak.client.core.query.Location
import com.basho.riak.spark.rdd.connector.RiakConnector
import com.basho.riak.spark.rdd.{BucketDef, ReadConf}

import scala.collection.mutable.ArrayBuffer

private case class QueryBucketKeys(bucket: BucketDef,
                                   readConf:ReadConf,
                                   riakConnector: RiakConnector,
                                   keys: Iterable[String]
                                  ) extends QuerySubsetOfKeys[String] {

  override def locationsByKeys(keys: Iterator[String]): (Boolean, Iterable[Location]) = {

    val dataBuffer = new ArrayBuffer[Location](readConf.fetchSize)

    val ns = bucket.asNamespace()

    keys.forall(k =>{
      dataBuffer += new Location(ns, k)
      dataBuffer.size < readConf.fetchSize} )
    false -> dataBuffer
  }
}

Source File: Query2iKeys.scala From spark-riak-connector with Apache License 2.0

5 votes

package com.basho.riak.spark.query

import com.basho.riak.client.core.operations.CoveragePlanOperation.Response.CoverageEntry
import com.basho.riak.client.core.query.Location
import com.basho.riak.spark.rdd.connector.RiakConnector
import com.basho.riak.spark.rdd.{BucketDef, ReadConf}

import scala.collection.mutable.ArrayBuffer

private case class Query2iKeys[K](bucket: BucketDef,
                                  readConf:ReadConf,
                                  riakConnector: RiakConnector,
                                  index: String,
                                  keys: Iterable[K]
                                 ) extends QuerySubsetOfKeys[K] {
  private var query2iKey: Option[Query2iKeySingleOrRange[K]] = None
  private var tokenNext: Option[Either[String, CoverageEntry]] = None

  // By default there should be an empty Serializable Iterator
  private var _iterator: Iterator[Location] = ArrayBuffer.empty[Location].iterator

  private def chunkIsCollected(chunk: Iterable[Location]) = chunk.size >= readConf.fetchSize

  // scalastyle:off cyclomatic.complexity
  override def locationsByKeys(keys: Iterator[K]): (Boolean, Iterable[Location]) = {
    val dataBuffer = new ArrayBuffer[Location](readConf.fetchSize)

    while ((keys.hasNext || _iterator.hasNext || tokenNext.isDefined) && !chunkIsCollected(dataBuffer)){
      // Previously gathered results should be returned at first, if any
      _iterator forall  ( location => {
        dataBuffer += location
        !chunkIsCollected(dataBuffer)
      })

      if(!chunkIsCollected(dataBuffer)) tokenNext match {
        case Some(next) =>
          // Fetch the next results page from the previously executed 2i query, if any
          assert(query2iKey.isDefined)

          val r = query2iKey.get.nextLocationChunk(tokenNext)
          tokenNext = r._1
          _iterator = r._2.iterator

        case None if keys.hasNext =>
          // query data for the first/next key
          assert(_iterator.isEmpty && tokenNext.isEmpty)

          val key = keys.next()
          query2iKey = Some(new Query2iKeySingleOrRange[K](bucket, readConf, riakConnector, index, key))

          val r = query2iKey.get.nextLocationChunk(tokenNext)
          tokenNext = r._1
          _iterator = r._2.iterator

        case _ => // There is nothing to do
      }
    }
    tokenNext.isDefined -> dataBuffer
  }
  // scalastyle:on cyclomatic.complexity
}

Source File: Partitioner.scala From spark-solr with Apache License 2.0

5 votes

package com.lucidworks.spark

import java.net.InetAddress

import com.lucidworks.spark.rdd.SolrRDD
import com.lucidworks.spark.util.SolrSupport
import org.apache.solr.client.solrj.SolrQuery
import org.apache.spark.Partition

import scala.collection.mutable.ArrayBuffer

// Is there a need to override {@code Partitioner.scala} and define our own partition id's
object SolrPartitioner {

  def getShardPartitions(shards: List[SolrShard], query: SolrQuery) : Array[Partition] = {
    shards.zipWithIndex.map{ case (shard, i) =>
      // Chose any of the replicas as the active shard to query
      SelectSolrRDDPartition(i, "*", shard, query, SolrRDD.randomReplica(shard))}.toArray
  }

  def getSplitPartitions(
      shards: List[SolrShard],
      query: SolrQuery,
      splitFieldName: String,
      splitsPerShard: Int): Array[Partition] = {
    var splitPartitions = ArrayBuffer.empty[SelectSolrRDDPartition]
    var counter = 0
    shards.foreach(shard => {
      val splits = SolrSupport.getShardSplits(query, shard, splitFieldName, splitsPerShard)
      splits.foreach(split => {
        splitPartitions += SelectSolrRDDPartition(counter, "*", shard, split.query, split.replica)
        counter = counter + 1
      })
    })
    splitPartitions.toArray
  }

  // Workaround for SOLR-10490. TODO: Remove once fixed
  def getExportHandlerPartitions(
      shards: List[SolrShard],
      query: SolrQuery): Array[Partition] = {
    shards.zipWithIndex.map{ case (shard, i) =>
      // Chose any of the replicas as the active shard to query
      ExportHandlerPartition(i, shard, query, SolrRDD.randomReplica(shard), 0, 0)}.toArray
  }

  // Workaround for SOLR-10490. TODO: Remove once fixed
  def getExportHandlerPartitions(
      shards: List[SolrShard],
      query: SolrQuery,
      splitFieldName: String,
      splitsPerShard: Int): Array[Partition] = {
    val splitPartitions = ArrayBuffer.empty[ExportHandlerPartition]
    var counter = 0
    shards.foreach(shard => {
      // Form a continuous iterator list so that we can pick different replicas for different partitions in round-robin mode
      val splits = SolrSupport.getExportHandlerSplits(query, shard, splitFieldName, splitsPerShard)
      splits.foreach(split => {
        splitPartitions += ExportHandlerPartition(counter, shard, split.query, split.replica, split.numWorkers, split.workerId)
        counter = counter+1
      })
    })
    splitPartitions.toArray
  }

}

case class SolrShard(shardName: String, replicas: List[SolrReplica])

case class SolrReplica(
    replicaNumber: Int,
    replicaName: String,
    replicaUrl: String,
    replicaHostName: String,
    locations: Array[InetAddress]) {
  def getHostAndPort(): String = {replicaHostName.substring(0, replicaHostName.indexOf('_'))}
  override def toString(): String = {
    return s"SolrReplica(${replicaNumber}) ${replicaName}: url=${replicaUrl}, hostName=${replicaHostName}, locations="+locations.mkString(",")
  }
}

Source File: GranularBigVector.scala From glint with MIT License

5 votes

package glint.models.client.granular

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.{ExecutionContext, Future}
import scala.reflect.ClassTag

import glint.models.client.BigVector


  override def push(keys: Array[Long], values: Array[V])
    (implicit ec: ExecutionContext): Future[Boolean] = {
    var i = 0
    val ab = new ArrayBuffer[Future[Boolean]](keys.length / maximumMessageSize)
    while (i < keys.length) {
      val end = Math.min(keys.length, i + maximumMessageSize)
      val future = underlying.push(keys.slice(i, end), values.slice(i, end))
      ab.append(future)
      i += maximumMessageSize
    }
    Future.sequence(ab.toIterator).transform(x => x.forall(y => y), err => err)
  }

}

Source File: GranularBigMatrix.scala From glint with MIT License

5 votes

package glint.models.client.granular

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.{ExecutionContext, Future}
import scala.reflect.ClassTag

import breeze.linalg.Vector
import glint.models.client.BigMatrix


  override def pull(rows: Array[Long],
                    cols: Array[Int])(implicit ec: ExecutionContext): Future[Array[V]] = {
    if (rows.length <= maximumMessageSize) {
      underlying.pull(rows, cols)
    } else {
      var i = 0
      val ab = new ArrayBuffer[Future[Array[V]]](rows.length / maximumMessageSize)
      while (i < rows.length) {
        val end = Math.min(rows.length, i + maximumMessageSize)
        val future = underlying.pull(rows.slice(i, end), cols.slice(i, end))
        ab.append(future)
        i += maximumMessageSize
      }
      Future.sequence(ab.toIterator).map {
        case arrayOfValues =>
          val finalValues = new ArrayBuffer[V](rows.length)
          arrayOfValues.foreach(x => finalValues.appendAll(x))
          finalValues.toArray
      }
    }
  }
}

Source File: HiveQLProcessBuilder.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.enginemanager.hive.process

import java.nio.file.Paths

import com.webank.wedatasphere.linkis.common.conf.Configuration
import com.webank.wedatasphere.linkis.enginemanager.conf.EnvConfiguration.{DEFAULT_JAVA_OPTS, JAVA_HOME, engineGCLogPath}
import com.webank.wedatasphere.linkis.enginemanager.hive.conf.HiveEngineConfiguration
import com.webank.wedatasphere.linkis.enginemanager.impl.UserEngineResource
import com.webank.wedatasphere.linkis.enginemanager.process.JavaProcessEngineBuilder
import com.webank.wedatasphere.linkis.enginemanager.{AbstractEngineCreator, EngineResource}
import com.webank.wedatasphere.linkis.protocol.engine.RequestEngine
import org.apache.commons.lang.StringUtils
import org.slf4j.LoggerFactory

import scala.collection.mutable.ArrayBuffer


  override protected def classpathCheck(jarOrFiles: Array[String]): Unit = {
    for(jarOrFile <- jarOrFiles){
      checkJarOrFile(jarOrFile)
    }
  }
  //todo Check the jar of the classpath(对classpath的jar进行检查)
  private def checkJarOrFile(jarOrFile:String):Unit = {

  }


  override def build(engineRequest: EngineResource, request: RequestEngine): Unit = {
    this.request = request
    userEngineResource = engineRequest.asInstanceOf[UserEngineResource]
    val javaHome = JAVA_HOME.getValue(request.properties)
    if(StringUtils.isEmpty(javaHome)) {
      warn("We cannot find the java home, use java to run storage repl web server.")
      commandLine += "java"
    } else {
      commandLine += Paths.get(javaHome, "bin/java").toAbsolutePath.toFile.getAbsolutePath
    }
    if (request.properties.containsKey(HiveEngineConfiguration.HIVE_CLIENT_MEMORY.key)){
      val settingClientMemory = request.properties.get(HiveEngineConfiguration.HIVE_CLIENT_MEMORY.key)
      if (!settingClientMemory.toLowerCase().endsWith("g")){
        request.properties.put(HiveEngineConfiguration.HIVE_CLIENT_MEMORY.key, settingClientMemory + "g")
      }
      //request.properties.put(HiveEngineConfiguration.HIVE_CLIENT_MEMORY.key, request.properties.get(HiveEngineConfiguration.HIVE_CLIENT_MEMORY.key)+"g")
    }
    val clientMemory = HiveEngineConfiguration.HIVE_CLIENT_MEMORY.getValue(request.properties).toString
    if (clientMemory.toLowerCase().endsWith("g")){
      commandLine += ("-Xmx" + clientMemory.toLowerCase())
      commandLine += ("-Xms" + clientMemory.toLowerCase())
    }else{
      commandLine += ("-Xmx" + clientMemory + "g")
      commandLine += ("-Xms" + clientMemory + "g")
    }
    val javaOPTS = getExtractJavaOpts
    val alias = getAlias(request)
    if(StringUtils.isNotEmpty(DEFAULT_JAVA_OPTS.getValue))
      DEFAULT_JAVA_OPTS.getValue.format(engineGCLogPath(port, userEngineResource.getUser, alias)).split("\\s+").foreach(commandLine += _)
    if(StringUtils.isNotEmpty(javaOPTS)) javaOPTS.split("\\s+").foreach(commandLine += _)
    //engineLogJavaOpts(port, alias).trim.split(" ").foreach(commandLine += _)
    if(Configuration.IS_TEST_MODE.getValue) {
      val port = AbstractEngineCreator.getNewPort
      info(s"$toString open debug mode with port $port.")
      commandLine += s"-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=$port"
    }
    var classpath = getClasspath(request.properties, getExtractClasspath)
    classpath = classpath ++ request.properties.get("jars").split(",")
    classpathCheck(classpath)
    commandLine += "-Djava.library.path=/appcom/Install/hadoop/lib/native"
    commandLine += "-cp"
    commandLine += classpath.mkString(":")
    commandLine += "com.webank.wedatasphere.linkis.engine.DataWorkCloudEngineApplication"
  }


//  override def build(engineRequest: EngineResource, request: RequestEngine): Unit = {
//    import scala.collection.JavaConversions._
//    request.properties foreach {case (k, v) => LOG.info(s"request key is $k, value is $v")}
//    this.request = request
//    super.build(engineRequest, request)
//
//  }

  override protected val addApacheConfigPath: Boolean = true
}

Source File: JDBCSQLCodeParser.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.entrance.executer

import com.webank.wedatasphere.linkis.entrance.conf.JDBCConfiguration
import org.apache.commons.lang.StringUtils

import scala.collection.mutable.ArrayBuffer

object JDBCSQLCodeParser {

  val separator = ";"
  val defaultLimit: Int = JDBCConfiguration.ENGINE_DEFAULT_LIMIT.getValue

  def parse(code: String): Array[String] = {
    val codeBuffer = new ArrayBuffer[String]()

    def appendStatement(sqlStatement: String): Unit = {
      codeBuffer.append(sqlStatement)
    }

    if (StringUtils.contains(code, separator)) {
      StringUtils.split(code, ";").foreach {
        case s if StringUtils.isBlank(s) =>
        case s if isSelectCmdNoLimit(s) => appendStatement(s + " limit " + defaultLimit);
        case s => appendStatement(s);
      }
    } else {
      code match {
        case s if StringUtils.isBlank(s) =>
        case s if isSelectCmdNoLimit(s) => appendStatement(s + " limit " + defaultLimit);
        case s => appendStatement(s);
      }
    }
    codeBuffer.toArray
  }

  def isSelectCmdNoLimit(cmd: String): Boolean = {
    var code = cmd.trim
    if (!cmd.split("\\s+")(0).equalsIgnoreCase("select")) return false
    if (code.contains("limit")) code = code.substring(code.lastIndexOf("limit")).trim
    else if (code.contains("LIMIT")) code = code.substring(code.lastIndexOf("LIMIT")).trim.toLowerCase
    else return true
    val hasLimit = code.matches("limit\\s+\\d+\\s*;?")
    if (hasLimit) {
      if (code.indexOf(";") > 0) code = code.substring(5, code.length - 1).trim
      else code = code.substring(5).trim
      val limitNum = code.toInt
      if (limitNum > defaultLimit) throw new IllegalArgumentException("We at most allowed to limit " + defaultLimit + ", but your SQL has been over the max rows.")
    }
    !hasLimit
  }


}

Source File: PythonEngineExecutor.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.engine.executors

import com.webank.wedatasphere.linkis.common.utils.Logging
import com.webank.wedatasphere.linkis.engine.PythonSession
import com.webank.wedatasphere.linkis.engine.exception.EngineException
import com.webank.wedatasphere.linkis.engine.execute.{EngineExecutor, EngineExecutorContext}
import com.webank.wedatasphere.linkis.engine.rs.RsOutputStream
import com.webank.wedatasphere.linkis.protocol.engine.JobProgressInfo
import com.webank.wedatasphere.linkis.resourcemanager.{LoadInstanceResource, Resource}
import com.webank.wedatasphere.linkis.rpc.Sender
import com.webank.wedatasphere.linkis.scheduler.executer._
import org.apache.commons.io.IOUtils

import scala.collection.mutable.ArrayBuffer


class PythonEngineExecutor(outputPrintLimit: Int) extends EngineExecutor(outputPrintLimit, false) with SingleTaskOperateSupport with SingleTaskInfoSupport with Logging {
  override def getName: String = Sender.getThisServiceInstance.getInstance
  private val lineOutputStream = new RsOutputStream
  private[executors] var engineExecutorContext: EngineExecutorContext = _
  override def getActualUsedResources: Resource = {
    new LoadInstanceResource(Runtime.getRuntime.totalMemory() - Runtime.getRuntime.freeMemory(), 2, 1)
  }

 private val pySession = new PythonSession

  override protected def executeLine(engineExecutorContext: EngineExecutorContext, code: String): ExecuteResponse = {
    if(engineExecutorContext != this.engineExecutorContext){
      this.engineExecutorContext = engineExecutorContext
      pySession.setEngineExecutorContext(engineExecutorContext)
      //lineOutputStream.reset(engineExecutorContext)
      info("Python executor reset new engineExecutorContext!")
    }
    engineExecutorContext.appendStdout(s"$getName >> ${code.trim}")
    pySession.execute(code)
    //lineOutputStream.flush()
   SuccessExecuteResponse()
  }

  override protected def executeCompletely(engineExecutorContext: EngineExecutorContext, code: String, completedLine: String): ExecuteResponse = {
    val newcode = completedLine + code
    info("newcode is " + newcode)
    executeLine(engineExecutorContext, newcode)
  }

  override def kill(): Boolean = true

  override def pause(): Boolean = true

  override def resume(): Boolean = true

  override def progress(): Float = {
    if (this.engineExecutorContext != null){
      this.engineExecutorContext.getCurrentParagraph / this.engineExecutorContext.getTotalParagraph.asInstanceOf[Float]
    }else 0.0f
  }

  override def getProgressInfo: Array[JobProgressInfo] = {
    val jobProgressInfos = new ArrayBuffer[JobProgressInfo]()
    jobProgressInfos.toArray
    Array.empty
  }

  override def log(): String = ""

  override def close(): Unit = {
    IOUtils.closeQuietly(lineOutputStream)
    var isKill:Boolean = false
    try {
      pySession.close
      isKill = true;
    } catch {
      case e: Throwable =>
        throw new EngineException(60004, "Engine shutdown exception（引擎关闭异常）")
    }
  }
}

Source File: SparkPostExecutionHook.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.engine.extension

import com.webank.wedatasphere.linkis.common.utils.Logging
import com.webank.wedatasphere.linkis.engine.execute.EngineExecutorContext
import com.webank.wedatasphere.linkis.scheduler.executer.ExecuteResponse

import scala.collection.mutable.ArrayBuffer


trait SparkPostExecutionHook {
  def hookName:String
  def callPostExecutionHook(engineExecutorContext: EngineExecutorContext, executeResponse: ExecuteResponse, code: String): Unit
}

object SparkPostExecutionHook extends Logging{
  private val postHooks = ArrayBuffer[SparkPostExecutionHook]()

  def register(postExecutionHook: SparkPostExecutionHook):Unit = {
    info(s"Get a postExecutionHook of ${postExecutionHook.hookName} register")
    postHooks.append(postExecutionHook)
  }

  def getSparkPostExecutionHooks():Array[SparkPostExecutionHook] = {
    postHooks.toArray
  }
}

Source File: SparkPreExecutionHook.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.engine.extension

import com.webank.wedatasphere.linkis.common.utils.Logging
import com.webank.wedatasphere.linkis.engine.execute.EngineExecutorContext

import scala.collection.mutable.ArrayBuffer


trait SparkPreExecutionHook {
  def hookName:String
  def callPreExecutionHook(engineExecutorContext: EngineExecutorContext, code: String): String
}

object SparkPreExecutionHook extends Logging{
  private val preHooks = ArrayBuffer[SparkPreExecutionHook]()

  def register(preExecutionHook: SparkPreExecutionHook):Unit = {
    info(s"Get a preExecutionHook of ${preExecutionHook.hookName} register")
    preHooks.append(preExecutionHook)
  }

  def getSparkPreExecutionHooks():Array[SparkPreExecutionHook] = {
    preHooks.toArray
  }
}

Source File: SparkSqlExtension.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.engine.extension

import java.util.concurrent._

import com.webank.wedatasphere.linkis.common.conf.CommonVars
import com.webank.wedatasphere.linkis.common.utils.{Logging, Utils}
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.{DataFrame, SQLContext}

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.duration._

abstract class SparkSqlExtension extends Logging{

  private val maxPoolSize = CommonVars("wds.linkis.dws.ujes.spark.extension.max.pool",5).getValue

  private  val executor = new ThreadPoolExecutor(2, maxPoolSize, 2, TimeUnit.SECONDS, new LinkedBlockingQueue[Runnable](), new ThreadFactory {
    override def newThread(r: Runnable): Thread = {
      val thread = new Thread(r)
      thread.setDaemon(true)
      thread
    }
  })

  final def afterExecutingSQL(sqlContext: SQLContext,command: String,dataFrame: DataFrame,timeout:Long,sqlStartTime:Long):Unit = {
    try {
      val thread = new Runnable {
        override def run(): Unit = extensionRule(sqlContext,command,dataFrame.queryExecution,sqlStartTime)
      }
      val future = executor.submit(thread)
      Utils.waitUntil(future.isDone,timeout milliseconds)
    } catch {
      case e: Throwable => info("Failed to execute SparkSqlExtension: ", e)
    }
  }

  protected def extensionRule(sqlContext: SQLContext,command: String,queryExecution: QueryExecution,sqlStartTime:Long):Unit


}

object SparkSqlExtension extends Logging {

  private val extensions = ArrayBuffer[SparkSqlExtension]()

  def register(sqlExtension: SparkSqlExtension):Unit = {
    info("Get a sqlExtension register")
    extensions.append(sqlExtension)
  }

  def getSparkSqlExtensions():Array[SparkSqlExtension] = {
    extensions.toArray
  }
}

Source File: CSTableParser.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.engine.cs

import java.util.regex.Pattern

import com.webank.wedatasphere.linkis.common.utils.Logging
import com.webank.wedatasphere.linkis.cs.client.service.CSTableService
import com.webank.wedatasphere.linkis.cs.common.entity.metadata.CSTable
import com.webank.wedatasphere.linkis.cs.common.utils.CSCommonUtils
import com.webank.wedatasphere.linkis.engine.exception.ExecuteError
import com.webank.wedatasphere.linkis.engine.execute.EngineExecutorContext
import org.apache.commons.lang.StringUtils
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources.csv.DolphinToSpark

import scala.collection.mutable.ArrayBuffer


  def getCSTable(csTempTable:String,  contextIDValueStr: String, nodeNameStr: String):CSTable = {
    CSTableService.getInstance().getUpstreamSuitableTable(contextIDValueStr, nodeNameStr, csTempTable)
  }

  def registerTempTable(csTable: CSTable):Unit = {
    val spark = SparkSession.builder().enableHiveSupport().getOrCreate()
    info(s"Start to create  tempView to sparkSession viewName(${csTable.getName}) location(${csTable.getLocation})")
    DolphinToSpark.createTempView(spark, csTable.getName, csTable.getLocation, true)
    info(s"Finished to create  tempView to sparkSession viewName(${csTable.getName}) location(${csTable.getLocation})")
  }
}

Source File: LogContainer.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.engine.spark.common

import scala.collection.Iterable
import scala.collection.JavaConversions._
import scala.collection.mutable.ArrayBuffer


class LogContainer(val logSize: Int) {

  private final val logs = new Array[String](logSize)
  private var flag, tail = 0

  def putLog(log: String): Unit = {
    logs.synchronized {
      val index = (tail + 1) % logSize
      if(index == flag) {
        flag = (flag + 1) % logSize
      }
      logs(tail) = log
      tail = index
    }
  }

  def putLogs(logs: Iterable[String]) = synchronized {
    logs.foreach(putLog)
  }

  def reset() = synchronized {
    flag = 0
    tail = 0
  }

  def getLogs: List[String] = {
    logs.synchronized {
      if(flag == tail) {
        return List.empty[String]
      }
      val _logs = ArrayBuffer[String]()
      val _tail = if(flag > tail) tail + logSize else tail
      for (index <- flag until _tail) {
        val _index = index % logSize
        _logs += logs(_index)
      }
      flag = tail
      _logs.toList
    }
  }

  def size = {
    if(flag == tail) 0
    else if(flag > tail) tail + logSize - flag
    else tail - flag
  }

  def getLogList: java.util.List[String] = getLogs

}

Source File: SparkConfiguration.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.enginemanager.configuration

import com.webank.wedatasphere.linkis.common.conf.{CommonVars, Configuration}
import com.webank.wedatasphere.linkis.common.utils.{ClassUtils, Logging}
import com.webank.wedatasphere.linkis.engine.factory.SparkEngineExecutorFactory
import com.webank.wedatasphere.linkis.enginemanager.AbstractEngineCreator

import scala.collection.mutable.ArrayBuffer

object SparkConfiguration extends Logging {
  val SPARK_MAX_PARALLELISM_USERS = CommonVars[Int]("wds.linkis.engine.spark.user.parallelism", 100)
  val SPARK_USER_MAX_WAITING_SIZE = CommonVars[Int]("wds.linkis.engine.spark.user.waiting.max", 100)

  val SPARK_SESSION_HOOK = CommonVars[String]("wds.linkis.engine.spark.session.hook", "")
  val SPARK_LANGUAGE_REPL_INIT_TIME = CommonVars[String]("wds.linkis.engine.spark.language-repl.init.time", new String("30s"))

  val SPARK_ALLOW_REQUEST_ALL_YARN_MEMORY = CommonVars[String]("wds.linkis.engine.spark.allow.all-memory.when.queue", new String("60g"))
  val SPARK_ALLOW_REQUEST_ALL_YARN_CORES = CommonVars[Int]("wds.linkis.engine.spark.allow.all-cores.when.queue", 30)
  val SPARK_USER_MAX_ALLOCATE_SESSIONS = CommonVars[Int]("wds.linkis.engine.spark.user.sessions.max", 5)
  val SPARK_USER_MAX_ALLOCATE_YARN_MEMORY = CommonVars[String]("wds.linkis.engine.spark.user.yarn.memory.max", new String("100g"))
  val SPARK_USER_MAX_ALLOCATE_YARN_CORES = CommonVars[Int]("wds.linkis.engine.spark.user.cores.max", 50)
  val SPARK_USER_MAX_ALLOCATE_DRIVER_MEMORY = CommonVars[String]("wds.linkis.engine.spark.user.driver.memory.max", new String("15g"))
  val SPARK_USER_MAX_ALLOCATE_DRIVER_CORES = SPARK_USER_MAX_ALLOCATE_SESSIONS
  val SPARK_USER_MAX_RESOURCE_IN_QUEUE = CommonVars[Float]("wds.linkis.engine.spark.user.queue.resources.max", 0.6f)
  val SPARK_DANGER_QUEUE_USED_CAPACITY = CommonVars[Float]("wds.linkis.engine.spark.danger.queue.used", 0.2f)
  val SPARK_DANGER_QUEUE_USER_ALLOCATE_SESSION = CommonVars[Int]("wds.linkis.engine.spark.danger.user.sessions.max", 2)
  val SPARK_WARN_QUEUE_USED_CAPACITY = CommonVars[Float]("wds.linkis.engine.spark.warning.queue.used", 0.5f)
  val SPARK_WARN_QUEUE_USER_ALLOCATE_SESSION = CommonVars[Int]("wds.linkis.engine.spark.warning.user.sessions.max", 3)

  val PROXY_USER = CommonVars[String]("spark.proxy.user", "${UM}")
  val SPARK_CLIENT_MODE = "client"
  val SPARK_CLUSTER_MODE = "cluster"
  val SPARK_DEPLOY_MODE = CommonVars[String]("spark.submit.deployMode", SPARK_CLIENT_MODE)
  val SPARK_APPLICATION_JARS = CommonVars[String]("spark.application.jars", "", "User-defined jars, separated by English, must be uploaded to HDFS first, and must be full path to HDFS.（用户自定义jar包，多个以英文,隔开，必须先上传到HDFS，且需为HDFS全路径。）")

  val SPARK_EXTRA_JARS = CommonVars[String]("spark.jars", "", "Additional jar package, Driver and Executor take effect（额外的jar包，Driver和Executor生效）")

  val MAPRED_OUTPUT_COMPRESS = CommonVars[String]("mapred.output.compress", "true", "Whether the map output is compressed（map输出结果是否压缩）")
  val MAPRED_OUTPUT_COMPRESSION_CODEC = CommonVars[String]("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec", "Map output compression method（map输出结果压缩方式）")
  val SPARK_MASTER = CommonVars[String]("spark.master", "yarn", "Default master（默认master）")
  val SPARK_OUTPUTDIR = CommonVars[String]("spark.outputDir", "/home/georgeqiao", "Default output path（默认输出路径）")

  val DWC_SPARK_USEHIVECONTEXT = CommonVars[Boolean]("wds.linkis.spark.useHiveContext", true)
  val ENGINE_JAR = CommonVars[String]("wds.linkis.enginemanager.core.jar", ClassUtils.jarOfClass(classOf[SparkEngineExecutorFactory]).head)
  val SPARK_DRIVER_CLASSPATH = CommonVars[String]("wds.linkis.spark.driver.conf.mainjar", "")
  val SPARK_DRIVER_EXTRA_JAVA_OPTIONS = CommonVars[String]("spark.driver.extraJavaOptions", "\"-Dwds.linkis.configuration=linkis-engine.properties " + getJavaRemotePort + "\"")
  val DEFAULT_JAVA_OPTS = CommonVars[String]("wds.linkis.engine.javaOpts.default", "-server -XX:+UseG1GC -XX:MaxPermSize=250m -XX:PermSize=128m " +
    "-Xloggc:%s -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -Dwds.linkis.configuration=linkis-engine.properties")
  val SPARK_ML_BUCKET_FIELDS = CommonVars[String]("wds.linkis.engine.spark.ml.bucketFields", "age[0,18,30,60,100]")

  val SPARK_SUBMIT_CMD = CommonVars[String]("wds.linkis.engine.spark.submit.cmd", "spark-submit")
  private var Ports: ArrayBuffer[Int] = _

  def getJavaRemotePort = {
    if (Configuration.IS_TEST_MODE.getValue) {
      val r = new scala.util.Random()
      val port = 1024 + r.nextInt((65536 - 1024) + 1)
      info(s"open debug mode with port $port.")
      s"-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=$port"
    } else {
      ""
    }
  }

  private def getAvailablePort: Int = synchronized {
    var port = AbstractEngineCreator.getNewPort
    info("Get new port " + port)
    if (Ports == null) {
      info("Get inInitPorts is null ")
      Ports = ArrayBuffer(0, 1)
      info("Current ports is " + Ports.toList.toString())
    }
    while (Ports.contains(port)) {
      if (AbstractEngineCreator != null) {
        port = AbstractEngineCreator.getNewPort
      }
    }
    Ports += port
    port
  }
}

Source File: CSResourceParser.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.engine.cs


import java.util
import java.util.regex.Pattern

import com.webank.wedatasphere.linkis.cs.client.service.CSResourceService
import com.webank.wedatasphere.linkis.engine.PropertiesExecuteRequest
import org.apache.commons.lang.StringUtils

import scala.collection.JavaConversions._
import scala.collection.mutable.ArrayBuffer


class CSResourceParser {

  private val pb = Pattern.compile("cs://[^\\s\"]+[$\\s]{0,1}", Pattern.CASE_INSENSITIVE)

  private val PREFIX = "cs://"

  private def getPreFixResourceNames(code: String): Array[String] = {
    val bmlResourceNames = new ArrayBuffer[String]()
    val mb = pb.matcher(code)
    while (mb.find) bmlResourceNames.append(mb.group.trim)
    bmlResourceNames.toArray
  }

  def parse(executeRequest: PropertiesExecuteRequest, code: String, contextIDValueStr: String, nodeNameStr: String): String = {

    //TODO getBMLResource peaceWong
    val bmlResourceList = CSResourceService.getInstance().getUpstreamBMLResource(contextIDValueStr, nodeNameStr)

    val parsedResources = new util.ArrayList[util.Map[String, Object]]()
    val preFixResourceNames = getPreFixResourceNames(code)

    val preFixNames = new ArrayBuffer[String]()
    val parsedNames = new ArrayBuffer[String]()
    preFixResourceNames.foreach { preFixResourceName =>
      val resourceName = preFixResourceName.replace(PREFIX, "").trim
      val bmlResourceOption = bmlResourceList.find(_.getDownloadedFileName.equals(resourceName))
      if (bmlResourceOption.isDefined) {
        val bmlResource = bmlResourceOption.get
        val map = new util.HashMap[String, Object]()
        map.put("resourceId", bmlResource.getResourceId)
        map.put("version", bmlResource.getVersion)
        map.put("fileName", resourceName)
        parsedResources.add(map)
        preFixNames.append(preFixResourceName)
        parsedNames.append(resourceName)
      }

    }
    executeRequest.properties.put("resources", parsedResources)
    StringUtils.replaceEach(code, preFixNames.toArray, parsedNames.toArray)
  }

}

Source File: RsOutputStream.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.engine.rs

import java.io.OutputStream

import com.webank.wedatasphere.linkis.common.io.resultset.ResultSetWriter
import com.webank.wedatasphere.linkis.common.io.{MetaData, Record}
import com.webank.wedatasphere.linkis.common.utils.Logging
import com.webank.wedatasphere.linkis.engine.execute.EngineExecutorContext
import com.webank.wedatasphere.linkis.storage.LineRecord

import scala.collection.mutable.ArrayBuffer



class RsOutputStream extends OutputStream with Logging{
  private val line = ArrayBuffer[Byte]()
  private var isReady = false
  private var writer: ResultSetWriter[_ <: MetaData, _ <: Record] = _
  override def write(b: Int) = if(isReady) synchronized {
    if(writer != null) {
      if (b == '\n') {
        val outStr = new String(line.toArray,"UTF-8")
        writer.addRecord(new LineRecord(outStr))
        //info("output line:" + outStr)
        line.clear()
      } else line += b.toByte
    }else{
       warn("writer is null")
    }
  }

  def reset(engineExecutorContext: EngineExecutorContext) = {
    writer = engineExecutorContext.createDefaultResultSetWriter()
    writer.addMetaData(null)
  }

  def ready() = isReady = true

  override def flush(): Unit = if(writer != null && line.nonEmpty) {
    val outStr = new String(line.toArray,"UTF-8")
    writer.addRecord(new LineRecord(outStr))
    //info("flush line:" + outStr)
    line.clear()
  }

  override def toString = if(writer != null) writer.toString() else null

  override def close() = if(writer != null) {
    flush()
    writer.close()
    writer = null
  }
}

Source File: CodeGeneratorEngineHook.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.engine.execute.hook

import java.io.File

import com.webank.wedatasphere.linkis.common.utils.Logging
import com.webank.wedatasphere.linkis.engine.execute.{EngineExecutor, EngineHook}
import com.webank.wedatasphere.linkis.scheduler.executer.{ExecuteRequest, RunTypeExecuteRequest}
import com.webank.wedatasphere.linkis.server.JMap
import org.apache.commons.io.FileUtils
import org.apache.commons.lang.StringUtils

import scala.collection.mutable.ArrayBuffer


@Deprecated
//changed to UdfLoadEngineHook
abstract class CodeGeneratorEngineHook extends EngineHook with Logging{ self =>
  val udfPathProp = "udf.paths"
  protected var creator: String = _
  protected var user: String = _
  protected var initSpecialCode: String = _
  protected val runType: String

  protected def acceptCodeType(line: String): Boolean

  protected def generateCode(): Array[String] = {
    val codeBuffer = new ArrayBuffer[String]
    val statementBuffer = new ArrayBuffer[String]
    var accept = true
    initSpecialCode.split("\n").foreach{
      case "" =>
      case l if l.startsWith("%") =>
        if(acceptCodeType(l)){
          accept = true
          codeBuffer.append(statementBuffer.mkString("\n"))
          statementBuffer.clear()
        }else{
          accept = false
        }
      case l if accept => statementBuffer.append(l)
      case _ =>
    }
    if(statementBuffer.nonEmpty) codeBuffer.append(statementBuffer.mkString("\n"))
    codeBuffer.toArray
  }

  override def beforeCreateEngine(params: JMap[String, String]): JMap[String, String] = {
    creator = params.get("creator")
    user = params.get("user")
    initSpecialCode = StringUtils.split(params.get(udfPathProp), ",").map(readFile).mkString("\n")
    params
  }

  override def afterCreatedEngine(executor: EngineExecutor): Unit = {
    generateCode().foreach {
      case "" =>
      case c: String =>
        info("Submit udf registration to engine, code: " + c)
        executor.execute(new ExecuteRequest with RunTypeExecuteRequest{
          override val code: String = c
          override val runType: String = self.runType
        })
        info("executed code: " + c)
    }
  }

  protected def readFile(path: String): String = {
    info("read file: " + path)
    val file = new File(path)
    if(file.exists()){
      FileUtils.readFileToString(file)
    } else {
      info("udf file: [" + path + "] doesn't exist, ignore it.")
      ""
    }
  }
}
@Deprecated
class SqlCodeGeneratorEngineHook extends CodeGeneratorEngineHook{
  override val runType = "sql"
  override protected def acceptCodeType(line: String): Boolean = {
    line.startsWith("%sql")
  }
}
@Deprecated
class PythonCodeGeneratorEngineHook extends CodeGeneratorEngineHook{
  override val runType = "python"
  override protected def acceptCodeType(line: String): Boolean = {
    line.startsWith("%python")
  }
}
@Deprecated
class ScalaCodeGeneratorEngineHook extends CodeGeneratorEngineHook{
  override val runType = "scala"
  override protected def acceptCodeType(line: String): Boolean = {
    line.startsWith("%scala")
  }
}

Source File: AbstractEngineCreator.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.enginemanager

import java.net.ServerSocket

import com.webank.wedatasphere.linkis.common.conf.DWCArgumentsParser
import com.webank.wedatasphere.linkis.common.utils.Utils
import com.webank.wedatasphere.linkis.enginemanager.conf.EngineManagerConfiguration
import com.webank.wedatasphere.linkis.enginemanager.exception.EngineManagerErrorException
import com.webank.wedatasphere.linkis.enginemanager.impl.UserTimeoutEngineResource
import com.webank.wedatasphere.linkis.enginemanager.process.{CommonProcessEngine, ProcessEngine, ProcessEngineBuilder}
import com.webank.wedatasphere.linkis.protocol.engine.{EngineCallback, RequestEngine}
import com.webank.wedatasphere.linkis.rpc.Sender
import com.webank.wedatasphere.linkis.server.{JMap, toScalaMap}
import org.apache.commons.io.IOUtils

import scala.collection.mutable.ArrayBuffer


abstract class AbstractEngineCreator extends EngineCreator {

  private val inInitPorts = ArrayBuffer[Int]()

  private def getAvailablePort: Int = synchronized {
    var port = AbstractEngineCreator.getNewPort
    while(inInitPorts.contains(port)) port = AbstractEngineCreator.getNewPort
    inInitPorts += port
    port
  }

  def removePort(port: Int): Unit = inInitPorts -= port

  protected def createProcessEngineBuilder(): ProcessEngineBuilder
  protected def getExtractSpringConfigs(requestEngine: RequestEngine): JMap[String, String] = {
    val springConf = new JMap[String, String]
    requestEngine.properties.keysIterator.filter(_.startsWith("spring.")).foreach(key => springConf.put(key.substring(7), requestEngine.properties.get(key)))
    springConf
  }
  protected def createEngine(processEngineBuilder:ProcessEngineBuilder,parser:DWCArgumentsParser):ProcessEngine={
     processEngineBuilder.getEngineResource match {
      case timeout: UserTimeoutEngineResource =>
        new CommonProcessEngine(processEngineBuilder, parser, timeout.getTimeout)
      case _ =>
        new CommonProcessEngine(processEngineBuilder, parser)
    }
  }

  override def create(ticketId: String, engineRequest: EngineResource, request: RequestEngine): Engine = {
    val port = getAvailablePort
    val processEngineBuilder = createProcessEngineBuilder()
    processEngineBuilder.setPort(port)
    processEngineBuilder.build(engineRequest, request)
    val parser = new DWCArgumentsParser
    var springConf = Map("spring.application.name" -> EngineManagerConfiguration.ENGINE_SPRING_APPLICATION_NAME.getValue,
      "server.port" -> port.toString, "spring.profiles.active" -> "engine",
      "logging.config" -> "classpath:log4j2-engine.xml",
      "eureka.client.serviceUrl.defaultZone" -> EngineManagerReceiver.getSpringConf("eureka.client.serviceUrl.defaultZone"))
    springConf = springConf ++: getExtractSpringConfigs(request).toMap
    parser.setSpringConf(springConf)
    var dwcConf = Map("ticketId" -> ticketId, "creator" -> request.creator, "user" -> request.user) ++:
      EngineCallback.callbackToMap(EngineCallback(Sender.getThisServiceInstance.getApplicationName, Sender.getThisServiceInstance.getInstance))
    if(request.properties.exists{case (k, v) => k.contains(" ") || (v != null && v.contains(" "))})
      throw new EngineManagerErrorException(30000, "Startup parameters contain spaces!(启动参数中包含空格！)")
    dwcConf = dwcConf ++: request.properties.toMap
    parser.setDWCConf(dwcConf)
    val engine = createEngine(processEngineBuilder,parser)
    engine.setTicketId(ticketId)
    engine.setPort(port)
    engine match {
      case commonEngine: CommonProcessEngine => commonEngine.setUser(request.user)
      case _ =>
    }
    engine
  }
}
object AbstractEngineCreator {
  private[enginemanager] def getNewPort: Int = {
    val socket = new ServerSocket(0)
    Utils.tryFinally(socket.getLocalPort)(IOUtils.closeQuietly(socket))
  }
}

Source File: ScalaDDLCreator.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.metadata.ddl

import com.webank.wedatasphere.linkis.common.utils.Logging
import com.webank.wedatasphere.linkis.metadata.conf.MdqConfiguration
import com.webank.wedatasphere.linkis.metadata.domain.mdq.bo.{MdqTableBO, MdqTableFieldsInfoBO}
import com.webank.wedatasphere.linkis.metadata.exception.MdqIllegalParamException
import org.apache.commons.lang.StringUtils

import scala.collection.JavaConversions._
import scala.collection.mutable.ArrayBuffer

object ScalaDDLCreator extends DDLCreator with SQLConst with Logging{



  override def createDDL(tableInfo:MdqTableBO, user:String): String = {
    logger.info(s"begin to generate ddl for user $user using ScalaDDLCreator")
    val dbName = tableInfo.getTableBaseInfo.getBase.getDatabase
    val tableName = tableInfo.getTableBaseInfo.getBase.getName
    val fields = tableInfo.getTableFieldsInfo
    val createTableCode = new StringBuilder
    createTableCode.append(SPARK_SQL).append(LEFT_PARENTHESES).append(MARKS).append(CREATE_TABLE)
    createTableCode.append(dbName).append(".").append(tableName)
    createTableCode.append(LEFT_PARENTHESES)
    val partitions = new ArrayBuffer[MdqTableFieldsInfoBO]()
    val fieldsArray = new ArrayBuffer[String]()
    fields foreach {
      field =>
        if (field.getPartitionField != null && field.getPartitionField == true) partitions += field else{
          val name = field.getName
          val _type = field.getType
          val desc = field.getComment
          if (StringUtils.isNotEmpty(desc)){
            fieldsArray += (name + SPACE + _type + SPACE + COMMENT + SPACE + SINGLE_MARK + desc + SINGLE_MARK)
          }else{
            fieldsArray += (name + SPACE + _type)
          }
        }
    }
    createTableCode.append(fieldsArray.mkString(COMMA)).append(RIGHT_PARENTHESES).append(SPACE)
    if (partitions.nonEmpty){
      val partitionArr = new ArrayBuffer[String]()
      partitions foreach {
        p => val name = p.getName
          val _type = p.getType
          if (StringUtils.isEmpty(name) || StringUtils.isEmpty(_type)) throw MdqIllegalParamException("partition name or type is null")
          partitionArr += (name + SPACE + _type)
      }
      createTableCode.append(PARTITIONED_BY).append(LEFT_PARENTHESES).append(partitionArr.mkString(COMMA)).
        append(RIGHT_PARENTHESES).append(SPACE)
    }
    //如果是分区表，但是没有分区字段，默认是用ds做分区
    if(partitions.isEmpty && tableInfo.getTableBaseInfo.getBase.getPartitionTable){
      val partition = MdqConfiguration.DEFAULT_PARTITION_NAME.getValue
      val _type = "string"
      createTableCode.append(PARTITIONED_BY).append(LEFT_PARENTHESES).append(partition).append(SPACE).append(_type).
        append(RIGHT_PARENTHESES).append(SPACE)
    }
    createTableCode.append(STORED_AS).append(SPACE).append(MdqConfiguration.DEFAULT_STORED_TYPE.getValue).append(SPACE)
    createTableCode.append(MARKS)
    createTableCode.append(RIGHT_PARENTHESES)
    val finalCode = createTableCode.toString()
    logger.info(s"End to create ddl code, code is $finalCode")
    finalCode
  }

  def main(args: Array[String]): Unit = {
    val filePath = "E:\\data\\json\\data.json"
    val json = scala.io.Source.fromFile(filePath).mkString
    println(json)

   // val obj = new Gson().fromJson(json, classOf[MdqTableVO])
    //val sql = createDDL(obj, "hadoop")
    //println(System.currentTimeMillis())
    //println(sql)
  }


}

Source File: RMEventConsumer.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.resourcemanager.schedule

import java.util.concurrent.{ExecutorService, Future}

import com.webank.wedatasphere.linkis.common.utils.Utils
import com.webank.wedatasphere.linkis.resourcemanager.event.RMEvent
import com.webank.wedatasphere.linkis.resourcemanager.event.metric.{MetricRMEvent, MetricRMEventExecutor}
import com.webank.wedatasphere.linkis.resourcemanager.event.notify.{NotifyRMEvent, NotifyRMEventExecutor}
import com.webank.wedatasphere.linkis.scheduler.SchedulerContext
import com.webank.wedatasphere.linkis.scheduler.queue._

import scala.collection.mutable.ArrayBuffer


class RMEventConsumer(schedulerContext: SchedulerContext,
                      executeService: ExecutorService) extends Consumer(schedulerContext, executeService) {
  private var queue: ConsumeQueue = _
  private var group: Group = _
  private var maxRunningJobsNum = 1000
  //Not put(暂未放)
  private val runningJobs = new Array[SchedulerEvent](maxRunningJobsNum)
  private val executorManager = schedulerContext.getOrCreateExecutorManager
  private var rmConsumerListener : RMConsumerListener = _
  var future: Future[_] = _

  def this(schedulerContext: SchedulerContext, executeService: ExecutorService, group: Group) = {
    this(schedulerContext, executeService)
    this.group = group
    maxRunningJobsNum = group.getMaximumCapacity
  }

  def start():Unit = future = executeService.submit(this)

  def setRmConsumerListener(rmConsumerListener: RMConsumerListener): Unit ={
    this.rmConsumerListener = rmConsumerListener
  }

  override def setConsumeQueue(consumeQueue: ConsumeQueue) = {
    queue = consumeQueue
  }

  override def getConsumeQueue = queue

  override def getGroup = group

  override def setGroup(group: Group) = {
    this.group = group
  }

  override def getRunningEvents = getEvents(_.isRunning)

  private def getEvents(op: SchedulerEvent => Boolean): Array[SchedulerEvent] = {
    val result = ArrayBuffer[SchedulerEvent]()
    runningJobs.filter(_ != null).filter(x => op(x)).foreach(result += _)
    result.toArray
  }

  override def run() = {
    Thread.currentThread().setName(s"${toString}Thread")
    info(s"$toString thread started!")
    while (!terminate) {
      Utils.tryAndError(loop())
      Utils.tryQuietly(Thread.sleep(10))
    }
    info(s"$toString thread stopped!")
  }

  def loop(): Unit = {
    var event = queue.take()
    while (event.turnToScheduled() != true) {
      event = queue.take()
    }
    if(rmConsumerListener != null){rmConsumerListener.beforeEventExecute(this,event.asInstanceOf[RMEvent])}
    Utils.tryAndError({
      val executor = executorManager.askExecutor(event)
      if (executor.isDefined) {
        event match {
          case x: MetricRMEvent =>{
            Utils.tryQuietly(executor.get.asInstanceOf[MetricRMEventExecutor].execute(new EventJob(x)))
          }
          case y: NotifyRMEvent =>{
            Utils.tryQuietly(executor.get.asInstanceOf[NotifyRMEventExecutor].execute(new EventJob(y)))
          }
        }
      }
    })
    if(rmConsumerListener != null){rmConsumerListener.afterEventExecute(this,event.asInstanceOf[RMEvent])}
  }

  override def shutdown() = {
    future.cancel(true)
    super.shutdown()
  }
}

Source File: StorageScriptFsReader.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.storage.script.reader

import java.io._

import com.webank.wedatasphere.linkis.common.io.{FsPath, MetaData, Record}
import com.webank.wedatasphere.linkis.storage.script._
import com.webank.wedatasphere.linkis.storage.utils.StorageUtils
import org.apache.commons.io.IOUtils

import scala.collection.mutable.ArrayBuffer



  def isMetadata(line: String, prefix: String, prefixConf: String): Boolean = {
    val regex = ("\\s*" + prefix + "\\s*(.+)\\s*" + "=" + "\\s*(.+)\\s*").r
    line match {
      case regex(_, _) => true
      case _ => {
        val split: Array[String] = line.split("=")
        if (split.size != 2) return false
        if (split(0).split(" ").filter(_ != "").size != 4) return false
        if (!split(0).split(" ").filter(_ != "")(0).equals(prefixConf)) return false
        true
      }
    }
  }
}

Source File: ResultSetWriter.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.storage.resultset

import com.webank.wedatasphere.linkis.common.io.resultset.{ResultSet, ResultSetWriter}
import com.webank.wedatasphere.linkis.common.io.{FsPath, MetaData, Record}

import scala.collection.mutable.ArrayBuffer


object ResultSetWriter {
  def getResultSetWriter[K <: MetaData, V <: Record](resultSet: ResultSet[K,V], maxCacheSize: Long, storePath: FsPath):ResultSetWriter[K, V] =
    new StorageResultSetWriter[K, V](resultSet, maxCacheSize, storePath)

  def getResultSetWriter[K <: MetaData, V <: Record](resultSet: ResultSet[K,V], maxCacheSize: Long, storePath: FsPath, proxyUser:String):ResultSetWriter[K, V] ={
    val writer = new StorageResultSetWriter[K, V](resultSet, maxCacheSize, storePath)
    writer.setProxyUser(proxyUser)
    writer
  }


  def getRecordByWriter(writer: ResultSetWriter[_ <:MetaData,_ <:Record],limit:Long): Array[Record] ={
    val res = writer.toString
    getRecordByRes(res,limit)
  }

  def getRecordByRes(res: String,limit:Long): Array[Record] ={
    val reader = ResultSetReader.getResultSetReader(res)
    var count = 0
    val records = new ArrayBuffer[Record]()
    reader.getMetaData
    while (reader.hasNext && count < limit){
      records += reader.getRecord
      count = count + 1
    }
    records.toArray
  }

  def getLastRecordByRes(res: String):Record = {
    val reader = ResultSetReader.getResultSetReader(res)
    reader.getMetaData
    while (reader.hasNext ){
     reader.getRecord
    }
    reader.getRecord
  }
}

Source File: StorageResultSetReader.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.storage.resultset

import java.io.{ByteArrayInputStream, IOException, InputStream}

import com.webank.wedatasphere.linkis.common.io.resultset.{ResultSet, ResultSetReader}
import com.webank.wedatasphere.linkis.common.io.{MetaData, Record}
import com.webank.wedatasphere.linkis.common.utils.Logging
import com.webank.wedatasphere.linkis.storage.domain.Dolphin
import com.webank.wedatasphere.linkis.storage.exception.StorageWarnException
import com.webank.wedatasphere.linkis.storage.utils.StorageUtils

import scala.collection.mutable.ArrayBuffer



  def readLine(): Array[Byte] = {

    var rowLen = 0
    try rowLen = Dolphin.readInt(inputStream)
    catch {
      case t:StorageWarnException => info(s"Read finished(读取完毕)") ; return null
      case t: Throwable => throw t
    }

    val rowBuffer = ArrayBuffer[Byte]()
    var len = 0

    //Read the entire line, except for the data of the line length(读取整行，除了行长的数据)
    while (rowLen > 0 && len >= 0) {
      if (rowLen > READ_CACHE)
        len = StorageUtils.readBytes(inputStream,bytes, READ_CACHE)
      else
        len = StorageUtils.readBytes(inputStream,bytes, rowLen)

      if (len > 0) {
        rowLen -= len
        rowBuffer ++= bytes.slice(0, len)
      }
    }
    rowCount = rowCount + 1
    rowBuffer.toArray
  }

  @scala.throws[IOException]
  override def getRecord: Record = {
    if (metaData == null) throw new IOException("Must read metadata first(必须先读取metadata)")
    if (row ==  null) throw new IOException("Can't get the value of the field, maybe the IO stream has been read or has been closed!(拿不到字段的值，也许IO流已读取完毕或已被关闭！)")
    row
  }

  @scala.throws[IOException]
  override def getMetaData: MetaData = {
    if(metaData == null) init()
    metaData = deserializer.createMetaData(readLine())
    metaData
  }

  @scala.throws[IOException]
  override def skip(recordNum: Int): Int = {
    if(recordNum < 0 ) return -1

    if(metaData == null) getMetaData
    for(i <- recordNum until (0, -1)){
      try inputStream.skip(Dolphin.readInt(inputStream)) catch { case t: Throwable => return -1}
    }
    recordNum
  }

  @scala.throws[IOException]
  override def getPosition: Long = rowCount

  @scala.throws[IOException]
  override def hasNext: Boolean = {
    if(metaData == null) getMetaData
    val line = readLine()
    if(line == null) return  false
    row = deserializer.createRecord(line)
    if(row == null) return  false
    true
  }

  @scala.throws[IOException]
  override def available: Long = inputStream.available()

  override def close(): Unit = inputStream.close()
}

Source File: TableResultDeserializer.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.storage.resultset.table

import com.webank.wedatasphere.linkis.common.io.resultset.ResultDeserializer
import com.webank.wedatasphere.linkis.storage.domain.{Column, DataType, Dolphin}
import com.webank.wedatasphere.linkis.storage.exception.StorageErrorException

import scala.collection.mutable.ArrayBuffer


  override def createRecord(bytes: Array[Byte]): TableRecord = {
    val colByteLen = Dolphin.getString(bytes, 0, Dolphin.INT_LEN).toInt
    val colString = Dolphin.getString(bytes, Dolphin.INT_LEN, colByteLen)
    val colArray = if(colString.endsWith(Dolphin.COL_SPLIT)) colString.substring(0, colString.length -1).split(Dolphin.COL_SPLIT) else colString.split(Dolphin.COL_SPLIT)
    var index = Dolphin.INT_LEN + colByteLen
    val data = colArray.indices.map { i =>
      val len = colArray(i).toInt
      val res = Dolphin.getString(bytes, index, len)
      index += len
      if(i >= metaData.columns.length) res
      else
        toValue(metaData.columns(i).dataType,res)
    }.toArray
    new TableRecord(data)
  }
}

Source File: RetryHandler.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.common.utils

import com.webank.wedatasphere.linkis.common.exception.{DWCRetryException, FatalException}
import org.apache.commons.lang.{ClassUtils => CommonClassUtils}

import scala.collection.mutable.ArrayBuffer


trait RetryHandler extends Logging {

  private var retryNum = 2
  private var period = 100l
  private var maxPeriod = 1000l
  private val retryExceptions = ArrayBuffer[Class[_  <: Throwable]]()

  def setRetryNum(retryNum: Int): Unit = this.retryNum = retryNum
  def getRetryNum: Int = retryNum
  def setRetryPeriod(retryPeriod: Long): Unit = this.period = retryPeriod
  def getRetryPeriod: Long = period
  def setRetryMaxPeriod(retryMaxPeriod: Long): Unit = this.maxPeriod = retryMaxPeriod
  def getRetryMaxPeriod: Long = maxPeriod
  def addRetryException(t: Class[_  <: Throwable]): Unit = retryExceptions += t
  def getRetryExceptions = retryExceptions.toArray

  def exceptionCanRetry(t: Throwable): Boolean = !t.isInstanceOf[FatalException] &&
    retryExceptions.exists(c => CommonClassUtils.isAssignable(t.getClass, c))

  def nextInterval(attempt: Int): Long = {
    val interval = (this.period.toDouble * Math.pow(1.5D, (attempt - 1).toDouble)).toLong
    if (interval > this.maxPeriod) this.maxPeriod
    else interval
  }

  def retry[T](op: => T, retryName: String): T = {
    if(retryExceptions.isEmpty || retryNum <= 1) return op
    var retry = 0
    var result = null.asInstanceOf[T]
    while(retry < retryNum && result == null) result = Utils.tryCatch(op) { t =>
      retry += 1
      if(retry >= retryNum) throw t
      else if(exceptionCanRetry(t)) {
        val retryInterval = nextInterval(retry)
        info(retryName + s" failed with ${t.getClass.getName}, wait ${ByteTimeUtils.msDurationToString(retryInterval)} for next retry. Retried $retry++ ...")
        Utils.tryQuietly(Thread.sleep(retryInterval))
        null.asInstanceOf[T]
      } else throw t
    }
    result
  }
}

Source File: ShutdownUtils.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.common.utils

import sun.misc.{Signal, SignalHandler}

import scala.collection.mutable.ArrayBuffer


object ShutdownUtils {

  private val shutdownRunners = ArrayBuffer[ShutdownRunner]()

  def addShutdownHook(runnable: Runnable): Unit = addShutdownHook(Int.MaxValue, runnable)

  def addShutdownHook(order: Int, runnable: Runnable): Unit =
    shutdownRunners synchronized shutdownRunners += new DefaultShutdownRunner(order, runnable)

  def addShutdownHook(hook: => Unit): Unit = addShutdownHook(Int.MaxValue, hook)

  def addShutdownHook(order: Int, hook: => Unit): Unit =
    shutdownRunners synchronized shutdownRunners += new FunctionShutdownRunner(order, hook)

  def addShutdownHook(shutdownRunner: ShutdownRunner): Unit =
    shutdownRunners synchronized shutdownRunners += shutdownRunner
  private val signals = Array("TERM", "HUP", "INT").map(new Signal(_))
  private val signalHandler = new SignalHandler {
    override def handle(signal: Signal): Unit = {
      val hooks = shutdownRunners.sortBy(_.order).toArray.map{
        case m: DefaultShutdownRunner =>
          Utils.defaultScheduler.execute(m)
          m
        case m =>
          val runnable = new DefaultShutdownRunner(m.order, m)
          Utils.defaultScheduler.execute(runnable)
          runnable
      }
      val startTime = System.currentTimeMillis
      ShutdownUtils synchronized {
        while(System.currentTimeMillis - startTime < 30000 && hooks.exists(!_.isCompleted))
          ShutdownUtils.wait(3000)
      }
      System.exit(0)
    }
  }
  signals.foreach(Signal.handle(_, signalHandler))
}
trait ShutdownRunner extends Runnable {
  val order: Int
}
class DefaultShutdownRunner(override val order: Int,
                            runnable: Runnable) extends ShutdownRunner {
  private var completed = false
  override def run(): Unit = Utils.tryFinally(runnable.run()){
    completed = true
    ShutdownUtils synchronized ShutdownUtils.notify()
  }
  def isCompleted = completed
}
class FunctionShutdownRunner(override val order: Int,
                             hook: => Unit) extends ShutdownRunner {
  override def run(): Unit = hook
}

Source File: DWCArgumentsParser.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.common.conf

import org.apache.commons.lang.StringUtils

import scala.collection.{JavaConversions, mutable}
import scala.collection.mutable.ArrayBuffer


object DWCArgumentsParser {
  protected val DWC_CONF = "--dwc-conf"
  protected val SPRING_CONF = "--spring-conf"
  private var dwcOptionMap = Map.empty[String, String]

  private[linkis] def setDWCOptionMap(dwcOptionMap: Map[String, String]) = this.dwcOptionMap = dwcOptionMap
  def getDWCOptionMap = dwcOptionMap

  def parse(args: Array[String]): DWCArgumentsParser = {
    val keyValueRegex = "([^=]+)=(.+)".r
    var i = 0
    val optionParser = new DWCArgumentsParser
    while(i < args.length) {
      args(i) match {
        case DWC_CONF | SPRING_CONF =>
          args(i + 1) match {
            case keyValueRegex(key, value) =>
              optionParser.setConf(args(i), key, value)
              i += 1
            case _ => throw new IllegalArgumentException("illegal commond line, format: --conf key=value.")
          }
        case _ => throw new IllegalArgumentException(s"illegal commond line, ${args(i)} cannot recognize.")
      }
      i += 1
    }
    optionParser.validate()
    optionParser
  }

  def formatToArray(optionParser: DWCArgumentsParser): Array[String] = {
    val options = ArrayBuffer[String]()
    def write(confMap: Map[String, String], optionType: String): Unit = confMap.foreach { case (key, value) =>
      if (StringUtils.isNotEmpty(key) && StringUtils.isNotEmpty(value)) {
        options += optionType
        options += (key + "=" + value)
      }
    }
    write(optionParser.getDWCConfMap, DWC_CONF)
    write(optionParser.getSpringConfMap, SPRING_CONF)
    options.toArray
  }
  def formatToArray(springOptionMap: Map[String, String], dwcOptionMap: Map[String, String]): Array[String] =
    formatToArray(new DWCArgumentsParser().setSpringConf(springOptionMap).setDWCConf(dwcOptionMap))

  def format(optionParser: DWCArgumentsParser): String = formatToArray(optionParser).mkString(" ")
  def format(springOptionMap: Map[String, String], dwcOptionMap: Map[String, String]): String =
    formatToArray(springOptionMap, dwcOptionMap).mkString(" ")

  def formatSpringOptions(springOptionMap: Map[String, String]): Array[String] = {
    val options = ArrayBuffer[String]()
    springOptionMap.foreach { case (key, value) =>
      if (StringUtils.isNotEmpty(key) && StringUtils.isNotEmpty(value)) {
        options += ("--" + key + "=" + value)
      }
    }
    options.toArray
  }
}
class DWCArgumentsParser {
  import DWCArgumentsParser._
  private val dwcOptionMap = new mutable.HashMap[String, String]()
  private val springOptionMap = new mutable.HashMap[String, String]()
  def getSpringConfMap = springOptionMap.toMap
  def getSpringConfs = JavaConversions.mapAsJavaMap(springOptionMap)
  def getDWCConfMap = dwcOptionMap.toMap
  def setConf(optionType: String, key: String, value: String) = {
    optionType match {
      case DWC_CONF =>
        dwcOptionMap += key -> value
      case SPRING_CONF =>
        springOptionMap += key -> value
    }
    this
  }
  def setSpringConf(optionMap: Map[String, String]): DWCArgumentsParser = {
    if(optionMap != null) this.springOptionMap ++= optionMap
    this
  }
  def setDWCConf(optionMap: Map[String, String]): DWCArgumentsParser = {
    if(optionMap != null) this.dwcOptionMap ++= optionMap
    this
  }
  def validate() = {}
}

Source File: _03_TraitsAsStackableModifications.scala From LearningScala with Apache License 2.0

5 votes

package _033_traits

import scala.collection.mutable.ArrayBuffer


  class MyQueue extends BasicIntQueue with Doubling

  def main(args: Array[String]): Unit = {
    val queue = new BasicIntQueue
    queue.put(-10)
    queue.put(20)
    println(s"queue.get(): ${queue.get()}")
    println(s"queue.get(): ${queue.get()}")
    println()

    val myQueue = new MyQueue
    myQueue.put(-10)
    myQueue.put(20)
    println(s"myQueue.get(): ${myQueue.get()}")
    println(s"myQueue.get(): ${myQueue.get()}")
    println()

    // You could supply "BasicIntQueue with Doubling" directly to new instead of defining a named class.
    val queueWithDoubling = new BasicIntQueue with Doubling
    queueWithDoubling.put(-10)
    queueWithDoubling.put(20)
    println(s"queueWithDoubling.get(): ${queueWithDoubling.get()}")
    println(s"queueWithDoubling.get(): ${queueWithDoubling.get()}")
    println()


    // ORDER MATTERS examples:
    // You can now pick and choose which traits you want for a particular queue.
    val q1 = new BasicIntQueue with Incrementing with Filtering
    q1.put(-1)
    q1.put(0)
    q1.put(1)
    println(s"q1.get(): ${q1.get()}")
    println(s"q1.get(): ${q1.get()}")
    //    println(s"q1.get(): ${q1.get()}") // will give an error
    println()

    val q2 = new BasicIntQueue with Filtering with Incrementing
    q2.put(-1)
    q2.put(0)
    q2.put(1)
    println(s"q2.get(): ${q2.get()}")
    println(s"q2.get(): ${q2.get()}")
    println(s"q2.get(): ${q2.get()}")
    println()
  }
}

Source File: _10_MutableCollections.scala From LearningScala with Apache License 2.0

5 votes

package _020_collections


object _10_MutableCollections {
  def main(args: Array[String]): Unit = {
    println("===== List buffers =====")
    listBufferExample()
    println()

    println("===== Array buffers =====")
    println(arrayBufferExample())
    println()

    println("===== Mutable Sets =====")
    mutableSetExample()
    println()

    println("===== Mutable Maps =====")
    mutableMapExample()
  }

  private def mutableMapExample(): Unit = {
    import scala.collection.mutable
    val map = mutable.Map.empty[String, Int]
    println(map)
    map("hello") = 1
    map("there") = 2
    println(map)
    println(map("hello"))
    println("======")
    val nums = mutable.Map("i" -> 1, "ii" -> 2)
    println(nums)
    nums += ("vi" -> 6)
    println(nums)
    nums -= "ii"
    println(nums)
    nums ++= List("iii" -> 3, "v" -> 5)
    println(nums)
    nums --= List("i", "ii")
    println(nums)
    println("=====")
    println(s"nums.size: ${nums.size}")
    print("nums.contains(\"ii\"): ")
    println(nums.contains("ii"))
    print("nums(\"iii\"): ")
    println(nums("iii"))
    println(s"nums.keys ==> ${nums.keys}")
    println(s"nums.keySet ==> ${nums.keySet}")
    println(s"nums.values ==> ${nums.values}")
    println(s"nums.isEmpty: ${nums.isEmpty}")
  }

  def arrayBufferExample(): List[Int] = {
    import scala.collection.mutable.ArrayBuffer
    val ab = ArrayBuffer[Int](10, 20)
    ab += 30
    ab += 40
    ab.prepend(5)
    ab.toList //return immutable
  }

  private def listBufferExample(): Unit = {
    import scala.collection.mutable.ListBuffer
    val listBuffer = new ListBuffer[Int]
    listBuffer += 1
    listBuffer += 2
    println(listBuffer)
    3 +=: listBuffer
    println(listBuffer)
    val list = listBuffer.toList
    println(list)
  }

  private def mutableSetExample(): Unit = {
    import scala.collection.mutable
    val emptySet = mutable.Set.empty[Int]
    println(emptySet)
    val nums = mutable.Set(1, 2, 3)
    println(nums)
    nums += 5
    println(nums)
    nums -= 3
    println(nums)
    nums ++= List(5, 6)
    println(nums)
    nums --= List(1, 2)
    println(nums)
    println(nums & Set(1, 3, 5, 7)) // intersection of two sets
    nums.clear()
    println(nums)
  }
}

Source File: TestableQueueInputDStream.scala From SparkUnitTestingExamples with Apache License 2.0

5 votes

package org.apache.spark.streaming

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.spark.rdd.{RDD, UnionRDD}
import org.apache.spark.streaming.dstream.InputDStream

import scala.collection.mutable.{ArrayBuffer, Queue}
import scala.reflect.ClassTag

class TestableQueueInputDStream[T: ClassTag](
                                              ssc: StreamingContext,
                                              val queue: Queue[RDD[T]],
                                              oneAtATime: Boolean,
                                              defaultRDD: RDD[T]
                                              ) extends InputDStream[T](ssc) {

  override def start() { }

  override def stop() { }

  private def readObject(in: ObjectInputStream): Unit = {
    logWarning("queueStream doesn't support checkpointing")
  }

  private def writeObject(oos: ObjectOutputStream): Unit = {
    logWarning("queueStream doesn't support checkpointing")
  }

  override def compute(validTime: Time): Option[RDD[T]] = {
    val buffer = new ArrayBuffer[RDD[T]]()
    queue.synchronized {
      if (oneAtATime && queue.nonEmpty) {
        buffer += queue.dequeue()
      } else {
        buffer ++= queue
        queue.clear()
      }
    }
    if (buffer.nonEmpty) {
      if (oneAtATime) {
        Some(buffer.head)
      } else {
        Some(new UnionRDD(context.sc, buffer.toSeq))
      }
    } else if (defaultRDD != null) {
      Some(defaultRDD)
    } else {
      Some(ssc.sparkContext.emptyRDD)
    }
  }

}

Source File: PruneWorker.scala From spatial with MIT License

5 votes

package spatial.dse

import java.util.concurrent.LinkedBlockingQueue

import argon.State
import spatial.metadata.params._
import spatial.metadata.bounds._

import scala.collection.mutable.ArrayBuffer

case class PruneWorker(
  start: Int,
  size: Int,
  prods: Seq[BigInt],
  dims:  Seq[BigInt],
  indexedSpace: Seq[(Domain[_],Int)],
  restricts: Set[Restrict],
  queue: LinkedBlockingQueue[Seq[Int]]
)(implicit state: State) extends Runnable {

  private def isLegalSpace(): Boolean = restricts.forall(_.evaluate())

  def run(): Unit = {
    println(s"Searching from $start until ${start+size}")
    val pts = (start until (start+size)).filter{i =>
      indexedSpace.foreach{case (domain,d) => domain.set( ((i / prods(d)) % dims(d)).toInt ) }
      isLegalSpace()
    }
    queue.put(pts)
  }
}

Source File: Flows.scala From spatial with MIT License

5 votes

package argon

import scala.collection.mutable.{ArrayBuffer,HashSet}

import utils.Instrument

trait FlowRules {
  val IR: State

}


class Flows {
  private var rules = ArrayBuffer[(String,PartialFunction[(Sym[_],Op[_],SrcCtx,State),Unit])]()
  private[argon] var names = HashSet[String]()

  lazy val instrument = new Instrument("flows")

  def prepend(name: String, func: PartialFunction[(Sym[_],Op[_],SrcCtx,State),Unit]): Unit = {
    rules.prepend((name,func))
    names += name
  }

  def add(name: String, func: PartialFunction[(Sym[_],Op[_],SrcCtx,State),Unit]): Unit = {
    rules += ((name,func))
    names += name
  }
  def remove(name: String): Unit = {
    val idx = rules.indexWhere(_._1 == name)
    rules.remove(idx)
    names.remove(name)
  }

  def apply[A](lhs: Sym[A], rhs: Op[A])(implicit ctx: SrcCtx, state: State): Unit = {
    val tuple = (lhs,rhs,ctx,state)
    rules.foreach{case (name,rule) =>
      if (rule.isDefinedAt(tuple)) { instrument(name){ rule.apply(tuple) } }
    }
  }

  def save(): Flows = {
    val flows = new Flows
    flows.rules ++= rules
    flows.names ++= names
    flows
  }
  def restore(flow: Flows): Unit = {
    rules = flow.rules
    names = flow.names
  }
}

Source File: Rewrites.scala From spatial with MIT License

5 votes

package argon

import utils.implicits.collections._

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

trait RewriteRules {
  val IR: State
}


class Rewrites {
  type RewriteRule = PartialFunction[(Op[_],SrcCtx,State),Option[Sym[_]]]

  private def keyOf[A<:Op[_]:Manifest] = manifest[A].runtimeClass.asInstanceOf[Class[A]]

  // Roughly O(G), where G is the total number of global rewrite rules
  // When possible, use rules instead of globals
  private var globals: ArrayBuffer[RewriteRule] = ArrayBuffer.empty

  // Roughly O(R), where R is the number of rules for a specific node class
  private val rules: mutable.HashMap[Class[_], ArrayBuffer[RewriteRule]] = mutable.HashMap.empty
  private[argon] val names: mutable.HashSet[String] = mutable.HashSet.empty

  def rule(op: Op[_]): Seq[RewriteRule] = rules.getOrElse(op.getClass, Nil)

  def addGlobal(name: String, rule: RewriteRule): Unit = if (!names.contains(name)) {
    names += name
    globals += rule
  }

  def add[O<:Op[_]:Manifest](name: String, rule: RewriteRule): Unit = if (!names.contains(name)) {
    names += name
    val key = keyOf[O]
    val pfs = rules.getOrElseAdd(key, () => ArrayBuffer.empty[RewriteRule])
    pfs += rule
  }

  private def applyRule[A:Type](op: Op[A], ctx: SrcCtx, state: State, rule: RewriteRule): Option[A] = {
    rule.apply((op,ctx,state)) match {
      case Some(s) if s.tp <:< Type[A] => Some(s.asInstanceOf[A])
      case Some(s) => None
      case _ => None
    }
  }

  def apply[A:Type](op: Op[A])(implicit ctx: SrcCtx, state: State): Option[A] = {
    Option(op.rewrite)
          .orElse{ rule(op).mapFind{rule => applyRule[A](op,ctx,state, rule) } }
          .orElse{ globals.mapFind{rule => applyRule[A](op,ctx,state, rule) } }.map { op2 =>
      if (state.config.enLog) {
        dbgs(s"Rewrite $op => $op2")
      }
      op2
    }
  }
}

Source File: BitTest.scala From spatial with MIT License

5 votes

package spatial.tests.compiler

import spatial.dsl._

import scala.collection.mutable.ArrayBuffer

@spatial class BitTest extends SpatialTest {
  override def backends = DISABLED

  // Returns a random number in [min,max)
  def rand(max: gen.Int, min: gen.Int): gen.Int = scala.util.Random.nextInt(max-min)+min

  def opp(x: Bit, y: Bit, op: gen.Int): Bit = op match {
    case 0 | 1 | 2 => x & y
    case 3 | 4 | 5 => x | y
    case 6 | 7 | 8 => x !== y
    case 9 | 10 | 11 => x === y
    case 12 => !x
    case 13 => !y
  }

  def main(args: Array[String]): Void = {
    Foreach(0 until 32){i =>
      val bits: List[Bit] = List.fill(32){ random[Bit] }
      var layers: ArrayBuffer[List[Bit]] = ArrayBuffer(bits)

      (0 until 64).meta.foreach{i =>
        val layer = List.fill(200){
          val l1 = i //rand(layers.length,0)
          val l2 = i //rand(layers.length,0)
          val p1 = rand(layers(l1).length, 0)
          val p2 = rand(layers(l2).length, 0)
          val op = rand(14,0)
          val x = layers(l1).apply(p1)
          val y = layers(l2).apply(p2)
          opp(x,y,op)
        }
        layers += layer

        println(r"[$i] 1: ${layer(1)}, 3: ${layer(3)}, 5: ${layer(5)}")
      }
    }
  }
}

Source File: TemplateRunner.scala From spatial with MIT License

5 votes

package fringe.test

import java.io.File

import scala.collection.mutable.ArrayBuffer
import scala.util.Properties.envOrElse

object TemplateRunner {
  def deleteRecursively(file: File): Unit = {
    if (file.isDirectory)
      file.listFiles.foreach(deleteRecursively)
    if (file.exists && !file.delete)
      throw new Exception(s"Unable to delete ${file.getAbsolutePath}")
  }
  def apply(templateMap: Map[String, String => Boolean], args: Array[String]): Unit = {
    // Choose the default backend based on what is available.
    lazy val firrtlTerpBackendAvailable: Boolean = {
      try {
        val cls = Class.forName("chisel3.iotesters.FirrtlTerpBackend")
        cls != null
      } catch {
        case e: Throwable => false
      }
    }
    lazy val defaultBackend = if (firrtlTerpBackendAvailable) "firrtl" else ""

    val backendName = envOrElse("TESTER_BACKENDS", defaultBackend).split(" ").head
    val tempDir = s"""${envOrElse("NEW_TEMPLATES_HOME", "tmp")}/test_run_dir/"""
    val specificRegex = "(.*[0-9]+)".r
    val problemsToRun = if (args.isEmpty) {
      templateMap.keys.toSeq.sorted.toArray // Run all by default
    } else {
      args.map { arg => arg match {
        case "all" => templateMap.keys.toSeq.sorted // Run all
        case specificRegex(c) => List(c).toSeq // Run specific test
        case _ => // Figure out tests that match this template and run all
          val tempRegex = s"(${arg}[0-9]+)".r
          templateMap.keys.toSeq.sorted.filter(tempRegex.pattern.matcher(_).matches)
      }}.flatten.toArray
    }

    var successful = 0
    var passedTests:List[String] = List()
    val errors = new ArrayBuffer[String]
    for(testName <- problemsToRun) {
      // Wipe tempdir for consecutive tests of same module
      deleteRecursively(new File(tempDir))
      templateMap.get(testName) match {
        case Some(test) =>
          println(s"Starting template $testName")
          try {
            if(test(backendName)) {
              successful += 1
              passedTests = passedTests :+ s"$testName"
            }
            else {
              errors += s"Template $testName: test error occurred"
            }
          }
          catch {
            case exception: Exception =>
              exception.printStackTrace()
              errors += s"Template $testName: exception ${exception.getMessage}"
            case t : Throwable =>
              errors += s"Template $testName: throwable ${t.getMessage}"
          }
        case _ =>
          errors += s"Bad template name: $testName"
      }
    }
    if(successful > 0) {
      println(s"""Templates passing: $successful (${passedTests.mkString(", ")})""")
    }
    if(errors.nonEmpty) {
      println("=" * 80)
      println(s"Errors: ${errors.length}: in the following templates")
      println(errors.mkString("\n"))
      println("=" * 80)
      System.exit(1)
    }
  }
}

Source File: AvroSchemaMerge.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.avro

import com.sksamuel.exts.StringOption
import org.apache.avro.Schema

import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer

object AvroSchemaMerge {

  def apply(name: String, namespace: String, schemas: List[Schema]): Schema = {
    require(schemas.forall(_.getType == Schema.Type.RECORD), "Can only merge records")

    // documentations can just be a concat
    val doc = schemas.map(_.getDoc).filter(_ != null).mkString("; ")

    // simple impl to start: take all the fields from the first schema, and then add in the missing ones
    // from second 2 and so on
    val fields = new ArrayBuffer[Schema.Field]()
    schemas.foreach { schema =>
      schema.getFields.asScala.filterNot { field => fields.exists(_.name() == field.name) }.foreach { field =>
        // avro is funny about sharing fields, so need to copy it
        val copy = new Schema.Field(field.name(), field.schema(), StringOption(field.doc).orNull, field.defaultVal)
        fields.append(copy)
      }
    }

    val schema = Schema.createRecord(name, if (doc.isEmpty()) null else doc, namespace, false)
    schema.setFields(fields.result().asJava)
    schema
  }
}

Source File: JdbcPublisher.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.jdbc

import java.sql.{Connection, PreparedStatement}
import java.util.concurrent.atomic.AtomicBoolean

import com.sksamuel.exts.io.Using
import com.sksamuel.exts.metrics.Timed
import io.eels.Row
import io.eels.component.jdbc.dialect.JdbcDialect
import io.eels.datastream.{Publisher, Subscriber, Subscription}

import scala.collection.mutable.ArrayBuffer

class JdbcPublisher(connFn: () => Connection,
                    query: String,
                    bindFn: (PreparedStatement) => Unit,
                    fetchSize: Int,
                    dialect: JdbcDialect
              ) extends Publisher[Seq[Row]] with Timed with JdbcPrimitives with Using {

  override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {
    try {
      using(connFn()) { conn =>

        logger.debug(s"Preparing query $query")
        using(conn.prepareStatement(query)) { stmt =>

          stmt.setFetchSize(fetchSize)
          bindFn(stmt)

          logger.debug(s"Executing query $query")
          using(stmt.executeQuery()) { rs =>

            val schema = schemaFor(dialect, rs)

            val running = new AtomicBoolean(true)
            subscriber.subscribed(Subscription.fromRunning(running))

            val buffer = new ArrayBuffer[Row](fetchSize)
            while (rs.next && running.get) {
              val values = schema.fieldNames().map { name =>
                val raw = rs.getObject(name)
                dialect.sanitize(raw)
              }
              buffer append Row(schema, values)
              if (buffer.size == fetchSize) {
                subscriber.next(buffer.toVector)
                buffer.clear()
              }
            }

            if (buffer.nonEmpty)
              subscriber.next(buffer.toVector)

            subscriber.completed()
          }
        }
      }
    } catch {
      case t: Throwable => subscriber.error(t)
    }
  }
}

Source File: HbasePublisher.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.hbase

import java.util
import java.util.concurrent.atomic.AtomicBoolean

import com.sksamuel.exts.io.Using
import com.sksamuel.exts.metrics.Timed
import io.eels.Row
import io.eels.datastream.{Publisher, Subscriber, Subscription}
import io.eels.schema.StructType
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.client.{Connection, Result, Scan}

import scala.collection.mutable.ArrayBuffer

class HbasePublisher(connection: Connection,
                     schema: StructType,
                     namespace: String,
                     tableName: String,
                     bufferSize: Int,
                     maxRows: Long,
                     scanner: Scan,
                     implicit val serializer: HbaseSerializer) extends Publisher[Seq[Row]] with Timed with Using {

  private val table = connection.getTable(TableName.valueOf(namespace, tableName))

  override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {
    try {
      using(new CloseableIterator) { rowIter =>
        val running = new AtomicBoolean(true)
        subscriber.subscribed(Subscription.fromRunning(running))
        val buffer = new ArrayBuffer[Row](bufferSize)
        while (rowIter.hasNext && running.get()) {
          buffer append rowIter.next()
          if (buffer.size == bufferSize) {
            subscriber.next(buffer.toVector)
            buffer.clear()
          }
        }
        if (buffer.nonEmpty) subscriber.next(buffer.toVector)
        subscriber.completed()
      }
    } catch {
      case t: Throwable => subscriber.error(t)
    }
  }

  class CloseableIterator extends Iterator[Row] with AutoCloseable {
    private val resultScanner = table.getScanner(scanner)
    private val resultScannerIter = resultScanner.iterator()
    private var rowCount = 0
    private var iter: Iterator[Row] = Iterator.empty

    override def hasNext: Boolean = rowCount < maxRows && iter.hasNext || {
      if (rowCount < maxRows && resultScannerIter.hasNext) {
        iter = HBaseResultsIterator(schema, resultScannerIter)
        iter.hasNext
      } else false
    }

    override def next(): Row = {
      rowCount += 1
      iter.next()
    }

    override def close(): Unit = {
      resultScanner.close()
    }
  }

  case class HBaseResultsIterator(schema: StructType, resultIter: util.Iterator[Result])(implicit serializer: HbaseSerializer) extends Iterator[Row] {
    override def hasNext: Boolean = resultIter.hasNext

    override def next(): Row = {
      val resultRow = resultIter.next()
      val values = schema.fields.map { field =>
        if (!field.key) {
          val value = resultRow.getValue(field.columnFamily.getOrElse(sys.error(s"No Column Family defined for field '${field.name}'")).getBytes, field.name.getBytes)
          if (value != null) serializer.fromBytes(value, field.name, field.dataType) else null
        } else serializer.fromBytes(resultRow.getRow, field.name, field.dataType)
      }
      Row(schema, values)
    }
  }


}

Source File: OrcWriter.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.orc

import java.util.concurrent.atomic.AtomicInteger
import java.util.function.IntUnaryOperator

import com.sksamuel.exts.Logging
import com.typesafe.config.ConfigFactory
import io.eels.Row
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector
import org.apache.orc.{OrcConf, OrcFile, TypeDescription}

import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer

// performs the actual write out of orc data, to be used by an orc sink
class OrcWriter(path: Path,
                structType: StructType,
                options: OrcWriteOptions)(implicit conf: Configuration) extends Logging {

  private val schema: TypeDescription = OrcSchemaFns.toOrcSchema(structType)
  logger.trace(s"Creating orc writer for schema $schema")

  private val batchSize = {
    val size = ConfigFactory.load().getInt("eel.orc.sink.batchSize")
    Math.max(Math.min(1024, size), 1)
  }
  logger.debug(s"Orc writer will use batchsize=$batchSize")

  private val buffer = new ArrayBuffer[Row](batchSize)
  private val serializers = schema.getChildren.asScala.map(OrcSerializer.forType).toArray
  private val batch = schema.createRowBatch(batchSize)

  OrcConf.COMPRESSION_STRATEGY.setString(conf, options.compressionStrategy.name)
  OrcConf.COMPRESS.setString(conf, options.compressionKind.name)
  options.encodingStrategy.map(_.name).foreach(OrcConf.ENCODING_STRATEGY.setString(conf, _))
  options.compressionBufferSize.foreach(OrcConf.BUFFER_SIZE.setLong(conf, _))
  private val woptions = OrcFile.writerOptions(conf).setSchema(schema)

  options.rowIndexStride.foreach { size =>
    woptions.rowIndexStride(size)
    logger.debug(s"Using stride size = $size")
  }

  if (options.bloomFilterColumns.nonEmpty) {
    woptions.bloomFilterColumns(options.bloomFilterColumns.mkString(","))
    logger.debug(s"Using bloomFilterColumns = $options.bloomFilterColumns")
  }
  private lazy val writer = OrcFile.createWriter(path, woptions)

  private val counter = new AtomicInteger(0)

  def write(row: Row): Unit = {
    buffer.append(row)
    if (buffer.size == batchSize)
      flush()
  }

  def records: Int = counter.get()

  def flush(): Unit = {

    def writecol[T <: ColumnVector](rowIndex: Int, colIndex: Int, row: Row): Unit = {
      val value = row.values(colIndex)
      val vector = batch.cols(colIndex).asInstanceOf[T]
      val serializer = serializers(colIndex).asInstanceOf[OrcSerializer[T]]
      serializer.writeToVector(rowIndex, vector, value)
    }

    // don't use foreach here, using old school for loops for perf
    for (rowIndex <- buffer.indices) {
      val row = buffer(rowIndex)
      for (colIndex <- batch.cols.indices) {
        writecol(rowIndex, colIndex, row)
      }
    }

    batch.size = buffer.size
    writer.addRowBatch(batch)
    counter.updateAndGet(new IntUnaryOperator {
      override def applyAsInt(operand: Int): Int = operand + batch.size
    })
    buffer.clear()
    batch.reset()
  }

  def close(): Long = {
    if (buffer.nonEmpty)
      flush()
    writer.close()
    val count = writer.getNumberOfRows
    logger.info(s"Orc writer wrote $count rows")
    count
  }
}

Source File: SKRSpec.scala From spark-kafka-writer with Apache License 2.0

5 votes

package com.github.benfradet.spark.kafka.writer

import java.util.concurrent.atomic.AtomicInteger

import org.apache.kafka.common.serialization.{StringDeserializer, StringSerializer}
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.scalatest.concurrent.Eventually
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach}

import scala.collection.mutable.ArrayBuffer
import scala.util.Random
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

case class Foo(a: Int, b: String)

trait SKRSpec
  extends AnyWordSpec
  with Matchers
  with BeforeAndAfterEach
  with BeforeAndAfterAll
  with Eventually {

  val sparkConf = new SparkConf()
    .setMaster("local[1]")
    .setAppName(getClass.getSimpleName)

  var ktu: KafkaTestUtils = _
  override def beforeAll(): Unit = {
    ktu = new KafkaTestUtils
    ktu.setup()
  }
  override def afterAll(): Unit = {
    SKRSpec.callbackTriggerCount.set(0)
    if (ktu != null) {
      ktu.tearDown()
      ktu = null
    }
  }

  var topic: String = _
  var ssc: StreamingContext = _
  var spark: SparkSession = _
  override def afterEach(): Unit = {
    if (ssc != null) {
      ssc.stop()
      ssc = null
    }
    if (spark != null) {
      spark.stop()
      spark = null
    }
  }
  override def beforeEach(): Unit = {
    ssc = new StreamingContext(sparkConf, Seconds(1))
    spark = SparkSession.builder
      .config(sparkConf)
      .getOrCreate()
    topic = s"topic-${Random.nextInt()}"
    ktu.createTopics(topic)
  }

  def collect(ssc: StreamingContext, topic: String): ArrayBuffer[String] = {
    val kafkaParams = Map(
      "bootstrap.servers" -> ktu.brokerAddress,
      "auto.offset.reset" -> "earliest",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "test-collect"
    )
    val results = new ArrayBuffer[String]
    KafkaUtils.createDirectStream[String, String](
      ssc,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Subscribe[String, String](Set(topic), kafkaParams)
    ).map(_.value())
      .foreachRDD { rdd =>
        results ++= rdd.collect()
        ()
      }
    results
  }

  val producerConfig = Map(
    "bootstrap.servers" -> "127.0.0.1:9092",
    "key.serializer" -> classOf[StringSerializer].getName,
    "value.serializer" -> classOf[StringSerializer].getName
  )
}

object SKRSpec {
  val callbackTriggerCount = new AtomicInteger()
}

Source File: SidechainBlockInfo.scala From Sidechains-SDK with MIT License

5 votes

package com.horizen.chain

import com.horizen.block.SidechainBlock
import com.horizen.utils.{WithdrawalEpochInfo, WithdrawalEpochInfoSerializer}
import com.horizen.vrf.{VrfOutput, VrfOutputSerializer}
import scorex.core.NodeViewModifier
import scorex.core.block.Block.Timestamp
import scorex.core.consensus.ModifierSemanticValidity
import scorex.core.serialization.{BytesSerializable, ScorexSerializer}
import scorex.util.serialization.{Reader, Writer}
import scorex.util.{ModifierId, bytesToId, idToBytes}

import scala.collection.mutable.ArrayBuffer

case class SidechainBlockInfo(height: Int,
                              score: Long,
                              parentId: ModifierId,
                              timestamp: Timestamp,
                              semanticValidity: ModifierSemanticValidity,
                              mainchainHeaderHashes: Seq[MainchainHeaderHash],
                              mainchainReferenceDataHeaderHashes: Seq[MainchainHeaderHash],
                              withdrawalEpochInfo: WithdrawalEpochInfo,
                              vrfOutputOpt: Option[VrfOutput],
                              lastBlockInPreviousConsensusEpoch: ModifierId) extends BytesSerializable with LinkedElement[ModifierId] {

  override def getParentId: ModifierId = parentId

  override type M = SidechainBlockInfo

  override lazy val serializer: ScorexSerializer[SidechainBlockInfo] = SidechainBlockInfoSerializer

  override def bytes: Array[Byte] = SidechainBlockInfoSerializer.toBytes(this)
}

object SidechainBlockInfo {
  def mainchainHeaderHashesFromBlock(sidechainBlock: SidechainBlock): Seq[MainchainHeaderHash] = {
    sidechainBlock.mainchainHeaders.map(header => byteArrayToMainchainHeaderHash(header.hash))
  }

  def mainchainReferenceDataHeaderHashesFromBlock(sidechainBlock: SidechainBlock): Seq[MainchainHeaderHash] = {
    sidechainBlock.mainchainBlockReferencesData.map(data => byteArrayToMainchainHeaderHash(data.headerHash))
  }
}

object SidechainBlockInfoSerializer extends ScorexSerializer[SidechainBlockInfo] {
  override def serialize(obj: SidechainBlockInfo, w: Writer): Unit = {
    w.putInt(obj.height)
    w.putLong(obj.score)
    w.putBytes(idToBytes(obj.parentId))
    w.putLong(obj.timestamp)
    w.put(obj.semanticValidity.code)
    w.putInt(obj.mainchainHeaderHashes.size)
    obj.mainchainHeaderHashes.foreach(id => w.putBytes(id.data))
    w.putInt(obj.mainchainReferenceDataHeaderHashes.size)
    obj.mainchainReferenceDataHeaderHashes.foreach(id => w.putBytes(id.data))
    WithdrawalEpochInfoSerializer.serialize(obj.withdrawalEpochInfo, w)

    w.putOption(obj.vrfOutputOpt){case (writer: Writer, vrfOutput: VrfOutput) =>
      VrfOutputSerializer.getSerializer.serialize(vrfOutput, writer)
    }

    w.putBytes(idToBytes(obj.lastBlockInPreviousConsensusEpoch))
  }

  private def readMainchainHeadersHashes(r: Reader): Seq[MainchainHeaderHash] = {
    val references: ArrayBuffer[MainchainHeaderHash] = ArrayBuffer()
    val length = r.getInt()

    (0 until length).foreach(_ => {
      val bytes = r.getBytes(mainchainHeaderHashSize)
      references.append(byteArrayToMainchainHeaderHash(bytes))
    })

    references
  }

  override def parse(r: Reader): SidechainBlockInfo = {
    val height = r.getInt()
    val score = r.getLong()
    val parentId = bytesToId(r.getBytes(NodeViewModifier.ModifierIdSize))
    val timestamp = r.getLong()
    val semanticValidityCode = r.getByte()
    val mainchainHeaderHashes = readMainchainHeadersHashes(r)
    val mainchainReferenceDataHeaderHashes = readMainchainHeadersHashes(r)
    val withdrawalEpochInfo = WithdrawalEpochInfoSerializer.parse(r)
    val vrfOutputOpt = r.getOption(VrfOutputSerializer.getSerializer.parse(r))

    val lastBlockInPreviousConsensusEpoch = bytesToId(r.getBytes(NodeViewModifier.ModifierIdSize))

    SidechainBlockInfo(height, score, parentId, timestamp, ModifierSemanticValidity.restoreFromCode(semanticValidityCode),
      mainchainHeaderHashes, mainchainReferenceDataHeaderHashes, withdrawalEpochInfo, vrfOutputOpt, lastBlockInPreviousConsensusEpoch)
  }
}

Source File: IODBStoreAdapter.scala From Sidechains-SDK with MIT License

5 votes

package com.horizen.storage

import java.util.{ArrayList => JArrayList, List => JList}
import java.util.Optional
import com.horizen.utils.Pair

import scala.collection.JavaConverters._

import io.iohk.iodb.Store
import com.horizen.utils.ByteArrayWrapper

import scala.collection.mutable.ArrayBuffer

class IODBStoreAdapter (store : Store)
  extends Storage {

  override def get(key: ByteArrayWrapper): Optional[ByteArrayWrapper] = {
    val value = store.get(key)
    if (value.isEmpty)
      Optional.empty()
    else
      Optional.of(new ByteArrayWrapper(value.get))
  }

  override def getOrElse(key: ByteArrayWrapper, defaultValue: ByteArrayWrapper): ByteArrayWrapper = {
    val value = store.get(key)
    if (value.isEmpty)
      defaultValue
    else
      new ByteArrayWrapper(value.get)
  }

  override def get(keys: JList[ByteArrayWrapper]): JList[Pair[ByteArrayWrapper, Optional[ByteArrayWrapper]]] = {
    val keysList = new ArrayBuffer[ByteArrayWrapper]()
    val valList = store.get(keys.asScala)
    val values = new JArrayList[Pair[ByteArrayWrapper,Optional[ByteArrayWrapper]]]()

    for (v <- valList)
      if (v._2.isDefined)
        values.add(new Pair[ByteArrayWrapper,Optional[ByteArrayWrapper]](new ByteArrayWrapper(v._1),
          Optional.of(new ByteArrayWrapper(v._2.get))))
      else
        values.add(new Pair[ByteArrayWrapper,Optional[ByteArrayWrapper]](new ByteArrayWrapper(v._1),
          Optional.empty()))

    values
  }

  override def getAll: JList[Pair[ByteArrayWrapper, ByteArrayWrapper]] = {
    val values = new JArrayList[Pair[ByteArrayWrapper,ByteArrayWrapper]]()

    for ( i <- store.getAll())
      values.add(new Pair[ByteArrayWrapper,ByteArrayWrapper](new ByteArrayWrapper(i._1),
        new ByteArrayWrapper(i._2)))

    values
  }

  override def lastVersionID(): Optional[ByteArrayWrapper] = {
    val value = store.lastVersionID
    if (value.isEmpty)
      Optional.empty()
    else
      Optional.of(new ByteArrayWrapper(value.get))
  }

  override def update(version: ByteArrayWrapper, toUpdate: JList[Pair[ByteArrayWrapper, ByteArrayWrapper]],
                      toRemove: JList[ByteArrayWrapper]): Unit = {

    val listToUpdate = new ArrayBuffer[Tuple2[ByteArrayWrapper,ByteArrayWrapper]]()

    for (r <- toUpdate.asScala) {
      listToUpdate.append(new Tuple2[ByteArrayWrapper, ByteArrayWrapper](r.getKey, r.getValue))
    }

    store.update(version, toRemove.asScala, listToUpdate)
  }

  override def rollback(version : ByteArrayWrapper): Unit = {
    store.rollback(version)
  }

  override def rollbackVersions(): JList[ByteArrayWrapper] = {
    val versions = store.rollbackVersions()
    val value = new JArrayList[ByteArrayWrapper]()
    for (v <- versions)
      value.add(new ByteArrayWrapper(v))

    value
  }

  override def isEmpty(): Boolean = !lastVersionID().isPresent

  override def close(): Unit = {
    store.close()
  }
}

Source File: StoreOpsTest.scala From fs2-blobstore with Apache License 2.0

5 votes

package blobstore

import java.nio.charset.Charset
import java.nio.file.Files
import java.util.concurrent.Executors

import cats.effect.{Blocker, IO}
import cats.effect.laws.util.TestInstances
import cats.implicits._
import fs2.Pipe
import org.scalatest.Assertion
import org.scalatest.flatspec.AnyFlatSpec
import implicits._
import org.scalatest.matchers.must.Matchers

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.ExecutionContext


class StoreOpsTest extends AnyFlatSpec with Matchers with TestInstances {

  implicit val cs = IO.contextShift(ExecutionContext.global)
  val blocker = Blocker.liftExecutionContext(ExecutionContext.fromExecutor(Executors.newCachedThreadPool))

  behavior of "PutOps"
  it should "buffer contents and compute size before calling Store.put" in {
    val bytes: Array[Byte] = "AAAAAAAAAA".getBytes(Charset.forName("utf-8"))
    val store = DummyStore(_.size must be(Some(bytes.length)))

    fs2.Stream.emits(bytes).covary[IO].through(store.bufferedPut(Path("path/to/file.txt"), blocker)).compile.drain.unsafeRunSync()
    store.buf.toArray must be(bytes)

  }

  it should "upload a file from a nio Path" in {
    val bytes = "hello".getBytes(Charset.forName("utf-8"))
    val store = DummyStore(_.size must be(Some(bytes.length)))

    fs2.Stream.bracket(IO(Files.createTempFile("test-file", ".bin"))) { p =>
      IO(p.toFile.delete).void
    }.flatMap { p =>
      fs2.Stream.emits(bytes).covary[IO].through(fs2.io.file.writeAll(p, blocker)).drain ++
        fs2.Stream.eval(store.put(p, Path("path/to/file.txt"), blocker))
    }.compile.drain.unsafeRunSync()
    store.buf.toArray must be(bytes)
  }

}

final case class DummyStore(check: Path => Assertion) extends Store[IO] {
  val buf = new ArrayBuffer[Byte]()
  override def put(path: Path): Pipe[IO, Byte, Unit] = {
    check(path)
    in => {
      buf.appendAll(in.compile.toVector.unsafeRunSync())
      fs2.Stream.emit(())
    }
  }
  override def list(path: Path): fs2.Stream[IO, Path] = ???
  override def get(path: Path, chunkSize: Int): fs2.Stream[IO, Byte] = ???
  override def move(src: Path, dst: Path): IO[Unit] = ???
  override def copy(src: Path, dst: Path): IO[Unit] = ???
  override def remove(path: Path): IO[Unit] = ???
}

Source File: MetadataTransformUtils.scala From automl with Apache License 2.0

5 votes

package org.apache.spark.ml.feature.operator

import org.apache.spark.sql.types.{MetadataBuilder, StructField}

import scala.collection.mutable.ArrayBuffer


  def vectorCartesianTransform(fields: Array[StructField], numFeatures: Int): MetadataBuilder = {
    if (fields.length < 2) {
      throw new IllegalArgumentException("the number of cols in the input DataFrame should be no less than 2")
    }

    var res = Array[String]()
    if (fields.head.metadata.contains(DERIVATION)) {
      res = fields.head.metadata.getStringArray(DERIVATION)
    } else {
      res = createDerivation(numFeatures)
    }

    for (i <- 1 until fields.length) {
      if (fields(i).metadata.contains(DERIVATION)) {
        res = cartesianWithArray(res, fields(i).metadata.getStringArray(DERIVATION))
      } else {
        res = cartesianWithArray(res, createDerivation(numFeatures))
      }
    }

    val metadata = fields.last.metadata
    new MetadataBuilder().withMetadata(metadata).putStringArray(DERIVATION, res)
  }

}

Source File: Message.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.network.nio

import java.net.InetSocketAddress
import java.nio.ByteBuffer

import scala.collection.mutable.ArrayBuffer

import com.google.common.base.Charsets.UTF_8

import org.apache.spark.util.Utils

private[nio] abstract class Message(val typ: Long, val id: Int) {
  var senderAddress: InetSocketAddress = null
  var started = false
  var startTime = -1L
  var finishTime = -1L
  var isSecurityNeg = false
  var hasError = false

  def size: Int

  def getChunkForSending(maxChunkSize: Int): Option[MessageChunk]

  def getChunkForReceiving(chunkSize: Int): Option[MessageChunk]

  def timeTaken(): String = (finishTime - startTime).toString + " ms"

  override def toString: String = {
    this.getClass.getSimpleName + "(id = " + id + ", size = " + size + ")"
  }
}


private[nio] object Message {
  val BUFFER_MESSAGE = 1111111111L

  var lastId = 1

  def getNewId(): Int = synchronized {
    lastId += 1
    if (lastId == 0) {
      lastId += 1
    }
    lastId
  }

  def createBufferMessage(dataBuffers: Seq[ByteBuffer], ackId: Int): BufferMessage = {
    if (dataBuffers == null) {
      return new BufferMessage(getNewId(), new ArrayBuffer[ByteBuffer], ackId)
    }
    if (dataBuffers.exists(_ == null)) {
      throw new Exception("Attempting to create buffer message with null buffer")
    }
    new BufferMessage(getNewId(), new ArrayBuffer[ByteBuffer] ++= dataBuffers, ackId)
  }

  def createBufferMessage(dataBuffers: Seq[ByteBuffer]): BufferMessage =
    createBufferMessage(dataBuffers, 0)

  def createBufferMessage(dataBuffer: ByteBuffer, ackId: Int): BufferMessage = {
    if (dataBuffer == null) {
      //ByteBuffer.allocate在能够读和写之前,必须有一个缓冲区,用静态方法 allocate() 来分配缓冲区
      createBufferMessage(Array(ByteBuffer.allocate(0)), ackId)
    } else {
      createBufferMessage(Array(dataBuffer), ackId)
    }
  }

  def createBufferMessage(dataBuffer: ByteBuffer): BufferMessage =
    createBufferMessage(dataBuffer, 0)

  def createBufferMessage(ackId: Int): BufferMessage = {
    createBufferMessage(new Array[ByteBuffer](0), ackId)
  }

  
  def createErrorMessage(exception: Exception, ackId: Int): BufferMessage = {
    val exceptionString = Utils.exceptionString(exception)
    val serializedExceptionString = ByteBuffer.wrap(exceptionString.getBytes(UTF_8))
    val errorMessage = createBufferMessage(serializedExceptionString, ackId)
    errorMessage.hasError = true
    errorMessage
  }

  def create(header: MessageChunkHeader): Message = {
    val newMessage: Message = header.typ match {
      case BUFFER_MESSAGE => new BufferMessage(header.id,
        //ByteBuffer.allocate在能够读和写之前,必须有一个缓冲区,用静态方法 allocate() 来分配缓冲区
        ArrayBuffer(ByteBuffer.allocate(header.totalSize)), header.other)
    }
    newMessage.hasError = header.hasError
    newMessage.senderAddress = header.address
    newMessage
  }
}

Source File: ApplicationInfo.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.deploy.master

import java.util.Date

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

import org.apache.spark.deploy.ApplicationDescription
import org.apache.spark.rpc.RpcEndpointRef
import org.apache.spark.util.Utils

private[spark] class ApplicationInfo(
  val startTime: Long,
  val id: String,
  val desc: ApplicationDescription,
  val submitDate: Date,
  val driver: RpcEndpointRef,
  defaultCores: Int)
    extends Serializable {
  //枚举类型赋值
  @transient var state: ApplicationState.Value = _
  @transient var executors: mutable.HashMap[Int, ExecutorDesc] = _
  @transient var removedExecutors: ArrayBuffer[ExecutorDesc] = _
  @transient var coresGranted: Int = _
  @transient var endTime: Long = _
  @transient var appSource: ApplicationSource = _

  // A cap on the number of executors this application can have at any given time.
  //执行者的数量这个应用程序可以在任何给定的时间
  // By default, this is infinite. Only after the first allocation request is issued by the
  // application will this be set to a finite value. This is used for dynamic allocation.
  //默认情况下,这是无限的,只有在应用程序发出第一个分配请求之后,这将被设置为有限的值,这用于动态分配
  @transient private[master] var executorLimit: Int = _

  @transient private var nextExecutorId: Int = _

  init() //初始化方法

  private def readObject(in: java.io.ObjectInputStream): Unit = Utils.tryOrIOException {
    in.defaultReadObject()
    init()
  }
  
  private[deploy] def getExecutorLimit: Int = executorLimit

  def duration: Long = {
    if (endTime != -1) {
      endTime - startTime
    } else {
      System.currentTimeMillis() - startTime
    }
  }

}

Source File: Schedulable.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.util.concurrent.ConcurrentLinkedQueue

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.scheduler.SchedulingMode.SchedulingMode


private[spark] trait Schedulable {
  var parent: Pool
  // child queues
  def schedulableQueue: ConcurrentLinkedQueue[Schedulable]
  def schedulingMode: SchedulingMode
  def weight: Int
  def minShare: Int
  def runningTasks: Int
  def priority: Int
  def stageId: Int
  def name: String

  def addSchedulable(schedulable: Schedulable): Unit
  def removeSchedulable(schedulable: Schedulable): Unit
  def getSchedulableByName(name: String): Schedulable
  def executorLost(executorId: String, host: String): Unit
  def checkSpeculatableTasks(): Boolean
  def getSortedTaskSetQueue: ArrayBuffer[TaskSetManager]
}

Source File: ByteArrayChunkOutputStream.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.util.io

import java.io.OutputStream

import scala.collection.mutable.ArrayBuffer



  private var position = chunkSize

  override def write(b: Int): Unit = {
    allocateNewChunkIfNeeded()
    //注意前套数组取值方式
    chunks(lastChunkIndex)(position) = b.toByte
    position += 1
  }

  override def write(bytes: Array[Byte], off: Int, len: Int): Unit = {
    var written = 0
    while (written < len) {
      allocateNewChunkIfNeeded()
      val thisBatch = math.min(chunkSize - position, len - written)
      System.arraycopy(bytes, written + off, chunks(lastChunkIndex), position, thisBatch)
      written += thisBatch
      position += thisBatch
    }
  }

  @inline
  private def allocateNewChunkIfNeeded(): Unit = {
    if (position == chunkSize) {
      chunks += new Array[Byte](chunkSize)
      lastChunkIndex += 1
      position = 0
    }
  }

  def toArrays: Array[Array[Byte]] = {
    if (lastChunkIndex == -1) {
      new Array[Array[Byte]](0)
    } else {
      // Copy the first n-1 chunks to the output, and then create an array that fits the last chunk.
      // An alternative would have been returning an array of ByteBuffers, with the last buffer
      // bounded to only the last chunk's position. However, given our use case in Spark (to put
      // the chunks in block manager), only limiting the view bound of the buffer would still
      // require the block manager to store the whole chunk.
      //将第一个n-1块复制到输出，然后创建一个适合最后一个块的数组。一个替代方法是返回一个ByteBuffers数组，最后一个缓冲区
      //仅限于最后一个块的位置。 但是，考虑到我们在Spark中的用例（put块块中的块管理器），只会限制缓冲区的视图边界
      //要求块管理器存储整个块。
      val ret = new Array[Array[Byte]](chunks.size)
      for (i <- 0 until chunks.size - 1) {
        ret(i) = chunks(i)
      }
      if (position == chunkSize) {
        ret(lastChunkIndex) = chunks(lastChunkIndex)
      } else {
        ret(lastChunkIndex) = new Array[Byte](position)
        System.arraycopy(chunks(lastChunkIndex), 0, ret(lastChunkIndex), 0, position)
      }
      ret
    }
  }
}

Source File: MapPartitionsWithPreparationRDD.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.{Partition, Partitioner, TaskContext}


  override def compute(partition: Partition, context: TaskContext): Iterator[U] = {
    val prepared =
      if (preparedArguments.isEmpty) {
        preparePartition()
      } else {
        preparedArguments.remove(0)
      }
    val parentIterator = firstParent[T].iterator(partition, context)
    executePartition(context, partition.index, prepared, parentIterator)
  }
}

Source File: UnionRDD.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient rdd: RDD[T],
    val parentRddIndex: Int,
    @transient parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    //在任务序列化时更新对父拆分的引用
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  override def getPartitions: Array[Partition] = {
    val array = new Array[Partition](rdds.map(_.partitions.length).sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
}

Source File: hbaseCommands.scala From Heracles with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase.execution

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias
import org.apache.spark.sql.execution.command.RunnableCommand
import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.hbase._
import org.apache.spark.sql.hbase.util.DataTypeUtils
import org.apache.spark.sql.types._

import scala.collection.mutable.ArrayBuffer

@DeveloperApi
case class AlterDropColCommand(namespace: String, tableName: String, columnName: String)
  extends RunnableCommand {

  def run(sparkSession: SparkSession): Seq[Row] = {
    sparkSession.sharedState.externalCatalog.asInstanceOf[HBaseCatalog]
      .alterTableDropNonKey(namespace, tableName, columnName)
    sparkSession.sharedState.externalCatalog.asInstanceOf[HBaseCatalog].stopAdmin()
    Seq.empty[Row]
  }
}

@DeveloperApi
case class AlterAddColCommand(namespace: String,
                              tableName: String,
                              colName: String,
                              colType: String,
                              colFamily: String,
                              colQualifier: String) extends RunnableCommand {

  def run(sparkSession: SparkSession): Seq[Row] = {
    val hbaseCatalog = sparkSession.sharedState.externalCatalog.asInstanceOf[HBaseCatalog]
    hbaseCatalog.alterTableAddNonKey(namespace, tableName,
      NonKeyColumn(colName, DataTypeUtils.getDataType(colType), colFamily, colQualifier))
    hbaseCatalog.stopAdmin()
    Seq.empty[Row]
  }
}

@DeveloperApi
case class InsertValueIntoTableCommand(tid: TableIdentifier, valueSeq: Seq[String])
  extends RunnableCommand {
  override def run(sparkSession: SparkSession) = {
    val relation: HBaseRelation = sparkSession.sessionState.catalog.externalCatalog
      .asInstanceOf[HBaseCatalog]
      .getHBaseRelation(tid.database.getOrElse(null), tid.table).getOrElse(null)

    val bytes = valueSeq.zipWithIndex.map(v =>
      DataTypeUtils.string2TypeData(v._1, relation.schema(v._2).dataType))

    val rows = sparkSession.sparkContext.makeRDD(Seq(Row.fromSeq(bytes)))
    val inputValuesDF = sparkSession.createDataFrame(rows, relation.schema)
    relation.insert(inputValuesDF, overwrite = false)

    Seq.empty[Row]
  }

  override def output: Seq[Attribute] = Seq.empty
}

Source File: MeetupReceiver.scala From meetup-stream with Apache License 2.0

5 votes

package receiver

import org.apache.spark.streaming.receiver.Receiver
import org.apache.spark.storage.StorageLevel
import org.apache.spark.Logging
import com.ning.http.client.AsyncHttpClientConfig
import com.ning.http.client._
import scala.collection.mutable.ArrayBuffer
import java.io.OutputStream
import java.io.ByteArrayInputStream
import java.io.InputStreamReader
import java.io.BufferedReader
import java.io.InputStream
import java.io.PipedInputStream
import java.io.PipedOutputStream

class MeetupReceiver(url: String) extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) with Logging {
  
  @transient var client: AsyncHttpClient = _
  
  @transient var inputPipe: PipedInputStream = _
  @transient var outputPipe: PipedOutputStream = _  
       
  def onStart() {    
    val cf = new AsyncHttpClientConfig.Builder()
    cf.setRequestTimeout(Integer.MAX_VALUE)
    cf.setReadTimeout(Integer.MAX_VALUE)
    cf.setPooledConnectionIdleTimeout(Integer.MAX_VALUE)      
    client= new AsyncHttpClient(cf.build())
    
    inputPipe = new PipedInputStream(1024 * 1024)
    outputPipe = new PipedOutputStream(inputPipe)
    val producerThread = new Thread(new DataConsumer(inputPipe))
    producerThread.start()
    
    client.prepareGet(url).execute(new AsyncHandler[Unit]{
        
      def onBodyPartReceived(bodyPart: HttpResponseBodyPart) = {
        bodyPart.writeTo(outputPipe)
        AsyncHandler.STATE.CONTINUE        
      }
      
      def onStatusReceived(status: HttpResponseStatus) = {
        AsyncHandler.STATE.CONTINUE
      }
      
      def onHeadersReceived(headers: HttpResponseHeaders) = {
        AsyncHandler.STATE.CONTINUE
      }
            
      def onCompleted = {
        println("completed")
      }
      
      
      def onThrowable(t: Throwable)={
        t.printStackTrace()
      }
        
    })    
    
    
  }

  def onStop() {
    if (Option(client).isDefined) client.close()
    if (Option(outputPipe).isDefined) {
     outputPipe.flush()
     outputPipe.close() 
    }
    if (Option(inputPipe).isDefined) {
     inputPipe.close() 
    }    
  }
  
  class DataConsumer(inputStream: InputStream) extends Runnable 
  {
       
      override
      def run()
      {        
        val bufferedReader = new BufferedReader( new InputStreamReader( inputStream ))
        var input=bufferedReader.readLine()
        while(input!=null){          
          store(input)
          input=bufferedReader.readLine()
        }            
      }  
      
  }

}

Source File: HashBasedDeduplicator.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl.texts

import odkl.analysis.spark.util.Logging
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param._
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.ml.linalg.Vectors.norm
import org.apache.spark.ml.linalg.{BLAS, Vector}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Dataset, Row}

import scala.collection.mutable.ArrayBuffer


  def setSimilarityTreshold(value: Double): this.type = set(similarityThreshold, value)

  setDefault(new ParamPair[String](inputColHash,"hash"),
    new ParamPair[Double](similarityThreshold,0.9))

  def this() = this(Identifiable.randomUID("hashBasedDeduplication"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    dataset.sqlContext.createDataFrame(
      dataset.toDF
        .repartition(dataset.col($(inputColHash)))
        .sortWithinPartitions($(inputColHash))
        .rdd
        .mapPartitions((f: Iterator[Row]) => {
          if (f.hasNext) {
            var curHash: Long = -1L
            val vectorsBuffer = new ArrayBuffer[Vector](0) // unique vectors buffer for this bucket
            for (it <- f) yield {
              val newHash = it.getAs[Long]($(inputColHash))
              if (newHash == curHash) {
                val currentVector = it.getAs[Vector]($(inputColVector))
                val isUnique = vectorsBuffer.forall(storedVector => { //are this vector is "different" with other in buffer?
                  (BLAS.dot(storedVector, currentVector) / (norm(storedVector, 2) * norm(currentVector, 2))) < $(similarityThreshold) //is unsimilar?
                })
                if (isUnique) {
                  vectorsBuffer.append(currentVector)
                  it
                } else {
                  Row.empty //dummy Row
                }
              } else {
                vectorsBuffer.clear()
                vectorsBuffer.append(it.getAs[Vector]($(inputColVector)))
                curHash = newHash
                it
              }
            }
          } else {
            new Array[Row](0).toIterator //empty partition?
          }

        }).filter(!_.equals(Row.empty)), //filter dummy
      transformSchema(dataset.schema))
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    schema
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)


}

Source File: NonSampleCompactor.scala From deequ with Apache License 2.0

5 votes

package com.amazon.deequ.analyzers

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag
import scala.util.Random


    val output = (offset until len by 2).map(sortedBuffer(_)).toArray
    val tail = findOdd(items)
    items = items % 2
    var newBuffer = ArrayBuffer[T]()
    if (tail.isDefined) {
      newBuffer = newBuffer :+ tail.get
    }
    buffer = newBuffer
    numOfCompress = numOfCompress + 1
    output
  }
}

Source File: ScaleAndConvert.scala From SparkNet with MIT License

5 votes

package preprocessing

import java.awt.image.DataBufferByte
import java.io.ByteArrayInputStream
import javax.imageio.ImageIO

import scala.collection.mutable.ArrayBuffer
import scala.collection.JavaConversions._
import net.coobird.thumbnailator._

import org.apache.spark.rdd.RDD

import libs._

object ScaleAndConvert {
  def BufferedImageToByteArray(image: java.awt.image.BufferedImage) : Array[Byte] = {
    val height = image.getHeight()
    val width = image.getWidth()
    val pixels = image.getRGB(0, 0, width, height, null, 0, width)
    val result = new Array[Byte](3 * height * width)
    var row = 0
    while (row < height) {
      var col = 0
      while (col < width) {
        val rgb = pixels(row * width + col)
        result(0 * height * width + row * width + col) = ((rgb >> 16) & 0xFF).toByte
        result(1 * height * width + row * width + col) = ((rgb >> 8) & 0xFF).toByte
        result(2 * height * width + row * width + col) = (rgb & 0xFF).toByte
        col += 1
      }
      row += 1
    }
    result
  }

  def decompressImageAndResize(compressedImage: Array[Byte], height: Int, width: Int) : Option[Array[Byte]] = {
    // this method takes a JPEG, decompresses it, and resizes it
    try {
      val im = ImageIO.read(new ByteArrayInputStream(compressedImage))
      val resizedImage = Thumbnails.of(im).forceSize(width, height).asBufferedImage()
      Some(BufferedImageToByteArray(resizedImage))
    } catch {
      // If images can't be processed properly, just ignore them
      case e: java.lang.IllegalArgumentException => None
      case e: javax.imageio.IIOException => None
      case e: java.lang.NullPointerException => None
    }
  }
}

Source File: ClassRDDPartitioner.scala From spark-orientdb-connector with Apache License 2.0

5 votes

package com.metreta.spark.orientdb.connector.rdd.partitioner

import scala.collection.JavaConversions.iterableAsScalaIterable
import scala.collection.mutable.ArrayBuffer

import org.apache.spark.Logging
import org.apache.spark.Partition

import com.metreta.spark.orientdb.connector.api.OrientDBConnector
import com.orientechnologies.orient.core.metadata.schema.OClass
import com.orientechnologies.orient.core.metadata.schema.OSchema
import com.orientechnologies.orient.core.storage.OStorage
import com.metreta.spark.orientdb.connector.SystemTables
import scala.collection.JavaConversions.iterableAsScalaIterable


  def getPartitions(): Array[Partition] = {
    
	val db = connector.databaseDocumentTx()

    var partitions = new ArrayBuffer[OrientPartition]
    val schema: OSchema = connector.getSchema(db)
    var klass: OClass = schema.getClass(mClass)
    val storage: OStorage = connector.getStorage(db)
    klass.getClusterIds.zipWithIndex foreach {
      case (clusterId, index) => partitions = partitions.+=(OrientPartition(
        index,
        null, // <- Host Address ?????
        PartitionName(klass.getName, storage.getClusterById(clusterId).getName)))
    }
  partitions.toArray
  }

}

Source File: SparkContextFunctionsSpec.scala From spark-orientdb-connector with Apache License 2.0

5 votes

package com.metreta.spark.orientdb.connector

import scala.collection.mutable.ArrayBuffer
import org.scalatest.BeforeAndAfterAll
import com.orientechnologies.orient.core.id.ORID
import com.metreta.spark.orientdb.connector.utils.BaseOrientDbFlatSpec

class SparkContextFunctionsSpec extends BaseOrientDbFlatSpec {

  var oridList: ArrayBuffer[String] = new ArrayBuffer
  var MaxCluster = 1000
  var MaxRecord = 1000

  override def beforeAll(): Unit = {
    initSparkConf(defaultSparkConf)
    createOridList()
  }

  override def afterAll(): Unit = {
    sparkContext.stop()
  }

  "A VertexId created from RID" should "be unique" in {
    val vertexIdList = oridList map { rid => sparkContext.getVertexIdFromString(rid) }
    val duplicatedValues = vertexIdList.groupBy(identity).collect { case (x, ys) if ys.lengthCompare(1) > 0 => x }
    duplicatedValues shouldBe empty
  }

  it should "be a positive number" in {
    val negativeValues = oridList filter { rid => sparkContext.getVertexIdFromString(rid) < 0 }
    negativeValues shouldBe empty
  }

  def createOridList() {
    for (clusterId <- 0 to MaxCluster) {
      for (recordId <- 0 to MaxRecord) {
        val rid = new StringBuilder
        rid.append(ORID.PREFIX);
        rid.append(clusterId);
        rid.append(ORID.SEPARATOR);
        rid.append(recordId);
        oridList += rid.toString
      }
    }
  }

}

Source File: SpearmanCorrelation.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat.correlation

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.internal.Logging
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.rdd.RDD


  override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = {
    // ((columnIndex, value), rowUid)
    val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) =>
      vec.toArray.view.zipWithIndex.map { case (v, j) =>
        ((j, v), uid)
      }
    }
    // global sort by (columnIndex, value)
    val sorted = colBased.sortByKey()
    // assign global ranks (using average ranks for tied values)
    val globalRanks = sorted.zipWithIndex().mapPartitions { iter =>
      var preCol = -1
      var preVal = Double.NaN
      var startRank = -1.0
      val cachedUids = ArrayBuffer.empty[Long]
      val flush: () => Iterable[(Long, (Int, Double))] = () => {
        val averageRank = startRank + (cachedUids.size - 1) / 2.0
        val output = cachedUids.map { uid =>
          (uid, (preCol, averageRank))
        }
        cachedUids.clear()
        output
      }
      iter.flatMap { case (((j, v), uid), rank) =>
        // If we see a new value or cachedUids is too big, we flush ids with their average rank.
        if (j != preCol || v != preVal || cachedUids.size >= 10000000) {
          val output = flush()
          preCol = j
          preVal = v
          startRank = rank
          cachedUids += uid
          output
        } else {
          cachedUids += uid
          Iterator.empty
        }
      } ++ flush()
    }
    // Replace values in the input matrix by their ranks compared with values in the same column.
    // Note that shifting all ranks in a column by a constant value doesn't affect result.
    val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) =>
      // sort by column index and then convert values to a vector
      Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray)
    }
    PearsonCorrelation.computeCorrelationMatrix(groupedRanks)
  }
}

Source File: ApplicationMasterArguments.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import scala.collection.mutable.ArrayBuffer

class ApplicationMasterArguments(val args: Array[String]) {
  var userJar: String = null
  var userClass: String = null
  var primaryPyFile: String = null
  var primaryRFile: String = null
  var userArgs: Seq[String] = Nil
  var propertiesFile: String = null

  parseArgs(args.toList)

  private def parseArgs(inputArgs: List[String]): Unit = {
    val userArgsBuffer = new ArrayBuffer[String]()

    var args = inputArgs

    while (!args.isEmpty) {
      // --num-workers, --worker-memory, and --worker-cores are deprecated since 1.0,
      // the properties with executor in their names are preferred.
      args match {
        case ("--jar") :: value :: tail =>
          userJar = value
          args = tail

        case ("--class") :: value :: tail =>
          userClass = value
          args = tail

        case ("--primary-py-file") :: value :: tail =>
          primaryPyFile = value
          args = tail

        case ("--primary-r-file") :: value :: tail =>
          primaryRFile = value
          args = tail

        case ("--arg") :: value :: tail =>
          userArgsBuffer += value
          args = tail

        case ("--properties-file") :: value :: tail =>
          propertiesFile = value
          args = tail

        case _ =>
          printUsageAndExit(1, args)
      }
    }

    if (primaryPyFile != null && primaryRFile != null) {
      // scalastyle:off println
      System.err.println("Cannot have primary-py-file and primary-r-file at the same time")
      // scalastyle:on println
      System.exit(-1)
    }

    userArgs = userArgsBuffer.toList
  }

  def printUsageAndExit(exitCode: Int, unknownParam: Any = null) {
    // scalastyle:off println
    if (unknownParam != null) {
      System.err.println("Unknown/unsupported param " + unknownParam)
    }
    System.err.println("""
      |Usage: org.apache.spark.deploy.yarn.ApplicationMaster [options]
      |Options:
      |  --jar JAR_PATH       Path to your application's JAR file
      |  --class CLASS_NAME   Name of your application's main class
      |  --primary-py-file    A main Python file
      |  --primary-r-file     A main R file
      |  --arg ARG            Argument to be passed to your application's main class.
      |                       Multiple invocations are possible, each will be passed in order.
      |  --properties-file FILE Path to a custom Spark properties file.
      """.stripMargin)
    // scalastyle:on println
    System.exit(exitCode)
  }
}

object ApplicationMasterArguments {
  val DEFAULT_NUMBER_EXECUTORS = 2
}

Source File: ClientArguments.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import scala.collection.mutable.ArrayBuffer

// TODO: Add code and support for ensuring that yarn resource 'tasks' are location aware !
private[spark] class ClientArguments(args: Array[String]) {

  var userJar: String = null
  var userClass: String = null
  var primaryPyFile: String = null
  var primaryRFile: String = null
  var userArgs: ArrayBuffer[String] = new ArrayBuffer[String]()

  parseArgs(args.toList)

  private def parseArgs(inputArgs: List[String]): Unit = {
    var args = inputArgs

    while (!args.isEmpty) {
      args match {
        case ("--jar") :: value :: tail =>
          userJar = value
          args = tail

        case ("--class") :: value :: tail =>
          userClass = value
          args = tail

        case ("--primary-py-file") :: value :: tail =>
          primaryPyFile = value
          args = tail

        case ("--primary-r-file") :: value :: tail =>
          primaryRFile = value
          args = tail

        case ("--arg") :: value :: tail =>
          userArgs += value
          args = tail

        case Nil =>

        case _ =>
          throw new IllegalArgumentException(getUsageMessage(args))
      }
    }

    if (primaryPyFile != null && primaryRFile != null) {
      throw new IllegalArgumentException("Cannot have primary-py-file and primary-r-file" +
        " at the same time")
    }
  }

  private def getUsageMessage(unknownParam: List[String] = null): String = {
    val message = if (unknownParam != null) s"Unknown/unsupported param $unknownParam\n" else ""
    message +
      s"""
      |Usage: org.apache.spark.deploy.yarn.Client [options]
      |Options:
      |  --jar JAR_PATH           Path to your application's JAR file (required in yarn-cluster
      |                           mode)
      |  --class CLASS_NAME       Name of your application's main class (required)
      |  --primary-py-file        A main Python file
      |  --primary-r-file         A main R file
      |  --arg ARG                Argument to be passed to your application's main class.
      |                           Multiple invocations are possible, each will be passed in order.
      """.stripMargin
  }
}

Source File: KPLBasedKinesisTestUtils.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.streaming.kinesis

import java.nio.ByteBuffer
import java.nio.charset.StandardCharsets

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

import com.amazonaws.services.kinesis.producer.{KinesisProducer => KPLProducer, KinesisProducerConfiguration, UserRecordResult}
import com.google.common.util.concurrent.{FutureCallback, Futures}

private[kinesis] class KPLBasedKinesisTestUtils(streamShardCount: Int = 2)
    extends KinesisTestUtils(streamShardCount) {
  override protected def getProducer(aggregate: Boolean): KinesisDataGenerator = {
    if (!aggregate) {
      new SimpleDataGenerator(kinesisClient)
    } else {
      new KPLDataGenerator(regionName)
    }
  }
}


private[kinesis] class KPLDataGenerator(regionName: String) extends KinesisDataGenerator {

  private lazy val producer: KPLProducer = {
    val conf = new KinesisProducerConfiguration()
      .setRecordMaxBufferedTime(1000)
      .setMaxConnections(1)
      .setRegion(regionName)
      .setMetricsLevel("none")

    new KPLProducer(conf)
  }

  override def sendData(streamName: String, data: Seq[Int]): Map[String, Seq[(Int, String)]] = {
    val shardIdToSeqNumbers = new mutable.HashMap[String, ArrayBuffer[(Int, String)]]()
    data.foreach { num =>
      val str = num.toString
      val data = ByteBuffer.wrap(str.getBytes(StandardCharsets.UTF_8))
      val future = producer.addUserRecord(streamName, str, data)
      val kinesisCallBack = new FutureCallback[UserRecordResult]() {
        override def onFailure(t: Throwable): Unit = {} // do nothing

        override def onSuccess(result: UserRecordResult): Unit = {
          val shardId = result.getShardId
          val seqNumber = result.getSequenceNumber()
          val sentSeqNumbers = shardIdToSeqNumbers.getOrElseUpdate(shardId,
            new ArrayBuffer[(Int, String)]())
          sentSeqNumbers += ((num, seqNumber))
        }
      }
      Futures.addCallback(future, kinesisCallBack)
    }
    producer.flushSync()
    shardIdToSeqNumbers.toMap
  }
}

Source File: Exchange.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.exchange

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

import org.apache.spark.broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression, SortOrder}
import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.StructType


case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] {

  def apply(plan: SparkPlan): SparkPlan = {
    if (!conf.exchangeReuseEnabled) {
      return plan
    }
    // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls.
    val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]()
    plan.transformUp {
      case exchange: Exchange =>
        // the exchanges that have same results usually also have same schemas (same column names).
        val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]())
        val samePlan = sameSchema.find { e =>
          exchange.sameResult(e)
        }
        if (samePlan.isDefined) {
          // Keep the output of this exchange, the following plans require that to resolve
          // attributes.
          ReusedExchangeExec(exchange.output, samePlan.get)
        } else {
          sameSchema += exchange
          exchange
        }
    }
  }
}

Source File: SQLAppStatusStore.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.ui

import java.lang.{Long => JLong}
import java.util.Date

import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer

import com.fasterxml.jackson.annotation.JsonIgnore
import com.fasterxml.jackson.databind.annotation.JsonDeserialize

import org.apache.spark.JobExecutionStatus
import org.apache.spark.status.KVUtils.KVIndexParam
import org.apache.spark.util.kvstore.{KVIndex, KVStore}


class SparkPlanGraphNodeWrapper(
    val node: SparkPlanGraphNode,
    val cluster: SparkPlanGraphClusterWrapper) {

  def toSparkPlanGraphNode(): SparkPlanGraphNode = {
    assert(node == null ^ cluster == null, "One and only of of nore or cluster must be set.")
    if (node != null) node else cluster.toSparkPlanGraphCluster()
  }

}

case class SQLPlanMetric(
    name: String,
    accumulatorId: Long,
    metricType: String)

Source File: ManifestFileCommitProtocol.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.util.UUID

import scala.collection.mutable.ArrayBuffer

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}

import org.apache.spark.internal.Logging
import org.apache.spark.internal.io.FileCommitProtocol
import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage


  def setupManifestOptions(fileLog: FileStreamSinkLog, batchId: Long): Unit = {
    this.fileLog = fileLog
    this.batchId = batchId
  }

  override def setupJob(jobContext: JobContext): Unit = {
    require(fileLog != null, "setupManifestOptions must be called before this function")
    // Do nothing
  }

  override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = {
    require(fileLog != null, "setupManifestOptions must be called before this function")
    val fileStatuses = taskCommits.flatMap(_.obj.asInstanceOf[Seq[SinkFileStatus]]).toArray

    if (fileLog.add(batchId, fileStatuses)) {
      logInfo(s"Committed batch $batchId")
    } else {
      throw new IllegalStateException(s"Race while writing batch $batchId")
    }
  }

  override def abortJob(jobContext: JobContext): Unit = {
    require(fileLog != null, "setupManifestOptions must be called before this function")
    // Do nothing
  }

  override def setupTask(taskContext: TaskAttemptContext): Unit = {
    addedFiles = new ArrayBuffer[String]
  }

  override def newTaskTempFile(
      taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = {
    // The file name looks like part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet
    // Note that %05d does not truncate the split number, so if we have more than 100000 tasks,
    // the file name is fine and won't overflow.
    val split = taskContext.getTaskAttemptID.getTaskID.getId
    val uuid = UUID.randomUUID.toString
    val filename = f"part-$split%05d-$uuid$ext"

    val file = dir.map { d =>
      new Path(new Path(path, d), filename).toString
    }.getOrElse {
      new Path(path, filename).toString
    }

    addedFiles += file
    file
  }

  override def newTaskTempFileAbsPath(
      taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String = {
    throw new UnsupportedOperationException(
      s"$this does not support adding files with an absolute path")
  }

  override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = {
    if (addedFiles.nonEmpty) {
      val fs = new Path(addedFiles.head).getFileSystem(taskContext.getConfiguration)
      val statuses: Seq[SinkFileStatus] =
        addedFiles.map(f => SinkFileStatus(fs.getFileStatus(new Path(f))))
      new TaskCommitMessage(statuses)
    } else {
      new TaskCommitMessage(Seq.empty[SinkFileStatus])
    }
  }

  override def abortTask(taskContext: TaskAttemptContext): Unit = {
    // Do nothing
    // TODO: we can also try delete the addedFiles as a best-effort cleanup.
  }
}

Source File: BatchEvalPythonExecSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.python

import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer

import org.apache.spark.api.python.{PythonEvalType, PythonFunction}
import org.apache.spark.sql.catalyst.FunctionIdentifier
import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, GreaterThan, In}
import org.apache.spark.sql.execution.{FilterExec, InputAdapter, SparkPlanTest, WholeStageCodegenExec}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.BooleanType

class BatchEvalPythonExecSuite extends SparkPlanTest with SharedSQLContext {
  import testImplicits.newProductEncoder
  import testImplicits.localSeqToDatasetHolder

  override def beforeAll(): Unit = {
    super.beforeAll()
    spark.udf.registerPython("dummyPythonUDF", new MyDummyPythonUDF)
  }

  override def afterAll(): Unit = {
    spark.sessionState.functionRegistry.dropFunction(FunctionIdentifier("dummyPythonUDF"))
    super.afterAll()
  }

  test("Python UDF: push down deterministic FilterExec predicates") {
    val df = Seq(("Hello", 4)).toDF("a", "b")
      .where("dummyPythonUDF(b) and dummyPythonUDF(a) and a in (3, 4)")
    val qualifiedPlanNodes = df.queryExecution.executedPlan.collect {
      case f @ FilterExec(
          And(_: AttributeReference, _: AttributeReference),
          InputAdapter(_: BatchEvalPythonExec)) => f
      case b @ BatchEvalPythonExec(_, _, WholeStageCodegenExec(FilterExec(_: In, _))) => b
    }
    assert(qualifiedPlanNodes.size == 2)
  }

  test("Nested Python UDF: push down deterministic FilterExec predicates") {
    val df = Seq(("Hello", 4)).toDF("a", "b")
      .where("dummyPythonUDF(a, dummyPythonUDF(a, b)) and a in (3, 4)")
    val qualifiedPlanNodes = df.queryExecution.executedPlan.collect {
      case f @ FilterExec(_: AttributeReference, InputAdapter(_: BatchEvalPythonExec)) => f
      case b @ BatchEvalPythonExec(_, _, WholeStageCodegenExec(FilterExec(_: In, _))) => b
    }
    assert(qualifiedPlanNodes.size == 2)
  }

  test("Python UDF: no push down on non-deterministic") {
    val df = Seq(("Hello", 4)).toDF("a", "b")
      .where("b > 4 and dummyPythonUDF(a) and rand() > 0.3")
    val qualifiedPlanNodes = df.queryExecution.executedPlan.collect {
      case f @ FilterExec(
          And(_: AttributeReference, _: GreaterThan),
          InputAdapter(_: BatchEvalPythonExec)) => f
      case b @ BatchEvalPythonExec(_, _, WholeStageCodegenExec(_: FilterExec)) => b
    }
    assert(qualifiedPlanNodes.size == 2)
  }

  test("Python UDF: push down on deterministic predicates after the first non-deterministic") {
    val df = Seq(("Hello", 4)).toDF("a", "b")
      .where("dummyPythonUDF(a) and rand() > 0.3 and b > 4")

    val qualifiedPlanNodes = df.queryExecution.executedPlan.collect {
      case f @ FilterExec(
          And(_: AttributeReference, _: GreaterThan),
          InputAdapter(_: BatchEvalPythonExec)) => f
      case b @ BatchEvalPythonExec(_, _, WholeStageCodegenExec(_: FilterExec)) => b
    }
    assert(qualifiedPlanNodes.size == 2)
  }

  test("Python UDF refers to the attributes from more than one child") {
    val df = Seq(("Hello", 4)).toDF("a", "b")
    val df2 = Seq(("Hello", 4)).toDF("c", "d")
    val joinDF = df.crossJoin(df2).where("dummyPythonUDF(a, c) == dummyPythonUDF(d, c)")
    val qualifiedPlanNodes = joinDF.queryExecution.executedPlan.collect {
      case b: BatchEvalPythonExec => b
    }
    assert(qualifiedPlanNodes.size == 1)
  }
}

// This Python UDF is dummy and just for testing. Unable to execute.
class DummyUDF extends PythonFunction(
  command = Array[Byte](),
  envVars = Map("" -> "").asJava,
  pythonIncludes = ArrayBuffer("").asJava,
  pythonExec = "",
  pythonVer = "",
  broadcastVars = null,
  accumulator = null)

class MyDummyPythonUDF extends UserDefinedPythonFunction(
  name = "dummyUDF",
  func = new DummyUDF,
  dataType = BooleanType,
  pythonEvalType = PythonEvalType.SQL_BATCHED_UDF,
  udfDeterministic = true)

Source File: UnionDStream.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.SparkException
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Duration, Time}

private[streaming]
class UnionDStream[T: ClassTag](parents: Array[DStream[T]])
  extends DStream[T](parents.head.ssc) {

  require(parents.length > 0, "List of DStreams to union is empty")
  require(parents.map(_.ssc).distinct.length == 1, "Some of the DStreams have different contexts")
  require(parents.map(_.slideDuration).distinct.length == 1,
    "Some of the DStreams have different slide durations")

  override def dependencies: List[DStream[_]] = parents.toList

  override def slideDuration: Duration = parents.head.slideDuration

  override def compute(validTime: Time): Option[RDD[T]] = {
    val rdds = new ArrayBuffer[RDD[T]]()
    parents.map(_.getOrCompute(validTime)).foreach {
      case Some(rdd) => rdds += rdd
      case None => throw new SparkException("Could not generate RDD from a parent for unifying at" +
        s" time $validTime")
    }
    if (rdds.nonEmpty) {
      Some(ssc.sc.union(rdds))
    } else {
      None
    }
  }
}

Source File: QueueInputDStream.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream}

import scala.collection.mutable.{ArrayBuffer, Queue}
import scala.reflect.ClassTag

import org.apache.spark.rdd.{RDD, UnionRDD}
import org.apache.spark.streaming.{StreamingContext, Time}

private[streaming]
class QueueInputDStream[T: ClassTag](
    ssc: StreamingContext,
    val queue: Queue[RDD[T]],
    oneAtATime: Boolean,
    defaultRDD: RDD[T]
  ) extends InputDStream[T](ssc) {

  override def start() { }

  override def stop() { }

  private def readObject(in: ObjectInputStream): Unit = {
    throw new NotSerializableException("queueStream doesn't support checkpointing. " +
      "Please don't use queueStream when checkpointing is enabled.")
  }

  private def writeObject(oos: ObjectOutputStream): Unit = {
    logWarning("queueStream doesn't support checkpointing")
  }

  override def compute(validTime: Time): Option[RDD[T]] = {
    val buffer = new ArrayBuffer[RDD[T]]()
    queue.synchronized {
      if (oneAtATime && queue.nonEmpty) {
        buffer += queue.dequeue()
      } else {
        buffer ++= queue
        queue.clear()
      }
    }
    if (buffer.nonEmpty) {
      if (oneAtATime) {
        Some(buffer.head)
      } else {
        Some(new UnionRDD(context.sc, buffer.toSeq))
      }
    } else if (defaultRDD != null) {
      Some(defaultRDD)
    } else {
      Some(ssc.sparkContext.emptyRDD)
    }
  }

}

Source File: LocalSparkCluster.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.deploy

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.SparkConf
import org.apache.spark.deploy.master.Master
import org.apache.spark.deploy.worker.Worker
import org.apache.spark.internal.Logging
import org.apache.spark.rpc.RpcEnv
import org.apache.spark.util.Utils


    for (workerNum <- 1 to numWorkers) {
      val workerEnv = Worker.startRpcEnvAndEndpoint(localHostname, 0, 0, coresPerWorker,
        memoryPerWorker, masters, null, Some(workerNum), _conf)
      workerRpcEnvs += workerEnv
    }

    masters
  }

  def stop() {
    logInfo("Shutting down local Spark cluster.")
    // Stop the workers before the master so they don't get upset that it disconnected
    workerRpcEnvs.foreach(_.shutdown())
    masterRpcEnvs.foreach(_.shutdown())
    workerRpcEnvs.foreach(_.awaitTermination())
    masterRpcEnvs.foreach(_.awaitTermination())
    masterRpcEnvs.clear()
    workerRpcEnvs.clear()
  }
}

Source File: TaskResult.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.io._
import java.nio.ByteBuffer

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.SparkEnv
import org.apache.spark.serializer.SerializerInstance
import org.apache.spark.storage.BlockId
import org.apache.spark.util.{AccumulatorV2, Utils}

// Task result. Also contains updates to accumulator variables.
private[spark] sealed trait TaskResult[T]


  def value(resultSer: SerializerInstance = null): T = {
    if (valueObjectDeserialized) {
      valueObject
    } else {
      // This should not run when holding a lock because it may cost dozens of seconds for a large
      // value
      val ser = if (resultSer == null) SparkEnv.get.serializer.newInstance() else resultSer
      valueObject = ser.deserialize(valueBytes)
      valueObjectDeserialized = true
      valueObject
    }
  }
}

Source File: Schedulable.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.util.concurrent.ConcurrentLinkedQueue

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.scheduler.SchedulingMode.SchedulingMode


private[spark] trait Schedulable {
  var parent: Pool
  // child queues
  def schedulableQueue: ConcurrentLinkedQueue[Schedulable]
  def schedulingMode: SchedulingMode
  def weight: Int
  def minShare: Int
  def runningTasks: Int
  def priority: Int
  def stageId: Int
  def name: String

  def addSchedulable(schedulable: Schedulable): Unit
  def removeSchedulable(schedulable: Schedulable): Unit
  def getSchedulableByName(name: String): Schedulable
  def executorLost(executorId: String, host: String, reason: ExecutorLossReason): Unit
  def checkSpeculatableTasks(minTimeToSpeculation: Int): Boolean
  def getSortedTaskSetQueue: ArrayBuffer[TaskSetManager]
}

Source File: ChunkedByteBufferOutputStream.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.util.io

import java.io.OutputStream
import java.nio.ByteBuffer

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.storage.StorageUtils


  private[this] var position = chunkSize
  private[this] var _size = 0
  private[this] var closed: Boolean = false

  def size: Long = _size

  override def close(): Unit = {
    if (!closed) {
      super.close()
      closed = true
    }
  }

  override def write(b: Int): Unit = {
    require(!closed, "cannot write to a closed ChunkedByteBufferOutputStream")
    allocateNewChunkIfNeeded()
    chunks(lastChunkIndex).put(b.toByte)
    position += 1
    _size += 1
  }

  override def write(bytes: Array[Byte], off: Int, len: Int): Unit = {
    require(!closed, "cannot write to a closed ChunkedByteBufferOutputStream")
    var written = 0
    while (written < len) {
      allocateNewChunkIfNeeded()
      val thisBatch = math.min(chunkSize - position, len - written)
      chunks(lastChunkIndex).put(bytes, written + off, thisBatch)
      written += thisBatch
      position += thisBatch
    }
    _size += len
  }

  @inline
  private def allocateNewChunkIfNeeded(): Unit = {
    if (position == chunkSize) {
      chunks += allocator(chunkSize)
      lastChunkIndex += 1
      position = 0
    }
  }

  def toChunkedByteBuffer: ChunkedByteBuffer = {
    require(closed, "cannot call toChunkedByteBuffer() unless close() has been called")
    require(!toChunkedByteBufferWasCalled, "toChunkedByteBuffer() can only be called once")
    toChunkedByteBufferWasCalled = true
    if (lastChunkIndex == -1) {
      new ChunkedByteBuffer(Array.empty[ByteBuffer])
    } else {
      // Copy the first n-1 chunks to the output, and then create an array that fits the last chunk.
      // An alternative would have been returning an array of ByteBuffers, with the last buffer
      // bounded to only the last chunk's position. However, given our use case in Spark (to put
      // the chunks in block manager), only limiting the view bound of the buffer would still
      // require the block manager to store the whole chunk.
      val ret = new Array[ByteBuffer](chunks.size)
      for (i <- 0 until chunks.size - 1) {
        ret(i) = chunks(i)
        ret(i).flip()
      }
      if (position == chunkSize) {
        ret(lastChunkIndex) = chunks(lastChunkIndex)
        ret(lastChunkIndex).flip()
      } else {
        ret(lastChunkIndex) = allocator(position)
        chunks(lastChunkIndex).flip()
        ret(lastChunkIndex).put(chunks(lastChunkIndex))
        ret(lastChunkIndex).flip()
        StorageUtils.dispose(chunks(lastChunkIndex))
      }
      new ChunkedByteBuffer(ret)
    }
  }
}

Source File: UnionRDD.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.ForkJoinTaskSupport
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
}

Source File: ValueJsonConversionTest.scala From ingraph with Eclipse Public License 1.0

5 votes

package ingraph.compiler.sql.driver

import ingraph.compiler.sql.driver.ValueJsonConversion._
import ingraph.compiler.sql.driver.ValueJsonConversionTest._
import org.neo4j.driver.internal.value._
import org.neo4j.driver.internal.{InternalNode, InternalPath, InternalRelationship}
import org.neo4j.driver.v1.Value
import org.scalactic.source
import org.scalactic.source.Position
import org.scalatest.FunSuite

import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer

class ValueJsonConversionTest extends FunSuite {
  testParameters.foreach { case (value, testName, pos) =>
    test(testName) {
      println(value)

      val jsonString = gson.toJson(value, classOf[Value])
      println(jsonString)

      val deserialized = gson.fromJson(jsonString, classOf[Value])

      assert(value == deserialized)
    }(pos)
  }
}

object ValueJsonConversionTest {
  val testValues: ArrayBuffer[Value] = ArrayBuffer.empty
  val testParameters: ArrayBuffer[(Value, String, Position)] = ArrayBuffer.empty

  def addTest(value: Value, testName: String = null)(implicit pos: source.Position): Unit = {
    testValues += value
    testParameters += ((value, Option(testName).getOrElse(value.getClass.getSimpleName), pos))
  }

  private val stringValue = new StringValue("John")
  private val integerValue = new IntegerValue(101)
  private val propertiesMap = Map[String, Value]("name" -> stringValue).asJava

  addTest(new MapValue(propertiesMap))
  addTest(new BytesValue(Array[Byte](0, 42, 127, -128)))
  addTest(new ListValue(stringValue, integerValue))
  addTest(new NodeValue(new InternalNode(5, List("Label1", "Label2").asJavaCollection, propertiesMap)))
  addTest(new RelationshipValue(new InternalRelationship(42, 10, 20, "Edge_Type_1", propertiesMap)))
  addTest(new PathValue(new InternalPath(
    new InternalNode(0),
    new InternalRelationship(101, 0, 1, "TYPE_A"),
    new InternalNode(1)
  )))
  addTest(BooleanValue.FALSE)
  addTest(BooleanValue.TRUE)
  addTest(NullValue.NULL)
  addTest(stringValue)
  addTest(integerValue)
  addTest(new FloatValue(3.14))
}

Source File: TokenStreamUtils.scala From odinson with Apache License 2.0

5 votes

package ai.lum.odinson.lucene.analysis

import scala.collection.mutable.ArrayBuffer
import org.apache.lucene.analysis.Analyzer
import org.apache.lucene.analysis.TokenStream
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import org.apache.lucene.search.IndexSearcher
import org.apache.lucene.search.highlight.TokenSources

object TokenStreamUtils {

  def getTokens(
    docID: Int,
    fieldName: String,
    indexSearcher: IndexSearcher,
    analyzer: Analyzer
  ): Array[String] = {
    val doc = indexSearcher.doc(docID)
    val tvs = indexSearcher.getIndexReader().getTermVectors(docID)
    val text = doc.getField(fieldName).stringValue
    val ts = TokenSources.getTokenStream(fieldName, tvs, text, analyzer, -1)
    val tokens = getTokens(ts)
    tokens
  }

  def getTokens(ts: TokenStream): Array[String] = {
    ts.reset()
    val terms = new ArrayBuffer[String]

    while (ts.incrementToken()) {
      val charTermAttribute = ts.addAttribute(classOf[CharTermAttribute])
      val term = charTermAttribute.toString
      terms += term
    }

    ts.end()
    ts.close()

    terms.toArray
  }

}

Source File: Driver.scala From OnlineLDA_Spark with Apache License 2.0

5 votes

package com.github.yuhao.yang

import java.util.Calendar
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkContext, SparkConf}
import scala.collection.mutable.ArrayBuffer

object Driver extends Serializable{

  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.ERROR)
    Logger.getLogger("akka").setLevel(Level.ERROR)
    val inputDir = args(0)
    val filePaths = extractPaths(inputDir + "texts", true)
    val stopWordsPath = inputDir + "stop.txt"
    val vocabPath = inputDir + "wordsEn.txt"

    println("begin: " + Calendar.getInstance().getTime)
    println("path size: " + filePaths.size)
    assert(filePaths.size > 0)

    val conf = new SparkConf().setAppName("online LDA Spark")
    val sc = new SparkContext(conf)

    val vocab = Docs2Vec.extractVocab(sc, Seq(vocabPath), stopWordsPath)
    val vocabArray = vocab.map(_.swap)

    val K = args(1).toInt
//    val lda = OnlineLDA_Spark.runBatchMode(sc, filePaths, vocab, K, 50)
    val lda = OnlineLDA_Spark.runOnlineMode(sc, filePaths, vocab, K, args(2).toInt)

    println("_lambda:")
    for(row <- 0 until lda._lambda.rows){
      val v = lda._lambda(row, ::).t
      val topk = lda._lambda(row, ::).t.argtopk(10)
      val pairs = topk.map(k => (vocabArray(k), v(k)))
      val sorted = pairs.sortBy(_._2).reverse
       println(sorted.map(x => (x._1)).mkString(","), sorted.map(x => ("%2.2f".format(x._2))).mkString(","))
    }

    println("end: " + Calendar.getInstance().getTime())

  }

  def extractPaths(path: String, recursive: Boolean = true): Array[String] ={
    val docsets = ArrayBuffer[String]()
    val fileList = new java.io.File(path).listFiles()
    if(fileList == null) return docsets.toArray
    for(f <- fileList){
      if(f.isDirectory){
        if(recursive)
          docsets ++= extractPaths(f.getAbsolutePath, true)
      }
      else{
        docsets +=  f.getAbsolutePath
      }
    }
    docsets.toArray
  }

}

Source File: QuerySuite.scala From BigDatalog with Apache License 2.0

5 votes

package edu.ucla.cs.wis.bigdatalog.spark

import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException}
import org.scalatest.FunSuite

import scala.collection.mutable.ArrayBuffer

abstract class QuerySuite extends FunSuite with Logging {

  case class TestCase(program: String, query: String, data: Map[String, Seq[String]], answers: Seq[String], answersSize: Int) {
    def this(program: String, query: String, data: Map[String, Seq[String]], answersSize: Int) = this(program, query, data, null, answersSize)

    def this(program: String, query: String, data: Map[String, Seq[String]], answers: Seq[String]) = this(program, query, data, answers, answers.size)
  }

  def runTest(testCase: TestCase): Unit = runTests(Seq(testCase))

  def runTests(testCases: Seq[TestCase]): Unit = {
    val sparkCtx = new SparkContext("local[*]", "QuerySuite", new SparkConf()
      .set("spark.eventLog.enabled", "true")
      //.set("spark.eventLog.dir", "../logs")
      .set("spark.ui.enabled", "false")
      .set("spark.sql.shuffle.partitions", "5")
      .setAll(Map.empty[String, String])
    )

    val bigDatalogCtx = new BigDatalogContext(sparkCtx)

    var count: Int = 1
    for (testCase <- testCases) {
      bigDatalogCtx.loadProgram(testCase.program)

      for ((relationName, data) <- testCase.data) {
        val relationInfo = bigDatalogCtx.relationCatalog.getRelationInfo(relationName)
        if (relationInfo == null)
          throw new SparkException("You are attempting to load an unknown relation.")

        bigDatalogCtx.registerAndLoadTable(relationName, data, bigDatalogCtx.conf.numShufflePartitions)
      }

      val query = testCase.query
      val answers = testCase.answers
      logInfo("========== START BigDatalog Query " + count + " START ==========")
      val program = bigDatalogCtx.query(query)

      val results = program.execute().collect()

      // for some test cases we will only know the size of the answer set, not the actual answers
      if (answers == null) {
        assert(results.size == testCase.answersSize)
      } else {
        if (results.size != answers.size) {
          displayDifferences(results.map(_.toString), answers)
          // yes this will fail
          assert(results.size == answers.size)
        } else {
          for (result <- results)
            assert(answers.contains(result.toString()))
        }

        val resultStrings = results.map(_.toString).toSet

        for (answer <- answers)
          assert(resultStrings.contains(answer.toString()))
      }
      logInfo("========== END BigDatalog Query " + count + " END ==========\n")
      count += 1
      bigDatalogCtx.reset()
    }

    sparkCtx.stop()
  }

  private def displayDifferences(results: Seq[String], answers: Seq[String]): Unit = {
    val missingAnswers = new ArrayBuffer[String]
    val missingResults = new ArrayBuffer[String]

    for (result <- results)
      if (!answers.contains(result))
        missingAnswers += result

    for (answer <- answers)
      if (!results.contains(answer))
        missingResults += answer

    if (missingAnswers.nonEmpty)
      logInfo("Results not in Answers: " + missingAnswers.mkString(", "))

    if (missingResults.nonEmpty)
      logInfo("Answers not in Results: " + missingResults.mkString(", "))
  }
}

Source File: SpearmanCorrelation.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat.correlation

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.Logging
import org.apache.spark.SparkContext._
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.rdd.RDD


  override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = {
    // ((columnIndex, value), rowUid)
    val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) =>
      vec.toArray.view.zipWithIndex.map { case (v, j) =>
        ((j, v), uid)
      }
    }
    // global sort by (columnIndex, value)
    val sorted = colBased.sortByKey()
    // assign global ranks (using average ranks for tied values)
    val globalRanks = sorted.zipWithIndex().mapPartitions { iter =>
      var preCol = -1
      var preVal = Double.NaN
      var startRank = -1.0
      var cachedUids = ArrayBuffer.empty[Long]
      val flush: () => Iterable[(Long, (Int, Double))] = () => {
        val averageRank = startRank + (cachedUids.size - 1) / 2.0
        val output = cachedUids.map { uid =>
          (uid, (preCol, averageRank))
        }
        cachedUids.clear()
        output
      }
      iter.flatMap { case (((j, v), uid), rank) =>
        // If we see a new value or cachedUids is too big, we flush ids with their average rank.
        if (j != preCol || v != preVal || cachedUids.size >= 10000000) {
          val output = flush()
          preCol = j
          preVal = v
          startRank = rank
          cachedUids += uid
          output
        } else {
          cachedUids += uid
          Iterator.empty
        }
      } ++ flush()
    }
    // Replace values in the input matrix by their ranks compared with values in the same column.
    // Note that shifting all ranks in a column by a constant value doesn't affect result.
    val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) =>
      // sort by column index and then convert values to a vector
      Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray)
    }
    PearsonCorrelation.computeCorrelationMatrix(groupedRanks)
  }
}

Source File: TestOutputStream.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.streaming

import java.io.{IOException, ObjectInputStream}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, ForEachDStream}
import org.apache.spark.util.Utils

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag


class TestOutputStream[T: ClassTag](parent: DStream[T],
    val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]())
  extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => {
    val collected = rdd.collect()
    output += collected
  }, false) {

  // This is to clear the output buffer every it is read from a checkpoint
  @throws(classOf[IOException])
  private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException {
    ois.defaultReadObject()
    output.clear()
  }
}

Source File: FlumeStreamSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.streaming.flume

import scala.collection.JavaConverters._
import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer}
import scala.concurrent.duration._
import scala.language.postfixOps

import com.google.common.base.Charsets
import org.jboss.netty.channel.ChannelPipeline
import org.jboss.netty.channel.socket.SocketChannel
import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory
import org.jboss.netty.handler.codec.compression._
import org.scalatest.{BeforeAndAfter, Matchers}
import org.scalatest.concurrent.Eventually._

import org.apache.spark.{Logging, SparkConf, SparkFunSuite}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Milliseconds, StreamingContext, TestOutputStream}

class FlumeStreamSuite extends SparkFunSuite with BeforeAndAfter with Matchers with Logging {
  val conf = new SparkConf().setMaster("local[4]").setAppName("FlumeStreamSuite")
  var ssc: StreamingContext = null

  test("flume input stream") {
    testFlumeStream(testCompression = false)
  }

  test("flume input compressed stream") {
    testFlumeStream(testCompression = true)
  }

  
  private class CompressionChannelFactory(compressionLevel: Int)
    extends NioClientSocketChannelFactory {

    override def newChannel(pipeline: ChannelPipeline): SocketChannel = {
      val encoder = new ZlibEncoder(compressionLevel)
      pipeline.addFirst("deflater", encoder)
      pipeline.addFirst("inflater", new ZlibDecoder())
      super.newChannel(pipeline)
    }
  }
}

Source File: JDBCRelation.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.jdbc

import java.util.Properties

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.Partition
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode}


  def columnPartition(partitioning: JDBCPartitioningInfo): Array[Partition] = {
    if (partitioning == null) return Array[Partition](JDBCPartition(null, 0))

    val numPartitions = partitioning.numPartitions
    val column = partitioning.column
    if (numPartitions == 1) return Array[Partition](JDBCPartition(null, 0))
    // Overflow and silliness can happen if you subtract then divide.
    // Here we get a little roundoff, but that's (hopefully) OK.
    val stride: Long = (partitioning.upperBound / numPartitions
                      - partitioning.lowerBound / numPartitions)
    var i: Int = 0
    var currentValue: Long = partitioning.lowerBound
    var ans = new ArrayBuffer[Partition]()
    while (i < numPartitions) {
      val lowerBound = if (i != 0) s"$column >= $currentValue" else null
      currentValue += stride
      val upperBound = if (i != numPartitions - 1) s"$column < $currentValue" else null
      val whereClause =
        if (upperBound == null) {
          lowerBound
        } else if (lowerBound == null) {
          upperBound
        } else {
          s"$lowerBound AND $upperBound"
        }
      ans += JDBCPartition(whereClause, i)
      i = i + 1
    }
    ans.toArray
  }
}

private[sql] case class JDBCRelation(
    url: String,
    table: String,
    parts: Array[Partition],
    properties: Properties = new Properties())(@transient val sqlContext: SQLContext)
  extends BaseRelation
  with PrunedFilteredScan
  with InsertableRelation {

  override val needConversion: Boolean = false

  override val schema: StructType = JDBCRDD.resolveTable(url, table, properties)

  override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
    // Rely on a type erasure hack to pass RDD[InternalRow] back as RDD[Row]
    JDBCRDD.scanTable(
      sqlContext.sparkContext,
      schema,
      url,
      properties,
      table,
      requiredColumns,
      filters,
      parts).asInstanceOf[RDD[Row]]
  }

  override def insert(data: DataFrame, overwrite: Boolean): Unit = {
    data.write
      .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append)
      .jdbc(url, table, properties)
  }
}

Source File: KPLBasedKinesisTestUtils.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.streaming.kinesis

import java.nio.ByteBuffer

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

import com.amazonaws.services.kinesis.producer.{KinesisProducer => KPLProducer, KinesisProducerConfiguration, UserRecordResult}
import com.google.common.util.concurrent.{FutureCallback, Futures}

private[kinesis] class KPLBasedKinesisTestUtils extends KinesisTestUtils {
  override protected def getProducer(aggregate: Boolean): KinesisDataGenerator = {
    if (!aggregate) {
      new SimpleDataGenerator(kinesisClient)
    } else {
      new KPLDataGenerator(regionName)
    }
  }
}


private[kinesis] class KPLDataGenerator(regionName: String) extends KinesisDataGenerator {

  private lazy val producer: KPLProducer = {
    val conf = new KinesisProducerConfiguration()
      .setRecordMaxBufferedTime(1000)
      .setMaxConnections(1)
      .setRegion(regionName)
      .setMetricsLevel("none")

    new KPLProducer(conf)
  }

  override def sendData(streamName: String, data: Seq[Int]): Map[String, Seq[(Int, String)]] = {
    val shardIdToSeqNumbers = new mutable.HashMap[String, ArrayBuffer[(Int, String)]]()
    data.foreach { num =>
      val str = num.toString
      val data = ByteBuffer.wrap(str.getBytes())
      val future = producer.addUserRecord(streamName, str, data)
      val kinesisCallBack = new FutureCallback[UserRecordResult]() {
        override def onFailure(t: Throwable): Unit = {} // do nothing

        override def onSuccess(result: UserRecordResult): Unit = {
          val shardId = result.getShardId
          val seqNumber = result.getSequenceNumber()
          val sentSeqNumbers = shardIdToSeqNumbers.getOrElseUpdate(shardId,
            new ArrayBuffer[(Int, String)]())
          sentSeqNumbers += ((num, seqNumber))
        }
      }
      Futures.addCallback(future, kinesisCallBack)
    }
    producer.flushSync()
    shardIdToSeqNumbers.toMap
  }
}

Source File: UnionDStream.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.SparkException
import org.apache.spark.streaming.{Duration, Time}
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.UnionRDD

private[streaming]
class UnionDStream[T: ClassTag](parents: Array[DStream[T]])
  extends DStream[T](parents.head.ssc) {

  require(parents.length > 0, "List of DStreams to union is empty")
  require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts")
  require(parents.map(_.slideDuration).distinct.size == 1,
    "Some of the DStreams have different slide durations")

  override def dependencies: List[DStream[_]] = parents.toList

  override def slideDuration: Duration = parents.head.slideDuration

  override def compute(validTime: Time): Option[RDD[T]] = {
    val rdds = new ArrayBuffer[RDD[T]]()
    parents.map(_.getOrCompute(validTime)).foreach {
      case Some(rdd) => rdds += rdd
      case None => throw new SparkException("Could not generate RDD from a parent for unifying at" +
        s" time $validTime")
    }
    if (rdds.size > 0) {
      Some(new UnionRDD(ssc.sc, rdds))
    } else {
      None
    }
  }
}

Source File: QueueInputDStream.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream}

import scala.collection.mutable.{ArrayBuffer, Queue}
import scala.reflect.ClassTag

import org.apache.spark.rdd.{RDD, UnionRDD}
import org.apache.spark.streaming.{Time, StreamingContext}

private[streaming]
class QueueInputDStream[T: ClassTag](
    ssc: StreamingContext,
    val queue: Queue[RDD[T]],
    oneAtATime: Boolean,
    defaultRDD: RDD[T]
  ) extends InputDStream[T](ssc) {

  override def start() { }

  override def stop() { }

  private def readObject(in: ObjectInputStream): Unit = {
    throw new NotSerializableException("queueStream doesn't support checkpointing. " +
      "Please don't use queueStream when checkpointing is enabled.")
  }

  private def writeObject(oos: ObjectOutputStream): Unit = {
    logWarning("queueStream doesn't support checkpointing")
  }

  override def compute(validTime: Time): Option[RDD[T]] = {
    val buffer = new ArrayBuffer[RDD[T]]()
    if (oneAtATime && queue.size > 0) {
      buffer += queue.dequeue()
    } else {
      buffer ++= queue.dequeueAll(_ => true)
    }
    if (buffer.size > 0) {
      if (oneAtATime) {
        Some(buffer.head)
      } else {
        Some(new UnionRDD(context.sc, buffer.toSeq))
      }
    } else if (defaultRDD != null) {
      Some(defaultRDD)
    } else {
      Some(ssc.sparkContext.emptyRDD)
    }
  }

}

Source File: LocalSparkCluster.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.deploy

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.rpc.RpcEnv
import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.deploy.worker.Worker
import org.apache.spark.deploy.master.Master
import org.apache.spark.util.Utils


    for (workerNum <- 1 to numWorkers) {
      val workerEnv = Worker.startRpcEnvAndEndpoint(localHostname, 0, 0, coresPerWorker,
        memoryPerWorker, masters, null, Some(workerNum), _conf)
      workerRpcEnvs += workerEnv
    }

    masters
  }

  def stop() {
    logInfo("Shutting down local Spark cluster.")
    // Stop the workers before the master so they don't get upset that it disconnected
    workerRpcEnvs.foreach(_.shutdown())
    masterRpcEnvs.foreach(_.shutdown())
    workerRpcEnvs.foreach(_.awaitTermination())
    masterRpcEnvs.foreach(_.awaitTermination())
    masterRpcEnvs.clear()
    workerRpcEnvs.clear()
  }
}

Source File: Schedulable.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.util.concurrent.ConcurrentLinkedQueue

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.scheduler.SchedulingMode.SchedulingMode


private[spark] trait Schedulable {
  var parent: Pool
  // child queues
  def schedulableQueue: ConcurrentLinkedQueue[Schedulable]
  def schedulingMode: SchedulingMode
  def weight: Int
  def minShare: Int
  def runningTasks: Int
  def priority: Int
  def stageId: Int
  def name: String

  def addSchedulable(schedulable: Schedulable): Unit
  def removeSchedulable(schedulable: Schedulable): Unit
  def getSchedulableByName(name: String): Schedulable
  def executorLost(executorId: String, host: String, reason: ExecutorLossReason): Unit
  def checkSpeculatableTasks(): Boolean
  def getSortedTaskSetQueue: ArrayBuffer[TaskSetManager]
}

Source File: ByteArrayChunkOutputStream.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.util.io

import java.io.OutputStream

import scala.collection.mutable.ArrayBuffer



  private var position = chunkSize

  override def write(b: Int): Unit = {
    allocateNewChunkIfNeeded()
    chunks(lastChunkIndex)(position) = b.toByte
    position += 1
  }

  override def write(bytes: Array[Byte], off: Int, len: Int): Unit = {
    var written = 0
    while (written < len) {
      allocateNewChunkIfNeeded()
      val thisBatch = math.min(chunkSize - position, len - written)
      System.arraycopy(bytes, written + off, chunks(lastChunkIndex), position, thisBatch)
      written += thisBatch
      position += thisBatch
    }
  }

  @inline
  private def allocateNewChunkIfNeeded(): Unit = {
    if (position == chunkSize) {
      chunks += new Array[Byte](chunkSize)
      lastChunkIndex += 1
      position = 0
    }
  }

  def toArrays: Array[Array[Byte]] = {
    if (lastChunkIndex == -1) {
      new Array[Array[Byte]](0)
    } else {
      // Copy the first n-1 chunks to the output, and then create an array that fits the last chunk.
      // An alternative would have been returning an array of ByteBuffers, with the last buffer
      // bounded to only the last chunk's position. However, given our use case in Spark (to put
      // the chunks in block manager), only limiting the view bound of the buffer would still
      // require the block manager to store the whole chunk.
      val ret = new Array[Array[Byte]](chunks.size)
      for (i <- 0 until chunks.size - 1) {
        ret(i) = chunks(i)
      }
      if (position == chunkSize) {
        ret(lastChunkIndex) = chunks(lastChunkIndex)
      } else {
        ret(lastChunkIndex) = new Array[Byte](position)
        System.arraycopy(chunks(lastChunkIndex), 0, ret(lastChunkIndex), 0, position)
      }
      ret
    }
  }
}

Source File: UnionRDD.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  override def getPartitions: Array[Partition] = {
    val array = new Array[Partition](rdds.map(_.partitions.length).sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
}

Source File: TaskContextImpl.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark

import scala.collection.mutable.{ArrayBuffer, HashMap}

import org.apache.spark.executor.TaskMetrics
import org.apache.spark.memory.TaskMemoryManager
import org.apache.spark.metrics.MetricsSystem
import org.apache.spark.metrics.source.Source
import org.apache.spark.util.{TaskCompletionListener, TaskCompletionListenerException}

private[spark] class TaskContextImpl(
    val stageId: Int,
    val partitionId: Int,
    override val taskAttemptId: Long,
    override val attemptNumber: Int,
    override val taskMemoryManager: TaskMemoryManager,
    @transient private val metricsSystem: MetricsSystem,
    internalAccumulators: Seq[Accumulator[Long]],
    val runningLocally: Boolean = false,
    val taskMetrics: TaskMetrics = TaskMetrics.empty)
  extends TaskContext
  with Logging {

  // For backwards-compatibility; this method is now deprecated as of 1.3.0.
  override def attemptId(): Long = taskAttemptId

  // List of callback functions to execute when the task completes.
  @transient private val onCompleteCallbacks = new ArrayBuffer[TaskCompletionListener]

  // Whether the corresponding task has been killed.
  @volatile private var interrupted: Boolean = false

  // Whether the task has completed.
  @volatile private var completed: Boolean = false

  override def addTaskCompletionListener(listener: TaskCompletionListener): this.type = {
    onCompleteCallbacks += listener
    this
  }

  override def addTaskCompletionListener(f: TaskContext => Unit): this.type = {
    onCompleteCallbacks += new TaskCompletionListener {
      override def onTaskCompletion(context: TaskContext): Unit = f(context)
    }
    this
  }

  @deprecated("use addTaskCompletionListener", "1.1.0")
  override def addOnCompleteCallback(f: () => Unit) {
    onCompleteCallbacks += new TaskCompletionListener {
      override def onTaskCompletion(context: TaskContext): Unit = f()
    }
  }

  
  private[spark] def markInterrupted(): Unit = {
    interrupted = true
  }

  override def isCompleted(): Boolean = completed

  override def isRunningLocally(): Boolean = runningLocally

  override def isInterrupted(): Boolean = interrupted

  override def getMetricsSources(sourceName: String): Seq[Source] =
    metricsSystem.getSourcesByName(sourceName)

  @transient private val accumulators = new HashMap[Long, Accumulable[_, _]]

  private[spark] override def registerAccumulator(a: Accumulable[_, _]): Unit = synchronized {
    accumulators(a.id) = a
  }

  private[spark] override def collectInternalAccumulators(): Map[Long, Any] = synchronized {
    accumulators.filter(_._2.isInternal).mapValues(_.localValue).toMap
  }

  private[spark] override def collectAccumulators(): Map[Long, Any] = synchronized {
    accumulators.mapValues(_.localValue).toMap
  }

  //private[spark]
  override val internalMetricsToAccumulators: Map[String, Accumulator[Long]] = {
    // Explicitly register internal accumulators here because these are
    // not captured in the task closure and are already deserialized
    internalAccumulators.foreach(registerAccumulator)
    internalAccumulators.map { a => (a.name.get, a) }.toMap
  }
}

Source File: LinkerdApi.scala From asura with MIT License

5 votes

package asura.app.api

import asura.app.api.model.Dtabs
import asura.app.api.model.Dtabs.DtabItem
import asura.common.exceptions.ErrorMessages.ErrorMessage
import asura.common.model.{ApiRes, ApiResError}
import asura.core.http.HttpEngine
import asura.core.{CoreConfig, ErrorMessages}
import asura.namerd.DtabEntry
import asura.namerd.api.v1.NamerdV1Api
import asura.play.api.BaseApi.OkApiRes
import javax.inject.{Inject, Singleton}
import org.pac4j.play.scala.SecurityComponents
import play.api.Configuration

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.{ExecutionContext, Future}


@Singleton
class LinkerdApi @Inject()(
                            implicit val exec: ExecutionContext,
                            val controllerComponents: SecurityComponents,
                            config: Configuration,
                          ) extends BaseApi {

  val srcPrefix = "/svc/"
  val dstPrefix = "/$/inet/"

  def getProxyServers() = Action { implicit req =>
    if (CoreConfig.linkerdConfig.enabled) {
      OkApiRes(ApiRes(data = CoreConfig.linkerdConfig.servers))
    } else {
      OkApiRes(ApiResError(getI18nMessage(ErrorMessages.error_ProxyDisabled.name)))
    }
  }

  def getHttp(group: String, project: String, server: String) = Action.async { implicit req =>
    if (CoreConfig.linkerdConfig.enabled) {
      val proxyServer = CoreConfig.linkerdConfig.servers.find(_.tag.equals(server)).get
      NamerdV1Api.getNamespaceDtabs(proxyServer.namerd, proxyServer.httpNs)(HttpEngine.http).map(dtabs => {
        val items = ArrayBuffer[DtabItem]()
        dtabs.foreach(entry => {
          val pStrs = entry.prefix.split("/")
          val dStrs = entry.dst.split("/")
          if (pStrs.length == 5 && dStrs.length == 5) {
            items += DtabItem(
              group = pStrs(2),
              project = pStrs(3),
              namespace = pStrs(4),
              host = dStrs(3),
              port = dStrs(4),
              owned = group == pStrs(2) && project == pStrs(3)
            )
          }
        })
        toActionResultFromAny(items)
      })
    } else {
      Future.successful(OkApiRes(ApiResError(getI18nMessage(ErrorMessages.error_ProxyDisabled.name))))
    }
  }

  def putHttp(group: String, project: String, server: String) = Action(parse.byteString).async { implicit req =>
    if (CoreConfig.linkerdConfig.enabled) {
      val proxyServer = CoreConfig.linkerdConfig.servers.find(_.tag.equals(server)).get
      val dtabs = req.bodyAs(classOf[Dtabs])
      if (null != dtabs && null != dtabs.dtabs && dtabs.dtabs.nonEmpty) {
        var error: ErrorMessage = null
        val entries = ArrayBuffer[DtabEntry]()
        for (i <- 0 until dtabs.dtabs.length if null == error) {
          val item = dtabs.dtabs(i)
          error = item.isValid()
          entries += DtabEntry(
            s"${srcPrefix}${item.group}/${item.project}/${item.namespace}",
            s"${dstPrefix}${item.host}/${item.port}"
          )
        }
        if (null == error) {
          NamerdV1Api.updateNamespaceDtabs(proxyServer.namerd, proxyServer.httpNs, entries)(HttpEngine.http).toOkResult
        } else {
          error.toFutureFail
        }
      } else {
        Future.successful(OkApiRes(ApiRes()))
      }
    } else {
      Future.successful(OkApiRes(ApiResError(getI18nMessage(ErrorMessages.error_ProxyDisabled.name))))
    }
  }
}

Source File: InterfaceMethodParamsActor.scala From asura with MIT License

5 votes

package asura.dubbo.actor

import akka.actor.{ActorRef, Props, Status}
import akka.pattern.pipe
import akka.util.ByteString
import asura.common.actor.BaseActor
import asura.common.util.LogUtils
import asura.dubbo.DubboConfig
import asura.dubbo.actor.GenericServiceInvokerActor.GetInterfaceMethodParams
import asura.dubbo.model.InterfaceMethodParams
import asura.dubbo.model.InterfaceMethodParams.MethodSignature

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.{ExecutionContext, Future}

class InterfaceMethodParamsActor(invoker: ActorRef, msg: GetInterfaceMethodParams) extends BaseActor {

  implicit val ec: ExecutionContext = context.dispatcher
  private val telnet: ActorRef = context.actorOf(TelnetClientActor.props(msg.address, if (msg.port > 0) msg.port else DubboConfig.DEFAULT_PORT, self))

  override def receive: Receive = {
    case telnetData: ByteString =>
      val utf8String = telnetData.utf8String
      if (utf8String.contains(TelnetClientActor.MSG_CONNECT_TO)) {
        log.debug(utf8String)
        if (utf8String.contains(TelnetClientActor.MSG_SUCCESS)) {
          telnet ! ByteString(s"ls -l ${msg.ref}\r\n")
        } else if (utf8String.contains(TelnetClientActor.MSG_FAIL)) {
          Future.failed(new RuntimeException(s"Remote connection to ${msg.address}:${msg.port} failed")) pipeTo invoker
          telnet ! TelnetClientActor.CMD_CLOSE
          context stop self
        } else {
          Future.failed(new RuntimeException(s"Unknown response ${utf8String}")) pipeTo invoker
          telnet ! TelnetClientActor.CMD_CLOSE
          context stop self
        }
      } else if (utf8String.contains("(") && utf8String.contains(")")) {
        getInterfaceMethodParams(msg.ref, utf8String) pipeTo invoker
        telnet ! TelnetClientActor.CMD_CLOSE
      } else {
        Future.failed(new RuntimeException(s"Unknown response: ${utf8String}")) pipeTo invoker
        telnet ! TelnetClientActor.CMD_CLOSE
        context stop self
      }
    case Status.Failure(t) =>
      val stackTrace = LogUtils.stackTraceToString(t)
      log.warning(stackTrace)
      context stop self
  }

  def getInterfaceMethodParams(ref: String, content: String): Future[InterfaceMethodParams] = {
    Future.successful {
      val methods = ArrayBuffer[MethodSignature]()
      content.split("\r\n")
        .filter(!_.startsWith(DubboConfig.DEFAULT_PROMPT))
        .map(signature => {
          val splits = signature.split(" ")
          if (splits.length == 2) {
            val ret = splits(0)
            val secondPart = splits(1)
            val idx = secondPart.indexOf("(")
            val method = secondPart.substring(0, idx)
            val params = secondPart.substring(idx + 1, secondPart.length - 1).split(",")
            methods += (MethodSignature(ret, method, params))
          }
        })
      InterfaceMethodParams(ref, methods)
    }
  }

  override def postStop(): Unit = log.debug(s"${self.path} stopped")
}

object InterfaceMethodParamsActor {
  def props(invoker: ActorRef, msg: GetInterfaceMethodParams) = {
    Props(new InterfaceMethodParamsActor(invoker, msg))
  }
}

Source File: JobReportDataItemSaveActor.scala From asura with MIT License

5 votes

package asura.core.job.actor

import akka.actor.{Props, Status}
import asura.common.actor.BaseActor
import asura.common.util.LogUtils
import asura.core.actor.messages.Flush
import asura.core.es.model.JobReportDataItem
import asura.core.es.service.JobReportDataItemService
import asura.core.job.actor.JobReportDataItemSaveActor.SaveReportDataHttpItemMessage

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.duration._

class JobReportDataItemSaveActor(dayIndexSuffix: String) extends BaseActor {

  val messages = ArrayBuffer[SaveReportDataHttpItemMessage]()

  override def receive: Receive = {
    case m: SaveReportDataHttpItemMessage =>
      messages += m
      if (messages.length >= 10) {
        insert()
      }
      context.system.scheduler.scheduleOnce(2 seconds) {
        self ! Flush
      }(context.system.dispatcher)
    case Flush =>
      insert()
    case Status.Failure(t) =>
      log.warning(LogUtils.stackTraceToString(t))
  }

  override def preStart(): Unit = {
  }

  override def postStop(): Unit = {
    insert()
    log.debug(s"${self.path} is stopped")
  }

  private def insert(): Unit = {
    if (messages.length > 0) {
      log.debug(s"${messages.length} items is saving...")
      JobReportDataItemService.index(messages, dayIndexSuffix)
      messages.clear()
    }
  }
}

object JobReportDataItemSaveActor {

  def props(dayIndexSuffix: String) = Props(new JobReportDataItemSaveActor(dayIndexSuffix))

  case class SaveReportDataHttpItemMessage(id: String, dataItem: JobReportDataItem)

}

Source File: JobStatusActor.scala From asura with MIT License

5 votes

package asura.core.job.actor

import akka.actor.Status.Failure
import akka.actor.{ActorRef, Props}
import asura.common.actor._
import asura.common.model.Pagination
import asura.core.model.QueryJob
import asura.core.es.service.JobService
import asura.core.job.actor.JobStatusMonitorActor.JobStatusOperationMessage
import asura.core.job.eventbus.JobStatusBus.JobStatusNotificationMessage
import asura.core.job.{JobListItem, JobStates}
import asura.core.redis.RedisJobState
import asura.core.util.JacksonSupport
import com.typesafe.scalalogging.Logger

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

class JobStatusActor() extends BaseActor {

  var query: QueryJob = null
  val watchIds = mutable.HashSet[String]()

  override def receive: Receive = {
    case SenderMessage(sender) =>
      context.become(query(sender))
  }

  def query(outSender: ActorRef): Receive = {
    case query: QueryJob =>
      this.query = query
      JobService.queryJob(query).map(esResponse =>
        if (esResponse.isSuccess) {
          val items = ArrayBuffer[JobListItem]()
          val jobsTable = mutable.HashMap[String, JobListItem]()
          val hits = esResponse.result.hits
          watchIds.clear()
          hits.hits.foreach(hit => {
            val jobId = hit.id
            watchIds.add(jobId)
            jobsTable += (jobId -> {
              val item = JacksonSupport.parse(hit.sourceAsString, classOf[JobListItem])
              item.state = JobStates.UNKNOWN
              items += item
              item._id = jobId
              item
            })
          })
          if (watchIds.nonEmpty) {
            RedisJobState.getJobState(watchIds.toSet).onComplete {
              case util.Success(statesMap) =>
                statesMap.forEach((jobKey, state) => jobsTable(jobKey).state = state)
                outSender ! ListActorEvent(Map("total" -> hits.total, "list" -> items))
              case util.Failure(_) =>
                outSender ! ListActorEvent(Map("total" -> hits.total, "list" -> items))
            }(context.system.dispatcher)
          } else {
            outSender ! ListActorEvent(Map("total" -> 0, "list" -> Nil))
          }
        } else {
          outSender ! ErrorActorEvent(esResponse.error.reason)
        })(context.system.dispatcher)
    case JobStatusNotificationMessage(_, operator, scheduler, group, name, data) =>
      if (watchIds.contains(name)) {
        outSender ! ItemActorEvent(JobStatusOperationMessage(operator, scheduler, group, name, data))
      }
    case eventMessage: ActorEvent =>
      outSender ! eventMessage
    case Failure(t) =>
      outSender ! ErrorActorEvent(t.getMessage)
  }

  override def postStop(): Unit = {
    import JobStatusActor.logger
    logger.debug(s"JobStatus for ${query} stopped")
  }
}

object JobStatusActor {

  val logger = Logger(classOf[JobStatusActor])

  def props() = Props(new JobStatusActor())

  case class JobQueryMessage(scheduler: String = null, group: String = null, text: String = null) extends Pagination

}

Source File: HeaderUtils.scala From asura with MIT License

5 votes

package asura.core.http

import akka.http.scaladsl.model.HttpHeader.ParsingResult.{Error, Ok}
import akka.http.scaladsl.model.headers.{Cookie, RawHeader}
import akka.http.scaladsl.model.{ErrorInfo, HttpHeader}
import asura.common.util.StringUtils
import asura.core.es.model.{Environment, HttpCaseRequest}
import asura.core.runtime.RuntimeContext
import asura.core.{CoreConfig, ErrorMessages}
import com.typesafe.scalalogging.Logger

import scala.collection.immutable
import scala.collection.mutable.ArrayBuffer

object HeaderUtils {

  val logger = Logger("HeaderUtils")

  def toHeaders(cs: HttpCaseRequest, context: RuntimeContext): immutable.Seq[HttpHeader] = {
    val headers = ArrayBuffer[HttpHeader]()
    val request = cs.request
    val env = if (null != context.options) context.options.getUsedEnv() else null
    if (null != request) {
      val headerSeq = request.header
      if (null != headerSeq) {
        for (h <- headerSeq if (h.enabled && StringUtils.isNotEmpty(h.key))) {
          HttpHeader.parse(h.key, context.renderSingleMacroAsString(h.value)) match {
            case Ok(header: HttpHeader, errors: List[ErrorInfo]) =>
              if (errors.nonEmpty) logger.warn(errors.mkString(","))
              headers += header
            case Error(error: ErrorInfo) =>
              logger.warn(error.detail)
          }
        }
      }
      val cookieSeq = request.cookie
      if (null != cookieSeq) {
        val cookies = ArrayBuffer[(String, String)]()
        for (c <- cookieSeq if (c.enabled && StringUtils.isNotEmpty(c.key))) {
          cookies += ((c.key, context.renderSingleMacroAsString(c.value)))
        }
        if (cookies.nonEmpty) headers += Cookie(cookies: _*)
      }
    }
    if (null != env && null != env.headers && env.headers.nonEmpty) {
      for (h <- env.headers if (h.enabled && StringUtils.isNotEmpty(h.key))) {
        HttpHeader.parse(h.key, context.renderSingleMacroAsString(h.value)) match {
          case Ok(header: HttpHeader, errors: List[ErrorInfo]) =>
            if (errors.nonEmpty) logger.warn(errors.mkString(","))
            headers += header
          case Error(error: ErrorInfo) =>
            logger.warn(error.detail)
        }
      }
    }
    if (null != env && env.enableProxy) {
      val headerIdentifier = validateProxyVariables(env)
      val dst = StringBuilder.newBuilder
      dst.append("/").append(cs.group).append("/").append(cs.project).append("/").append(env.namespace)
      headers += RawHeader(headerIdentifier, dst.toString)
    }
    headers.toList
  }

  def validateProxyVariables(env: Environment): String = {
    if (!CoreConfig.linkerdConfig.enabled) {
      throw ErrorMessages.error_ProxyDisabled.toException
    }
    if (StringUtils.isEmpty(env.namespace)) {
      throw ErrorMessages.error_EmptyNamespace.toException
    }
    if (StringUtils.isEmpty(env.server)) {
      throw ErrorMessages.error_EmptyProxyServer.toException
    }
    val proxyServerOpt = CoreConfig.linkerdConfig.servers.find(_.tag.equals(env.server))
    if (proxyServerOpt.isEmpty && StringUtils.isEmpty(proxyServerOpt.get.headerIdentifier)) {
      throw ErrorMessages.error_InvalidProxyConfig.toException
    } else {
      proxyServerOpt.get.headerIdentifier
    }
  }

  def isApplicationJson(header: HttpHeader): Boolean = {
    if (header.lowercaseName().equals("content-type")) {
      header.value().contains(HttpContentTypes.JSON)
    } else {
      false
    }
  }
}

Source File: Or.scala From asura with MIT License

5 votes

package asura.core.assertion

import asura.core.concurrent.ExecutionContextManager.cachedExecutor
import asura.core.assertion.engine.{AssertResult, AssertionContext, FailAssertResult, Statistic}

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import scala.concurrent.Future

object Or extends Assertion {

  override val name: String = Assertions.OR

  override def assert(actual: Any, expect: Any): Future[AssertResult] = {
    apply(actual, expect)
  }

  def apply(actual: Any, except: Any): Future[AssertResult] = {
    val result = AssertResult(
      isSuccessful = false,
      msg = AssertResult.MSG_FAILED
    )
    val subResults = ArrayBuffer[mutable.Map[String, Any]]()
    result.subResult = subResults
    except match {
      case assertions: Seq[_] =>
        if (assertions.nonEmpty) {
          val assertionResults = assertions.map(assertion => {
            val subStatis = Statistic()
            val assertionMap = assertion.asInstanceOf[Map[String, Any]]
            val contextMap = actual.asInstanceOf[Object]
            AssertionContext.eval(assertionMap, contextMap, subStatis).map((subStatis, _))
          })
          Future.sequence(assertionResults).map(subStatisResults => {
            val subResults = ArrayBuffer[java.util.Map[String, Any]]()
            result.subResult = subResults
            subStatisResults.foreach(subStatisResult => {
              val (subStatis, subResult) = subStatisResult
              subResults += subResult
              result.pass(subStatis.passed)
              result.fail(subStatis.failed)
              if (subStatis.isSuccessful) {
                result.isSuccessful = true
                result.msg = AssertResult.MSG_PASSED
              }
            })
            result
          })
        } else {
          Future.successful(null)
        }
      case _ =>
        Future.successful(FailAssertResult(1, AssertResult.msgIncomparableTargetType(except)))
    }
  }
}

Source File: And.scala From asura with MIT License

5 votes

package asura.core.assertion

import asura.core.concurrent.ExecutionContextManager.cachedExecutor
import asura.core.assertion.engine.{AssertResult, AssertionContext, FailAssertResult, Statistic}

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.Future

object And extends Assertion {

  override val name: String = Assertions.AND

  override def assert(actual: Any, expect: Any): Future[AssertResult] = {
    apply(actual, expect)
  }

  def apply(actual: Any, expect: Any): Future[AssertResult] = {
    val result = AssertResult(
      isSuccessful = true,
      msg = AssertResult.MSG_PASSED
    )
    expect match {
      case assertions: Seq[_] =>
        if (assertions.nonEmpty) {
          val assertionResults = assertions.map(assertion => {
            val subStatis = Statistic()
            val assertionMap = assertion.asInstanceOf[Map[String, Any]]
            val contextMap = actual.asInstanceOf[Object]
            AssertionContext.eval(assertionMap, contextMap, subStatis).map((subStatis, _))
          })
          Future.sequence(assertionResults).map(subStatisResults => {
            val subResults = ArrayBuffer[java.util.Map[String, Any]]()
            result.subResult = subResults
            subStatisResults.foreach(subStatisResult => {
              val (subStatis, subResult) = subStatisResult
              subResults += subResult
              result.pass(subStatis.passed)
              result.fail(subStatis.failed)
              if (!subStatis.isSuccessful) {
                result.isSuccessful = false
                result.msg = AssertResult.MSG_FAILED
              }
            })
            result
          })
        } else {
          Future.successful(null)
        }
      case _ =>
        Future.successful(FailAssertResult(1, AssertResult.msgIncomparableTargetType(expect)))
    }
  }
}

Source File: TriggerEventsSaveActor.scala From asura with MIT License

5 votes

package asura.core.es.actor

import akka.actor.{Props, Status}
import asura.common.actor.BaseActor
import asura.common.util.LogUtils
import asura.core.actor.messages.Flush
import asura.core.es.model.TriggerEventLog
import asura.core.es.service.TriggerEventLogService

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.duration._

class TriggerEventsSaveActor extends BaseActor {

  val logs = ArrayBuffer[TriggerEventLog]()

  override def receive: Receive = {
    case m: TriggerEventLog =>
      logs += m
      if (logs.length >= 20) {
        insert()
      }
      context.system.scheduler.scheduleOnce(2 seconds) {
        self ! Flush
      }(context.system.dispatcher)
    case Flush =>
      insert()
    case Status.Failure(t) =>
      log.warning(LogUtils.stackTraceToString(t))
  }

  override def preStart(): Unit = {
  }

  override def postStop(): Unit = {
    insert()
    log.debug(s"${self.path} is stopped")
  }

  private def insert(): Unit = {
    if (logs.length > 0) {
      log.debug(s"${logs.length} trigger events is saving...")
      TriggerEventLogService.index(logs)
      logs.clear()
    }
  }
}

object TriggerEventsSaveActor {
  def props() = Props(new TriggerEventsSaveActor())
}

Source File: ActivitySaveActor.scala From asura with MIT License

5 votes

package asura.core.es.actor

import akka.actor.{Props, Status}
import asura.common.actor.BaseActor
import asura.common.util.LogUtils
import asura.core.actor.messages.Flush
import asura.core.es.model.Activity
import asura.core.es.service.ActivityService

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.duration._

class ActivitySaveActor extends BaseActor {

  val activities = ArrayBuffer[Activity]()

  override def receive: Receive = {
    case m: Activity =>
      activities += m
      if (activities.length >= 20) {
        insert()
      }
      context.system.scheduler.scheduleOnce(2 seconds) {
        self ! Flush
      }(context.system.dispatcher)
    case Flush =>
      insert()
    case Status.Failure(t) =>
      log.warning(LogUtils.stackTraceToString(t))
  }

  override def preStart(): Unit = {
  }

  override def postStop(): Unit = {
    insert()
    log.debug(s"${self.path} is stopped")
  }

  private def insert(): Unit = {
    if (activities.length > 0) {
      log.debug(s"${activities.length} activities is saving...")
      ActivityService.index(activities)
      activities.clear()
    }
  }
}

object ActivitySaveActor {

  def props() = Props(new ActivitySaveActor())

}

Source File: HttpResponse.scala From asura with MIT License

5 votes

package asura.core.es.model

import asura.core.http.HttpContentTypes
import io.swagger.models.properties.RefProperty
import io.swagger.models.{ModelImpl, Response, Swagger}

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

case class HttpResponse(
                         description: String,
                         headers: Seq[ParameterSchema],
                         contentType: String,
                         schema: JsonSchema
                       ) {

}

object HttpResponse {

  
  def toResponses(openApi: Swagger, responses: mutable.Map[String, Response]): Map[String, HttpResponse] = {
    val definitions = openApi.getDefinitions
    val responseMap = mutable.Map[String, HttpResponse]()
    for ((code, res) <- responses) {
      val schema: JsonSchema = res.getSchema match {
        case p: RefProperty =>
          definitions.get(p.getSimpleRef) match {
            case model: ModelImpl =>
              JsonSchema.toJsonSchema(model)
            case _ =>
              null
          }
        case _ =>
          null
      }
      val headers = ArrayBuffer[ParameterSchema]()
      if (null != res.getHeaders) {
        res.getHeaders.forEach((name, property) => {
          headers += (ParameterSchema(
            name = name,
            description = property.getDescription,
            `type` = SchemaObject.translateOpenApiType(property.getType, property.getFormat)
          ))
        })
      }
      responseMap += (code -> HttpResponse(
        description = res.getDescription,
        headers = headers.toList,
        contentType = HttpContentTypes.JSON,
        schema = schema
      ))
    }
    responseMap.toMap
  }
}

Source File: RecommendService.scala From asura with MIT License

5 votes

package asura.core.es.service

import asura.common.util.StringUtils
import asura.core.concurrent.ExecutionContextManager.sysGlobal
import asura.core.es.model.{FieldKeys, Project}
import asura.core.model.RecommendProject

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import scala.concurrent.Future

object RecommendService {

  def getRecommendProjects(user: String, wd: String, discover: Boolean): Future[RecommendProjects] = {
    val futureTuple = for {
      my <- getRecommendProject(user, true, wd, 20, Nil)
      other <- if (discover) {
        getRecommendProject(user, false, null, 5, my.map(p => ((p.group, p.project))))
      } else {
        Future.successful(Nil)
      }
    } yield (my, other)
    futureTuple.map(tuple => RecommendProjects(tuple._1, tuple._2))
  }

  def getRecommendProject(user: String, me: Boolean, wd: String, size: Int, excludeGPs: Seq[(String, String)]): Future[Seq[RecommendProject]] = {
    val items = ArrayBuffer[RecommendProject]()
    ActivityService.recentProjects(user, me, wd, size).flatMap(aggItems => {
      if (aggItems.nonEmpty) {
        val map = mutable.Map[String, RecommendProject]()
        aggItems.foreach(item => {
          if (StringUtils.isNotEmpty(item.id)) {
            val gp = item.id.split("/")
            if (gp.length == 2) {
              val project = RecommendProject(gp(0), gp(1), item.count)
              items += project
              map += (Project.generateDocId(gp(0), gp(1)) -> project)
            }
          }
        })
        ProjectService.getByIds(map.keys.toSeq, Seq(FieldKeys.FIELD_SUMMARY)).map(idMap => {
          idMap.foreach(tuple => {
            val id = tuple._1
            val project = tuple._2
            map(id).summary = project.summary
          })
          items
        })
      } else {
        Future.successful(items)
      }
    })
  }

  case class RecommendProjects(
                                my: Seq[RecommendProject],
                                others: Seq[RecommendProject]
                              )

}

Source File: HomeService.scala From asura with MIT License

5 votes

package asura.core.es.service

import asura.common.util.StringUtils
import asura.core.concurrent.ExecutionContextManager.sysGlobal
import asura.core.es.EsClient
import asura.core.es.model._
import asura.core.model.QueryHome
import com.sksamuel.elastic4s.ElasticDsl._
import com.sksamuel.elastic4s.requests.searches.queries.Query

import scala.collection.mutable.ArrayBuffer

object HomeService extends CommonService {

  val includeFields = Seq(
    FieldKeys.FIELD_GROUP,
    FieldKeys.FIELD_PROJECT,
    FieldKeys.FIELD_ID,
    FieldKeys.FIELD_AVATAR,
    FieldKeys.FIELD_SUMMARY,
    FieldKeys.FIELD_DESCRIPTION,
    FieldKeys.FIELD_OBJECT_REQUEST_URLPATH
  )

  def queryDoc(query: QueryHome) = {
    EsClient.esClient.execute {
      val esQueries = ArrayBuffer[Query]()
      if (StringUtils.isNotEmpty(query.text)) esQueries += matchQuery(FieldKeys.FIELD__TEXT, query.text)
      search(Group.Index, Project.Index, HttpCaseRequest.Index, DubboRequest.Index, SqlRequest.Index,
        Environment.Index, Scenario.Index, Job.Index)
        .query(boolQuery().must(esQueries))
        .sourceInclude(includeFields)
        .size(3)
    }
  }
}

Source File: TriggerEventLogService.scala From asura with MIT License

5 votes

package asura.core.es.service

import asura.common.model.ApiMsg
import asura.common.util.{FutureUtils, StringUtils}
import asura.core.concurrent.ExecutionContextManager.sysGlobal
import asura.core.es.EsClient
import asura.core.es.model._
import asura.core.model.QueryCiEvents
import asura.core.util.JacksonSupport.jacksonJsonIndexable
import com.sksamuel.elastic4s.ElasticDsl._
import com.sksamuel.elastic4s.requests.searches.queries.Query

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.Future

object TriggerEventLogService extends CommonService with BaseAggregationService {

  def index(items: Seq[TriggerEventLog]): Future[BulkDocResponse] = {
    if (null == items && items.isEmpty) {
      FutureUtils.illegalArgs(ApiMsg.INVALID_REQUEST_BODY)
    } else {
      EsClient.esClient.execute {
        bulk(
          items.map(item => indexInto(TriggerEventLog.Index).doc(item))
        )
      }.map(toBulkDocResponse(_))
    }
  }

  def queryEvents(query: QueryCiEvents) = {
    val esQueries = ArrayBuffer[Query]()
    if (StringUtils.isNotEmpty(query.group)) esQueries += termQuery(FieldKeys.FIELD_GROUP, query.group)
    if (StringUtils.isNotEmpty(query.project)) esQueries += termQuery(FieldKeys.FIELD_PROJECT, query.project)
    if (StringUtils.isNotEmpty(query.env)) esQueries += termQuery(FieldKeys.FIELD_ENV, query.env)
    if (StringUtils.isNotEmpty(query.`type`)) esQueries += termQuery(FieldKeys.FIELD_TYPE, query.`type`)
    if (StringUtils.isNotEmpty(query.service)) esQueries += termQuery(FieldKeys.FIELD_SERVICE, query.service)
    EsClient.esClient.execute {
      search(TriggerEventLog.Index).query(boolQuery().must(esQueries))
        .from(query.pageFrom)
        .size(query.pageSize)
        .sortByFieldDesc(FieldKeys.FIELD_CREATED_AT)
    }
  }
}

Source File: IndexService.scala From asura with MIT License

5 votes

package asura.core.es.service

import asura.common.util.StringUtils
import asura.core.concurrent.ExecutionContextManager.sysGlobal
import asura.core.es.EsClient
import asura.core.es.model.{FieldKeys, IndexSetting, JobReportDataItem, RestApiOnlineLog}
import com.sksamuel.elastic4s.ElasticDsl._
import com.sksamuel.elastic4s.Indexes
import com.sksamuel.elastic4s.requests.delete.DeleteByQueryRequest
import com.sksamuel.elastic4s.requests.searches.queries.Query
import com.typesafe.scalalogging.Logger

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.Future

object IndexService extends CommonService {

  val logger = Logger("IndexService")

  
  def initCheck(idx: IndexSetting): Boolean = {
    val cli = EsClient.esClient
    val res = cli.execute(indexExists(idx.Index)).await
    if (res.isSuccess) {
      if (res.result.exists) {
        true
      } else {
        val res2 = cli.execute {
          createIndex(idx.Index)
            .shards(idx.shards)
            .replicas(idx.replicas)
            .mapping(idx.mappings)
        }.await
        if (res2.isSuccess) {
          true
        } else {
          logger.error(res2.error.reason)
          false
        }
      }
    } else {
      logger.error(res.error.reason)
      false
    }
  }

  def checkTemplate(): Boolean = {
    checkIndexTemplate(JobReportDataItem).await && checkIndexTemplate(RestApiOnlineLog).await
  }

  def checkIndexTemplate(idxSetting: IndexSetting): Future[Boolean] = {
    logger.info(s"check es template ${idxSetting.Index}")
    val cli = EsClient.esClient
    cli.execute {
      getIndexTemplate(idxSetting.Index)
    }.map { res =>
      if (res.status != 404) true else false
    }.recover {
      case _ => false
    }.flatMap(hasTpl => {
      if (!hasTpl) {
        cli.execute {
          createIndexTemplate(idxSetting.Index, s"${idxSetting.Index}-*")
            .settings(Map(
              "number_of_replicas" -> idxSetting.replicas,
              "number_of_shards" -> idxSetting.shards
            ))
            .mappings(idxSetting.mappings)
        }.map(tplIndex => {
          if (tplIndex.result.acknowledged) true else false
        })
      } else {
        Future.successful(true)
      }
    })
  }

  def delIndex(indices: Seq[String]) = {
    EsClient.esClient.execute {
      deleteIndex(indices)
    }.map(toDeleteIndexResponse(_))
  }

  def deleteByGroupOrProject(indices: Seq[String], group: String, project: String) = {
    val esQueries = ArrayBuffer[Query]()
    if (StringUtils.isNotEmpty(group)) esQueries += termQuery(FieldKeys.FIELD_GROUP, group)
    if (StringUtils.isNotEmpty(project)) esQueries += termQuery(FieldKeys.FIELD_PROJECT, project)
    EsClient.esClient.execute {
      DeleteByQueryRequest(
        Indexes(indices),
        boolQuery().must(esQueries)
      ).refreshImmediately
    }.map(toDeleteByQueryResponse(_))
  }
}

Source File: ScalapropsRunner.scala From scalaprops with MIT License

5 votes

package scalaprops

import sbt.testing._
import scala.collection.mutable.ArrayBuffer

object ScalapropsRunner {

  
  def testFieldNames(clazz: Class[_]): Array[String] =
    Scalaprops.testFieldNames(clazz)

  private[scalaprops] def getTestObject(
    fingerprint: Fingerprint,
    testClassName: String,
    testClassLoader: ClassLoader
  ): Scalaprops = {
    ???
  }

  private[scalaprops] def findTests(
    fingerprint: Fingerprint,
    testClassName: String,
    testClassLoader: ClassLoader,
    only: List[String],
    logger: Logger
  ): Properties[_] = {
    ???
  }
}

final class ScalapropsRunner(
  override val args: Array[String],
  override val remoteArgs: Array[String],
  testClassLoader: ClassLoader
) extends Runner {

  private[this] val results = ArrayBuffer.empty[TestResult]
  private[this] val arguments = Arguments.parse(args.toList)

  private[this] val taskdef2task: TaskDef => sbt.testing.Task = { taskdef =>
    new ScalapropsTaskImpl(taskdef, testClassLoader, args, arguments, results, TestStatus())
  }

  override def tasks(taskDefs: Array[TaskDef]) = taskDefs.map(taskdef2task)

  override def done() = {
    val result = TestResult.formatResults(results, arguments.showDuration)
    println(result)
    result
  }
}

Source File: SolrTableFactory.scala From solr-sql with BSD 3-Clause "New" or "Revised" License

5 votes

package org.apache.calcite.adapter.solr

import scala.annotation.migration
import scala.collection.JavaConversions
import scala.collection.mutable.ArrayBuffer

import org.apache.calcite.rel.`type`.RelDataType
import org.apache.calcite.schema.SchemaPlus
import org.apache.calcite.schema.TableFactory
import org.apache.calcite.sql.`type`.SqlTypeName
import org.apache.log4j.Logger
import org.apache.solr.client.solrj.SolrClient
import org.apache.solr.client.solrj.impl.CloudSolrClient
import org.apache.solr.client.solrj.impl.HttpSolrClient


trait SolrClientFactory {
	def getClient(): SolrClient;
}

class SolrTableFactory extends TableFactory[SolrTable] {
	val logger = Logger.getLogger(this.getClass);

	override def create(parentSchema: SchemaPlus, name: String,
		operands: java.util.Map[String, Object], rowTypw: RelDataType): SolrTable = {

		val args = JavaConversions.mapAsScalaMap(operands).toMap.map(x ⇒ (x._1, x._2.toString()));
		//columns="title string, url string, content_length int"
		SolrTableConf.argumentsRequired(args, SolrTableConf.COULMNS);

		val columns: Map[String, SqlTypeName] = SolrTableConf.parseColumns(args, SolrTableConf.COULMNS);
		logger.debug(s"defined columns: $columns");

		//columnMapping="title->solr_field_title, url->solr_field_url"
		val definedColumnMapping = SolrTableConf.parseMap(args, SolrTableConf.COLUMN_MAPPING);
		logger.debug(s"defined column mapping: $definedColumnMapping");

		val filledColumnMapping = columns.map(x ⇒ (x._1, definedColumnMapping.getOrElse(x._1, x._1)));

		//options="pageSize:20,solrZkHosts=10.0.71.14:2181,10.0.71.17:2181,10.0.71.38:2181"
		val options = args;

		//a singleton of solr client
		val solrClientFactory = new SolrClientFactory {
			val clients = ArrayBuffer[SolrClient]();
			override def getClient = {
				if (clients.isEmpty) {
					if (options.keySet.contains(SolrTableConf.SOLR_ZK_HOSTS)) {
						val solrZkHosts = options(SolrTableConf.SOLR_ZK_HOSTS);
						logger.debug(s"connecting to solr cloud via zookeeper servers: $solrZkHosts");
						val csc = new CloudSolrClient(solrZkHosts);
						csc.setDefaultCollection(options("solrCollection"));
						clients += csc;
					}
					else {
						SolrTableConf.argumentsRequired(args, SolrTableConf.SOLR_ZK_HOSTS, SolrTableConf.SOLR_SERVER_URL);

						val solrServerURL = options(SolrTableConf.SOLR_SERVER_URL);
						logger.debug(s"connecting to solr server: $solrServerURL");
						clients += new HttpSolrClient(solrServerURL);
					}
				}

				clients(0);
			}
		}

		new SolrTable(solrClientFactory, columns, filledColumnMapping, options);
	}
}

Source File: TSQR.scala From SparkAndMPIFactorizations with MIT License

5 votes

package edu.berkeley.cs.amplab.mlmatrix

import java.util.concurrent.ThreadLocalRandom
import scala.collection.mutable.ArrayBuffer

import breeze.linalg._

import edu.berkeley.cs.amplab.mlmatrix.util.QRUtils
import edu.berkeley.cs.amplab.mlmatrix.util.Utils

import org.apache.spark.rdd.RDD
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.Accumulator
import org.apache.spark.SparkContext._

import java.util.Calendar
import java.text.SimpleDateFormat

class modifiedTSQR extends Serializable {

    def report(message: String, verbose: Boolean = true) = {
        val now = Calendar.getInstance().getTime()
        val formatter = new SimpleDateFormat("H:m:s")
        if (verbose) {
            println("STATUS REPORT (" + formatter.format(now) + "): " + message)
        }
    }

  
  private def reduceQR(
      acc: Accumulator[Double],
      a: Tuple2[DenseVector[Double], DenseMatrix[Double]],
      b: Tuple2[DenseVector[Double], DenseMatrix[Double]]): Tuple2[DenseVector[Double], DenseMatrix[Double]] = {
    val begin = System.nanoTime
    val outmat = QRUtils.qrR(DenseMatrix.vertcat(a._2, b._2), false)
    val outcolnorms = a._1 + b._1
    acc += ((System.nanoTime - begin) / 1e6)
    (outcolnorms, outmat)
  }

}

Source File: ParallelizedWithLocalityRDD.scala From cloud-integration with Apache License 2.0

5 votes

package org.apache.spark.cloudera

import scala.collection.immutable.NumericRange
import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.rdd.{ParallelCollectionPartition, RDD}


  def slice[T: ClassTag](seq: Seq[T], numSlices: Int): Seq[Seq[T]] = {
    if (numSlices < 1) {
      throw new IllegalArgumentException(
        "Positive number of partitions required")
    }

    // Sequences need to be sliced at the same set of index positions for operations
    // like RDD.zip() to behave as expected
    def positions(length: Long, numSlices: Int): Iterator[(Int, Int)] = {
      (0 until numSlices).iterator.map { i =>
        val start = ((i * length) / numSlices).toInt
        val end = (((i + 1) * length) / numSlices).toInt
        (start, end)
      }
    }

    seq match {
      case r: Range =>
        positions(r.length, numSlices).zipWithIndex
          .map { case ((start, end), index) =>
            // If the range is inclusive, use inclusive range for the last slice
            if (r.isInclusive && index == numSlices - 1) {
              new Range.Inclusive(r.start + start * r.step, r.end, r.step)
            }
            else {
              new Range(r.start + start * r.step,
                r.start + end * r.step,
                r.step)
            }
          }.toSeq.asInstanceOf[Seq[Seq[T]]]
      case nr: NumericRange[T] =>
        // For ranges of Long, Double, BigInteger, etc
        val slices = new ArrayBuffer[Seq[T]](numSlices)
        var r = nr
        for ((start, end) <- positions(nr.length, numSlices)) {
          val sliceSize = end - start
          slices += r.take(sliceSize).asInstanceOf[Seq[T]]
          r = r.drop(sliceSize)
        }
        slices
      case _ =>
        val array = seq.toArray // To prevent O(n^2) operations for List etc
        positions(array.length, numSlices).map { case (start, end) =>
          array.slice(start, end).toSeq
        }.toSeq
    }
  }
}

Source File: DelTransfer.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.pileup.converters

import scala.collection.mutable.ArrayBuffer

case class DelTransfer (contig: String,start: Int,len: Int) {
  val endDel: Int = start + len
  def isOverlappingLocus(queryContig:String, queryStart:Int ): Boolean ={
    if(queryContig != contig || queryStart <= start)
      return false
    if (queryStart <= endDel)
      return true
    false
  }
}

class DelContext extends Serializable  {
  private val minDelLen: Int = 0
  val dels: ArrayBuffer[DelTransfer] = new ArrayBuffer[DelTransfer]()

  def add(delTransfer: DelTransfer):Unit = {
    if (delTransfer.len <= minDelLen)
      return
    dels.append(delTransfer)
  }

  def getDelTransferForLocus(contig:String, position: Int): Int = {
    var counter = 0
    for (del <- dels) {
      if (del.isOverlappingLocus(contig, position))
        counter += 1
    }
    counter
  }
}

Source File: MDTagParser.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.pileup

import java.io.File

import htsjdk.samtools.reference.IndexedFastaSequenceFile
import htsjdk.samtools.{Cigar, CigarOperator, SAMRecord}
import org.apache.log4j.Logger
import org.apache.spark.sql.SparkSession
import org.biodatageeks.sequila.datasources.BAM.BDGAlignFileReaderWriter
import org.seqdoop.hadoop_bam.BAMBDGInputFormat

import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer

case class MDOperator(length: Int, base: Char) { //S means to skip n positions, not fix needed
  def isDeletion:Boolean = base.isLower
  def isNonDeletion:Boolean = base.isUpper
}
object MDTagParser{

  val logger: Logger = Logger.getLogger(this.getClass.getCanonicalName)
  val pattern = "([0-9]+)\\^?([A-Za-z]+)?".r

  def parseMDTag(t : String) = {

    if (isAllDigits(t)) {
      Array[MDOperator](MDOperator(t.toInt, 'S'))
    }
    else {
      val ab = new ArrayBuffer[MDOperator]()
      val matches = pattern
        .findAllIn(t)
      while (matches.hasNext) {
        val m = matches.next()
        if(m.last.isLetter && !m.contains('^') ){
          val skipPos = m.dropRight(1).toInt
          ab.append(MDOperator(skipPos, 'S') )
          ab.append(MDOperator(0, m.last.toUpper))
        }
        else if (m.last.isLetter && m.contains('^') ){ //encoding deletions as lowercase
          val arr =  m.split('^')
          val skipPos = arr.head.toInt
          ab.append(MDOperator(skipPos, 'S') )
          arr(1).foreach { b =>
            ab.append(MDOperator(0, b.toLower))
          }
        }
        else ab.append(MDOperator(m.toInt, 'S') )
      }
      ab.toArray
    }
  }


  private def isAllDigits(s: String) : Boolean = {
    val len = s.length
    var i = 0
      while(i < len){
        if(! s(i).isDigit ) return false
        i += 1
      }
    true
  }

}

Source File: NCListBuilder.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.rangejoins.NCList

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

object NCListBuilder {
  def build[T](array: Array[(Interval[Int], T)]): NCList = {
    val topNCList = NCList(ArrayBuffer.empty[NCList], 0, ArrayBuffer.empty[Int])
    var landingNCList = NCList(ArrayBuffer.empty[NCList], 0, ArrayBuffer.empty[Int])

    val arrayWithIndices = array.zipWithIndex.map{case (k,v) => (v,k)}
    val sortedIndices = arrayWithIndices.sortWith((x, y) => x._2._1.end > y._2._1.end)
      .sortWith((x, y) => x._2._1.start < y._2._1.start)
      .map(x => x._1)

    val stack = mutable.ArrayStack[NCListBuildingStack]()

    sortedIndices.foreach (
      rgid => {
      val currentEnd = arrayWithIndices(rgid)._2._1.end
      while(!stack.isEmpty && arrayWithIndices(stack.top.rgid)._2._1.end < currentEnd)
        stack.pop

      landingNCList = if (stack.isEmpty) topNCList else stack.top.ncList

      val stackElt = appendNCListElt(landingNCList, rgid)
      stack.push(stackElt)
    })

    topNCList
  }

   def appendNCListElt(landingNCList: NCList, rgid: Int): NCListBuildingStack = {
     landingNCList.childrenBuf.append(NCList(ArrayBuffer.empty[NCList], 0, ArrayBuffer.empty[Int]))
     val childrenNCList = landingNCList.childrenBuf.last
     val stackElt = NCListBuildingStack(childrenNCList,rgid)
     landingNCList.rgidBuf.append(rgid)
     landingNCList.nChildren = landingNCList.nChildren+1

     stackElt
   }
}

Source File: NCListTree.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.rangejoins.NCList


import scala.collection.mutable.{ArrayBuffer, ArrayStack}
import scala.util.control.Breaks._

class NCListTree[T](allRegions: Array[(Interval[Int], T)]) extends Serializable {

  val ncList = NCListBuilder.build(allRegions)

  def getAllOverlappings(processedInterval: Interval[Int]) = allOverlappingRegions(processedInterval, ncList, allRegions)

  private def allOverlappingRegions(processedInterval: Interval[Int], topNcList: NCList, intervalArray: Array[(Interval[Int],T)]): List[(Interval[Int], T)] = {
    val backpack = Backpack(intervalArray,processedInterval)
    var resultList = List[(Interval[Int], T)]()
    val walkingStack = ArrayStack[NCListWalkingStack]()

    var n = findLandingChild(topNcList, backpack)
    if (n < 0)
      return Nil

    var ncList = moveToChild(topNcList, n, walkingStack)
    while (ncList != null) {
      val stackElt = peekNCListWalkingStackElt(walkingStack)
      val rgid = stackElt.parentNcList.rgidBuf(stackElt.n)
      breakable {
        val candidateInterval = intervalArray(rgid)
        if (candidateInterval._1.start > backpack.processedInterval.end) {
          
    var n = (n1 + n2) / 2
    while (n != n1) {
      b = base(subset(n))._1.end
      if (b == min)
        return n
      if (b < min)
        n1 = n
      else
        n2 = n

      n = (n1 + n2) / 2
    }
    return n2
  }

  private def moveToChild(parentNcList: NCList, n: Int, walkingStack: ArrayStack[NCListWalkingStack]): NCList = {
    walkingStack.push(NCListWalkingStack(parentNcList, n))
    parentNcList.childrenBuf(n)
  }

  private def peekNCListWalkingStackElt(walkingStack: ArrayStack[NCListWalkingStack]): NCListWalkingStack = {
    walkingStack.top
  }

  private def moveToRightUncle(walkingStack: ArrayStack[NCListWalkingStack]): NCList = {
    val parentNcList = walkingStack.pop().parentNcList
    if (walkingStack.isEmpty)
      return null
    moveToRightSiblingOrUncle(parentNcList, walkingStack)
  }

  private def moveToRightSiblingOrUncle(ncList: NCList, walkingStack: ArrayStack[NCListWalkingStack]): NCList = {
    var ncListLocal = ncList

    do {
      val stackElt = walkingStack.pop()
      if ((stackElt.n+1) < stackElt.parentNcList.nChildren) {
        walkingStack.push(NCListWalkingStack(stackElt.parentNcList,stackElt.n+1))
        ncListLocal = stackElt.parentNcList.childrenBuf(stackElt.n+1)
        return ncListLocal
      } else {
        walkingStack.push(NCListWalkingStack(stackElt.parentNcList,stackElt.n+1))
        ncListLocal = stackElt.parentNcList
        walkingStack.pop()
      }
    } while (walkingStack.nonEmpty)
    null
  }

}

Source File: CoverageUpdate.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.coverage

import org.apache.spark.util.AccumulatorV2

import scala.collection.mutable.ArrayBuffer

case class RightCovEdge(contig: String,
                        minPos: Int,
                        startPoint: Int,
                        cov: Array[Short],
                        cumSum: Short)

case class ContigRange(contig: String, minPos: Int, maxPos: Int)

class CovUpdate(var right: ArrayBuffer[RightCovEdge],
                var left: ArrayBuffer[ContigRange])
    extends Serializable {

  def reset(): Unit = {
    right = new ArrayBuffer[RightCovEdge]()
    left = new ArrayBuffer[ContigRange]()
  }
  def add(p: CovUpdate): CovUpdate = {
    right = right ++ p.right
    left = left ++ p.left
    this
  }

}

class CoverageAccumulatorV2(var covAcc: CovUpdate)
    extends AccumulatorV2[CovUpdate, CovUpdate] {

  def reset(): Unit = {
    covAcc = new CovUpdate(new ArrayBuffer[RightCovEdge](),
                           new ArrayBuffer[ContigRange]())
  }

  def add(v: CovUpdate): Unit = {
    covAcc.add(v)
  }
  def value(): CovUpdate = {
    covAcc
  }
  def isZero(): Boolean = {
    covAcc.right.isEmpty && covAcc.left.isEmpty
  }
  def copy(): CoverageAccumulatorV2 = {
    new CoverageAccumulatorV2(covAcc)
  }
  def merge(other: AccumulatorV2[CovUpdate, CovUpdate]): Unit = {
    covAcc.add(other.value)
  }
}

Source File: BufferBenchmark.scala From sigmastate-interpreter with MIT License

5 votes

package special.collections

import debox.Buffer
import spire.syntax.all.cfor
import org.scalameter.api.Bench

import scala.collection.mutable
import scala.collection.mutable.{ArrayBuffer, ListBuffer}

trait BufferBenchmarkCases extends BenchmarkGens { suite: Bench[Double] =>
  val obj = new Object()
  performance of "append[Int]" in {
    measure method "of debox.Buffer" in {
      using(arrays) in { case (arr, n) =>
        val buf = Buffer.ofSize[Int](16)
        val limit = arr.length
        cfor(0)(_ < limit, _ + 1) { i =>
          buf.append(arr(i))
        }
        val res = buf.toArray()
      }
    }
    measure method "of ArrayBuilder" in {
      using(arrays) in { case (arr, n) =>
        val buf = mutable.ArrayBuilder.make[Int]()
        val limit = arr.length
        cfor(0)(_ < limit, _ + 1) { i =>
          buf += (arr(i))
        }
        val res = buf.result()
      }
    }
    measure method "of ArrayBuffer" in {
      using(arrays) in { case (arr, n) =>
        val buf = ArrayBuffer.empty[Int]
        val limit = arr.length
        cfor(0)(_ < limit, _ + 1) { i =>
          buf.append(arr(i))
        }
        val res = buf.toArray
      }
    }
    measure method "of ListBuffer" in {
      using(arrays) in { case (arr, n) =>
        val buf = ListBuffer.empty[Int]
        val limit = arr.length
        cfor(0)(_ < limit, _ + 1) { i =>
          buf.append(arr(i))
        }
        val res = buf.toList
      }
    }
  }

  performance of "append[Object]" in {
    measure method "of debox.Buffer" in {
      using(arrays) in { case (arr, n) =>
        val buf = Buffer.ofSize[Object](100)
        val limit = arr.length
        cfor(0)(_ < limit, _ + 1) { i =>
          buf.append(obj)
        }
      }
    }
    measure method "of ArrayBuilder" in {
      using(arrays) in { case (arr, n) =>
        val buf = mutable.ArrayBuilder.make[Object]()
        val limit = arr.length
        cfor(0)(_ < limit, _ + 1) { i =>
          buf += (obj)
        }
        val res = buf.result()
      }
    }
    measure method "of ArrayBuffer" in {
      using(arrays) in { case (arr, n) =>
        val buf = ArrayBuffer.empty[Object]
        val limit = arr.length
        cfor(0)(_ < limit, _ + 1) { i =>
          buf.append(obj)
        }
      }
    }
    measure method "of ListBuffer" in {
      using(arrays) in { case (arr, n) =>
        val buf = ListBuffer.empty[Object]
        val limit = arr.length
        cfor(0)(_ < limit, _ + 1) { i =>
          buf.append(obj)
        }
        val res = buf.toList
      }
    }
  }
}

object FastBufferBenchmark extends Bench.LocalTime with BufferBenchmarkCases {
}

Source File: HBaseLocalClient.scala From gimel with Apache License 2.0

5 votes

package com.paypal.gimel.hbase.utilities

import java.io.File

import scala.collection.mutable.ArrayBuffer

import com.google.common.io.Files
import org.apache.hadoop.hbase.{HBaseTestingUtility, TableName}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.execution.datasources.hbase.SparkHBaseConf
import org.apache.spark.sql.util._
import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers}

import com.paypal.gimel.common.catalog.Field
import com.paypal.gimel.hbase.DataSet

class HBaseLocalClient extends FunSuite with Matchers with BeforeAndAfterAll {

  var sparkSession : SparkSession = _
  var dataSet: DataSet = _
  val hbaseTestingUtility = new HBaseTestingUtility()
  val tableName = "test_table"
  val cfs = Array("personal", "professional")
  val columns = Array("id", "name", "age", "address", "company", "designation", "salary")
  val fields = columns.map(col => new Field(col))

  val metrics = ArrayBuffer.empty[(String, QueryExecution, Long)]

  protected override def beforeAll(): Unit = {
    val tempDir: File = Files.createTempDir
    tempDir.deleteOnExit
    hbaseTestingUtility.startMiniCluster()
    SparkHBaseConf.conf = hbaseTestingUtility.getConfiguration
    createTable(tableName, cfs)
    val conf = new SparkConf
    conf.set(SparkHBaseConf.testConf, "true")
    sparkSession = SparkSession.builder()
      .master("local")
      .appName("HBase Test")
      .config(conf)
      .getOrCreate()

    val listener = new QueryExecutionListener {
      // Only test successful case here, so no need to implement `onFailure`
      override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {}
      override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = {
        metrics += ((funcName, qe, duration))
      }
    }
    sparkSession.listenerManager.register(listener)
    sparkSession.sparkContext.setLogLevel("ERROR")
    dataSet = new DataSet(sparkSession)
  }

  protected override def afterAll(): Unit = {
    hbaseTestingUtility.shutdownMiniCluster()
    sparkSession.close()
  }

  def createTable(name: String, cfs: Array[String]) {
    val tName = Bytes.toBytes(name)
    val bcfs = cfs.map(Bytes.toBytes(_))
    try {
      hbaseTestingUtility.deleteTable(TableName.valueOf(tName))
    } catch {
      case _ : Throwable =>
        println("No table = " + name + " found")
    }
    hbaseTestingUtility.createMultiRegionTable(TableName.valueOf(tName), bcfs)
  }

  // Mocks data for testing
  def mockDataInDataFrame(numberOfRows: Int): DataFrame = {
    def stringed(n: Int) = s"""{"id": "$n","name": "MAC-$n", "address": "MAC-${n + 1}", "age": "${n + 1}", "company": "MAC-$n", "designation": "MAC-$n", "salary": "${n * 10000}" }"""
    val texts: Seq[String] = (1 to numberOfRows).map { x => stringed(x) }
    val rdd: RDD[String] = sparkSession.sparkContext.parallelize(texts)
    val dataFrame: DataFrame = sparkSession.read.json(rdd)
    dataFrame
  }
}

Source File: FriendEntity.scala From lagom-scala-chirper with Apache License 2.0

5 votes

package sample.chirper.friend.impl

import akka.Done
import com.lightbend.lagom.scaladsl.persistence.PersistentEntity
import com.lightbend.lagom.scaladsl.playjson.{JsonSerializer, JsonSerializerRegistry}
import sample.chirper.friend.api.User

import scala.collection.mutable.ArrayBuffer

class FriendEntity extends PersistentEntity {

  override type Command = FriendCommand[_]
  override type Event = FriendEvent
  override type State = FriendState

  override def initialState = FriendState(None)

  override def behavior = {
    case FriendState(None) => notInitialized
    case FriendState(Some(user)) => initialized
  }

  val onGetUser = Actions().onReadOnlyCommand[GetUser, GetUserReply] {
    case (GetUser(), ctx, state) => ctx.reply(GetUserReply(state.user))
  }

  val onFriendAdded = Actions().onEvent {
    case (FriendAdded(userId, friendId, timestamp), state) => state.addFriend(friendId)
  }

  val notInitialized = {
    Actions().
      onCommand[CreateUser, Done] {
      case (CreateUser(user), ctx, state) =>
        val events = ArrayBuffer.empty[FriendEvent]
        events += UserCreated(user.userId, user.name)
        events ++= user.friends.map(friendId => FriendAdded(user.userId, friendId))
        ctx.thenPersistAll(events: _*) { () =>
          ctx.reply(Done)
        }
    }.
      onCommand[AddFriend, Done] {
      case (AddFriend(friendUserId), ctx, state) =>
        ctx.invalidCommand(s"User $entityId is not created")
        ctx.done
    }.
      onEvent {
        case (UserCreated(userId, name, timestamp), state) => FriendState(User(userId, name))
      }
  }.orElse(onGetUser).orElse(onFriendAdded)

  val initialized = {
    Actions().
      onCommand[CreateUser, Done] {
      case (CreateUser(user), ctx, state) =>
        ctx.invalidCommand(s"User ${user.name} is already created")
        ctx.done
    }.
      onCommand[AddFriend, Done] {
      case (AddFriend(friendUserId), ctx, state) if state.user.get.friends.contains(friendUserId) =>
        ctx.reply(Done)
        ctx.done
      case (AddFriend(friendUserId), ctx, state) =>
        val event = FriendAdded(state.user.get.userId, friendUserId)
        ctx.thenPersist(event) { _ =>
          ctx.reply(Done)
        }
    }
  }.orElse(onGetUser).orElse(onFriendAdded)

}

object FriendSerializerRegistry extends JsonSerializerRegistry {
  override def serializers = List(
    JsonSerializer[GetUser],
    JsonSerializer[GetUserReply],
    JsonSerializer[FriendState],
    JsonSerializer[CreateUser],
    JsonSerializer[UserCreated],
    JsonSerializer[AddFriend],
    JsonSerializer[FriendAdded]
  )
}

Source File: TestContext.scala From swave with Mozilla Public License 2.0

5 votes

package swave.core.internal.testkit

import scala.annotation.tailrec
import scala.collection.mutable.ArrayBuffer
import org.scalacheck.rng.Seed
import swave.core.macros._
import swave.core.impl.util.ResizableRingBuffer
import swave.core.util._

private[testkit] final class TestContext(val runNr: Int,
                                         val asyncRate: Double,
                                         val asyncScheduling: TestGeneration.AsyncScheduling,
                                         val genSeed: Seed,
                                         tracing: Boolean) {

  import TestContext._

  private[this] val schedulings = ArrayBuffer.empty[ResizableRingBuffer[Task]]
  val random                    = XorShiftRandom(genSeed.long._1)

  def lastId = schedulings.size - 1

  def nextId(): Int = {
    schedulings += new ResizableRingBuffer[Task](16, 4096)
    schedulings.size - 1
  }

  def trace(msg: ⇒ String)(implicit stage: TestStage): Unit =
    if (tracing) println(stage.toString + ": " + msg)

  def run(msg: ⇒ String)(block: ⇒ Unit)(implicit stage: TestStage): Unit = {
    val scheduled = schedulings(stage.id)
    if (scheduled.nonEmpty || random.decide(asyncRate)) {
      trace("(scheduling) " + msg)
      requireState(scheduled.write(new Task(stage, msg _, block _)))
    } else {
      trace("(sync)       " + msg)
      block
    }
  }

  def hasSchedulings: Boolean = schedulings.exists(_.nonEmpty)

  @tailrec def processSchedulings(): Unit =
    if (hasSchedulings) {
      val snapshot: Array[ResizableRingBuffer[Task]] = schedulings.toArray

      def runSnapshots() = snapshot foreach { buf ⇒
        runTasks(buf, buf.count)
      }

      @tailrec def runTasks(buf: ResizableRingBuffer[Task], count: Int): Unit =
        if (count > 0) {
          val task = buf.read()
          trace("(running)    " + task.msg())(task.stage)
          task.block()
          runTasks(buf, count - 1)
        }

      asyncScheduling match {
        case TestGeneration.AsyncScheduling.InOrder ⇒
          runSnapshots()

        case TestGeneration.AsyncScheduling.RandomOrder ⇒
          random.shuffle_!(snapshot)
          runSnapshots()

        case TestGeneration.AsyncScheduling.ReversedOrder ⇒
          snapshot.reverse_!()
          runSnapshots()

        case TestGeneration.AsyncScheduling.Mixed ⇒
          @tailrec def rec(remaining: Array[ResizableRingBuffer[Task]]): Unit =
            if (remaining.nonEmpty) {
              random.shuffle_!(remaining)
              rec(remaining flatMap { buf ⇒
                val jobsSize = buf.count
                runTasks(buf, random.nextInt(jobsSize + 1)) // at least one, at most all
                if (buf.nonEmpty) buf :: Nil else Nil
              })
            }
          rec(snapshot)
      }
      processSchedulings()
    }
}

private[testkit] object TestContext {

  private class Task(val stage: TestStage, val msg: () ⇒ String, val block: () ⇒ Unit)
}

Source File: ercesiMIPSRunner.scala From ercesiMIPS with GNU General Public License v3.0

5 votes

// See LICENSE.txt for license details.

package utils

import scala.collection.mutable.ArrayBuffer
import scala.util.Properties.envOrElse

object ercesiMIPSRunner {
  def apply(ercesiMIPSMap: Map[String, String => Boolean], args: Array[String]): Unit = {
    // Choose the default backend based on what is available.
    lazy val firrtlTerpBackendAvailable: Boolean = {
      try {
        val cls = Class.forName("chisel3.iotesters.FirrtlTerpBackend")
        cls != null
      } catch {
        case e: Throwable => false
      }
    }
    lazy val defaultBackend = if (firrtlTerpBackendAvailable) {
      "firrtl"
    } else {
      ""
    }
    val backendName = envOrElse("TESTER_BACKENDS", defaultBackend).split(" ").head
    val problemsToRun = if(args.isEmpty || args.head == "all" ) {
      ercesiMIPSMap.keys.toSeq.sorted.toArray
    }
    else {
      args
    }

    var successful = 0
    val errors = new ArrayBuffer[String]
    for(testName <- problemsToRun) {
      ercesiMIPSMap.get(testName) match {
        case Some(test) =>
          println(s"Starting ercesiMIPS $testName")
          try {
            if(test(backendName)) {
              successful += 1
            }
            else {
              errors += s"ercesiMIPS $testName: test error occurred"
            }
          }
          catch {
            case exception: Exception =>
              exception.printStackTrace()
              errors += s"ercesiMIPS $testName: exception ${exception.getMessage}"
            case t : Throwable =>
              errors += s"ercesiMIPS $testName: throwable ${t.getMessage}"
          }
        case _ =>
          errors += s"Bad ercesiMIPS name: $testName"
      }

    }
    if(successful > 0) {
      println(s"ercesiMIPSs passing: $successful")
    }
    if(errors.nonEmpty) {
      println("=" * 80)
      println(s"Errors: ${errors.length}: in the following commands")
      println(errors.mkString("\n"))
      println("=" * 80)
      System.exit(1)
    }
  }
}

Source File: GzetPersons.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.community.util

import scala.collection.mutable.ArrayBuffer

object GzetPersons {
  def buildTuples(array: Array[String]): Array[(String, String)]  = {
    val holdingArray = ArrayBuffer[String]()
    val n = array.length
    val r = 2
    val data = new Array[String](r)
    combinations(array, holdingArray, data, 0, n - 1, 0, r)

    val result = ArrayBuffer[(String, String)]()
    for (s: String <- holdingArray.toArray) {
      val split: Array[String] = s.split(",")
      result += ((split(0), split(1)))
    }
    result.toArray
  }

  def combinations(input: Array[String], result: ArrayBuffer[String], data: Array[String], start: Int, end: Int, index: Int, r: Int): Unit ={
    if(index == r) {
      var s:String = ""
      for (i <- 0 until r) {
        if (i != 0) {
          s += ","
        }
        s += data(i)
      }
      result += s
     return
    }
    var j = start
    while(j <= end && (end - j + 1) >= (r - index)){
      data(index) = input(j)
      combinations(input, result, data, j + 1, end, index + 1, r)
      j += 1
    }
  }
}

Source File: IncrementalSeq.scala From inox with Apache License 2.0

5 votes

package inox.utils

import scala.collection.mutable.Builder
import scala.collection.mutable.ArrayBuffer
import scala.collection.{Iterable, IterableLike}

class IncrementalSeq[A] extends IncrementalState
                        with Iterable[A]
                        with IterableLike[A, Seq[A]]
                        with Builder[A, IncrementalSeq[A]] {

  private[this] var stack: List[ArrayBuffer[A]] = List(new ArrayBuffer())

  def clear() : Unit = {
    stack = List(new ArrayBuffer())
  }

  def reset(): Unit = {
    clear()
  }

  def push(): Unit = {
    stack ::= stack.head.clone
  }

  def pop(): Unit = {
    stack = stack.tail
  }

  def iterator = stack.head.toList.iterator
  def +=(e: A) = { stack.head += e; this }
  def -=(e: A) = { stack.head -= e; this }

  override def newBuilder = new scala.collection.mutable.ArrayBuffer()
  def result = this
}

Source File: MatchCollector.scala From piglet with Apache License 2.0

5 votes

package dbis.piglet.cep.ops
import dbis.piglet.cep.nfa.NFAStructure
import scala.reflect.ClassTag
import scala.collection.mutable.ListBuffer
import scala.collection.mutable.ArrayBuffer
import dbis.piglet.backends.{SchemaClass => Event}

class MatchCollector[ T <: Event: ClassTag] extends Serializable {
  var macthSequences: ListBuffer[NFAStructure[T]] = new ListBuffer()
  def +(that: NFAStructure[T]): Unit = macthSequences += that
  def size: Int = macthSequences.size
  def convertEventsToArray(): ArrayBuffer[T] = {
    var events: ArrayBuffer[T] = new ArrayBuffer()
    macthSequences.foreach (seq =>  events ++= seq.events)
    events
  }
  def convertEventsToBoolean(): ArrayBuffer[Boolean] = {
    ArrayBuffer(macthSequences.size > 0)
  }
}

Source File: NFAStructure.scala From piglet with Apache License 2.0

5 votes

package dbis.piglet.cep.nfa
import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag
import scala.collection.mutable.HashMap
import dbis.piglet.backends.{SchemaClass => Event}
import scala.collection.mutable.ListBuffer


  def addEvent(event: T, currentEdge: ForwardEdge[T]): Unit = {
    events += event
    //if (relatedValue != null) {
     // relatedValue.get(currentEdge.name.get) match {
       // case Some(x) => x.foreach (r => r.updateValue(event))
        //case None => Nil
      //}
    //}
    currenState = currentEdge.destState
    if (currenState.isInstanceOf[FinalState[T]])
      complete = true
  }
  
  override def clone(): NFAStructure[T] = {
    val copyStr = new NFAStructure[T](this.nfaController)
    copyStr.complete = this.complete
    copyStr.currenState = this.currenState
    copyStr.events = this.events.clone()
    //copyStr.events = this.events
    copyStr
  }
}

Source File: FlinkStreamingCEPTest.scala From piglet with Apache License 2.0

5 votes

package dbis.cep.test.flink

import java.io.File

import dbis.piglet.backends.{ Record, SchemaClass }
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.scalatest._
import org.apache.commons.io.FileUtils
import org.apache.flink.api.scala._
import dbis.piglet.cep.nfa._
import dbis.piglet.cep.ops.SelectionStrategy._
import dbis.piglet.cep.ops.OutputStrategy._
import dbis.piglet.cep.flink.CustomDataStreamMatcher._
import scala.collection.mutable.ArrayBuffer
import org.apache.flink.streaming.api.windowing.windows.GlobalWindow
import org.apache.flink.streaming.api.windowing.assigners.GlobalWindows

case class StreamingDoubleRecord(col1: Int, col2: Int) extends java.io.Serializable with SchemaClass {
  override def mkString(delim: String) = s"$col1$delim$col2"
}

object OurStreamingNFA {
    def filter1(record: StreamingDoubleRecord, rvalues: NFAStructure[StreamingDoubleRecord]): Boolean = record.col1 == 1
    def filter2(record: StreamingDoubleRecord, rvalues: NFAStructure[StreamingDoubleRecord]): Boolean = record.col1 == 2
    def filter3(record: StreamingDoubleRecord, rvalues: NFAStructure[StreamingDoubleRecord]): Boolean = record.col1 == 3
    def createNFA = {
      val testNFA: NFAController[StreamingDoubleRecord] = new NFAController()
      val firstState = testNFA.createAndGetStartState("First")
      val secondState = testNFA.createAndGetNormalState("Second")
      val thirdState = testNFA.createAndGetNormalState("Third")
      val finalState = testNFA.createAndGetFinalState("Final")

      val firstEdge = testNFA.createAndGetForwardEdge(filter1)
      val secondEdge = testNFA.createAndGetForwardEdge(filter2)
      val thirdEdge = testNFA.createAndGetForwardEdge(filter3)

      testNFA.createForwardTransition(firstState, firstEdge, secondState)
      testNFA.createForwardTransition(secondState, secondEdge, thirdState)
      testNFA.createForwardTransition(thirdState, thirdEdge, finalState)
      testNFA
    }
  }

class FlinkStreamingCEPTest extends FlatSpec with Matchers with BeforeAndAfterEach {
  var resultArray = new ArrayBuffer[StreamingDoubleRecord]
  override def beforeEach() {
     resultArray.clear()
  }

  val sample = Seq(
      StreamingDoubleRecord(1,1), 
      StreamingDoubleRecord(2,2), 
      StreamingDoubleRecord(1,3), 
      StreamingDoubleRecord(2,4), 
      StreamingDoubleRecord(3,5), 
      StreamingDoubleRecord(1,6),
      StreamingDoubleRecord(2,7),
      StreamingDoubleRecord(3,8))
      
  "Flink Streaming CEP" should "detect the pattern SEQ(A, B, C) with first match" in {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.getConfig.disableSysoutLogging()
    val data = env.fromCollection(sample)
    val res = data.matchNFA(OurStreamingNFA.createNFA, env, FirstMatch)
  }

  it should "detect the pattern SEQ(A, B, C) with any match" in {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.getConfig.disableSysoutLogging()
    val data = env.fromCollection(sample)
    val res = data.matchNFA(OurStreamingNFA.createNFA, env, AllMatches)
  }

  it should "detect the pattern SEQ(A, B, C) with next match" in {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.getConfig.disableSysoutLogging()
    val data = env.fromCollection(sample)
    val res = data.matchNFA(OurStreamingNFA.createNFA, env, NextMatches)
  }

  it should "detect the pattern SEQ(A, B, C) with contiguity match" in {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.getConfig.disableSysoutLogging()
    val data = env.fromCollection(sample)
    val res = data.matchNFA(OurStreamingNFA.createNFA, env, ContiguityMatches)
  }
}

Source File: Cross.scala From piglet with Apache License 2.0

5 votes

package dbis.piglet.op

import dbis.piglet.schema._

import scala.collection.mutable.ArrayBuffer


case class Cross(
    private val out: Pipe, 
    private val in: List[Pipe], 
    timeWindow: (Int, String)= null.asInstanceOf[(Int, String)]
  ) extends PigOperator(List(out), in) {

//  require(in.size == 2, "Only two inputs allowed for CROSS, currently!")
  
  override def lineageString: String = {
    s"""CROSS%""" + super.lineageString
  }

  override def constructSchema: Option[Schema] = {
    val newFields = ArrayBuffer[Field]()
    inputs.foreach(p => p.producer.schema match {
      case Some(s) => newFields ++= s.fields map { f =>
        Field(f.name, f.fType, p.name :: f.lineage)
      }
      case None => ???
    })
    schema = Some(Schema(BagType(TupleType(newFields.toArray))))
    schema
  }

  override def toString =
    s"""CROSS
       |  out = ${outPipeNames.mkString(",")}
       |  in = ${inPipeNames.mkString(",")}""".stripMargin

}

Source File: Zip.scala From piglet with Apache License 2.0

5 votes

package dbis.piglet.op

import dbis.piglet.schema._

import scala.collection.mutable.ArrayBuffer


case class Zip(
    private val out: Pipe, 
    private val in: List[Pipe],
    withIndex: Boolean
  ) extends PigOperator(List(out), in) {

  require((in.size > 1 && !withIndex) || (in.size == 1 && withIndex), "zip with index works only with one input. Otherwise we must have at least two inputs")

  override def lineageString: String = {
    s"""ZIP%$withIndex""" + super.lineageString
  }

  override def constructSchema: Option[Schema] = {


    val newFields = inputs.flatMap(p => p.producer.schema match {
        case Some(s) =>
          s.fields.map { f =>
            Field(f.name, f.fType, p.name :: f.lineage)
          }
        case None =>
          throw new UnsupportedOperationException(s"Cannot zip with unknown Schema! (input pipe $p)")
      })



    schema = Some(Schema(
      BagType(
        TupleType(
          (if(withIndex) newFields :+ Field("index", Types.LongType) else newFields).toArray
        )
      )
    ))
    schema
  }

  override def toString =
    s"""ZIP
       |  out = ${outPipeNames.mkString(",")}
       |  in = ${inPipeNames.mkString(",")}
       |  withIndex = $withIndex""".stripMargin

}

Source File: SpatialJoin.scala From piglet with Apache License 2.0

5 votes

package dbis.piglet.op

import dbis.piglet.expr.SpatialJoinPredicate
import dbis.piglet.op.IndexMethod.IndexMethod
import dbis.piglet.op.PartitionMethod.PartitionMethod
import dbis.piglet.schema._

import scala.collection.mutable.ArrayBuffer



case class SpatialJoin(
    private val out: Pipe, 
    private val in: List[Pipe], 
    predicate: SpatialJoinPredicate,
    index: Option[(IndexMethod, List[String])],
    leftParti: Option[(PartitionMethod, List[String])],
    rightParti: Option[(PartitionMethod, List[String])]
  ) extends PigOperator(List(out), in) {
  
  
  override def lineageString: String = {
    s"""SPATIALJOIN%${predicate.toString()}%$index%""" + super.lineageString
  }

  override def constructSchema: Option[Schema] = {
    val newFields = ArrayBuffer[Field]()
    inputs.foreach(p => p.producer.schema match {
      case Some(s) => if(s.isIndexed) {
        newFields ++= s.element.valueType.asInstanceOf[IndexType] // a bag of Indexes
          .valueType.fields // An Index contains tuples with two fields: indexed column and payload
          .last.fType.asInstanceOf[TupleType] // payload is again a tuple
          .fields // fields in each tuple
          .map { f =>
            Field(f.name, f.fType, p.name :: f.lineage)
          }
        } else {
          newFields ++= s.fields map { f =>
            Field(f.name, f.fType, p.name :: f.lineage)
          }
        }
      case None => newFields += Field("", Types.ByteArrayType)
    })
    schema = Some(Schema(BagType(TupleType(newFields.toArray))))
    schema
  }

  override def toString =
    s"""SPATIALJOIN
       |  out = $outPipeName
       |  in = ${inPipeNames.mkString(",")}
       |  inSchema = {${inputs.map(_.producer.schema).mkString(",")}}
       |  outSchema = $schema
       |  predicate = $predicate
       |  index = $index""".stripMargin
//

}

Source File: Union.scala From piglet with Apache License 2.0

5 votes

package dbis.piglet.op

import dbis.piglet.schema._

import scala.collection.mutable.ArrayBuffer


case class Union(private val out: Pipe, private val in: List[Pipe]) extends PigOperator(List(out), in) {

  override def lineageString: String = {
    s"""UNION%""" + super.lineageString
  }

  override def constructSchema: Option[Schema] = {
    val bagType = (p: Pipe) => p.producer.schema.get.element
    val generalizedBagType = (b1: BagType, b2: BagType) => {
      require(b1.valueType.fields.length == b2.valueType.fields.length)
      val newFields = ArrayBuffer[Field]()
      val fieldPairs = b1.valueType.fields.zip(b2.valueType.fields)
      for ((f1, f2) <- fieldPairs) {
        newFields += Field(f1.name, Types.escalateTypes(f1.fType, f2.fType))
      }
      BagType(TupleType(newFields.toArray))
    }

    // case 1: one of the input schema isn't known -> output schema = None
    if (inputs.exists(p => p.producer.schema.isEmpty)) {
      schema = None
    }
    else {
      // case 2: all input schemas have the same number of fields
      val s1 = inputs.head.producer.schema.get
      if (! inputs.tail.exists(p => s1.fields.length != p.producer.schema.get.fields.length)) {
        val typeList = inputs.map(p => bagType(p))
        val resultType = typeList.reduceLeft(generalizedBagType)
        schema = Some(Schema(resultType))
      }
      else {
        // case 3: the number of fields differ
        schema = None
      }
    }
    schema
  }

  override def toString =
    s"""UNION
       |  out = $outPipeName
       |  in = { ${inPipeNames.mkString(",")} }
       |  inSchema = $inputSchema
       |  outSchema = $schema""".stripMargin

}

Source File: JoinEmitter.scala From piglet with Apache License 2.0

5 votes

package dbis.piglet.codegen.flink.emitter

import dbis.piglet.codegen.{ CodeEmitter, CodeGenContext, CodeGenException }
import dbis.piglet.expr.Ref
import dbis.piglet.op.Join

import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.Set
import dbis.piglet.codegen.scala_lang.ScalaEmitter
import scala.collection.mutable.ListBuffer
import dbis.piglet.codegen.flink.FlinkHelper

class JoinEmitter extends dbis.piglet.codegen.scala_lang.JoinEmitter {
  override def template: String = """    val <out> = <rel1><rels, rel1_keys, rel2_keys:{ r,k1, k2 | .join(<r>).where(<k1>).equalTo(<k2>)}>.map{ 
                                    |      t => 
                                    |        val <pairs> = t
                                    |        <class>(<fields>)
                                    |    }""".stripMargin

  override def code(ctx: CodeGenContext, op: Join): String = {
    if (!op.schema.isDefined)
      throw CodeGenException("schema required in JOIN")

    val res = op.inputs.zip(op.fieldExprs)
    val keys = res.map { case (i, k) => k.map { x => s"_${FlinkHelper.getOrderIndex(i.producer.schema, x)}" } }
    var keysGroup: ListBuffer[(List[String], List[String])] = new ListBuffer
    for (i <- 0 until keys.length - 1) {
      val v = (keys(i), keys(i + 1))
      keysGroup += v
    }
    val keysGroup1 = keysGroup.zipWithIndex.map {
      case (i, k) =>
        if (k > 0)
          (FlinkHelper.printQuote(i._1.map { x => s"_$k.$x" }), FlinkHelper.printQuote(i._2))
        else
          (FlinkHelper.printQuote(i._1), FlinkHelper.printQuote(i._2))
    }
    val keys1 = keysGroup1.map(x => x._1)
    val keys2 = keysGroup1.map(x => x._2)

    val className = op.schema match {
      case Some(s) => ScalaEmitter.schemaClassName(s.className)
      case None => ScalaEmitter.schemaClassName(op.outPipeName)
    }
    var pairs = "(v1,v2)"
    for (i <- 3 to op.inputs.length) {
      pairs = s"($pairs,v$i)"
    }
    val fieldList = ArrayBuffer[String]()
    for (i <- 1 to op.inputs.length) {
      op.inputs(i - 1).producer.schema match {
        case Some(s) => fieldList ++= s.fields.zipWithIndex.map { case (f, k) => s"v$i._$k" }
        case None => fieldList += s"v$i._0"
      }
    }
    render(
      Map("out" -> op.outPipeName,
        "rel1" -> op.inputs.head.name,
        "class" -> className,
        "rels" -> op.inputs.tail.map(_.name),
        "pairs" -> pairs,
        "rel1_keys" -> keys1,
        "rel2_keys" -> keys2,
        "fields" -> fieldList.mkString(", ")))
  }
}

object JoinEmitter {
	lazy val instance = new JoinEmitter
}

Source File: FlinkHelper.scala From piglet with Apache License 2.0

5 votes

package dbis.piglet.codegen.flink

import dbis.piglet.codegen.CodeGenException
import dbis.piglet.expr.NamedField
import dbis.piglet.expr.PositionalField
import dbis.piglet.schema.Schema
import dbis.piglet.expr.Ref
import dbis.piglet.op.PigOperator
import scala.collection.mutable.ArrayBuffer

object FlinkHelper {
  def getOrderIndex(schema: Option[Schema],
    ref: Ref): Int = schema match {

    case Some(s) => ref match {
      case nf @ NamedField(f, _) => s.indexOfField(nf)
      case PositionalField(pos) => pos
      case _ => 0
    }
    case None => throw new CodeGenException(s"the Flink OrderBy/Join operator needs a schema, thus, invalid field ")
  }
  
  def emitJoinFieldList(node: PigOperator): (String, String) = {
    val rels = node.inputs
    var fields = ""
    var pairs = "(v,w)"
    if (rels.length == 2) {
      val vsize = rels.head.inputSchema.get.fields.length
      fields = node.schema.get.fields.zipWithIndex
        .map { case (f, i) => if (i < vsize) s"v._$i" else s"w._${i - vsize}" }.mkString(", ")
    } else {
      pairs = "(v1,v2)"
      for (i <- 3 to rels.length) {
        pairs = s"($pairs,v$i)"
      }
      val fieldList = ArrayBuffer[String]()
      for (i <- 1 to node.inputs.length) {
        node.inputs(i - 1).producer.schema match {
          case Some(s) => fieldList ++= s.fields.zipWithIndex.map { case (f, k) => s"v$i._$k" }
          case None => fieldList += s"v$i._0"
        }
      }
      fields = fieldList.mkString(", ")
    }
    (pairs, fields)
  }

  def printQuote(values: List[String]) = """"""" + values.mkString("""","""") + """""""
}

Source File: TSNEStandardExample.scala From dl4scala with MIT License

5 votes

package org.dl4scala.examples.nlp.tsne

import java.io.File

import org.datavec.api.util.ClassPathResource
import org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer
import org.deeplearning4j.models.sequencevectors.sequence.SequenceElement
import org.deeplearning4j.models.word2vec.wordstore.VocabCache
import org.nd4j.linalg.api.buffer.DataBuffer
import org.nd4j.linalg.api.buffer.util.DataTypeUtil
import org.nd4j.linalg.primitives
import org.slf4j.LoggerFactory

import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer


object TSNEStandardExample {
  private val log = LoggerFactory.getLogger(TSNEStandardExample.getClass)

  def main(args: Array[String]): Unit = {
    // STEP 1: Initialization
    val iterations = 100
    // create an n-dimensional array of doubles
    DataTypeUtil.setDTypeForContext(DataBuffer.Type.DOUBLE)
    val cacheList = new ArrayBuffer[String](); // cacheList is a dynamic array of strings used to hold all words

    //STEP 2: Turn text input into a list of words
    log.info("Load & Vectorize data....")
    val wordFile = new ClassPathResource("words.txt").getFile //Open the file

    //Get the data of all unique word vectors
    val vectors: primitives.Pair[InMemoryLookupTable[_ <: SequenceElement], VocabCache[_ <: SequenceElement]] = WordVectorSerializer.loadTxt(wordFile)
    val cache = vectors.getSecond
    val weights = vectors.getFirst.getSyn0 //seperate weights of unique words into their own list

    (0 until cache.numWords()).foreach(i => cacheList.append(cache.wordAtIndex(i)))

    import org.deeplearning4j.plot.BarnesHutTsne
    //STEP 3: build a dual-tree tsne to use later//STEP 3: build a dual-tree tsne to use later

    log.info("Build model....")
    val tsne = new BarnesHutTsne.Builder()
      .setMaxIter(iterations)
      .theta(0.5)
      .normalize(false)
      .learningRate(500)
      .useAdaGrad(false)
      .build

    //STEP 4: establish the tsne values and save them to a file
    log.info("Store TSNE Coordinates for Plotting....")
    val outputFile = "target/archive-tmp/tsne-standard-coords.csv"
    new File(outputFile).getParentFile.mkdirs
    tsne.fit(weights)
    tsne.saveAsFile(cacheList.asJava, outputFile)
  }
}

Source File: MNISTVisualizer.scala From dl4scala with MIT License

5 votes

package org.dl4scala.examples.feedforward.anomalydetection

import java.awt.{GridLayout, Image}
import java.awt.image.BufferedImage
import javax.swing.{ImageIcon, JFrame, JLabel, JPanel}

import org.nd4j.linalg.api.ndarray.INDArray

import scala.collection.mutable.ArrayBuffer


class MNISTVisualizer(imageScale: Double, digits: ArrayBuffer[INDArray], title: String, gridWidth: Int) {
  def this(imageScale: Double, digits: ArrayBuffer[INDArray], title: String) = {
    this(imageScale, digits, title, 5)
  }

  def visualize(): Unit = {
    val frame = new JFrame
    frame.setTitle(title)
    frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE)
    val panel = new JPanel
    panel.setLayout(new GridLayout(0, gridWidth))
    val list = getComponents
    for (image <- list) {
      panel.add(image)
    }
    frame.add(panel)
    frame.setVisible(true)
    frame.pack()
  }

  def getComponents: ArrayBuffer[JLabel] = {
    val images = new ArrayBuffer[JLabel]()
    for (arr <- digits) {
      val bi = new BufferedImage(28, 28, BufferedImage.TYPE_BYTE_GRAY)
      for(i <- 0 until 784) {
        bi.getRaster.setSample(i % 28, i / 28, 0, (255 * arr.getDouble(i)).asInstanceOf[Int])
      }
      val orig = new ImageIcon(bi)
      val imageScaled = orig.getImage.getScaledInstance((imageScale * 28).asInstanceOf[Int],
        (imageScale * 28).asInstanceOf[Int], Image.SCALE_REPLICATE)
      val scaled = new ImageIcon(imageScaled)
      images.append(new JLabel(scaled))
    }
    images
  }
}

Source File: GeneralNetwork.scala From deepspark with GNU General Public License v2.0

5 votes

package com.github.nearbydelta.deepspark.network

import com.esotericsoftware.kryo.Kryo
import com.esotericsoftware.kryo.io.{Input, Output}
import com.github.nearbydelta.deepspark.data._
import com.github.nearbydelta.deepspark.layer.InputLayer
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.ParSeq


class GeneralNetwork[In, Out](var inputLayer: InputLayer[In, _]) extends Network[In, Out] {
  @deprecated(message = "This is for kryo deserialization. Please use this(inputlayer)")
  def this() = this(null)

  override def NOut: Int =
    layerSeq.lastOption match {
      case Some(x) ⇒ x.NOut
      case None if inputLayer != null ⇒ inputLayer.NOut
      case None ⇒ 0
    }

  override def backward(error: ParSeq[DataVec]): ArrayBuffer[() ⇒ Unit] = {
    val (upper, fseq) = backwardSeq(error)
    val (x, f) = inputLayer backward upper
    fseq ++= f.seq
    fseq
  }

  override def broadcast(sc: SparkContext): Unit = {
    inputLayer.broadcast(sc)
    super.broadcast(sc)
  }

  override def forward(in: In) = {
    val out = inputLayer.forward(in)
    forwardSingle(out)
  }

  override def forward(in: ParSeq[In]): ParSeq[DataVec] = {
    val out = inputLayer.forward(in)
    forwardSeq(out)
  }

  override def forward(in: RDD[(Long, In)]): RDD[(Long, DataVec)] = {
    val out = inputLayer.forward(in)
    broadcast(in.context)
    forwardRDD(out)
  }

  override def initiateBy(builder: WeightBuilder): this.type = {
    inputLayer.initiateBy(builder)
    super.initiateBy(builder)
    this
  }

  override def loss: Double = super.loss + inputLayer.loss

  override def read(kryo: Kryo, input: Input): Unit = {
    inputLayer = kryo.readClassAndObject(input).asInstanceOf[InputLayer[In, _]]
    super.read(kryo, input)
  }

  override def setUpdatable(bool: Boolean): Network[In, Out] = {
    inputLayer.setUpdatable(bool)
    super.setUpdatable(bool)
  }

  override def unbroadcast(): Unit = {
    inputLayer.unbroadcast()
    super.unbroadcast()
  }

  override def write(kryo: Kryo, output: Output): Unit = {
    kryo.writeClassAndObject(output, inputLayer)
    super.write(kryo, output)
  }
}

Source File: ExtractStageHelpers.scala From akka-xml-parser with Apache License 2.0

5 votes

package uk.gov.hmrc.akka.xml

import com.fasterxml.aalto.{AsyncByteArrayFeeder, AsyncXMLStreamReader}

import scala.collection.mutable.ArrayBuffer

trait ExtractStageHelpers {

  def update(xmlElementsLst: scala.collection.mutable.Set[XMLGroupElement],
             path: ArrayBuffer[String], newValue: Some[String]): Unit = {
    val elementsWithoutAnyValueForGivenPath = xmlElementsLst.collect {
      case e: XMLGroupElement if (e.xPath == path.toList) && e.value.isEmpty => e
    }

    elementsWithoutAnyValueForGivenPath.map((ele: XMLGroupElement) => {
      xmlElementsLst.remove(ele)
      val newElement = ele.copy(value = newValue)
      xmlElementsLst.add(newElement)
    })
  }

  def getCompletedXMLElements(xmlElementsLst: scala.collection.mutable.Set[XMLGroupElement]):
  scala.collection.mutable.Set[XMLGroupElement] = {
    val completedElements = xmlElementsLst.collect {
      case e if !(e.xPath.nonEmpty && e.value.isEmpty) => e
    }

    completedElements.foreach({
      xmlElementsLst -= _
    })

    completedElements
  }
  
}

Source File: StreamHelper.scala From akka-xml-parser with Apache License 2.0

5 votes

package uk.gov.hmrc.akka.xml

import com.fasterxml.aalto.{AsyncByteArrayFeeder, AsyncXMLStreamReader}

import scala.collection.mutable.ArrayBuffer


trait StreamHelper {

  def update(xmlElementsLst: scala.collection.mutable.Set[XMLElement],
             path: ArrayBuffer[String], newValue: Some[String]): Unit = {
    val elementsWithoutAnyValueForGivenPath = xmlElementsLst.collect {
      case e: XMLElement if (e.xPath == path.toList) && e.value.isEmpty => e
    }

    elementsWithoutAnyValueForGivenPath.map((ele: XMLElement) => {
      xmlElementsLst.remove(ele)
      val newElement = ele.copy(value = newValue)
      xmlElementsLst.add(newElement)
    })
  }

  def getCompletedXMLElements(xmlElementsLst: scala.collection.mutable.Set[XMLElement]):
  scala.collection.mutable.Set[XMLElement] = {
    val completedElements = xmlElementsLst.collect {
      case e if !(e.xPath.nonEmpty && (e.value.isEmpty && e.attributes.isEmpty)) => e
    }

    completedElements.foreach({
      xmlElementsLst -= _
    })

    completedElements
  }


  def getUpdatedElement(xPath: Seq[String], attributes: Map[String, String], elemText: String)
                       (implicit reader: AsyncXMLStreamReader[AsyncByteArrayFeeder]): String = {
    val prefix = getPrefix

    val startElement = attributes.foldLeft(s"<$prefix${xPath.last}") {
      case (s, (k, v)) => s"""$s $k="$v""""
    } + ">"
    val value = elemText
    val endElement = getEndElement(xPath, prefix)
    s"$startElement$value$endElement"
  }

  private def getPrefix(implicit reader: AsyncXMLStreamReader[AsyncByteArrayFeeder]): String = Option(reader.getPrefix) match {
    case Some(pre) if pre.nonEmpty => s"$pre:"
    case _ => ""
  }

  private def getEndElement(xPath: Seq[String], prefix: String) = s"</$prefix${xPath.last}>"

}

scala.collection.mutable.ArrayBuffer Scala Examples