Example 1
Source File: IntegrationTest.scala From kmq with Apache License 2.0 | 6 votes |
package com.softwaremill.kmq.redelivery import java.time.Duration import java.util.Random import import akka.kafka.scaladsl.{Consumer, Producer} import akka.kafka.{ConsumerSettings, ProducerMessage, ProducerSettings, Subscriptions} import import akka.testkit.TestKit import com.softwaremill.kmq._ import com.softwaremill.kmq.redelivery.infrastructure.KafkaSpec import org.apache.kafka.clients.consumer.ConsumerConfig import org.apache.kafka.clients.producer.{ProducerConfig, ProducerRecord} import org.apache.kafka.common.serialization.StringDeserializer import org.scalatest.concurrent.Eventually import org.scalatest.time.{Seconds, Span} import org.scalatest.{BeforeAndAfterAll, FlatSpecLike, Matchers} import scala.collection.mutable.ArrayBuffer class IntegrationTest extends TestKit(ActorSystem("test-system")) with FlatSpecLike with KafkaSpec with BeforeAndAfterAll with Eventually with Matchers { implicit val materializer = ActorMaterializer() import system.dispatcher "KMQ" should "resend message if not committed" in { val bootstrapServer = s"localhost:${testKafkaConfig.kafkaPort}" val kmqConfig = new KmqConfig("queue", "markers", "kmq_client", "kmq_redelivery", Duration.ofSeconds(1).toMillis, 1000) val consumerSettings = ConsumerSettings(system, new StringDeserializer, new StringDeserializer) .withBootstrapServers(bootstrapServer) .withGroupId(kmqConfig.getMsgConsumerGroupId) .withProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest") val markerProducerSettings = ProducerSettings(system, new MarkerKey.MarkerKeySerializer(), new MarkerValue.MarkerValueSerializer()) .withBootstrapServers(bootstrapServer) .withProperty(ProducerConfig.PARTITIONER_CLASS_CONFIG, classOf[ParititionFromMarkerKey].getName) val markerProducer = markerProducerSettings.createKafkaProducer() val random = new Random() lazy val processedMessages = ArrayBuffer[String]() lazy val receivedMessages = ArrayBuffer[String]() val control = Consumer.committableSource(consumerSettings, Subscriptions.topics(kmqConfig.getMsgTopic)) // 1. get messages from topic .map { msg => ProducerMessage.Message( new ProducerRecord[MarkerKey, MarkerValue](kmqConfig.getMarkerTopic, MarkerKey.fromRecord(msg.record), new StartMarker(kmqConfig.getMsgTimeoutMs)), msg) } .via(Producer.flow(markerProducerSettings, markerProducer)) // 2. write the "start" marker .map(_.message.passThrough) .mapAsync(1) { msg => msg.committableOffset.commitScaladsl().map(_ => msg.record) // this should be batched } .map { msg => receivedMessages += msg.value msg } .filter(_ => random.nextInt(5) != 0) .map { processedMessage => processedMessages += processedMessage.value new ProducerRecord[MarkerKey, MarkerValue](kmqConfig.getMarkerTopic, MarkerKey.fromRecord(processedMessage), EndMarker.INSTANCE) } .to(Producer.plainSink(markerProducerSettings, markerProducer)) // 5. write "end" markers .run() val redeliveryHook = RedeliveryTracker.start(new KafkaClients(bootstrapServer), kmqConfig) val messages = (0 to 20).map(_.toString) messages.foreach(msg => sendToKafka(kmqConfig.getMsgTopic,msg)) eventually { receivedMessages.size should be > processedMessages.size processedMessages.sortBy(_.toInt).distinct shouldBe messages }(PatienceConfig(timeout = Span(15, Seconds)), implicitly) redeliveryHook.close() control.shutdown() } override def afterAll(): Unit = { super.afterAll() TestKit.shutdownActorSystem(system) } }
Example 2
Source File: UndoSnackbarManager.scala From shadowsocksr-android with GNU General Public License v3.0 | 5 votes |
package com.github.shadowsocks.widget import import android.view.View import com.github.shadowsocks.R import scala.collection.mutable.ArrayBuffer class UndoSnackbarManager[T](view: View, undo: Iterator[(Int, T)] => Unit, commit: Iterator[(Int, T)] => Unit = null) { private val recycleBin = new ArrayBuffer[(Int, T)] private val removedCallback = new Snackbar.Callback { override def onDismissed(snackbar: Snackbar, event: Int) = { event match { case Snackbar.Callback.DISMISS_EVENT_SWIPE | Snackbar.Callback.DISMISS_EVENT_MANUAL | Snackbar.Callback.DISMISS_EVENT_TIMEOUT => if (commit != null) commit(recycleBin.iterator) recycleBin.clear case _ => } last = null } } private var last: Snackbar = _ def remove(index: Int, item: T) = { recycleBin.append((index, item)) val count = recycleBin.length last = Snackbar .make(view, view.getResources.getQuantityString(R.plurals.removed, count, count: Integer), Snackbar.LENGTH_LONG) .setCallback(removedCallback).setAction(R.string.undo, (_ => { undo(recycleBin.reverseIterator) recycleBin.clear }): View.OnClickListener) } def flush = if (last != null) last.dismiss }
Example 3
Source File: SinkRouteHandler.scala From ohara with Apache License 2.0 | 5 votes |
package oharastream.ohara.shabondi.sink import java.time.{Duration => JDuration} import java.util.concurrent.TimeUnit import import akka.http.scaladsl.model.{ContentTypes, HttpEntity, StatusCodes} import akka.http.scaladsl.server.{ExceptionHandler, Route} import com.typesafe.scalalogging.Logger import import oharastream.ohara.common.util.Releasable import oharastream.ohara.shabondi.common.{JsonSupport, RouteHandler, ShabondiUtils} import org.apache.commons.lang3.StringUtils import scala.collection.mutable.ArrayBuffer import scala.compat.java8.DurationConverters._ import scala.concurrent.ExecutionContextExecutor import scala.concurrent.duration.Duration import spray.json.DefaultJsonProtocol._ import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport._ private[shabondi] object SinkRouteHandler { def apply(config: SinkConfig)(implicit actorSystem: ActorSystem) = new SinkRouteHandler(config) } private[shabondi] class SinkRouteHandler(config: SinkConfig)(implicit actorSystem: ActorSystem) extends RouteHandler { implicit private val contextExecutor: ExecutionContextExecutor = actorSystem.dispatcher private val log = Logger(classOf[SinkRouteHandler]) private[sink] val dataGroups = SinkDataGroups(config) def scheduleFreeIdleGroups(interval: JDuration, idleTime: JDuration): Unit = actorSystem.scheduler.scheduleWithFixedDelay(Duration(1, TimeUnit.SECONDS), interval.toScala) { () => { log.trace("scheduled free group, total group: {} ", dataGroups.size) dataGroups.freeIdleGroup(idleTime) } } private val exceptionHandler = ExceptionHandler { case ex: Throwable => log.error(ex.getMessage, ex) complete((StatusCodes.InternalServerError, ex.getMessage)) } private def fullyPollQueue(queue: RowQueue): Seq[Row] = { val buffer = ArrayBuffer.empty[Row] var item: Row = queue.poll() while (item != null) { buffer += item item = queue.poll() } buffer.toSeq } private def apiUrl = ShabondiUtils.apiUrl def route(): Route = handleExceptions(exceptionHandler) { path("groups" / Segment) { groupId => get { if (StringUtils.isAlphanumeric(groupId)) { val group = dataGroups.createIfAbsent(groupId) val result = fullyPollQueue(group.queue).map(row => JsonSupport.toRowData(row)) complete(result) } else { val entity = HttpEntity(ContentTypes.`text/plain(UTF-8)`, "Illegal group name, only accept alpha and numeric.") complete(StatusCodes.NotAcceptable -> entity) } } ~ { complete(StatusCodes.MethodNotAllowed -> s"Unsupported method, please reference: $apiUrl") } } ~ { complete(StatusCodes.NotFound -> s"Please reference: $apiUrl") } } override def close(): Unit = { Releasable.close(dataGroups) } }
Example 4
Source File: CSVConverter.scala From spark-snowflake with Apache License 2.0 | 5 votes |
package net.snowflake.spark.snowflake import org.apache.spark.sql.types.StructType import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag object CSVConverter { private final val delimiter = '|' private final val quoteChar = '"' private[snowflake] def convert[T: ClassTag]( partition: Iterator[String], resultSchema: StructType ): Iterator[T] = { val converter = Conversions.createRowConverter[T](resultSchema) => { val fields = ArrayBuffer.empty[String] var buff = new StringBuilder def addField(): Unit = { if (buff.isEmpty) fields.append(null) else { val field = buff.toString() buff = new StringBuilder fields.append(field) } } var escaped = false var index = 0 while (index < s.length) { escaped = false if (s(index) == quoteChar) { index += 1 while (index < s.length && !(escaped && s(index) == delimiter)) { if (escaped) { escaped = false buff.append(s(index)) } else if (s(index) == quoteChar) escaped = true else buff.append(s(index)) index += 1 } addField() } else { while (index < s.length && s(index) != delimiter) { buff.append(s(index)) index += 1 } addField() } index += 1 } addField() converter(fields.toArray) }) } }
Example 5
Source File: InterfaceTreeSpec.scala From daml with Apache License 2.0 | 5 votes |
// Copyright (c) 2020 Digital Asset (Switzerland) GmbH and/or its affiliates. All rights reserved. // SPDX-License-Identifier: Apache-2.0 package com.daml.lf.codegen import import import{DottedName, QualifiedName, PackageId} import com.daml.lf.iface.{DefDataType, Interface, InterfaceType, Record, Variant} import org.scalatest.{FlatSpec, Matchers} import scala.collection.mutable.ArrayBuffer class InterfaceTreeSpec extends FlatSpec with Matchers { behavior of "InterfaceTree.bfs" it should "traverse an empty tree" in { val interfaceTree = InterfaceTree(Map.empty, Interface(PackageId.assertFromString("packageid"), Map.empty)) interfaceTree.bfs(0)((x, _) => x + 1) shouldEqual 0 } it should "traverse a tree with n elements in bfs order" in { val qualifiedName1 = QualifiedName( DottedName.assertFromSegments(ImmArray("foo").toSeq), DottedName.assertFromSegments(ImmArray("bar").toSeq)) val record1 = InterfaceType.Normal(DefDataType(ImmArraySeq(), Record(ImmArraySeq()))) val qualifiedName2 = QualifiedName( DottedName.assertFromSegments(ImmArray("foo").toSeq), DottedName.assertFromSegments(ImmArray("bar", "baz").toSeq)) val variant1 = InterfaceType.Normal(DefDataType(ImmArraySeq(), Variant(ImmArraySeq()))) val qualifiedName3 = QualifiedName( DottedName.assertFromSegments(ImmArray("foo").toSeq), DottedName.assertFromSegments(ImmArray("qux").toSeq)) val record2 = InterfaceType.Normal(DefDataType(ImmArraySeq(), Record(ImmArraySeq()))) val typeDecls = Map(qualifiedName1 -> record1, qualifiedName2 -> variant1, qualifiedName3 -> record2) val interface = new Interface(PackageId.assertFromString("packageId2"), typeDecls) val tree = InterfaceTree.fromInterface(interface) val result = tree.bfs(ArrayBuffer.empty[InterfaceType])((ab, n) => n match { case ModuleWithContext(interface @ _, modulesLineage @ _, name @ _, module @ _) => ab case TypeWithContext(interface @ _, modulesLineage @ _, typesLineage @ _, name @ _, typ) => ab ++= typ.typ.toList }) result should contain theSameElementsInOrderAs Seq(record1, record2, variant1) } behavior of "InterfaceTree.fromInterface" it should "permit standalone types with multi-component names" in { val bazQuux = QualifiedName( DottedName.assertFromSegments(ImmArray("foo", "bar").toSeq), DottedName.assertFromSegments(ImmArray("baz", "quux").toSeq) ) val record = InterfaceType.Normal(DefDataType(ImmArraySeq(), Record(ImmArraySeq()))) val typeDecls = Map(bazQuux -> record) val interface = new Interface(PackageId.assertFromString("pkgid"), typeDecls) val tree = InterfaceTree.fromInterface(interface) val result = tree.bfs(ArrayBuffer.empty[InterfaceType])((types, n) => n match { case _: ModuleWithContext => types case TypeWithContext(_, _, _, _, tpe) => types ++= tpe.typ.toList }) result.toList shouldBe List(record) } }
Example 6
Source File: SpearmanCorrelation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 var cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Example 7
Source File: KPLBasedKinesisTestUtils.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import java.nio.ByteBuffer import java.nio.charset.StandardCharsets import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import{KinesisProducer => KPLProducer, KinesisProducerConfiguration, UserRecordResult} import{FutureCallback, Futures} private[kinesis] class KPLBasedKinesisTestUtils extends KinesisTestUtils { override protected def getProducer(aggregate: Boolean): KinesisDataGenerator = { if (!aggregate) { new SimpleDataGenerator(kinesisClient) } else { new KPLDataGenerator(regionName) } } } private[kinesis] class KPLDataGenerator(regionName: String) extends KinesisDataGenerator { private lazy val producer: KPLProducer = { val conf = new KinesisProducerConfiguration() .setRecordMaxBufferedTime(1000) .setMaxConnections(1) .setRegion(regionName) .setMetricsLevel("none") new KPLProducer(conf) } override def sendData(streamName: String, data: Seq[Int]): Map[String, Seq[(Int, String)]] = { val shardIdToSeqNumbers = new mutable.HashMap[String, ArrayBuffer[(Int, String)]]() data.foreach { num => val str = num.toString val data = ByteBuffer.wrap(str.getBytes(StandardCharsets.UTF_8)) val future = producer.addUserRecord(streamName, str, data) val kinesisCallBack = new FutureCallback[UserRecordResult]() { override def onFailure(t: Throwable): Unit = {} // do nothing override def onSuccess(result: UserRecordResult): Unit = { val shardId = result.getShardId val seqNumber = result.getSequenceNumber() val sentSeqNumbers = shardIdToSeqNumbers.getOrElseUpdate(shardId, new ArrayBuffer[(Int, String)]()) sentSeqNumbers += ((num, seqNumber)) } } Futures.addCallback(future, kinesisCallBack) } producer.flushSync() shardIdToSeqNumbers.toMap } }
Example 8
Source File: Exchange.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]() plan.transformUp { case exchange: Exchange => // the exchanges that have same results usually also have same schemas (same column names). val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]()) val samePlan = sameSchema.find { e => exchange.sameResult(e) } if (samePlan.isDefined) { // Keep the output of this exchange, the following plans require that to resolve // attributes. ReusedExchangeExec(exchange.output, samePlan.get) } else { sameSchema += exchange exchange } } } }
Example 9
Source File: subquery.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.{expressions, InternalRow} import org.apache.spark.sql.catalyst.expressions.{Expression, ExprId, InSet, Literal, PlanExpression} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{BooleanType, DataType, StructType} case class ReuseSubquery(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val subqueries = mutable.HashMap[StructType, ArrayBuffer[SubqueryExec]]() plan transformAllExpressions { case sub: ExecSubqueryExpression => val sameSchema = subqueries.getOrElseUpdate(sub.plan.schema, ArrayBuffer[SubqueryExec]()) val sameResult = sameSchema.find(_.sameResult(sub.plan)) if (sameResult.isDefined) { sub.withNewPlan(sameResult.get) } else { sameSchema += sub.plan sub } } } }
Example 10
Source File: ApplicationMasterArguments.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import scala.collection.mutable.ArrayBuffer import org.apache.spark.util.{IntParam, MemoryParam} class ApplicationMasterArguments(val args: Array[String]) { var userJar: String = null var userClass: String = null var primaryPyFile: String = null var primaryRFile: String = null var userArgs: Seq[String] = Nil var propertiesFile: String = null parseArgs(args.toList) private def parseArgs(inputArgs: List[String]): Unit = { val userArgsBuffer = new ArrayBuffer[String]() var args = inputArgs while (!args.isEmpty) { // --num-workers, --worker-memory, and --worker-cores are deprecated since 1.0, // the properties with executor in their names are preferred. args match { case ("--jar") :: value :: tail => userJar = value args = tail case ("--class") :: value :: tail => userClass = value args = tail case ("--primary-py-file") :: value :: tail => primaryPyFile = value args = tail case ("--primary-r-file") :: value :: tail => primaryRFile = value args = tail case ("--arg") :: value :: tail => userArgsBuffer += value args = tail case ("--properties-file") :: value :: tail => propertiesFile = value args = tail case _ => printUsageAndExit(1, args) } } if (primaryPyFile != null && primaryRFile != null) { // scalastyle:off println System.err.println("Cannot have primary-py-file and primary-r-file at the same time") // scalastyle:on println System.exit(-1) } userArgs = userArgsBuffer.toList } def printUsageAndExit(exitCode: Int, unknownParam: Any = null) { // scalastyle:off println if (unknownParam != null) { System.err.println("Unknown/unsupported param " + unknownParam) } System.err.println(""" |Usage: org.apache.spark.deploy.yarn.ApplicationMaster [options] |Options: | --jar JAR_PATH Path to your application's JAR file | --class CLASS_NAME Name of your application's main class | --primary-py-file A main Python file | --primary-r-file A main R file | --arg ARG Argument to be passed to your application's main class. | Multiple invocations are possible, each will be passed in order. | --properties-file FILE Path to a custom Spark properties file. """.stripMargin) // scalastyle:on println System.exit(exitCode) } } object ApplicationMasterArguments { val DEFAULT_NUMBER_EXECUTORS = 2 }
Example 11
Source File: ClientArguments.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import scala.collection.mutable.ArrayBuffer // TODO: Add code and support for ensuring that yarn resource 'tasks' are location aware ! private[spark] class ClientArguments(args: Array[String]) { var userJar: String = null var userClass: String = null var primaryPyFile: String = null var primaryRFile: String = null var userArgs: ArrayBuffer[String] = new ArrayBuffer[String]() parseArgs(args.toList) private def parseArgs(inputArgs: List[String]): Unit = { var args = inputArgs while (!args.isEmpty) { args match { case ("--jar") :: value :: tail => userJar = value args = tail case ("--class") :: value :: tail => userClass = value args = tail case ("--primary-py-file") :: value :: tail => primaryPyFile = value args = tail case ("--primary-r-file") :: value :: tail => primaryRFile = value args = tail case ("--arg") :: value :: tail => userArgs += value args = tail case Nil => case _ => throw new IllegalArgumentException(getUsageMessage(args)) } } if (primaryPyFile != null && primaryRFile != null) { throw new IllegalArgumentException("Cannot have primary-py-file and primary-r-file" + " at the same time") } } private def getUsageMessage(unknownParam: List[String] = null): String = { val message = if (unknownParam != null) s"Unknown/unsupported param $unknownParam\n" else "" message + s""" |Usage: org.apache.spark.deploy.yarn.Client [options] |Options: | --jar JAR_PATH Path to your application's JAR file (required in yarn-cluster | mode) | --class CLASS_NAME Name of your application's main class (required) | --primary-py-file A main Python file | --primary-r-file A main R file | --arg ARG Argument to be passed to your application's main class. | Multiple invocations are possible, each will be passed in order. """.stripMargin } }
Example 12
Source File: YarnClientSchedulerBackend.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.yarn.api.records.YarnApplicationState import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.deploy.yarn.{Client, ClientArguments, YarnSparkHadoopUtil} import org.apache.spark.internal.Logging import org.apache.spark.launcher.SparkAppHandle import org.apache.spark.scheduler.TaskSchedulerImpl private[spark] class YarnClientSchedulerBackend( scheduler: TaskSchedulerImpl, sc: SparkContext) extends YarnSchedulerBackend(scheduler, sc) with Logging { private var client: Client = null private var monitorThread: MonitorThread = null override def stop() { assert(client != null, "Attempted to stop this scheduler before starting it!") if (monitorThread != null) { monitorThread.stopMonitor() } // Report a final state to the launcher if one is connected. This is needed since in client // mode this backend doesn't let the app monitor loop run to completion, so it does not report // the final state itself. // // Note: there's not enough information at this point to provide a better final state, // so assume the application was successful. client.reportLauncherState(SparkAppHandle.State.FINISHED) super.stop() YarnSparkHadoopUtil.get.stopCredentialUpdater() client.stop() logInfo("Stopped") } }
Example 13
Source File: UnionDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class UnionDStream[T: ClassTag](parents: Array[DStream[T]]) extends DStream[T](parents.head.ssc) { require(parents.length > 0, "List of DStreams to union is empty") require( == 1, "Some of the DStreams have different contexts") require( == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { val rdds = new ArrayBuffer[RDD[T]]() { case Some(rdd) => rdds += rdd case None => throw new SparkException("Could not generate RDD from a parent for unifying at" + s" time $validTime") } if (rdds.nonEmpty) { Some( } else { None } } }
Example 14
Source File: QueueInputDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import{NotSerializableException, ObjectInputStream, ObjectOutputStream} import scala.collection.mutable.{ArrayBuffer, Queue} import scala.reflect.ClassTag import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.streaming.{StreamingContext, Time} private[streaming] class QueueInputDStream[T: ClassTag]( ssc: StreamingContext, val queue: Queue[RDD[T]], oneAtATime: Boolean, defaultRDD: RDD[T] ) extends InputDStream[T](ssc) { override def start() { } override def stop() { } private def readObject(in: ObjectInputStream): Unit = { throw new NotSerializableException("queueStream doesn't support checkpointing. " + "Please don't use queueStream when checkpointing is enabled.") } private def writeObject(oos: ObjectOutputStream): Unit = { logWarning("queueStream doesn't support checkpointing") } override def compute(validTime: Time): Option[RDD[T]] = { val buffer = new ArrayBuffer[RDD[T]]() queue.synchronized { if (oneAtATime && queue.nonEmpty) { buffer += queue.dequeue() } else { buffer ++= queue queue.clear() } } if (buffer.nonEmpty) { if (oneAtATime) { Some(buffer.head) } else { Some(new UnionRDD(, buffer.toSeq)) } } else if (defaultRDD != null) { Some(defaultRDD) } else { Some(ssc.sparkContext.emptyRDD) } } }
Example 15
Source File: LocalSparkCluster.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkConf import org.apache.spark.deploy.master.Master import org.apache.spark.deploy.worker.Worker import org.apache.spark.internal.Logging import org.apache.spark.rpc.RpcEnv import org.apache.spark.util.Utils for (workerNum <- 1 to numWorkers) { val workerEnv = Worker.startRpcEnvAndEndpoint(localHostname, 0, 0, coresPerWorker, memoryPerWorker, masters, null, Some(workerNum), _conf) workerRpcEnvs += workerEnv } masters } def stop() { logInfo("Shutting down local Spark cluster.") // Stop the workers before the master so they don't get upset that it disconnected workerRpcEnvs.foreach(_.shutdown()) masterRpcEnvs.foreach(_.shutdown()) workerRpcEnvs.foreach(_.awaitTermination()) masterRpcEnvs.foreach(_.awaitTermination()) masterRpcEnvs.clear() workerRpcEnvs.clear() } }
Example 16
Source File: TaskResult.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkEnv import org.apache.spark.serializer.SerializerInstance import import org.apache.spark.util.{AccumulatorV2, Utils} // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] def value(resultSer: SerializerInstance = null): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value val ser = if (resultSer == null) SparkEnv.get.serializer.newInstance() else resultSer valueObject = ser.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 17
Source File: Schedulable.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.util.concurrent.ConcurrentLinkedQueue import scala.collection.mutable.ArrayBuffer import org.apache.spark.scheduler.SchedulingMode.SchedulingMode private[spark] trait Schedulable { var parent: Pool // child queues def schedulableQueue: ConcurrentLinkedQueue[Schedulable] def schedulingMode: SchedulingMode def weight: Int def minShare: Int def runningTasks: Int def priority: Int def stageId: Int def name: String def addSchedulable(schedulable: Schedulable): Unit def removeSchedulable(schedulable: Schedulable): Unit def getSchedulableByName(name: String): Schedulable def executorLost(executorId: String, host: String, reason: ExecutorLossReason): Unit def checkSpeculatableTasks(minTimeToSpeculation: Int): Boolean def getSortedTaskSetQueue: ArrayBuffer[TaskSetManager] }
Example 18
Source File: ChunkedByteBufferOutputStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package import import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer import private[this] var position = chunkSize private[this] var _size = 0 private[this] var closed: Boolean = false def size: Long = _size override def close(): Unit = { if (!closed) { super.close() closed = true } } override def write(b: Int): Unit = { require(!closed, "cannot write to a closed ChunkedByteBufferOutputStream") allocateNewChunkIfNeeded() chunks(lastChunkIndex).put(b.toByte) position += 1 _size += 1 } override def write(bytes: Array[Byte], off: Int, len: Int): Unit = { require(!closed, "cannot write to a closed ChunkedByteBufferOutputStream") var written = 0 while (written < len) { allocateNewChunkIfNeeded() val thisBatch = math.min(chunkSize - position, len - written) chunks(lastChunkIndex).put(bytes, written + off, thisBatch) written += thisBatch position += thisBatch } _size += len } @inline private def allocateNewChunkIfNeeded(): Unit = { if (position == chunkSize) { chunks += allocator(chunkSize) lastChunkIndex += 1 position = 0 } } def toChunkedByteBuffer: ChunkedByteBuffer = { require(closed, "cannot call toChunkedByteBuffer() unless close() has been called") require(!toChunkedByteBufferWasCalled, "toChunkedByteBuffer() can only be called once") toChunkedByteBufferWasCalled = true if (lastChunkIndex == -1) { new ChunkedByteBuffer(Array.empty[ByteBuffer]) } else { // Copy the first n-1 chunks to the output, and then create an array that fits the last chunk. // An alternative would have been returning an array of ByteBuffers, with the last buffer // bounded to only the last chunk's position. However, given our use case in Spark (to put // the chunks in block manager), only limiting the view bound of the buffer would still // require the block manager to store the whole chunk. val ret = new Array[ByteBuffer](chunks.size) for (i <- 0 until chunks.size - 1) { ret(i) = chunks(i) ret(i).flip() } if (position == chunkSize) { ret(lastChunkIndex) = chunks(lastChunkIndex) ret(lastChunkIndex).flip() } else { ret(lastChunkIndex) = allocator(position) chunks(lastChunkIndex).flip() ret(lastChunkIndex).put(chunks(lastChunkIndex)) ret(lastChunkIndex).flip() StorageUtils.dispose(chunks(lastChunkIndex)) } new ChunkedByteBuffer(ret) } } }
Example 19
Source File: SubtractedRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.{HashMap => JHashMap} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.Dependency import org.apache.spark.OneToOneDependency import org.apache.spark.Partition import org.apache.spark.Partitioner import org.apache.spark.ShuffleDependency import org.apache.spark.SparkEnv import org.apache.spark.TaskContext private[spark] class SubtractedRDD[K: ClassTag, V: ClassTag, W: ClassTag]( @transient var rdd1: RDD[_ <: Product2[K, V]], @transient var rdd2: RDD[_ <: Product2[K, W]], part: Partitioner) extends RDD[(K, V)](rdd1.context, Nil) { override def getDependencies: Seq[Dependency[_]] = { def rddDependency[T1: ClassTag, T2: ClassTag](rdd: RDD[_ <: Product2[T1, T2]]) : Dependency[_] = { if (rdd.partitioner == Some(part)) { logDebug("Adding one-to-one dependency with " + rdd) new OneToOneDependency(rdd) } else { logDebug("Adding shuffle dependency with " + rdd) new ShuffleDependency[T1, T2, Any](rdd, part) } } Seq(rddDependency[K, V](rdd1), rddDependency[K, W](rdd2)) } override def getPartitions: Array[Partition] = { val array = new Array[Partition](part.numPartitions) for (i <- 0 until array.length) { // Each CoGroupPartition will depend on rdd1 and rdd2 array(i) = new CoGroupPartition(i, Seq(rdd1, rdd2) { case (rdd, j) => dependencies(j) match { case s: ShuffleDependency[_, _, _] => None case _ => Some(new NarrowCoGroupSplitDep(rdd, i, rdd.partitions(i))) } }.toArray) } array } override val partitioner = Some(part) override def compute(p: Partition, context: TaskContext): Iterator[(K, V)] = { val partition = p.asInstanceOf[CoGroupPartition] val map = new JHashMap[K, ArrayBuffer[V]] def getSeq(k: K): ArrayBuffer[V] = { val seq = map.get(k) if (seq != null) { seq } else { val seq = new ArrayBuffer[V]() map.put(k, seq) seq } } def integrate(depNum: Int, op: Product2[K, V] => Unit): Unit = { dependencies(depNum) match { case oneToOneDependency: OneToOneDependency[_] => val dependencyPartition = partition.narrowDeps(depNum).get.split oneToOneDependency.rdd.iterator(dependencyPartition, context) .asInstanceOf[Iterator[Product2[K, V]]].foreach(op) case shuffleDependency: ShuffleDependency[_, _, _] => val iter = SparkEnv.get.shuffleManager .getReader( shuffleDependency.shuffleHandle, partition.index, partition.index + 1, context) .read() iter.foreach(op) } } // the first dep is rdd1; add all values to the map integrate(0, t => getSeq(t._1) += t._2) // the second dep is rdd2; remove all of its keys integrate(1, t => map.remove(t._1)) =>, _))).flatten } override def clearDependencies() { super.clearDependencies() rdd1 = null rdd2 = null } }
Example 20
Source File: UnionRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport} import scala.concurrent.forkjoin.ForkJoinPool import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient private val rdd: RDD[T], val parentRddIndex: Int, @transient private val parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } object UnionRDD { private[spark] lazy val partitionEvalTaskSupport = new ForkJoinTaskSupport(new ForkJoinPool(8)) } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies // visible for testing private[spark] val isPartitionListingParallel: Boolean = rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10) override def getPartitions: Array[Partition] = { val parRDDs = if (isPartitionListingParallel) { val parArray = rdds.par parArray.tasksupport = UnionRDD.partitionEvalTaskSupport parArray } else { rdds } val array = new Array[Partition]( var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length) pos += rdd.partitions.length } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Example 21
Source File: TaskContextImpl.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark import java.util.Properties import scala.collection.mutable.ArrayBuffer import org.apache.spark.executor.TaskMetrics import org.apache.spark.internal.Logging import org.apache.spark.memory.TaskMemoryManager import org.apache.spark.metrics.MetricsSystem import org.apache.spark.metrics.source.Source import org.apache.spark.util._ private[spark] class TaskContextImpl( val stageId: Int, val partitionId: Int, override val taskAttemptId: Long, override val attemptNumber: Int, var _taskMemoryManager: TaskMemoryManager, localProperties: Properties, @transient private val metricsSystem: MetricsSystem, // The default value is only used in tests. override val taskMetrics: TaskMetrics = TaskMetrics.empty, var batchId: Int = 0) extends TaskContext with Logging { private[spark] def markInterrupted(): Unit = { interrupted = true } override def isCompleted(): Boolean = completed override def isRunningLocally(): Boolean = false override def isInterrupted(): Boolean = interrupted override def getLocalProperty(key: String): String = localProperties.getProperty(key) override def getMetricsSources(sourceName: String): Seq[Source] = metricsSystem.getSourcesByName(sourceName) private[spark] override def registerAccumulator(a: AccumulatorV2[_, _]): Unit = { taskMetrics.registerAccumulator(a) } }
Example 22
Source File: TimeStampedHashMapSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.util import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import scala.util.Random import org.apache.spark.SparkFunSuite class TimeStampedHashMapSuite extends SparkFunSuite { // Test the testMap function - a Scala HashMap should obviously pass testMap(new mutable.HashMap[String, String]()) // Test TimeStampedHashMap basic functionality testMap(new TimeStampedHashMap[String, String]()) testMapThreadSafety(new TimeStampedHashMap[String, String]()) test("TimeStampedHashMap - clearing by timestamp") { // clearing by insertion time val map = new TimeStampedHashMap[String, String](updateTimeStampOnGet = false) map("k1") = "v1" assert(map("k1") === "v1") Thread.sleep(10) val threshTime = System.currentTimeMillis assert(map.getTimestamp("k1").isDefined) assert(map.getTimestamp("k1").get < threshTime) map.clearOldValues(threshTime) assert(map.get("k1") === None) // clearing by modification time val map1 = new TimeStampedHashMap[String, String](updateTimeStampOnGet = true) map1("k1") = "v1" map1("k2") = "v2" assert(map1("k1") === "v1") Thread.sleep(10) val threshTime1 = System.currentTimeMillis Thread.sleep(10) assert(map1("k2") === "v2") // access k2 to update its access time to > threshTime assert(map1.getTimestamp("k1").isDefined) assert(map1.getTimestamp("k1").get < threshTime1) assert(map1.getTimestamp("k2").isDefined) assert(map1.getTimestamp("k2").get >= threshTime1) map1.clearOldValues(threshTime1) // should only clear k1 assert(map1.get("k1") === None) assert(map1.get("k2").isDefined) } def testMapThreadSafety(hashMapConstructor: => mutable.Map[String, String]) { def newMap() = hashMapConstructor val name = newMap().getClass.getSimpleName val testMap = newMap() @volatile var error = false def getRandomKey(m: mutable.Map[String, String]): Option[String] = { val keys = testMap.keysIterator.toSeq if (keys.nonEmpty) { Some(keys(Random.nextInt(keys.size))) } else { None } } val threads = (1 to 25).map(i => new Thread() { override def run() { try { for (j <- 1 to 1000) { Random.nextInt(3) match { case 0 => testMap(Random.nextString(10)) = Random.nextDouble().toString // put case 1 => getRandomKey(testMap).map(testMap.get) // get case 2 => getRandomKey(testMap).map(testMap.remove) // remove } } } catch { case t: Throwable => error = true throw t } } }) test(name + " - threading safety test") { threads.foreach(_.start()) threads.foreach(_.join()) assert(!error) } } }
Example 23
Source File: Predict.scala From BigDL with Apache License 2.0 | 5 votes |
package import{BytesToGreyImg, GreyImgNormalizer, GreyImgToSample} import import import import import org.apache.log4j.{Level, Logger} import scala.collection.mutable.ArrayBuffer object Predict { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) Logger.getLogger("breeze").setLevel(Level.ERROR) import Utils._ def main(args: Array[String]): Unit = { predictParser.parse(args, new PredictParams()).foreach { param => System.setProperty("bigdl.localMode", "true") System.setProperty("bigdl.coreNumber", (param.coreNumber.toString)) Engine.init val validationData = param.folder + "/t10k-images-idx3-ubyte" val validationLabel = param.folder + "/t10k-labels-idx1-ubyte" val rawData = load(validationData, validationLabel) val iter = rawData.iterator val sampleIter = GreyImgToSample()( GreyImgNormalizer(trainMean, trainStd)( BytesToGreyImg(28, 28)(iter))) var samplesBuffer = ArrayBuffer[Sample[Float]]() while (sampleIter.hasNext) { val elem = samplesBuffer += elem } val samples = samplesBuffer.toArray val model = Module.load[Float](param.model) val localPredictor = LocalPredictor(model) val result = localPredictor.predict(samples) val result_class = localPredictor.predictClass(samples) result_class.foreach(r => println(s"${r}")) } } }
Example 24
Source File: BatchSampler.scala From BigDL with Apache License 2.0 | 5 votes |
package import import{BboxUtil, BoundingBox} import import scala.collection.mutable.ArrayBuffer def generateBatchSamples(label: RoiLabel, batchSamplers: Array[BatchSampler], sampledBoxes: ArrayBuffer[BoundingBox]): Unit = { sampledBoxes.clear() var i = 0 val unitBox = BoundingBox(0, 0, 1, 1) while (i < batchSamplers.length) { batchSamplers(i).sample(unitBox, label, sampledBoxes) i += 1 } } }
Example 25
Source File: RandomSampler.scala From BigDL with Apache License 2.0 | 5 votes |
package import{FeatureTransformer, ImageFeature} import import{BoundingBox} import import org.opencv.core.Mat import scala.collection.mutable.ArrayBuffer class RandomSampler extends Crop { // random cropping samplers val batchSamplers = Array( new BatchSampler(maxTrials = 1), new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2, minOverlap = Some(0.1)), new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2, minOverlap = Some(0.3)), new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2, minOverlap = Some(0.5)), new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2, minOverlap = Some(0.7)), new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2, minOverlap = Some(0.9)), new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2, maxOverlap = Some(1.0))) def generateRoi(feature: ImageFeature): BoundingBox = { val roiLabel = feature(ImageFeature.label).asInstanceOf[RoiLabel] val boxesBuffer = new ArrayBuffer[BoundingBox]() BatchSampler.generateBatchSamples(roiLabel, batchSamplers, boxesBuffer) // randomly pick up one as input data if (boxesBuffer.nonEmpty) { // Randomly pick a sampled bbox and crop the expand_datum. val index = (RNG.uniform(0, 1) * boxesBuffer.length).toInt boxesBuffer(index) } else { BoundingBox(0, 0, 1, 1) } } } object RandomSampler { def apply(): FeatureTransformer = { new RandomSampler() -> RoiProject() } }
Example 26
Source File: RoiTransformer.scala From BigDL with Apache License 2.0 | 5 votes |
package import{BboxUtil, BoundingBox} import{FeatureTransformer, ImageFeature} import scala.collection.mutable.ArrayBuffer case class RoiProject(needMeetCenterConstraint: Boolean = true) extends FeatureTransformer { val transformedAnnot = new ArrayBuffer[BoundingBox]() override def transformMat(feature: ImageFeature): Unit = { val imageBoundary = feature[BoundingBox](ImageFeature.boundingBox) if (!imageBoundary.normalized) { imageBoundary.scaleBox(1.0f / feature.getHeight(), 1f / feature.getWidth(), imageBoundary) } val target = feature[RoiLabel](ImageFeature.label) transformedAnnot.clear() // Transform the annotation according to bounding box. var i = 1 while (i <= target.size()) { val gtBoxes = BoundingBox(target.bboxes.valueAt(i, 1), target.bboxes.valueAt(i, 2), target.bboxes.valueAt(i, 3), target.bboxes.valueAt(i, 4)) if (!needMeetCenterConstraint || imageBoundary.meetEmitCenterConstraint(gtBoxes)) { val transformedBox = new BoundingBox() if (imageBoundary.projectBbox(gtBoxes, transformedBox)) { transformedBox.setLabel(target.classes.valueAt(1, i)) transformedBox.setDifficult(target.classes.valueAt(2, i)) transformedAnnot.append(transformedBox) } } i += 1 } // write the transformed annotation back to target target.bboxes.resize(transformedAnnot.length, 4) target.classes.resize(2, transformedAnnot.length) i = 1 while (i <= transformedAnnot.length) { target.bboxes.setValue(i, 1, transformedAnnot(i - 1).x1) target.bboxes.setValue(i, 2, transformedAnnot(i - 1).y1) target.bboxes.setValue(i, 3, transformedAnnot(i - 1).x2) target.bboxes.setValue(i, 4, transformedAnnot(i - 1).y2) target.classes.setValue(1, i, transformedAnnot(i - 1).label) target.classes.setValue(2, i, transformedAnnot(i - 1).difficult) i += 1 } } }
Example 27
Source File: Mean.scala From BigDL with Apache License 2.0 | 5 votes |
package import java.nio.ByteOrder import import{AbstractModule, Activity} import import import import import import org.tensorflow.framework.{DataType, NodeDef} import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag class Mean extends TensorflowOpsLoader { import Utils._ override def build[T: ClassTag](nodeDef: NodeDef, byteOrder: ByteOrder , context: Context[T])(implicit ev: TensorNumeric[T]): Module[T] = { val attr = nodeDef.getAttrMap val dataType = getType(attr, "T") val squeeze = !getBoolean(attr, "keep_dims") val dt = dataType match { case DataType.DT_INT8 => "Int" case DataType.DT_INT16 => "Int" case DataType.DT_UINT8 => "Int" case DataType.DT_UINT16 => "Int" case DataType.DT_INT32 => "Int" case DataType.DT_INT64 => "Long" case DataType.DT_FLOAT => "Float" case DataType.DT_DOUBLE => "Double" case _ => throw new UnsupportedOperationException("Data Type: " + dataType + " is not Unsupported yet.") } new MeanLoadTF[T](dt, squeeze) } } class MeanLoadTF[T: ClassTag](val dataType: String, val squeeze: Boolean)(implicit ev: TensorNumeric[T]) extends Adapter[T](Array(2)) { override def build(tensorArrays: Array[Tensor[_]]): AbstractModule[Activity, Activity, T] = { val dims = tensorArrays(0).asInstanceOf[Tensor[Int]] val dim = ArrayBuffer[Int]() val mean = Sequential[T]() for (i <- 1 to dims.size(1)) { dim += dims.valueAt(i) + 1 } dataType match { case "Int" => dim.foreach(i => mean.add(Mean[T, Int](i, squeeze = squeeze))) case "Long" => dim.foreach(i => mean.add(Mean[T, Long](i, squeeze = squeeze))) case "Float" => dim.foreach(i => mean.add(Mean[T, Float](i, squeeze = squeeze))) case "Double" => dim.foreach(i => mean.add(Mean[T, Double](i, squeeze = squeeze))) case _ => throw new UnsupportedOperationException("Data Type: " + dataType + " is not Unsupported yet.") } mean } }
Example 28
Source File: Transpose.scala From BigDL with Apache License 2.0 | 5 votes |
package import java.nio.ByteOrder import import{AbstractModule, Activity} import{Contiguous, Sequential, Transpose => TransposeLayer} import import import import org.tensorflow.framework.NodeDef import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag class Transpose extends TensorflowOpsLoader { import Utils._ override def build[T: ClassTag](nodeDef: NodeDef, byteOrder: ByteOrder , context: Context[T])(implicit ev: TensorNumeric[T]): Module[T] = { new TransposeLoadTF[T]() } } object TransposeLoadTF { def permToPair(perm: Array[Int]): Array[(Int, Int)] = { val numToRank = perm.zipWithIndex.toMap val arr = perm.indices.toArray val pairs = ArrayBuffer[(Int, Int)]() def sort(arr: Array[Int], low: Int, high: Int): Unit = { var i = low var j = high val pivot = arr(low + (high - low)/2) while (i <= j) { while (arr(i) < pivot) i += 1 while (arr(j) > pivot) j -= 1 if (i <= j) { exchangeNumbers(arr, i, j) i += 1 j -= 1 } } if (low < j) sort(arr, low, j) if (i < high) sort(arr, i, high) } def exchangeNumbers(arr: Array[Int], i: Int, j: Int): Unit = { val temp = arr(i) arr(i) = arr(j) arr(j) = temp pairs += ((i, j)) } sort(, 0, arr.length-1) pairs.filter(pair => pair._1 != pair._2).toArray } } class TransposeLoadTF[T: ClassTag]()(implicit ev: TensorNumeric[T]) extends Adapter[T](Array(2)) { import TransposeLoadTF._ override def build(tensorArrays: Array[Tensor[_]]): AbstractModule[Activity, Activity, T] = { val perm = tensorArrays(0).asInstanceOf[Tensor[Int]].storage().array() val paris = permToPair(perm) val layer = Sequential() layer.add(TransposeLayer[T]( => (x._1 + 1, x._2 + 1)))) layer.add(Contiguous()) layer } }
Example 29
package import java.nio.ByteOrder import import{AbstractModule, Activity} import{Padding, Sequential} import import import{Context, TFUtils} import org.tensorflow.framework.NodeDef import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag class Pad extends TensorflowOpsLoader { import Utils._ override def build[T: ClassTag](nodeDef: NodeDef, byteOrder: ByteOrder, context: Context[T])(implicit ev: TensorNumeric[T]): Module[T] = { new PadLoadTF[T]() } } class PadLoadTF[T: ClassTag]()(implicit ev: TensorNumeric[T]) extends Adapter[T](Array(2)) { override def build(tensorArrays: Array[Tensor[_]]): AbstractModule[Activity, Activity, T] = { val paddings = tensorArrays(0).asInstanceOf[Tensor[Int]] val pad = ArrayBuffer[Int]() val padding = Sequential[T]() for(dim <- 1 to paddings.size(1)) { if (paddings.valueAt(dim, 1) != 0 || paddings.valueAt(dim, 2) != 0 ) { if (paddings(Array(dim, 1)) != 0) { padding.add(Padding[T](dim, -paddings.valueAt(dim, 1), 4)) } if (paddings(Array(dim, 2)) != 0) { padding.add(Padding[T](dim, paddings.valueAt(dim, 2), 4)) } } } padding } }
Example 30
Source File: IRConverter.scala From BigDL with Apache License 2.0 | 5 votes |
package import import{AbstractModule, Activity} import import{FloatType, Tensor} import import{Module, utils} import{Engine, MklBlas, MklDnn, Node} import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag private[bigdl] class IRConverter[T: ClassTag](IRgraph: IRGraph[T])(implicit ev: TensorNumeric[T]) { private val allNodes = new ArrayBuffer[Node[IRElement[T]]] private val irInputs = IRgraph.inputs.toArray private val irOutputs = IRgraph.outputs.toArray init() private def init() : Unit = { getNodes(irInputs, allNodes) // reminder: some output nodes may not be searched from inputs irOutputs.foreach(node => { if (!allNodes.contains(node)) allNodes.append(node) }) } private def getNodes(inputs: Seq[Node[IRElement[T]]], nodesBuffer: ArrayBuffer[Node[IRElement[T]]]): Unit = { if (inputs.length == 0) return inputs.foreach(node => { if (!nodesBuffer.contains(node)) { nodesBuffer.append(node) getNodes(node.nextNodes, nodesBuffer) } }) } def toGraph() : Graph[T] = { if (utils.Engine.getEngineType() == MklBlas) { require(IRToBlas[T].convertingCheck(allNodes.toArray), "IR graph can not be converted to Blas layer") toBlasGraph() } else if (utils.Engine.getEngineType() == MklDnn) { require(ev.getType() == FloatType, "Mkldnn engine only supports float data") require(IRToDnn[Float].convertingCheck( allNodes.toArray.asInstanceOf[Array[Node[IRElement[Float]]]]), "IR graph can not be converted to Dnn layer") toDnnGraph() } else throw new UnsupportedOperationException( s"Only support engineType mkldnn/mklblas, but get ${Engine.getEngineType()}") } private def toDnnGraph(): Graph[T] = { val nodeMap = IRToDnn[Float].convert( allNodes.toArray.asInstanceOf[Array[Node[IRElement[Float]]]]) val inputs = n => nodeMap.get(n.asInstanceOf[Node[IRElement[Float]]]).get) val outputs = n => nodeMap.get(n.asInstanceOf[Node[IRElement[Float]]]).get) // add input node for dnn graph val realInputs = => { val node = new Node[Module[Float]](new InputWrapper()) n.from(node) node }) // add output node for graph val realOutputs = { case (model: Node[Module[Float]], index: Int) => val node = if (model.element.isInstanceOf[BlasWrapper]) { model } else { model.add(new Node[Module[Float]](Output(IRgraph.outputFormats(index)))) } node } DnnGraph(realInputs, realOutputs, IRgraph.variables.asInstanceOf[Option[(Array[Tensor[Float]], Array[Tensor[Float]])]], IRgraph.generateBackward).asInstanceOf[Graph[T]] } private def toBlasGraph(): Graph[T] = { val nodeMap = IRToBlas[T].convert(allNodes.toArray) val inputs = => nodeMap.get(n).get) val outputs = => nodeMap.get(n).get) Graph.dynamic(inputs, outputs, IRgraph.variables, IRgraph.generateBackward) } }
Example 31
Source File: FileReader.scala From BigDL with Apache License 2.0 | 5 votes |
package import{BufferedInputStream} import java.nio.ByteBuffer import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.tensorflow.util.Event import scala.collection.mutable.ArrayBuffer import scala.util.matching.Regex private[bigdl] object FileReader { val fileNameRegex = """bigdl.tfevents.*""".r def readScalar(file: Path, tag: String, fs: FileSystem): Array[(Long, Float, Double)] = { require(fs.isFile(file), s"FileReader: ${file} should be a file") val bis = new BufferedInputStream( val longBuffer = new Array[Byte](8) val crcBuffer = new Array[Byte](4) val bf = new ArrayBuffer[(Long, Float, Double)] while ( > 0) { val l = ByteBuffer.wrap(longBuffer.reverse).getLong() // TODO: checksum // val crc1 = ByteBuffer.wrap(crcBuffer.reverse).getInt() val eventBuffer = new Array[Byte](l.toInt) val e = Event.parseFrom(eventBuffer) if (e.getSummary.getValueCount == 1 && tag.equals(e.getSummary.getValue(0).getTag())) { bf.append((e.getStep, e.getSummary.getValue(0).getSimpleValue, e.getWallTime)) } // val crc2 = ByteBuffer.wrap(crcBuffer.reverse).getInt() } bis.close() bf.toArray.sortWith(_._1 < _._1) } }
Example 32
Source File: Permute.scala From BigDL with Apache License 2.0 | 5 votes |
package import import import import import import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag class Permute[T: ClassTag]( val dims: Array[Int], val inputShape: Shape = null)(implicit ev: TensorNumeric[T]) extends KerasLayer[Tensor[T], Tensor[T], T](KerasLayer.addBatch(inputShape)) { private def permToPair(perm: Array[Int]): Array[(Int, Int)] = { val numToRank = perm.zipWithIndex.toMap val arr = perm.indices.toArray val pairs = ArrayBuffer[(Int, Int)]() def sort(arr: Array[Int], low: Int, high: Int): Unit = { var i = low var j = high val pivot = arr(low + (high - low)/2) while (i <= j) { while (arr(i) < pivot) i += 1 while (arr(j) > pivot) j -= 1 if (i <= j) { exchangeNumbers(arr, i, j) i += 1 j -= 1 } } if (low < j) sort(arr, low, j) if (i < high) sort(arr, i, high) } def exchangeNumbers(arr: Array[Int], i: Int, j: Int): Unit = { val temp = arr(i) arr(i) = arr(j) arr(j) = temp pairs += ((i, j)) } sort(, 0, arr.length-1) pairs.filter(pair => pair._1 != pair._2).toArray } override def computeOutputShape(inputShape: Shape): Shape = { val input = inputShape.toSingle().toArray val outputShape = input.clone() var i = 0 while (i < dims.length) { outputShape(i + 1) = input(dims(i)) i += 1 } Shape(outputShape) } override def doBuild(inputShape: Shape): AbstractModule[Tensor[T], Tensor[T], T] = { val swaps = permToPair( => x - 1)).map(pair => (pair._1 + 2, pair._2 + 2)) val layer = Transpose(swaps) layer.asInstanceOf[AbstractModule[Tensor[T], Tensor[T], T]] } } object Permute { def apply[@specialized(Float, Double) T: ClassTag]( dims: Array[Int], inputShape: Shape = null)(implicit ev: TensorNumeric[T]): Permute[T] = { new Permute[T](dims, inputShape) } }
Example 33
Source File: FrameManager.scala From BigDL with Apache License 2.0 | 5 votes |
package import java.util.concurrent.atomic.AtomicInteger import import{Exit, MergeOps, NextIteration} import scala.collection.mutable import scala.collection.mutable.ArrayBuffer class Frame[T] private[FrameManager] ( val name: String, val parent: Option[Frame[T]] ) { // Sync all next iteration nodes execution private[bigdl] var barrier: AtomicInteger = new AtomicInteger(0) // User can use NextIteration to sync execution. This is a list of those type of nodes private[bigdl] val waitingNodes: ArrayBuffer[ModuleNode[T]] = new ArrayBuffer[ModuleNode[T]]() // Nodes should be refreshed in a iteration of the frame private[bigdl] val nodes: ArrayBuffer[ModuleNode[T]] = new ArrayBuffer[ModuleNode[T]]() } }
Example 34
Source File: TimeDistributedCriterion.scala From BigDL with Apache License 2.0 | 5 votes |
package import import import import import scala.collection.mutable.ArrayBuffer import scala.concurrent.Future import scala.reflect.ClassTag require(input.size(dimension) == target.size(dimension), s"target should have as many elements as input, " + s"input ${input.size(dimension)}, target ${target.size(dimension)}") gradInput.resizeAs(input).zero() val nstep = input.size(dimension) var i = 0 while (i < nstep) { val _i = i + 1 results(i) = Engine.model.invoke(() => { fInput =, _i) fTarget =, _i) _gradInput =, _i) _gradInput.copy(cells(_i - 1).updateGradInput(fInput, fTarget).toTensor[T]) if (sizeAverage) { _gradInput = _gradInput.div(ev.fromType[Int](nstep)) } }) i += 1 } Engine.model.sync(results) gradInput } override def canEqual(other: Any): Boolean = other.isInstanceOf[TimeDistributedCriterion[T]] } object TimeDistributedCriterion { def apply[@specialized(Float, Double) T: ClassTag]( critrn: TensorCriterion[T] = null, sizeAverage: Boolean = false, dimension: Int = 2) (implicit ev: TensorNumeric[T]) : TimeDistributedCriterion[T] = { new TimeDistributedCriterion[T](critrn, sizeAverage, dimension) } }
Example 35
Source File: ExpandSize.scala From BigDL with Apache License 2.0 | 5 votes |
package import import import import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag class ExpandSize[T: ClassTag](targetSizes: Array[Int]) (implicit ev: TensorNumeric[T]) extends AbstractModule[Tensor[T], Tensor[T], T] { override def updateOutput(input: Tensor[T]): Tensor[T] = { require(targetSizes.length == input.dim(), s"the number of dimensions provided must equal ${input.dim()}") val tensorDim = input.dim() val tensorStride = input.stride() val tensorSize = input.size() var i = 0 while (i < tensorDim) { if (targetSizes(i) != -1) { if (tensorSize(i) == 1) { tensorSize(i) = targetSizes(i) tensorStride(i) = 0 } else if (tensorSize(i) != targetSizes(i)) { throw new UnsupportedOperationException( "incorrect size: only supporting singleton expansion (size=1)") } } i += 1 } output.set(, input.storageOffset(), tensorSize, tensorStride) output } override def updateGradInput(input: Tensor[T], gradOutput: Tensor[T]): Tensor[T] = { val tensorDim = input.dim() val tensorSize = input.size() gradInput = Tensor[T](tensorSize) val expandDim = new ArrayBuffer[Int]() var i = 0 while (i < tensorDim) { if (targetSizes(i) != -1) { if (tensorSize(i) == 1 && targetSizes(i) != 1) { expandDim.append(i + 1) } } i += 1 } i = expandDim.size - 1 val sizes = gradOutput.size() var _gradOutput = gradOutput while (i >= 0) { var start = 1 sizes(expandDim(i) - 1) = 1 val _gradInput = Tensor[T](sizes) while (start <= gradOutput.size(expandDim(i))) { val x = _gradOutput.narrow(expandDim(i), start, 1) _gradInput.add(x) start += 1 } _gradOutput = _gradInput i -= 1 } gradInput = _gradOutput gradInput } override def toString: String = s"ExpandSize" } object ExpandSize { def apply[@specialized(Float, Double) T: ClassTag](targetSizes: Array[Int]) (implicit ev: TensorNumeric[T]) : ExpandSize[T] = { new ExpandSize[T](targetSizes) } }
Example 36
Source File: Utils.scala From BigDL with Apache License 2.0 | 5 votes |
package import import{AbstractModule, Activity, TensorModule} import import{Cell, Container, Graph, Input, TimeDistributed, Linear => NNLinear, SpatialConvolution => NNConv, SpatialDilatedConvolution => NNDilatedConv} import{QuantizedTensor, Tensor} import import import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag object Utils { type ModuleNode[R] = AbstractModule[Activity, Activity, R] type SeqNodes[R] = Seq[Node[ModuleNode[R]]] type ArrayNodes[R] = Array[Node[ModuleNode[R]]] type ANode[R] = Node[ModuleNode[R]] type AbsModule[R] = AbstractModule[Activity, Activity, R] def reorganizeParameters[T: ClassTag](parameters: Array[Tensor[T]])( implicit ev: TensorNumeric[T]): Tensor[T] = { var length = 0 for (i <- parameters.indices) { if (!parameters(i).isInstanceOf[QuantizedTensor[T]]) { length += parameters(i).nElement() } } val result = Tensor[T](length) var offset = 0 for (i <- parameters.indices) { val parameter = parameters(i) if (!parameter.isInstanceOf[QuantizedTensor[T]]) { val length = parameter.nElement() val (src, srcOffset) = (, parameter.storageOffset() - 1) val (dst, dstOffset) = (, offset) val (size, stride) = (parameter.size(), parameter.stride()) System.arraycopy(src, srcOffset, dst, dstOffset, length) parameter.set(, offset + 1, size, stride) offset += length } } result } }
Example 37
package import import import import import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag class Any[T: ClassTag](keepDim : Boolean = false, startFromZero : Boolean = false) (implicit ev: TensorNumeric[T]) extends Operation[Table, Tensor[Boolean], T] { output = Tensor[Boolean]() private var buffer = Tensor[Boolean]() override def updateOutput(input: Table): Tensor[Boolean] = { val data = input[Tensor[Boolean]](1) val indices = input[Tensor[Int]](2) require(indices.nDimension() == 1 || indices.isScalar, "indices must be 1D tensor or scala") output.resizeAs(data) buffer.resizeAs(data).copy(data) val reduceDims = new ArrayBuffer[Int]() val size = output.size() if (indices.isScalar) { val dim = if (indices.value() < 0) { data.nDimension() + indices.value() + 1 } else if (startFromZero) { indices.value() + 1 } else { indices.value() } if (size(dim - 1) != 1) { size(dim - 1) = 1 reduceDims += dim output.resize(size) buffer.reduce(dim, output, (a, b) => a || b) buffer.resizeAs(output).copy(output) } } else { var i = 1 while (i <= indices.size(1)) { val dim = if (indices.valueAt(i) < 0) { data.nDimension() + indices.valueAt(i) + 1 } else if (startFromZero) { indices.valueAt(i) + 1 } else { indices.valueAt(i) } if (size(dim - 1) != 1) { size(dim - 1) = 1 reduceDims += dim output.resize(size) buffer.reduce(dim, output, (a, b) => a || b) buffer.resizeAs(output).copy(output) } i += 1 } } if (!keepDim) { val sizeBuffer = new ArrayBuffer[Int]() var i = 1 while (i <= data.nDimension()) { if (!reduceDims.contains(i)) sizeBuffer.append(data.size(i)) i += 1 } output.resize(sizeBuffer.toArray) } output } override def clearState(): this.type = { super.clearState() buffer.set() this } } object Any { def apply[T: ClassTag](keepDim: Boolean = false, startFromZero : Boolean = false) (implicit ev: TensorNumeric[T]): Any[T] = new Any[T](keepDim, startFromZero) }
Example 38
Source File: CategoricalColVocaList.scala From BigDL with Apache License 2.0 | 5 votes |
package import import import import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag class CategoricalColVocaList[T: ClassTag]( val vocaList: Array[String], val strDelimiter: String = ",", val isSetDefault: Boolean = false, val numOovBuckets: Int = 0 ) (implicit ev: TensorNumeric[T]) extends Operation[Tensor[String], Tensor[Int], T]{ private val vocaLen = vocaList.length private val vocaMap = vocaList.zipWithIndex.toMap require(numOovBuckets >= 0, "numOovBuckets is a negative integer") require(!(isSetDefault && numOovBuckets != 0), "defaultValue and numOovBuckets are both specified") require(vocaLen > 0, "the vocabulary list is empty") require(vocaLen == vocaMap.size, "the vocabulary list contains duplicate keys") output = Tensor[Int]() override def updateOutput(input: Tensor[String]): Tensor[Int] = { input.squeeze() val rows = input.size(dim = 1) val cols = if (numOovBuckets==0) { if (isSetDefault) vocaLen + 1 else vocaLen } else { vocaLen + numOovBuckets } val shape = Array(rows, cols) val indices0 = new ArrayBuffer[Int]() val indices1 = new ArrayBuffer[Int]() val values = new ArrayBuffer[Int]() var i = 1 while (i <= rows) { var feaStrArr = input.valueAt(i).split(strDelimiter) if (!isSetDefault && numOovBuckets == 0) { feaStrArr = feaStrArr.filter(x => vocaMap.contains(x)) } var j = 0 while (j < feaStrArr.length) { val mapVal = numOovBuckets==0 match { case true => vocaMap.getOrElse(feaStrArr(j), vocaMap.size) case false => vocaMap.getOrElse(feaStrArr(j), HashFunc.stringHashBucket32(feaStrArr(j), numOovBuckets) + vocaLen) } indices0 += i-1 indices1 += j values += mapVal j += 1 } i += 1 } val indices = Array(indices0.toArray, indices1.toArray) output = Tensor.sparse(indices, values.toArray, shape) output } } object CategoricalColVocaList { def apply[T: ClassTag]( vocaList: Array[String], strDelimiter: String = ",", isSetDefault: Boolean = false, numOovBuckets: Int = 0 ) (implicit ev: TensorNumeric[T]): CategoricalColVocaList[T] = new CategoricalColVocaList[T]( vocaList = vocaList, strDelimiter = strDelimiter, isSetDefault = isSetDefault, numOovBuckets = numOovBuckets ) }
Example 39
Source File: CategoricalColHashBucket.scala From BigDL with Apache License 2.0 | 5 votes |
package import import import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import scala.util.hashing.MurmurHash3 class CategoricalColHashBucket[T: ClassTag]( val hashBucketSize: Int, val strDelimiter: String = ",", val isSparse: Boolean = true )(implicit ev: TensorNumeric[T]) extends Operation[Tensor[String], Tensor[Int], T] { output = Tensor[Int]() override def updateOutput(input: Tensor[String]): Tensor[Int] = { val rows = input.size(dim = 1) val indices0 = new ArrayBuffer[Int]() val indices1 = new ArrayBuffer[Int]() val values = new ArrayBuffer[Int]() var i = 1 var max_fea_len = 0 while(i <= rows) { val feaStrArr = input.valueAt(i, 1).split(strDelimiter) max_fea_len = math.max(max_fea_len, feaStrArr.length) var j = 0 while(j < feaStrArr.length) { val hashVal = MurmurHash3.stringHash(feaStrArr(j)) % hashBucketSize match { case v if v < 0 => v + hashBucketSize case v => v } indices0 += i-1 indices1 += j values += hashVal j += 1 } i += 1 } val indices = Array(indices0.toArray, indices1.toArray) val shape = Array(rows, max_fea_len) output = isSparse match { case true => Tensor.sparse(indices, values.toArray, shape) case false => Tensor.dense(Tensor.sparse(indices, values.toArray, shape)) } output } } object CategoricalColHashBucket{ def apply[T: ClassTag]( hashBucketSize: Int, strDelimiter: String = ",", isSparse: Boolean = true) (implicit ev: TensorNumeric[T]) : CategoricalColHashBucket[T] = new CategoricalColHashBucket[T]( hashBucketSize = hashBucketSize, strDelimiter = strDelimiter, isSparse = isSparse ) }
Example 40
package import import import import{Sum => SumLayer} import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag class Sum[T: ClassTag, D: ClassTag](val keepDims: Boolean, val startFromZero: Boolean = false) (implicit ev: TensorNumeric[T], ev2: TensorNumeric[D]) extends Operation[Table, Tensor[D], T] { private val sum: SumLayer[D] = SumLayer[D](squeeze = !keepDims) output = Tensor[D]() override def updateOutput(input: Table): Tensor[D] = { val data = input[Tensor[D]](1) val dims = input[Tensor[Int]](2) output.resizeAs(data).copy(data) val sumDims = if (dims.isEmpty) { return output } else if (dims.isScalar) { Array(if (startFromZero) dims.value() + 1 else dims.value()) } else { require(dims.nDimension() == 1, s"Only accept 1D as dims, but now is ${dims.nDimension()}") val buffer = new ArrayBuffer[Int]() dims.apply1(a => { buffer.append(if (startFromZero) a + 1 else a) a }) buffer.toArray.sortWith(_ > _) } var i = 0 while(i < sumDims.length) { sum.changeSumDims(sumDims(i)) val tmp = sum.updateOutput(output) output.resizeAs(tmp).copy(tmp) i += 1 } output } override def getClassTagNumerics() : (Array[ClassTag[_]], Array[TensorNumeric[_]]) = { (Array[ClassTag[_]](scala.reflect.classTag[T], scala.reflect.classTag[D]), Array[TensorNumeric[_]](ev, ev2)) } } object Sum { def apply[T: ClassTag, D: ClassTag](keepDims: Boolean = false, startFromZero: Boolean = false) (implicit ev: TensorNumeric[T], ev2: TensorNumeric[D]): Sum[T, D] = new Sum(keepDims, startFromZero) }
Example 41
Source File: Kv2Tensor.scala From BigDL with Apache License 2.0 | 5 votes |
package import import import import import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag class Kv2Tensor[T: ClassTag, D: ClassTag]( val kvDelimiter: String, val itemDelimiter: String, val transType: Int )(implicit ev: TensorNumeric[T], ev2: TensorNumeric[D]) extends Operation[Table, Tensor[D], T]{ output = Activity.allocate[Tensor[D], D]() override def updateOutput(input: Table): Tensor[D] = { val kvTensor = input[Tensor[String]](1) val feaLen = input[Tensor[Int]](2).value() val indices0 = new ArrayBuffer[Int]() val indices1 = new ArrayBuffer[Int]() val values = new ArrayBuffer[D]() val rows = kvTensor.size(dim = 1) val shape = Array(rows, feaLen) var i = 1 while(i<=rows) { val kvFeaString =, i).valueAt(1) kvFeaString.split(kvDelimiter).foreach { kv => indices0 += i-1 indices1 += kv.split(itemDelimiter)(0).toInt ev2.getType() match { case DoubleType => values += kv.split(itemDelimiter)(1).toDouble.asInstanceOf[D] case FloatType => values += kv.split(itemDelimiter)(1).toFloat.asInstanceOf[D] case t => throw new NotImplementedError(s"$t is not supported") } } i += 1 } val indices = Array(indices0.toArray, indices1.toArray) val resTensor = transType match { case 0 => Tensor.dense(Tensor.sparse(indices, values.toArray, shape)) case 1 => Tensor.sparse(indices, values.toArray, shape) } output = resTensor output } override def getClassTagNumerics() : (Array[ClassTag[_]], Array[TensorNumeric[_]]) = { (Array[ClassTag[_]](scala.reflect.classTag[T], scala.reflect.classTag[D]), Array[TensorNumeric[_]](ev, ev2)) } } object Kv2Tensor{ def apply[T: ClassTag, D: ClassTag]( kvDelimiter: String = ",", itemDelimiter: String = ":", transType: Int = 0) (implicit ev: TensorNumeric[T], ev2: TensorNumeric[D]): Kv2Tensor[T, D] = new Kv2Tensor[T, D]( kvDelimiter = kvDelimiter, itemDelimiter = itemDelimiter, transType = transType ) }
Example 42
package import import import import import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag class All[T: ClassTag](keepDim : Boolean = false, startFromZero : Boolean = false) (implicit ev: TensorNumeric[T]) extends Operation[Table, Tensor[Boolean], T] { output = Tensor[Boolean]() private var buffer = Tensor[Boolean]() override def updateOutput(input: Table): Tensor[Boolean] = { val data = input[Tensor[Boolean]](1) val indices = input[Tensor[Int]](2) require(indices.nDimension() == 1 || indices.isScalar, "indices must be 1D tensor or scala") output.resizeAs(data) buffer.resizeAs(data).copy(data) val reduceDims = new ArrayBuffer[Int]() val size = output.size() if (indices.isScalar) { val dim = if (indices.value() < 0) { data.nDimension() + indices.value() + 1 } else if (startFromZero) { indices.value() + 1 } else { indices.value() } if (size(dim - 1) != 1) { size(dim - 1) = 1 reduceDims += dim output.resize(size) buffer.reduce(dim, output, (a, b) => a && b) buffer.resizeAs(output).copy(output) } } else { var i = 1 while (i <= indices.size(1)) { val dim = if (indices.valueAt(i) < 0) { data.nDimension() + indices.valueAt(i) + 1 } else if (startFromZero) { indices.valueAt(i) + 1 } else { indices.valueAt(i) } if (size(dim - 1) != 1) { size(dim - 1) = 1 reduceDims += dim output.resize(size) buffer.reduce(dim, output, (a, b) => a && b) buffer.resizeAs(output).copy(output) } i += 1 } } if (!keepDim) { val sizeBuffer = new ArrayBuffer[Int]() var i = 1 while (i <= data.nDimension()) { if (!reduceDims.contains(i)) sizeBuffer.append(data.size(i)) i += 1 } output.resize(sizeBuffer.toArray) } output } override def clearState(): this.type = { super.clearState() buffer.set() this } } object All { def apply[T: ClassTag](keepDim: Boolean = false, startFromZero : Boolean = false) (implicit ev: TensorNumeric[T]): All[T] = new All[T](keepDim, startFromZero) }
Example 43
Source File: ParallelTable.scala From BigDL with Apache License 2.0 | 5 votes |
package import import{AbstractModule, Activity} import import import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag @SerialVersionUID(- 1197848941394786045L) class ParallelTable[T: ClassTag] (implicit ev: TensorNumeric[T]) extends DynamicContainer[Table, Table, T] { override def updateOutput(input: Table): Table = { var i = 0 while (i < input.length()) { output.update(i + 1, modules(i).forward(input(i + 1))) i += 1 } output } override def updateGradInput(input: Table, gradOutput: Table): Table = { var i = 0 while (i < input.length()) { gradInput.update(i + 1, modules(i).updateGradInput(input(i + 1), gradOutput(i + 1))) i += 1 } gradInput } override def accGradParameters(input: Table, gradOutput: Table): Unit = { var i = 0 while (i < input.length()) { modules(i).accGradParameters(input(i + 1), gradOutput(i + 1)) i += 1 } } override def backward(input: Table, gradOutput: Table): Table = { val before = System.nanoTime() var i = 0 while (i < input.length()) { gradInput.update(i + 1, modules(i).backward(input(i + 1), gradOutput(i + 1))) i += 1 } backwardTime += System.nanoTime() - before gradInput } override def getEndNodes(startNodes: Array[ModuleNode[T]]): Array[ModuleNode[T]] = { val outputs = ArrayBuffer[ModuleNode[T]]() var outputTuple: Array[ModuleNode[T]] = null require(startNodes.length == modules.length, s"ParallelTable: " + s"startNodes length ${startNodes.length} is more than modules length ${modules.length}") for (i <- 0 to modules.size - 1) { outputTuple = modules(i).getEndNodes(Array(startNodes(i))) outputs ++= outputTuple } outputs.toArray } override def toString: String = { val tab = "\t" val line = "\n" val next = " |`-> " val lastNext = " `-> " val ext = " | " val extlast = " " val last = " ... -> " var str = "nn.ParallelTable" str = str + " {" + line + tab + "input" var i = 1 while (i <= modules.length) { if (i == modules.length) { str = str + line + tab + lastNext + "(" + i + "): " + modules(i-1).toString.replace(line, line + tab + extlast) } else { str = str + line + tab + next + "(" + i + "): " + modules(i-1).toString.replace(line, line + tab + ext) } i += 1 } str = str + line + tab + last + "output" str = str + line + "}" str } } object ParallelTable { def apply[@specialized(Float, Double) T: ClassTag]() (implicit ev: TensorNumeric[T]) : ParallelTable[T] = { new ParallelTable[T]() } }
Example 44
Source File: MultiCriterion.scala From BigDL with Apache License 2.0 | 5 votes |
package import{Activity, AbstractCriterion} import import import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag @SerialVersionUID(- 8679064077837483164L) class MultiCriterion[@specialized(Float, Double) T: ClassTag] (implicit ev: TensorNumeric[T]) extends AbstractCriterion[Activity, Activity, T] { private val weights = new ArrayBuffer[Double] private val criterions = T() def add(criterion: AbstractCriterion[Activity, Activity, T], weight: Double = 1): Unit = { criterions.insert(criterions.length() + 1, criterion) weights.append(weight) } override def updateOutput(input: Activity, target: Activity): T = { var i = 1 while (i <= criterions.length) { output =, ev.times(ev.fromType(weights(i-1)), criterions[AbstractCriterion[Activity, Activity, T]](i).updateOutput(input, target))) i +=1 } output } override def updateGradInput(input: Activity, target: Activity): Activity = { gradInput = Utils.recursiveResizeAs[T](gradInput, input) Utils.recursiveFill[T](gradInput, 0) var i = 1 while (i <= criterions.length) { Utils.recursiveAdd(gradInput, weights(i - 1), criterions[AbstractCriterion[Activity, Activity, T]](i).updateGradInput(input, target)) i += 1 } gradInput } override def canEqual(other: Any): Boolean = other.isInstanceOf[MultiCriterion[T]] override def equals(other: Any): Boolean = other match { case that: MultiCriterion[T] => super.equals(that) && (that canEqual this) && weights == that.weights case _ => false } override def hashCode(): Int = { def getHashCode(a: Any): Int = if (a == null) 0 else a.hashCode() val state = Seq(super.hashCode(), weights), b) => 31 * a + b) } override def toString(): String = { s"nn.MultiCriterion" } } object MultiCriterion { def apply[@specialized(Float, Double) T: ClassTag]() (implicit ev: TensorNumeric[T]) : MultiCriterion[T] = { new MultiCriterion[T]() } }
Example 45
Source File: Metrics.scala From BigDL with Apache License 2.0 | 5 votes |
package import import org.apache.spark.{Accumulable, Accumulator, SparkContext} import scala.collection.mutable.{ArrayBuffer, Map} class Metrics extends Serializable { private val localMetricsMap: Map[String, LocalMetricsEntry] = Map() private val aggregateDistributeMetricsMap: Map[String, AggregateDistributeMetricsEntry] = Map() private val distributeMetricsMap: Map[String, DistributeMetricsEntry] = Map() def add(name: String, value: Double): this.type = { require(localMetricsMap.contains(name) || aggregateDistributeMetricsMap.contains(name) || distributeMetricsMap.contains(name)) if (localMetricsMap.contains(name)) { localMetricsMap(name).value.addAndGet(value) } if (aggregateDistributeMetricsMap.contains(name)) { aggregateDistributeMetricsMap(name).value += value } if (distributeMetricsMap.contains(name)) { distributeMetricsMap(name).value += value } this } def set(name: String, value: Double, parallel: Int = 1): this.type = { require(!aggregateDistributeMetricsMap.contains(name), "duplicated distribute metric") require(!distributeMetricsMap.contains(name), "duplicated distribute metric2") if (localMetricsMap.contains(name)) { localMetricsMap(name).value.set(value) localMetricsMap(name).parallel = parallel } else { localMetricsMap(name) = LocalMetricsEntry(new AtomicDouble(value), parallel) } this } def set(name: String, value: Double, sc: SparkContext, parallel: Int): this.type = { require(!localMetricsMap.contains(name), "duplicated local metric") if (aggregateDistributeMetricsMap.contains(name)) { aggregateDistributeMetricsMap(name).value.setValue(value) aggregateDistributeMetricsMap(name).parallel = parallel } else { aggregateDistributeMetricsMap(name) = AggregateDistributeMetricsEntry(sc.accumulator(value, name), parallel) } this } def set(name: String, value: ArrayBuffer[Double], sc: SparkContext): this.type = { require(!localMetricsMap.contains(name), "duplicated local metric") require(!aggregateDistributeMetricsMap.contains(name), "duplicated distribute metric") if (distributeMetricsMap.contains(name)) { distributeMetricsMap(name).value.setValue(value) } else { distributeMetricsMap(name) = DistributeMetricsEntry(sc.accumulableCollection(value)) } this } def get(name: String): (Double, Int) = { require(localMetricsMap.contains(name) || aggregateDistributeMetricsMap.contains(name)) if (localMetricsMap.contains(name)) { (localMetricsMap(name).value.get(), localMetricsMap(name).parallel) } else { (aggregateDistributeMetricsMap(name).value.value, aggregateDistributeMetricsMap(name).parallel) } } def get(name: String, number: Int): Array[Double] = { require(distributeMetricsMap.contains(name)) distributeMetricsMap(name).value.value.toArray.dropRight(number) } def summary(unit: String = "s", scale: Double = 1e9): String = { "========== Metrics Summary ==========\n" + entry => s"${entry._1} : ${entry._2.value.get() / entry._2.parallel / scale} $unit\n") .mkString("") + entry => s"${entry._1} : ${entry._2.value.value / entry._2.parallel / scale} $unit\n") .mkString("") + { entry => s"${entry._1} : ${ / scale).mkString(" ")} \n" }.mkString("") + "=====================================" } } private case class LocalMetricsEntry(value: AtomicDouble, var parallel: Int) private case class AggregateDistributeMetricsEntry(value: Accumulator[Double], var parallel: Int) private case class DistributeMetricsEntry(value: Accumulable[ArrayBuffer[Double], Double])
Example 46
Source File: BatchSamplerSpec.scala From BigDL with Apache License 2.0 | 5 votes |
package import{Storage, Tensor} import import org.scalatest.{FlatSpec, Matchers} import scala.collection.mutable.ArrayBuffer class BatchSamplerSpec extends FlatSpec with Matchers { "batch sampler with no change" should "work properly" in { val sampler = new BatchSampler(maxTrials = 1) val unitBox = BoundingBox(0, 0, 1, 1) val boxes = Tensor(Storage(Array(0.582296, 0.334719, 0.673582, 0.52183, 0.596127, 0.282744, 0.670816, 0.449064, 0.936376, 0.627859, 0.961272, 0.733888, 0.896266, 0.640333, 0.923928, 0.740125).map(x => x.toFloat))).resize(4, 4) val classes = Tensor[Float](4).randn() val target = RoiLabel(classes, boxes) val sampledBoxes = new ArrayBuffer[BoundingBox]() sampler.sample(unitBox, target, sampledBoxes) sampledBoxes.length should be(1) sampledBoxes(0) should be(unitBox) } "satisfySampleConstraint with minOverlap 0.1" should "work properly" in { val boxes = Tensor(Storage(Array(0.418, 0.396396, 0.55, 0.666667, 0.438, 0.321321, 0.546, 0.561562, 0.93, 0.81982, 0.966, 0.972973, 0.872, 0.837838, 0.912, 0.981982).map(x => x.toFloat))).resize(4, 4) val classes = Tensor[Float](4).randn() val target = RoiLabel(classes, boxes) val sampledBox = BoundingBox(0.114741f, 0.248062f, 0.633665f, 0.763736f) val sampler = new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2, minOverlap = Some(0.1)) sampler.satisfySampleConstraint(sampledBox, target) should be(true) } "satisfySampleConstraint with minOverlap 0.3" should "work properly" in { val boxes = Tensor(Storage(Array(0.418, 0.396396, 0.55, 0.666667, 0.438, 0.321321, 0.546, 0.561562, 0.93, 0.81982, 0.966, 0.972973, 0.872, 0.837838, 0.912, 0.981982).map(x => x.toFloat))).resize(4, 4) val classes = Tensor[Float](4).randn() val target = RoiLabel(classes, boxes) val sampledBox = BoundingBox(0.266885f, 0.416113f, 0.678256f, 0.67208f) val sampler = new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2, minOverlap = Some(0.3)) sampler.satisfySampleConstraint(sampledBox, target) should be(true) } "batch samplers" should "work properly" in { val boxes = Tensor(Storage(Array(0.418, 0.396396, 0.55, 0.666667, 0.438, 0.321321, 0.546, 0.561562, 0.93, 0.81982, 0.966, 0.972973, 0.872, 0.837838, 0.912, 0.981982).map(x => x.toFloat))).resize(4, 4) val classes = Tensor[Float](4).randn() val target = RoiLabel(classes, boxes) val sampledBoxes = new ArrayBuffer[BoundingBox]() val batchSamplers = Array( new BatchSampler(maxTrials = 1), new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2, minOverlap = Some(0.1)), new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2, minOverlap = Some(0.3)), new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2, minOverlap = Some(0.5)), new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2, minOverlap = Some(0.7)), new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2, minOverlap = Some(0.9)), new BatchSampler(minScale = 0.3, minAspectRatio = 0.5, maxAspectRatio = 2, maxOverlap = Some(1.0))) BatchSampler.generateBatchSamples(target, batchSamplers, sampledBoxes) sampledBoxes.foreach(box => { println(box) }) } }
Example 47
Source File: BigDLSpecHelper.scala From BigDL with Apache License 2.0 | 5 votes |
package import{File => JFile} import org.apache.log4j.Logger import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers} import scala.collection.mutable.ArrayBuffer abstract class BigDLSpecHelper extends FlatSpec with Matchers with BeforeAndAfter { protected val logger = Logger.getLogger(getClass) private val tmpFiles : ArrayBuffer[JFile] = new ArrayBuffer[JFile]() protected def createTmpFile(): JFile = { val file ="UnitTest", "BigDLSpecBase")"created file $file") tmpFiles.append(file) file } protected def getFileFolder(path: String): String = { path.substring(0, path.lastIndexOf(JFile.separator)) } protected def getFileName(path: String): String = { path.substring(path.lastIndexOf(JFile.separator) + 1) } def doAfter(): Unit = {} def doBefore(): Unit = {} before { doBefore() } after { doAfter() tmpFiles.foreach(f => { if (f.exists()) { require(f.isFile, "cannot clean folder") f.delete()"deleted file $f") } }) } }
Example 48
Source File: Kv2TensorSpec.scala From BigDL with Apache License 2.0 | 5 votes |
package import{DenseType, SparseType, Tensor} import import{T, Table} import org.scalatest.{FlatSpec, Matchers} import scala.collection.mutable.ArrayBuffer import scala.util.Random class Kv2TensorSpec extends FlatSpec with Matchers { protected def randDoubles(length: Int, lp: Double = 0.0, up: Double = 1.0): Array[Double] = { (1 to length).map(_ => lp + (up - lp) * Random.nextDouble()).toArray } protected def randKVMap(size: Int, numActive: Int, lp: Double = 0.0, up: Double = 1.0): Map[Int, Double] = { require(numActive <= size) val keys = Random.shuffle((0 until size).toList).take(numActive) val values = randDoubles(numActive, lp, up) } val batchLen = 3 val numActive = Array(2, 3, 5) val feaLen = 8 val originData = new ArrayBuffer[String]() val originArr = new ArrayBuffer[Table]() val indices0 = new ArrayBuffer[Int]() val indices1 = new ArrayBuffer[Int]() val values = new ArrayBuffer[Double]() for (i <- 0 until batchLen) { val kvMap = randKVMap(feaLen, numActive(i)) val kvStr = => s"${data._1}:${data._2}").mkString(",") originData += kvStr originArr += T(kvStr) indices0 ++= ArrayBuffer.fill(numActive(i))(i) val kvArr = kvMap.toArray indices1 ++= => kv._1) values ++= => kv._2) } val originTable = T.array(originArr.toArray) val indices = Array(indices0.toArray, indices1.toArray) val shape = Array(batchLen, feaLen) "Kv2Tensor operation kvString to SparseTensor" should "work correctly" in { val input = T( Tensor[String](originTable), Tensor[Int](Array(feaLen), shape = Array[Int]()) ) val expectOutput = Tensor.sparse[Double]( indices = indices, values = values.toArray, shape = shape ) val output = Kv2Tensor[Double, Double](transType = 1) .forward(input) output should be(expectOutput) } "Kv2Tensor operation kvString to DenseTensor" should "work correctly" in { val input = T( Tensor[String](originTable), Tensor[Int](Array(feaLen), shape = Array[Int]()) ) val expectOutput = Tensor.dense(Tensor.sparse[Double]( indices = indices, values = values.toArray, shape = shape )) val output = Kv2Tensor[Double, Double](transType = 0) .forward(input) output should be(expectOutput) } } class Kv2TensorSerialTest extends ModuleSerializationTest { override def test(): Unit = { val kv2tensor = Kv2Tensor[Float, Float]( kvDelimiter = ",", itemDelimiter = ":", transType = 0 ).setName("kv2tensor") val input = T( Tensor[String]( T(T("0:0.1,1:0.2"), T("1:0.3,3:0.5"), T("2:0.15,4:0.25"))), Tensor[Int](Array(5), shape = Array[Int]()) ) runSerializationTest(kv2tensor, input) } }
Example 49
Source File: RMSpropSpec.scala From BigDL with Apache License 2.0 | 5 votes |
package import import{T, TestUtils} import org.scalatest.{FlatSpec, Matchers} import scala.collection.mutable.ArrayBuffer // class RMSpropSpec extends FlatSpec with Matchers { val start = System.currentTimeMillis() "RMSprop" should "perform well on rosenbrock function" in { val x = Tensor[Double](2).fill(0) val config = T("learningRate" -> 5e-4) val optm = new RMSprop[Double] var fx = new ArrayBuffer[Double] for (i <- 1 to 10001) { val result = optm.optimize(TestUtils.rosenBrock, x, config) if ((i - 1) % 1000 == 0) { fx += result._2(0) } } println(s"x is \n$x") println("fx is") for (i <- 1 to fx.length) { println(s"${(i - 1) * 1000 + 1}, ${fx(i - 1)}") } val spend = System.currentTimeMillis() - start println("Time Cost: " + spend + "ms") (fx.last < 1e-4) should be(true) x(Array(1)) should be(1.0 +- 0.01) x(Array(2)) should be(1.0 +- 0.01) } }
Example 50
Source File: AdagradSpec.scala From BigDL with Apache License 2.0 | 5 votes |
package import{TestUtils, T} import org.scalatest.{FlatSpec, Matchers} import import scala.collection.mutable.ArrayBuffer class AdagradSpec extends FlatSpec with Matchers { "adagrad" should "perform well on rosenbrock function" in { val x = Tensor[Double](2).fill(0) val config = T("learningRate" -> 1e-1) val optm = new Adagrad[Double] var fx = new ArrayBuffer[Double] for (i <- 1 to 10001) { val result = optm.optimize(TestUtils.rosenBrock, x, config) if ((i - 1) % 1000 == 0) { fx += (result._2(0)) } } println(s"x is \n$x") println("fx is") for (i <- 1 to fx.length) { println(s"${(i - 1) * 1000 + 1}, ${fx(i - 1)}") } (fx.last < 1e-9) should be(true) x(Array(1)) should be(1.0 +- 0.01) x(Array(2)) should be(1.0 +- 0.01) } }
Example 51
Source File: LBFGSSpec.scala From BigDL with Apache License 2.0 | 5 votes |
package import{TestUtils, T} import org.scalatest.{FlatSpec, Matchers} import import scala.collection.mutable.ArrayBuffer class LBFGSSpec extends FlatSpec with Matchers { "torchLBFGS in regular batch test" should "perform well on rosenbrock function" in { val x = Tensor[Double](2).fill(0) val optm = new LBFGS[Double] val result = optm.optimize(TestUtils.rosenBrock, x, T("maxIter" -> 100, "learningRate" -> 1e-1)) val fx = result._2 println() println("Rosenbrock test") println() println(s"x = $x") println("fx = ") for (i <- 1 to fx.length) { println(s"$i ${fx(i - 1)}") } println() println() fx.last < 1e-6 should be(true) } "torchLBFGS in stochastic test" should "perform well on rosenbrock function" in { val x = Tensor[Double](2).fill(0) val optm = new LBFGS[Double] val fx = new ArrayBuffer[Double]() val config = T("maxIter" -> 1, "learningRate" -> 1e-1) for (i <- 1 to 100) { val result = optm.optimize(TestUtils.rosenBrock, x, config) fx.append(result._2(0)) } println() println("Rosenbrock test") println() println(s"x = $x") println("fx = ") for (i <- 1 to fx.length) { println(s"$i ${fx(i - 1)}") } println() println() fx.last < 1e-6 should be(true) } }
Example 52
Source File: AdadeltaSpec.scala From BigDL with Apache License 2.0 | 5 votes |
package import import{T, TestUtils} import org.scalatest.{FlatSpec, Matchers} import scala.collection.mutable.ArrayBuffer class AdadeltaSpec extends FlatSpec with Matchers { val start = System.currentTimeMillis() "adadelta" should "perform well on rosenbrock function" in { val x = Tensor[Double](2).fill(0) val config = T("Epsilon" -> 1e-10) val optm = new Adadelta[Double] var fx = new ArrayBuffer[Double] for (i <- 1 to 10001) { val result = optm.optimize(TestUtils.rosenBrock, x, config) if ((i - 1) % 1000 == 0) { fx += result._2(0) } } println(s"x is \n$x") println("fx is") for (i <- 1 to fx.length) { println(s"${(i - 1) * 1000 + 1}, ${fx(i - 1)}") } val spend = System.currentTimeMillis() - start println("Time Cost: " + spend + "ms") (fx.last < 1e-4) should be(true) x(Array(1)) should be(1.0 +- 0.02) x(Array(2)) should be(1.0 +- 0.02) } }
Example 53
Source File: AdamaxSpec.scala From BigDL with Apache License 2.0 | 5 votes |
package import import{T, TestUtils} import org.scalatest.{FlatSpec, Matchers} import scala.collection.mutable.ArrayBuffer class AdamaxSpec extends FlatSpec with Matchers { val start = System.currentTimeMillis() "adamax" should "perform well on rosenbrock function" in { val x = Tensor[Double](2).fill(0) val config = T() val optm = new Adamax[Double] var fx = new ArrayBuffer[Double] for (i <- 1 to 10001) { val result = optm.optimize(TestUtils.rosenBrock, x, config) if ((i - 1) % 1000 == 0) { fx += result._2(0) } } println(s"x is \n$x") println("fx is") for (i <- 1 to fx.length) { println(s"${(i - 1) * 1000 + 1}, ${fx(i - 1)}") } val spend = System.currentTimeMillis() - start println("Time Cost: " + spend + "ms") (fx.last < 1e-9) should be(true) x(Array(1)) should be(1.0 +- 0.01) x(Array(2)) should be(1.0 +- 0.01) } }
Example 54
Source File: AdamSpec.scala From BigDL with Apache License 2.0 | 5 votes |
package import{CrossEntropyCriterion, Linear, Sequential} import import{Engine, RandomGenerator, T, TestUtils} import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers} import scala.collection.mutable.ArrayBuffer import scala.util.Random class AdamSpec extends FlatSpec with Matchers with BeforeAndAfter { before { System.setProperty("bigdl.localMode", "true") System.setProperty("spark.master", "local[2]") Engine.init } after { System.clearProperty("bigdl.localMode") System.clearProperty("spark.master") } val start = System.currentTimeMillis() "adam" should "perform well on rosenbrock function" in { val x = Tensor[Double](2).fill(0) val config = T("learningRate" -> 0.002) val optm = new Adam[Double] var fx = new ArrayBuffer[Double] for (i <- 1 to 10001) { val result = optm.optimize(TestUtils.rosenBrock, x, config) if ((i - 1) % 1000 == 0) { fx += result._2(0) } } println(s"x is \n$x") println("fx is") for (i <- 1 to fx.length) { println(s"${(i - 1) * 1000 + 1}, ${fx(i - 1)}") } val spend = System.currentTimeMillis() - start println("Time Cost: " + spend + "ms") (fx.last < 1e-9) should be(true) x(Array(1)) should be(1.0 +- 0.01) x(Array(2)) should be(1.0 +- 0.01) } "ParallelAdam" should "perform well on rosenbrock function" in { val x = Tensor[Double](2).fill(0) val optm = new ParallelAdam[Double](learningRate = 0.002, parallelNum = 2) var fx = new ArrayBuffer[Double] for (i <- 1 to 10001) { val result = optm.optimize(TestUtils.rosenBrock, x) if ((i - 1) % 1000 == 0) { fx += result._2(0) } } println(s"x is \n$x") println("fx is") for (i <- 1 to fx.length) { println(s"${(i - 1) * 1000 + 1}, ${fx(i - 1)}") } val spend = System.currentTimeMillis() - start println("Time Cost: " + spend + "ms") (fx.last < 1e-9) should be(true) x(Array(1)) should be(1.0 +- 0.01) x(Array(2)) should be(1.0 +- 0.01) } }
Example 55
Source File: TrimmedIndependentPixelEvaluator.scala From scalismo-faces with Apache License 2.0 | 5 votes |
package scalismo.faces.sampling.face.evaluators import scalismo.color.{RGB, RGBA} import scalismo.faces.image.{ImageBuffer, PixelImage, PixelImageDomain} import scalismo.sampling.DistributionEvaluator import scalismo.sampling.evaluators.PairEvaluator import scala.collection.mutable.ArrayBuffer def visualize(values: IndexedSeq[(Double, Int, Int)], domain: PixelImageDomain, callBack: PixelImage[Option[Double]] => Unit): Unit = { val buffer = ImageBuffer.makeConstantBuffer[Option[Double]](domain.width, domain.height, None) values.foreach { case (lh: Double, x: Int, y: Int) => buffer(x, y) = Some(lh) } callBack(buffer.toImage) } var transparencySum = 0.0 var values = ArrayBuffer[(Double, Int, Int)]() var x: Int = 0 while (x < reference.width) { var y: Int = 0 while (y < reference.height) { val smp = sample(x, y) if (smp.a > 1e-4f) { val ref = reference(x, y).toRGB val fg: Double = pixelEvaluator.logValue(ref, smp.toRGB) val bg: Double = bgEvaluator.logValue(ref) val entry = (fg - bg, x, y) values += entry } transparencySum += smp.a y += 1 } x += 1 } val nCount = math.floor(values.length.toFloat * alphaClamped).toInt if (transparencySum > 0 && nCount > 0) { //was something rendered on the image? val data = values.toIndexedSeq.sortBy { case (d: Double, x: Int, y: Int) => d } var sumTrimmed: Double = 0.0 for (i <- 0 until nCount) { sumTrimmed += data(data.size - 1 - i)._1 } if (visualizationCallback.isDefined) visualize(data.slice(data.size - 1 - nCount, data.size - 1), reference.domain, visualizationCallback.get) sumTrimmed } else { // nothing was rendered on the image! Double.NegativeInfinity } } override def toString: String = { val builder = new StringBuilder(128) builder ++= "TrimmedIndependentPixelEvaluator(" builder ++= pixelEvaluator.toString builder ++= "/" builder ++= bgEvaluator.toString builder ++= s"alpha=$alphaClamped" builder ++= ")" builder.mkString } } object TrimmedIndependentPixelEvaluator { def apply(pixelEvaluator: PairEvaluator[RGB], bgEvaluator: DistributionEvaluator[RGB], alpha: Double) = new TrimmedIndependentPixelEvaluator(pixelEvaluator, bgEvaluator, alpha, None) def apply(pixelEvaluator: PairEvaluator[RGB], bgEvaluator: DistributionEvaluator[RGB], alpha: Double, visualisationCallback: PixelImage[Option[Double]] => Unit) = new TrimmedIndependentPixelEvaluator(pixelEvaluator, bgEvaluator, alpha, Some(visualisationCallback)) }
Example 56
Source File: MorphologicalFilter.scala From scalismo-faces with Apache License 2.0 | 5 votes |
package scalismo.faces.image.filter import scalismo.faces.image.AccessMode._ import scalismo.faces.image._ import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag def perPixel(x: Int, y: Int): A = { var kx = 0 var kernelPixels = new ArrayBuffer[A](width * height) while (kx < width) { val ix = x + kx - width / 2 var ky = 0 while (ky < height) { val iy = y + ky - height / 2 if (structuringElement(kx, ky)) kernelPixels += image(ix, iy) ky += 1 } kx += 1 } if (kernelPixels.nonEmpty) windowFilter(kernelPixels) else image(x, y) } if(width <= 0 || height <= 0) image else PixelImage(image.width, image.height, perPixel, Strict()) } } object MorphologicalFilter { def boxElement(size: Int): PixelImage[Boolean] = PixelImage.view(size, size, (x, y) => x >= 0 && x < size && y >= 0 && y < size) }
Example 57
Source File: ImmutableSelection.scala From hacktoberfest-scala-algorithms with GNU General Public License v3.0 | 5 votes |
package io.github.sentenza.hacktoberfest.algos import scala.collection.mutable.ArrayBuffer import scala.math.Ordered def quickSelect(list: List[Int], idx: Int): Option[Int] = { if (idx < 0 || list.size <= idx) return None list match { case Nil => None case pivot :: rest => { val (smaller, larger) = rest partition (_ <= pivot) val pivotIdx = smaller.size match { case needleInSmaller if needleInSmaller < 0 => quickSelect(smaller, idx) case needleIsPivot if needleIsPivot == 0 => Some(pivot) case needleInLarger if needleInLarger > 0 => quickSelect(larger, idx - pivotIdx - 1) } } } } }
Example 58
Source File: RocksEdgeFetcher.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package import com.typesafe.config.Config import org.apache.s2graph.core._ import org.apache.s2graph.core.schema.Label import{SKeyValue, StorageIO, StorageSerDe} import org.apache.s2graph.core.types.{HBaseType, VertexId} import org.rocksdb.RocksDB import scala.collection.mutable.ArrayBuffer import scala.concurrent.{ExecutionContext, Future} class RocksEdgeFetcher(val graph: S2GraphLike, val config: Config, val db: RocksDB, val vdb: RocksDB, val serDe: StorageSerDe, val io: StorageIO) extends EdgeFetcher { import RocksStorage._ override def fetches(queryRequests: Seq[QueryRequest], prevStepEdges: Map[VertexId, Seq[EdgeWithScore]])(implicit ec: ExecutionContext): Future[Seq[StepResult]] = { val futures = for { queryRequest <- queryRequests } yield { val parentEdges = prevStepEdges.getOrElse(, Nil) val edge = graph.elementBuilder.toRequestEdge(queryRequest, parentEdges) val rpc = buildRequest(graph, serDe, queryRequest, edge) fetchKeyValues(vdb, db, rpc).map { kvs => val queryParam = queryRequest.queryParam val stepResult = io.toEdges(kvs, queryRequest, queryRequest.prevStepScore, false, parentEdges) val edgeWithScores = stepResult.edgeWithScores.filter { case edgeWithScore => val edge = edgeWithScore.edge val duration = queryParam.durationOpt.getOrElse((Long.MinValue, Long.MaxValue)) edge.ts >= duration._1 && edge.ts < duration._2 } stepResult.copy(edgeWithScores = edgeWithScores) } } Future.sequence(futures) } override def fetchEdgesAll()(implicit ec: ExecutionContext) = { val edges = new ArrayBuffer[S2EdgeLike]() Label.findAll().groupBy(_.hbaseTableName).toSeq.foreach { case (hTableName, labels) => val distinctLabels = labels.toSet val iter = db.newIterator() try { iter.seekToFirst() while (iter.isValid) { val kv = SKeyValue(table, iter.key(), SKeyValue.EdgeCf, qualifier, iter.value, System.currentTimeMillis()) serDe.indexEdgeDeserializer(schemaVer = HBaseType.DEFAULT_VERSION).fromKeyValues(Seq(kv), None) .filter(e => distinctLabels(e.innerLabel) && e.getDirection() == "out" && !e.isDegree) .foreach { edge => edges += edge } } } finally { iter.close() } } Future.successful(edges) } }
Example 59
Source File: RocksVertexFetcher.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package import com.typesafe.config.Config import org.apache.hadoop.hbase.util.Bytes import org.apache.s2graph.core._ import org.apache.s2graph.core.schema.ServiceColumn import{qualifier, table} import{SKeyValue, StorageIO, StorageSerDe} import org.apache.s2graph.core.types.HBaseType import org.rocksdb.RocksDB import scala.collection.mutable.ArrayBuffer import scala.concurrent.{ExecutionContext, Future} class RocksVertexFetcher(val graph: S2GraphLike, val config: Config, val db: RocksDB, val vdb: RocksDB, val serDe: StorageSerDe, val io: StorageIO) extends VertexFetcher { private def fetchKeyValues(queryRequest: QueryRequest, vertex: S2VertexLike)(implicit ec: ExecutionContext): Future[Seq[SKeyValue]] = { val rpc = RocksStorage.buildRequest(queryRequest, vertex) RocksStorage.fetchKeyValues(vdb, db, rpc) } override def fetchVertices(vertexQueryParam: VertexQueryParam)(implicit ec: ExecutionContext): Future[Seq[S2VertexLike]] = { def fromResult(kvs: Seq[SKeyValue], version: String): Seq[S2VertexLike] = { if (kvs.isEmpty) Nil else serDe.vertexDeserializer(version).fromKeyValues(kvs, None).toSeq.filter(vertexQueryParam.where.get.filter) } val vertices = => graph.elementBuilder.newVertex(vId)) val futures = { vertex => val queryParam = QueryParam.Empty val q = Query.toQuery(Seq(vertex), Seq(queryParam)) val queryRequest = QueryRequest(q, stepIdx = -1, vertex, queryParam) fetchKeyValues(queryRequest, vertex).map { kvs => fromResult(kvs, vertex.serviceColumn.schemaVersion) } recoverWith { case ex: Throwable => Future.successful(Nil) } } Future.sequence(futures).map(_.flatten) } override def fetchVerticesAll()(implicit ec: ExecutionContext) = { import scala.collection.mutable val vertices = new ArrayBuffer[S2VertexLike]() ServiceColumn.findAll().groupBy(_.service.hTableName).toSeq.foreach { case (hTableName, columns) => val distinctColumns = columns.toSet val iter = vdb.newIterator() val buffer = mutable.ListBuffer.empty[SKeyValue] var oldVertexIdBytes = Array.empty[Byte] var minusPos = 0 try { iter.seekToFirst() while (iter.isValid) { val row = iter.key() if (!Bytes.equals(oldVertexIdBytes, 0, oldVertexIdBytes.length - minusPos, row, 0, row.length - 1)) { if (buffer.nonEmpty) serDe.vertexDeserializer(schemaVer = HBaseType.DEFAULT_VERSION).fromKeyValues(buffer, None) .filter(v => distinctColumns(v.serviceColumn)) .foreach { vertex => vertices += vertex } oldVertexIdBytes = row minusPos = 1 buffer.clear() } val kv = SKeyValue(table, iter.key(), SKeyValue.VertexCf, qualifier, iter.value(), System.currentTimeMillis()) buffer += kv } if (buffer.nonEmpty) serDe.vertexDeserializer(schemaVer = HBaseType.DEFAULT_VERSION).fromKeyValues(buffer, None) .filter(v => distinctColumns(v.serviceColumn)) .foreach { vertex => vertices += vertex } } finally { iter.close() } } Future.successful(vertices) } }
Example 60
Source File: BytesUtilV1.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.counter.core.v1 import org.apache.hadoop.hbase.util.Bytes import org.apache.s2graph.counter.core.TimedQualifier.IntervalUnit import org.apache.s2graph.counter.core.{TimedQualifier, ExactQualifier, ExactKeyTrait, BytesUtil} import org.apache.s2graph.counter.models.Counter.ItemType import org.apache.s2graph.counter.util.Hashes import scala.collection.mutable.ArrayBuffer object BytesUtilV1 extends BytesUtil { // ExactKey: [hash(2b)][policy(4b)][item(variable)] val BUCKET_BYTE_SIZE = Bytes.SIZEOF_SHORT val POLICY_ID_SIZE = Bytes.SIZEOF_INT val INTERVAL_SIZE = Bytes.SIZEOF_BYTE val TIMESTAMP_SIZE = Bytes.SIZEOF_LONG val TIMED_QUALIFIER_SIZE = INTERVAL_SIZE + TIMESTAMP_SIZE override def getRowKeyPrefix(id: Int): Array[Byte] = { Bytes.toBytes(id) } override def toBytes(key: ExactKeyTrait): Array[Byte] = { val buff = new ArrayBuffer[Byte] // hash key (2 byte) buff ++= Bytes.toBytes(Hashes.murmur3(key.itemKey)).take(BUCKET_BYTE_SIZE) buff ++= getRowKeyPrefix(key.policyId) buff ++= { key.itemType match { case ItemType.INT => Bytes.toBytes(key.itemKey.toInt) case ItemType.LONG => Bytes.toBytes(key.itemKey.toLong) case ItemType.STRING | ItemType.BLOB => Bytes.toBytes(key.itemKey) } } buff.toArray } override def toBytes(eq: ExactQualifier): Array[Byte] = { toBytes(eq.tq) ++ eq.dimension.getBytes } override def toBytes(tq: TimedQualifier): Array[Byte] = { Bytes.toBytes(tq.q.toString) ++ Bytes.toBytes(tq.ts) } override def toExactQualifier(bytes: Array[Byte]): ExactQualifier = { // qualifier: interval, ts, dimension 순서 val tq = toTimedQualifier(bytes) val dimension = Bytes.toString(bytes, TIMED_QUALIFIER_SIZE, bytes.length - TIMED_QUALIFIER_SIZE) ExactQualifier(tq, dimension) } override def toTimedQualifier(bytes: Array[Byte]): TimedQualifier = { val interval = Bytes.toString(bytes, 0, INTERVAL_SIZE) val ts = Bytes.toLong(bytes, INTERVAL_SIZE) TimedQualifier(IntervalUnit.withName(interval), ts) } }
Example 61
Source File: BytesUtilV2.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.counter.core.v2 import org.apache.hadoop.hbase.util._ import org.apache.s2graph.counter import org.apache.s2graph.counter.core.TimedQualifier.IntervalUnit import org.apache.s2graph.counter.core.{TimedQualifier, ExactQualifier, ExactKeyTrait, BytesUtil} import org.apache.s2graph.counter.models.Counter.ItemType import org.apache.s2graph.counter.util.Hashes import scala.collection.mutable.ArrayBuffer object BytesUtilV2 extends BytesUtil { // ExactKey: [hash(1b)][version(1b)][policy(4b)][item(variable)] val BUCKET_BYTE_SIZE = Bytes.SIZEOF_BYTE val VERSION_BYTE_SIZE = Bytes.SIZEOF_BYTE val POLICY_ID_SIZE = Bytes.SIZEOF_INT val INTERVAL_SIZE = Bytes.SIZEOF_BYTE val TIMESTAMP_SIZE = Bytes.SIZEOF_LONG val TIMED_QUALIFIER_SIZE = INTERVAL_SIZE + TIMESTAMP_SIZE override def getRowKeyPrefix(id: Int): Array[Byte] = { Array(counter.VERSION_2) ++ Bytes.toBytes(id) } override def toBytes(key: ExactKeyTrait): Array[Byte] = { val buff = new ArrayBuffer[Byte] // hash byte buff ++= Bytes.toBytes(Hashes.murmur3(key.itemKey)).take(BUCKET_BYTE_SIZE) // row key prefix // version + policy id buff ++= getRowKeyPrefix(key.policyId) buff ++= { key.itemType match { case ItemType.INT => Bytes.toBytes(key.itemKey.toInt) case ItemType.LONG => Bytes.toBytes(key.itemKey.toLong) case ItemType.STRING | ItemType.BLOB => Bytes.toBytes(key.itemKey) } } buff.toArray } override def toBytes(eq: ExactQualifier): Array[Byte] = { val len = { case (k, v) => k.length + 2 + v.length + 2 }.sum val pbr = new SimplePositionedMutableByteRange(len) for { v <- ExactQualifier.makeSortedDimension(eq.dimKeyValues) } { OrderedBytes.encodeString(pbr, v, Order.ASCENDING) } toBytes(eq.tq) ++ pbr.getBytes } override def toBytes(tq: TimedQualifier): Array[Byte] = { val pbr = new SimplePositionedMutableByteRange(INTERVAL_SIZE + 2 + TIMESTAMP_SIZE + 1) OrderedBytes.encodeString(pbr, tq.q.toString, Order.ASCENDING) OrderedBytes.encodeInt64(pbr, tq.ts, Order.DESCENDING) pbr.getBytes } private def decodeString(pbr: PositionedByteRange): Stream[String] = { if (pbr.getRemaining > 0) { Stream.cons(OrderedBytes.decodeString(pbr), decodeString(pbr)) } else { Stream.empty } } override def toExactQualifier(bytes: Array[Byte]): ExactQualifier = { val pbr = new SimplePositionedByteRange(bytes) ExactQualifier(toTimedQualifier(pbr), { val seqStr = decodeString(pbr).toSeq val (keys, values) = seqStr.splitAt(seqStr.length / 2) }) } override def toTimedQualifier(bytes: Array[Byte]): TimedQualifier = { val pbr = new SimplePositionedByteRange(bytes) toTimedQualifier(pbr) } def toTimedQualifier(pbr: PositionedByteRange): TimedQualifier = { TimedQualifier(IntervalUnit.withName(OrderedBytes.decodeString(pbr)), OrderedBytes.decodeInt64(pbr)) } }
Example 62
Source File: AccountStorage.scala From matcher with MIT License | 5 votes |
package com.wavesplatform.dex.db import{File, FileInputStream, FileOutputStream} import java.nio.file.Files import java.util.Base64 import cats.syntax.either._ import{Bytes, Ints} import com.wavesplatform.dex.crypto.Enigma import com.wavesplatform.dex.db.AccountStorage.Settings.EncryptedFile import com.wavesplatform.dex.domain.account.KeyPair import com.wavesplatform.dex.domain.bytes.ByteStr import com.wavesplatform.dex.domain.crypto import net.ceedubs.ficus.readers.ValueReader import scala.collection.mutable.ArrayBuffer case class AccountStorage(keyPair: KeyPair) object AccountStorage { sealed trait Settings object Settings { case class InMem(seed: ByteStr) extends Settings case class EncryptedFile(path: File, password: String) extends Settings implicit val valueReader: ValueReader[Settings] = ValueReader.relative[Settings] { config => config.getString("type") match { case "in-mem" => InMem(Base64.getDecoder.decode(config.getString("in-mem.seed-in-base64"))) case "encrypted-file" => EncryptedFile( path = new File(config.getString("encrypted-file.path")), password = config.getString("encrypted-file.password") ) case x => throw new IllegalArgumentException(s"The type of account storage '$x' is unknown. Please update your settings.") } } } def load(settings: Settings): Either[String, AccountStorage] = settings match { case Settings.InMem(seed) => Right(AccountStorage(KeyPair(seed))) case Settings.EncryptedFile(file, password) => if (file.isFile) { val encryptedSeedBytes = readFile(file) val key = Enigma.prepareDefaultKey(password) val decryptedBytes = Enigma.decrypt(key, encryptedSeedBytes) AccountStorage(KeyPair(decryptedBytes)).asRight } else s"A file '${file.getAbsolutePath}' doesn't exist".asLeft } def save(seed: ByteStr, to: EncryptedFile): Unit = { Files.createDirectories(to.path.getParentFile.toPath) val key = Enigma.prepareDefaultKey(to.password) val encryptedSeedBytes = Enigma.encrypt(key, seed.arr) writeFile(to.path, encryptedSeedBytes) } def getAccountSeed(baseSeed: ByteStr, nonce: Int): ByteStr = ByteStr(crypto.secureHash(Bytes.concat(Ints.toByteArray(nonce), baseSeed))) def readFile(file: File): Array[Byte] = { val reader = new FileInputStream(file) try { val buff = new Array[Byte](1024) val r = new ArrayBuffer[Byte] while (reader.available() > 0) { val read = if (read > 0) { r.appendAll(buff.iterator.take(read)) } } r.toArray } finally { reader.close() } } def writeFile(file: File, bytes: Array[Byte]): Unit = { val writer = new FileOutputStream(file, false) try writer.write(bytes) finally writer.close() } }
Example 63
Source File: WordSpliter.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.nlp import cn.piflow._ import cn.piflow.conf._ import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import com.huaban.analysis.jieba.JiebaSegmenter.SegMode import com.huaban.analysis.jieba._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Row, SparkSession} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer class WordSpliter extends ConfigurableStop { val authorEmail: String = "[email protected]" val description: String = "Word segmentation" val inportList: List[String] = List(Port.AnyPort.toString) val outportList: List[String] = List(Port.DefaultPort.toString) var path:String = _ val jiebaSegmenter = new JiebaSegmenter() var tokenARR:ArrayBuffer[String]=ArrayBuffer() def segmenter(str:String): Unit ={ var strVar = str //delete symbol strVar = strVar.replaceAll( "[\\p{P}+~$`^=|<>~`$^+=|<>¥×+\\s]" , ""); val tokens = jiebaSegmenter.process(strVar,SegMode.SEARCH).asScala for (token: SegToken <- tokens){ tokenARR += token.word } } def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val session: SparkSession = pec.get[SparkSession]() //read val strDF = //segmenter segmenter(strDF.head().getString(0)) //write df val rows: List[Row] = => { var arr:Array[String]=Array(each) val row: Row = Row.fromSeq(arr) row }).toList val rowRDD: RDD[Row] = session.sparkContext.makeRDD(rows) val schema: StructType = StructType(Array( StructField("words",StringType) )) val df: DataFrame = session.createDataFrame(rowRDD,schema) out.write(df) } def initialize(ctx: ProcessContext): Unit = { } def setProperties(map : Map[String, Any]) = { path = MapUtil.get(map,"path").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val path = new PropertyDescriptor().name("path").displayName("path").description("The path of text file").defaultValue("").required(true) descriptor = path :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/nlp/NLP.png") } override def getGroup(): List[String] = { List(StopGroup.Alg_NLPGroup.toString) } }
Example 64
Source File: JsonUtil.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.util import org.apache.spark.sql.functions.explode import org.apache.spark.sql.{Column, DataFrame, SQLContext, SparkSession} import scala.collection.mutable.ArrayBuffer object JsonUtil extends Serializable{ // The tag you want to parse,If you want to open an array field,you have to write it like this:links_name(MasterField_ChildField) def ParserJsonDF(df:DataFrame,tag:String): DataFrame = { var openArrField:String="" var ArrSchame:String="" var tagARR: Array[String] = tag.split(",") var tagNew:String="" for(tt<-tagARR){ if(tt.indexOf("_")> -1){ //包含“.” val openField: Array[String] = tt.split("_") openArrField=openField(0) ArrSchame+=(openField(1)+",") }else{ tagNew+=(tt+",") } } tagNew+=openArrField ArrSchame=ArrSchame.substring(0,ArrSchame.length-1) tagARR = tagNew.split(",") var FinalDF:DataFrame=df //如果用户选择返回字段 var strings: Seq[Column] =tagNew.split(",") => new Column(p)) if(tag.length>0){ val df00 = : _*) FinalDF=df00 } //如果用户选择打开的数组字段,并给出schame if(openArrField.length>0&&ArrSchame.length>0){ val schames: Array[String] = ArrSchame.split(",") var selARR:ArrayBuffer[String]=ArrayBuffer()//分别取出已经打开的字段 //遍历数组,封装到column对象中 var coARR:ArrayBuffer[Column]=ArrayBuffer()//打开字段的select方法用 val sss = tagNew.split(",")//打开字段后todf方法用 var co: Column =null for(each<-tagARR){ if(each==openArrField){ co = explode(FinalDF(openArrField)) for(x<-schames){ selARR+=(openArrField+"."+x) } }else{ selARR+=each co=FinalDF(each) } coARR+=co } println("###################") selARR.foreach(println(_)) var selSEQ: Seq[Column] = => new Column(q)) var df01: DataFrame = : _*).toDF(sss:_*) FinalDF = : _*) } FinalDF } }
Example 65
Source File: BufferListener.scala From Binding.scala with MIT License | 5 votes |
package com.thoughtworks.binding import Binding.{PatchedEvent, ChangedEvent, PatchedListener, ChangedListener} import com.thoughtworks.binding.Binding.{PatchedEvent, ChangedEvent, PatchedListener, ChangedListener} import scala.collection.mutable.ArrayBuffer final class BufferListener extends ArrayBuffer[Any] { val listener = new ChangedListener[Seq[Any]] with PatchedListener[Any] { override def changed(event: ChangedEvent[Seq[Any]]): Unit = { BufferListener.this += event } override def patched(event: PatchedEvent[Any]): Unit = { BufferListener.this += event } } }
Example 66
Source File: FlatMapRemove.scala From Binding.scala with MIT License | 5 votes |
package com.thoughtworks.binding.regression import com.thoughtworks.binding.Binding._ import com.thoughtworks.binding._ import org.scalatest.freespec.AnyFreeSpec import org.scalatest.matchers.should.Matchers import scala.collection.mutable.ArrayBuffer final class FlatMapRemove extends AnyFreeSpec with Matchers { "removed source of a flatMap" in { val data = Vars.empty[Either[String, String]] val left = for { s <- data if s.isLeft } yield s val events = ArrayBuffer.empty[String] val autoPrint = Binding { if (left.length.bind > 0) { events += "has left" } else { events += "does not has left" } } assert(events.forall(_ == "does not has left")) assert(events.forall(_ == "does not has left")) data.value += Right("1") assert(events.forall(_ == "does not has left")) data.value += Right("2") assert(events.forall(_ == "does not has left")) data.value += Right("3") assert(events.forall(_ == "does not has left")) data.value(1) = Left("left 2") assert(events.last == "has left") data.value --= Seq(Left("left 2")) assert(events.last == "does not has left") } }
Example 67
Source File: InsertThenClear.scala From Binding.scala with MIT License | 5 votes |
package com.thoughtworks.binding.regression import com.thoughtworks.binding.Binding._ import com.thoughtworks.binding._ import org.scalatest.freespec.AnyFreeSpec import org.scalatest.matchers.should.Matchers import scala.collection.mutable.ArrayBuffer final class InsertThenClear extends AnyFreeSpec with Matchers { "insert then clear" in { val items = Vars(1 to 10: _*) val mapped = assert(mapped.get sameElements Seq(-1, -2, -3, -4, -5, -6, -7, -8, -9, -10)) items.value.insertAll(3, 100 to 103) assert(mapped.get sameElements Seq(-1, -2, -3, -100, -101, -102, -103, -4, -5, -6, -7, -8, -9, -10)) items.value.clear() assert(mapped.get sameElements Seq.empty) } }
Example 68
Source File: ProxyMessageHandler.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.stub import import java.nio.ByteBuffer import java.nio.channels._ import com.basho.riak.client.core.RiakMessage import com.basho.riak.client.core.util.HostAndPort import import import import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer class ProxyMessageHandler(hostAndPort: HostAndPort) extends RiakMessageHandler { private final val riakAddress = new InetSocketAddress(hostAndPort.getHost, hostAndPort.getPort) override def handle(context: ClientHandler.Context, input: RiakMessage): Iterable[RiakMessage] = input.getCode match { // coverage plan received from real Riak node must be modified to replace real node's host and port with proxy case MSG_CoverageReq => forwardAndTransform(context, input) { output => val resp = RiakKvPB.RpbCoverageResp.parseFrom(output.getData) val modified = RiakKvPB.RpbCoverageResp.newBuilder(resp) .clearEntries() .addAllEntries( { ce => val ceBuilder = RiakKvPB.RpbCoverageEntry.newBuilder(ce) if (ce.getIp.toStringUtf8 == hostAndPort.getHost && ce.getPort == hostAndPort.getPort) { val localAddress =[NetworkChannel] .getLocalAddress.asInstanceOf[InetSocketAddress] ceBuilder.setIp(ByteString.copyFromUtf8(localAddress.getHostString)) ceBuilder.setPort(localAddress.getPort) } }).build() new RiakMessage(output.getCode, modified.toByteArray) } case _ => forwardMessage(context, input) } private def forwardMessage(context: ClientHandler.Context, input: RiakMessage): Iterable[RiakMessage] = { def readRiakResponse(channel: SocketChannel, out: List[RiakMessage] = Nil): Iterable[RiakMessage] = out match { case _ if !isDoneReceived(out, input) => readRiakResponse(channel, out ++ readSocket(channel)) case _ => out } val channel = try { // forward request to real Riak node assert(channel.write(RiakMessageEncoder.encode(input)) > 0) // read response for forwarded request from real Riak node readRiakResponse(channel) } finally { channel.close() } } private def readSocket(channel: SocketChannel): Iterable[RiakMessage] = { var accumulator = ByteBuffer.allocateDirect(0) var out = ArrayBuffer[RiakMessage]() while (out.isEmpty || accumulator.hasRemaining) { // try to parse riak message from bytes in accumulator buffer RiakMessageEncoder.decode(accumulator) match { case Some(x) => accumulator = accumulator.slice() out += x case None => // read next chunk of data from channel and add it into accumulator val in = ByteBuffer.allocateDirect(1024) // scalastyle:ignore accumulator = ByteBuffer .allocate(accumulator.rewind().limit() + in.flip().limit()) .put(accumulator) .put(in) accumulator.rewind() in.clear() } } out } private def isDoneReceived(out: Iterable[RiakMessage], input: RiakMessage): Boolean = input.getCode match { case MSG_IndexReq => out.foldLeft[Boolean](false)((a, m) => a || RiakKvPB.RpbIndexResp.parseFrom(m.getData).getDone) case _ => out.nonEmpty } private def forwardAndTransform(context: ClientHandler.Context, input: RiakMessage )(transform: RiakMessage => RiakMessage ): Iterable[RiakMessage] = forwardMessage(context, input).map(transform(_)) override def onRespond(input: RiakMessage, output: Iterable[RiakMessage]): Unit = {} }
Example 69
Source File: QueryBucketKeys.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.query import com.basho.riak.client.core.query.Location import com.basho.riak.spark.rdd.connector.RiakConnector import com.basho.riak.spark.rdd.{BucketDef, ReadConf} import scala.collection.mutable.ArrayBuffer private case class QueryBucketKeys(bucket: BucketDef, readConf:ReadConf, riakConnector: RiakConnector, keys: Iterable[String] ) extends QuerySubsetOfKeys[String] { override def locationsByKeys(keys: Iterator[String]): (Boolean, Iterable[Location]) = { val dataBuffer = new ArrayBuffer[Location](readConf.fetchSize) val ns = bucket.asNamespace() keys.forall(k =>{ dataBuffer += new Location(ns, k) dataBuffer.size < readConf.fetchSize} ) false -> dataBuffer } }
Example 70
Source File: Query2iKeys.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.query import com.basho.riak.client.core.operations.CoveragePlanOperation.Response.CoverageEntry import com.basho.riak.client.core.query.Location import com.basho.riak.spark.rdd.connector.RiakConnector import com.basho.riak.spark.rdd.{BucketDef, ReadConf} import scala.collection.mutable.ArrayBuffer private case class Query2iKeys[K](bucket: BucketDef, readConf:ReadConf, riakConnector: RiakConnector, index: String, keys: Iterable[K] ) extends QuerySubsetOfKeys[K] { private var query2iKey: Option[Query2iKeySingleOrRange[K]] = None private var tokenNext: Option[Either[String, CoverageEntry]] = None // By default there should be an empty Serializable Iterator private var _iterator: Iterator[Location] = ArrayBuffer.empty[Location].iterator private def chunkIsCollected(chunk: Iterable[Location]) = chunk.size >= readConf.fetchSize // scalastyle:off cyclomatic.complexity override def locationsByKeys(keys: Iterator[K]): (Boolean, Iterable[Location]) = { val dataBuffer = new ArrayBuffer[Location](readConf.fetchSize) while ((keys.hasNext || _iterator.hasNext || tokenNext.isDefined) && !chunkIsCollected(dataBuffer)){ // Previously gathered results should be returned at first, if any _iterator forall ( location => { dataBuffer += location !chunkIsCollected(dataBuffer) }) if(!chunkIsCollected(dataBuffer)) tokenNext match { case Some(next) => // Fetch the next results page from the previously executed 2i query, if any assert(query2iKey.isDefined) val r = query2iKey.get.nextLocationChunk(tokenNext) tokenNext = r._1 _iterator = r._2.iterator case None if keys.hasNext => // query data for the first/next key assert(_iterator.isEmpty && tokenNext.isEmpty) val key = query2iKey = Some(new Query2iKeySingleOrRange[K](bucket, readConf, riakConnector, index, key)) val r = query2iKey.get.nextLocationChunk(tokenNext) tokenNext = r._1 _iterator = r._2.iterator case _ => // There is nothing to do } } tokenNext.isDefined -> dataBuffer } // scalastyle:on cyclomatic.complexity }
Example 71
Source File: Partitioner.scala From spark-solr with Apache License 2.0 | 5 votes |
package com.lucidworks.spark import import com.lucidworks.spark.rdd.SolrRDD import com.lucidworks.spark.util.SolrSupport import org.apache.solr.client.solrj.SolrQuery import org.apache.spark.Partition import scala.collection.mutable.ArrayBuffer // Is there a need to override {@code Partitioner.scala} and define our own partition id's object SolrPartitioner { def getShardPartitions(shards: List[SolrShard], query: SolrQuery) : Array[Partition] = {{ case (shard, i) => // Chose any of the replicas as the active shard to query SelectSolrRDDPartition(i, "*", shard, query, SolrRDD.randomReplica(shard))}.toArray } def getSplitPartitions( shards: List[SolrShard], query: SolrQuery, splitFieldName: String, splitsPerShard: Int): Array[Partition] = { var splitPartitions = ArrayBuffer.empty[SelectSolrRDDPartition] var counter = 0 shards.foreach(shard => { val splits = SolrSupport.getShardSplits(query, shard, splitFieldName, splitsPerShard) splits.foreach(split => { splitPartitions += SelectSolrRDDPartition(counter, "*", shard, split.query, split.replica) counter = counter + 1 }) }) splitPartitions.toArray } // Workaround for SOLR-10490. TODO: Remove once fixed def getExportHandlerPartitions( shards: List[SolrShard], query: SolrQuery): Array[Partition] = {{ case (shard, i) => // Chose any of the replicas as the active shard to query ExportHandlerPartition(i, shard, query, SolrRDD.randomReplica(shard), 0, 0)}.toArray } // Workaround for SOLR-10490. TODO: Remove once fixed def getExportHandlerPartitions( shards: List[SolrShard], query: SolrQuery, splitFieldName: String, splitsPerShard: Int): Array[Partition] = { val splitPartitions = ArrayBuffer.empty[ExportHandlerPartition] var counter = 0 shards.foreach(shard => { // Form a continuous iterator list so that we can pick different replicas for different partitions in round-robin mode val splits = SolrSupport.getExportHandlerSplits(query, shard, splitFieldName, splitsPerShard) splits.foreach(split => { splitPartitions += ExportHandlerPartition(counter, shard, split.query, split.replica, split.numWorkers, split.workerId) counter = counter+1 }) }) splitPartitions.toArray } } case class SolrShard(shardName: String, replicas: List[SolrReplica]) case class SolrReplica( replicaNumber: Int, replicaName: String, replicaUrl: String, replicaHostName: String, locations: Array[InetAddress]) { def getHostAndPort(): String = {replicaHostName.substring(0, replicaHostName.indexOf('_'))} override def toString(): String = { return s"SolrReplica(${replicaNumber}) ${replicaName}: url=${replicaUrl}, hostName=${replicaHostName}, locations="+locations.mkString(",") } }
Example 72
Source File: GranularBigVector.scala From glint with MIT License | 5 votes |
package glint.models.client.granular import scala.collection.mutable.ArrayBuffer import scala.concurrent.{ExecutionContext, Future} import scala.reflect.ClassTag import glint.models.client.BigVector override def push(keys: Array[Long], values: Array[V]) (implicit ec: ExecutionContext): Future[Boolean] = { var i = 0 val ab = new ArrayBuffer[Future[Boolean]](keys.length / maximumMessageSize) while (i < keys.length) { val end = Math.min(keys.length, i + maximumMessageSize) val future = underlying.push(keys.slice(i, end), values.slice(i, end)) ab.append(future) i += maximumMessageSize } Future.sequence(ab.toIterator).transform(x => x.forall(y => y), err => err) } }
Example 73
Source File: GranularBigMatrix.scala From glint with MIT License | 5 votes |
package glint.models.client.granular import scala.collection.mutable.ArrayBuffer import scala.concurrent.{ExecutionContext, Future} import scala.reflect.ClassTag import breeze.linalg.Vector import glint.models.client.BigMatrix override def pull(rows: Array[Long], cols: Array[Int])(implicit ec: ExecutionContext): Future[Array[V]] = { if (rows.length <= maximumMessageSize) { underlying.pull(rows, cols) } else { var i = 0 val ab = new ArrayBuffer[Future[Array[V]]](rows.length / maximumMessageSize) while (i < rows.length) { val end = Math.min(rows.length, i + maximumMessageSize) val future = underlying.pull(rows.slice(i, end), cols.slice(i, end)) ab.append(future) i += maximumMessageSize } Future.sequence(ab.toIterator).map { case arrayOfValues => val finalValues = new ArrayBuffer[V](rows.length) arrayOfValues.foreach(x => finalValues.appendAll(x)) finalValues.toArray } } } }
Example 74
Source File: HiveQLProcessBuilder.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.enginemanager.hive.process import java.nio.file.Paths import com.webank.wedatasphere.linkis.common.conf.Configuration import com.webank.wedatasphere.linkis.enginemanager.conf.EnvConfiguration.{DEFAULT_JAVA_OPTS, JAVA_HOME, engineGCLogPath} import com.webank.wedatasphere.linkis.enginemanager.hive.conf.HiveEngineConfiguration import com.webank.wedatasphere.linkis.enginemanager.impl.UserEngineResource import com.webank.wedatasphere.linkis.enginemanager.process.JavaProcessEngineBuilder import com.webank.wedatasphere.linkis.enginemanager.{AbstractEngineCreator, EngineResource} import com.webank.wedatasphere.linkis.protocol.engine.RequestEngine import org.apache.commons.lang.StringUtils import org.slf4j.LoggerFactory import scala.collection.mutable.ArrayBuffer override protected def classpathCheck(jarOrFiles: Array[String]): Unit = { for(jarOrFile <- jarOrFiles){ checkJarOrFile(jarOrFile) } } //todo Check the jar of the classpath(对classpath的jar进行检查) private def checkJarOrFile(jarOrFile:String):Unit = { } override def build(engineRequest: EngineResource, request: RequestEngine): Unit = { this.request = request userEngineResource = engineRequest.asInstanceOf[UserEngineResource] val javaHome = JAVA_HOME.getValue( if(StringUtils.isEmpty(javaHome)) { warn("We cannot find the java home, use java to run storage repl web server.") commandLine += "java" } else { commandLine += Paths.get(javaHome, "bin/java").toAbsolutePath.toFile.getAbsolutePath } if ({ val settingClientMemory = if (!settingClientMemory.toLowerCase().endsWith("g")){, settingClientMemory + "g") } //,"g") } val clientMemory = HiveEngineConfiguration.HIVE_CLIENT_MEMORY.getValue( if (clientMemory.toLowerCase().endsWith("g")){ commandLine += ("-Xmx" + clientMemory.toLowerCase()) commandLine += ("-Xms" + clientMemory.toLowerCase()) }else{ commandLine += ("-Xmx" + clientMemory + "g") commandLine += ("-Xms" + clientMemory + "g") } val javaOPTS = getExtractJavaOpts val alias = getAlias(request) if(StringUtils.isNotEmpty(DEFAULT_JAVA_OPTS.getValue)) DEFAULT_JAVA_OPTS.getValue.format(engineGCLogPath(port, userEngineResource.getUser, alias)).split("\\s+").foreach(commandLine += _) if(StringUtils.isNotEmpty(javaOPTS)) javaOPTS.split("\\s+").foreach(commandLine += _) //engineLogJavaOpts(port, alias).trim.split(" ").foreach(commandLine += _) if(Configuration.IS_TEST_MODE.getValue) { val port = AbstractEngineCreator.getNewPort info(s"$toString open debug mode with port $port.") commandLine += s"-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=$port" } var classpath = getClasspath(, getExtractClasspath) classpath = classpath ++"jars").split(",") classpathCheck(classpath) commandLine += "-Djava.library.path=/appcom/Install/hadoop/lib/native" commandLine += "-cp" commandLine += classpath.mkString(":") commandLine += "com.webank.wedatasphere.linkis.engine.DataWorkCloudEngineApplication" } // override def build(engineRequest: EngineResource, request: RequestEngine): Unit = { // import scala.collection.JavaConversions._ // foreach {case (k, v) =>"request key is $k, value is $v")} // this.request = request //, request) // // } override protected val addApacheConfigPath: Boolean = true }
Example 75
Source File: JDBCSQLCodeParser.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.entrance.executer import com.webank.wedatasphere.linkis.entrance.conf.JDBCConfiguration import org.apache.commons.lang.StringUtils import scala.collection.mutable.ArrayBuffer object JDBCSQLCodeParser { val separator = ";" val defaultLimit: Int = JDBCConfiguration.ENGINE_DEFAULT_LIMIT.getValue def parse(code: String): Array[String] = { val codeBuffer = new ArrayBuffer[String]() def appendStatement(sqlStatement: String): Unit = { codeBuffer.append(sqlStatement) } if (StringUtils.contains(code, separator)) { StringUtils.split(code, ";").foreach { case s if StringUtils.isBlank(s) => case s if isSelectCmdNoLimit(s) => appendStatement(s + " limit " + defaultLimit); case s => appendStatement(s); } } else { code match { case s if StringUtils.isBlank(s) => case s if isSelectCmdNoLimit(s) => appendStatement(s + " limit " + defaultLimit); case s => appendStatement(s); } } codeBuffer.toArray } def isSelectCmdNoLimit(cmd: String): Boolean = { var code = cmd.trim if (!cmd.split("\\s+")(0).equalsIgnoreCase("select")) return false if (code.contains("limit")) code = code.substring(code.lastIndexOf("limit")).trim else if (code.contains("LIMIT")) code = code.substring(code.lastIndexOf("LIMIT")).trim.toLowerCase else return true val hasLimit = code.matches("limit\\s+\\d+\\s*;?") if (hasLimit) { if (code.indexOf(";") > 0) code = code.substring(5, code.length - 1).trim else code = code.substring(5).trim val limitNum = code.toInt if (limitNum > defaultLimit) throw new IllegalArgumentException("We at most allowed to limit " + defaultLimit + ", but your SQL has been over the max rows.") } !hasLimit } }
Example 76
Source File: PythonEngineExecutor.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.engine.executors import com.webank.wedatasphere.linkis.common.utils.Logging import com.webank.wedatasphere.linkis.engine.PythonSession import com.webank.wedatasphere.linkis.engine.exception.EngineException import com.webank.wedatasphere.linkis.engine.execute.{EngineExecutor, EngineExecutorContext} import import com.webank.wedatasphere.linkis.protocol.engine.JobProgressInfo import com.webank.wedatasphere.linkis.resourcemanager.{LoadInstanceResource, Resource} import com.webank.wedatasphere.linkis.rpc.Sender import com.webank.wedatasphere.linkis.scheduler.executer._ import import scala.collection.mutable.ArrayBuffer class PythonEngineExecutor(outputPrintLimit: Int) extends EngineExecutor(outputPrintLimit, false) with SingleTaskOperateSupport with SingleTaskInfoSupport with Logging { override def getName: String = Sender.getThisServiceInstance.getInstance private val lineOutputStream = new RsOutputStream private[executors] var engineExecutorContext: EngineExecutorContext = _ override def getActualUsedResources: Resource = { new LoadInstanceResource(Runtime.getRuntime.totalMemory() - Runtime.getRuntime.freeMemory(), 2, 1) } private val pySession = new PythonSession override protected def executeLine(engineExecutorContext: EngineExecutorContext, code: String): ExecuteResponse = { if(engineExecutorContext != this.engineExecutorContext){ this.engineExecutorContext = engineExecutorContext pySession.setEngineExecutorContext(engineExecutorContext) //lineOutputStream.reset(engineExecutorContext) info("Python executor reset new engineExecutorContext!") } engineExecutorContext.appendStdout(s"$getName >> ${code.trim}") pySession.execute(code) //lineOutputStream.flush() SuccessExecuteResponse() } override protected def executeCompletely(engineExecutorContext: EngineExecutorContext, code: String, completedLine: String): ExecuteResponse = { val newcode = completedLine + code info("newcode is " + newcode) executeLine(engineExecutorContext, newcode) } override def kill(): Boolean = true override def pause(): Boolean = true override def resume(): Boolean = true override def progress(): Float = { if (this.engineExecutorContext != null){ this.engineExecutorContext.getCurrentParagraph / this.engineExecutorContext.getTotalParagraph.asInstanceOf[Float] }else 0.0f } override def getProgressInfo: Array[JobProgressInfo] = { val jobProgressInfos = new ArrayBuffer[JobProgressInfo]() jobProgressInfos.toArray Array.empty } override def log(): String = "" override def close(): Unit = { IOUtils.closeQuietly(lineOutputStream) var isKill:Boolean = false try { pySession.close isKill = true; } catch { case e: Throwable => throw new EngineException(60004, "Engine shutdown exception(引擎关闭异常)") } } }
Example 77
Source File: SparkPostExecutionHook.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.engine.extension import com.webank.wedatasphere.linkis.common.utils.Logging import com.webank.wedatasphere.linkis.engine.execute.EngineExecutorContext import com.webank.wedatasphere.linkis.scheduler.executer.ExecuteResponse import scala.collection.mutable.ArrayBuffer trait SparkPostExecutionHook { def hookName:String def callPostExecutionHook(engineExecutorContext: EngineExecutorContext, executeResponse: ExecuteResponse, code: String): Unit } object SparkPostExecutionHook extends Logging{ private val postHooks = ArrayBuffer[SparkPostExecutionHook]() def register(postExecutionHook: SparkPostExecutionHook):Unit = { info(s"Get a postExecutionHook of ${postExecutionHook.hookName} register") postHooks.append(postExecutionHook) } def getSparkPostExecutionHooks():Array[SparkPostExecutionHook] = { postHooks.toArray } }
Example 78
Source File: SparkPreExecutionHook.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.engine.extension import com.webank.wedatasphere.linkis.common.utils.Logging import com.webank.wedatasphere.linkis.engine.execute.EngineExecutorContext import scala.collection.mutable.ArrayBuffer trait SparkPreExecutionHook { def hookName:String def callPreExecutionHook(engineExecutorContext: EngineExecutorContext, code: String): String } object SparkPreExecutionHook extends Logging{ private val preHooks = ArrayBuffer[SparkPreExecutionHook]() def register(preExecutionHook: SparkPreExecutionHook):Unit = { info(s"Get a preExecutionHook of ${preExecutionHook.hookName} register") preHooks.append(preExecutionHook) } def getSparkPreExecutionHooks():Array[SparkPreExecutionHook] = { preHooks.toArray } }
Example 79
Source File: SparkSqlExtension.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.engine.extension import java.util.concurrent._ import com.webank.wedatasphere.linkis.common.conf.CommonVars import com.webank.wedatasphere.linkis.common.utils.{Logging, Utils} import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.{DataFrame, SQLContext} import scala.collection.mutable.ArrayBuffer import scala.concurrent.duration._ abstract class SparkSqlExtension extends Logging{ private val maxPoolSize = CommonVars("wds.linkis.dws.ujes.spark.extension.max.pool",5).getValue private val executor = new ThreadPoolExecutor(2, maxPoolSize, 2, TimeUnit.SECONDS, new LinkedBlockingQueue[Runnable](), new ThreadFactory { override def newThread(r: Runnable): Thread = { val thread = new Thread(r) thread.setDaemon(true) thread } }) final def afterExecutingSQL(sqlContext: SQLContext,command: String,dataFrame: DataFrame,timeout:Long,sqlStartTime:Long):Unit = { try { val thread = new Runnable { override def run(): Unit = extensionRule(sqlContext,command,dataFrame.queryExecution,sqlStartTime) } val future = executor.submit(thread) Utils.waitUntil(future.isDone,timeout milliseconds) } catch { case e: Throwable => info("Failed to execute SparkSqlExtension: ", e) } } protected def extensionRule(sqlContext: SQLContext,command: String,queryExecution: QueryExecution,sqlStartTime:Long):Unit } object SparkSqlExtension extends Logging { private val extensions = ArrayBuffer[SparkSqlExtension]() def register(sqlExtension: SparkSqlExtension):Unit = { info("Get a sqlExtension register") extensions.append(sqlExtension) } def getSparkSqlExtensions():Array[SparkSqlExtension] = { extensions.toArray } }
Example 80
Source File: CSTableParser.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.engine.cs import java.util.regex.Pattern import com.webank.wedatasphere.linkis.common.utils.Logging import com.webank.wedatasphere.linkis.cs.client.service.CSTableService import com.webank.wedatasphere.linkis.cs.common.entity.metadata.CSTable import com.webank.wedatasphere.linkis.cs.common.utils.CSCommonUtils import com.webank.wedatasphere.linkis.engine.exception.ExecuteError import com.webank.wedatasphere.linkis.engine.execute.EngineExecutorContext import org.apache.commons.lang.StringUtils import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.datasources.csv.DolphinToSpark import scala.collection.mutable.ArrayBuffer def getCSTable(csTempTable:String, contextIDValueStr: String, nodeNameStr: String):CSTable = { CSTableService.getInstance().getUpstreamSuitableTable(contextIDValueStr, nodeNameStr, csTempTable) } def registerTempTable(csTable: CSTable):Unit = { val spark = SparkSession.builder().enableHiveSupport().getOrCreate() info(s"Start to create tempView to sparkSession viewName(${csTable.getName}) location(${csTable.getLocation})") DolphinToSpark.createTempView(spark, csTable.getName, csTable.getLocation, true) info(s"Finished to create tempView to sparkSession viewName(${csTable.getName}) location(${csTable.getLocation})") } }
Example 81
Source File: LogContainer.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.engine.spark.common import scala.collection.Iterable import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer class LogContainer(val logSize: Int) { private final val logs = new Array[String](logSize) private var flag, tail = 0 def putLog(log: String): Unit = { logs.synchronized { val index = (tail + 1) % logSize if(index == flag) { flag = (flag + 1) % logSize } logs(tail) = log tail = index } } def putLogs(logs: Iterable[String]) = synchronized { logs.foreach(putLog) } def reset() = synchronized { flag = 0 tail = 0 } def getLogs: List[String] = { logs.synchronized { if(flag == tail) { return List.empty[String] } val _logs = ArrayBuffer[String]() val _tail = if(flag > tail) tail + logSize else tail for (index <- flag until _tail) { val _index = index % logSize _logs += logs(_index) } flag = tail _logs.toList } } def size = { if(flag == tail) 0 else if(flag > tail) tail + logSize - flag else tail - flag } def getLogList: java.util.List[String] = getLogs }
Example 82
Source File: SparkConfiguration.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.enginemanager.configuration import com.webank.wedatasphere.linkis.common.conf.{CommonVars, Configuration} import com.webank.wedatasphere.linkis.common.utils.{ClassUtils, Logging} import com.webank.wedatasphere.linkis.engine.factory.SparkEngineExecutorFactory import com.webank.wedatasphere.linkis.enginemanager.AbstractEngineCreator import scala.collection.mutable.ArrayBuffer object SparkConfiguration extends Logging { val SPARK_MAX_PARALLELISM_USERS = CommonVars[Int]("wds.linkis.engine.spark.user.parallelism", 100) val SPARK_USER_MAX_WAITING_SIZE = CommonVars[Int]("wds.linkis.engine.spark.user.waiting.max", 100) val SPARK_SESSION_HOOK = CommonVars[String]("wds.linkis.engine.spark.session.hook", "") val SPARK_LANGUAGE_REPL_INIT_TIME = CommonVars[String]("wds.linkis.engine.spark.language-repl.init.time", new String("30s")) val SPARK_ALLOW_REQUEST_ALL_YARN_MEMORY = CommonVars[String]("wds.linkis.engine.spark.allow.all-memory.when.queue", new String("60g")) val SPARK_ALLOW_REQUEST_ALL_YARN_CORES = CommonVars[Int]("wds.linkis.engine.spark.allow.all-cores.when.queue", 30) val SPARK_USER_MAX_ALLOCATE_SESSIONS = CommonVars[Int]("wds.linkis.engine.spark.user.sessions.max", 5) val SPARK_USER_MAX_ALLOCATE_YARN_MEMORY = CommonVars[String]("wds.linkis.engine.spark.user.yarn.memory.max", new String("100g")) val SPARK_USER_MAX_ALLOCATE_YARN_CORES = CommonVars[Int]("wds.linkis.engine.spark.user.cores.max", 50) val SPARK_USER_MAX_ALLOCATE_DRIVER_MEMORY = CommonVars[String]("wds.linkis.engine.spark.user.driver.memory.max", new String("15g")) val SPARK_USER_MAX_ALLOCATE_DRIVER_CORES = SPARK_USER_MAX_ALLOCATE_SESSIONS val SPARK_USER_MAX_RESOURCE_IN_QUEUE = CommonVars[Float]("wds.linkis.engine.spark.user.queue.resources.max", 0.6f) val SPARK_DANGER_QUEUE_USED_CAPACITY = CommonVars[Float]("wds.linkis.engine.spark.danger.queue.used", 0.2f) val SPARK_DANGER_QUEUE_USER_ALLOCATE_SESSION = CommonVars[Int]("wds.linkis.engine.spark.danger.user.sessions.max", 2) val SPARK_WARN_QUEUE_USED_CAPACITY = CommonVars[Float]("wds.linkis.engine.spark.warning.queue.used", 0.5f) val SPARK_WARN_QUEUE_USER_ALLOCATE_SESSION = CommonVars[Int]("wds.linkis.engine.spark.warning.user.sessions.max", 3) val PROXY_USER = CommonVars[String]("spark.proxy.user", "${UM}") val SPARK_CLIENT_MODE = "client" val SPARK_CLUSTER_MODE = "cluster" val SPARK_DEPLOY_MODE = CommonVars[String]("spark.submit.deployMode", SPARK_CLIENT_MODE) val SPARK_APPLICATION_JARS = CommonVars[String]("spark.application.jars", "", "User-defined jars, separated by English, must be uploaded to HDFS first, and must be full path to HDFS.(用户自定义jar包,多个以英文,隔开,必须先上传到HDFS,且需为HDFS全路径。)") val SPARK_EXTRA_JARS = CommonVars[String]("spark.jars", "", "Additional jar package, Driver and Executor take effect(额外的jar包,Driver和Executor生效)") val MAPRED_OUTPUT_COMPRESS = CommonVars[String]("mapred.output.compress", "true", "Whether the map output is compressed(map输出结果是否压缩)") val MAPRED_OUTPUT_COMPRESSION_CODEC = CommonVars[String]("mapred.output.compression.codec", "", "Map output compression method(map输出结果压缩方式)") val SPARK_MASTER = CommonVars[String]("spark.master", "yarn", "Default master(默认master)") val SPARK_OUTPUTDIR = CommonVars[String]("spark.outputDir", "/home/georgeqiao", "Default output path(默认输出路径)") val DWC_SPARK_USEHIVECONTEXT = CommonVars[Boolean]("wds.linkis.spark.useHiveContext", true) val ENGINE_JAR = CommonVars[String]("wds.linkis.enginemanager.core.jar", ClassUtils.jarOfClass(classOf[SparkEngineExecutorFactory]).head) val SPARK_DRIVER_CLASSPATH = CommonVars[String]("wds.linkis.spark.driver.conf.mainjar", "") val SPARK_DRIVER_EXTRA_JAVA_OPTIONS = CommonVars[String]("spark.driver.extraJavaOptions", "\" " + getJavaRemotePort + "\"") val DEFAULT_JAVA_OPTS = CommonVars[String]("wds.linkis.engine.javaOpts.default", "-server -XX:+UseG1GC -XX:MaxPermSize=250m -XX:PermSize=128m " + "-Xloggc:%s -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps") val SPARK_ML_BUCKET_FIELDS = CommonVars[String]("", "age[0,18,30,60,100]") val SPARK_SUBMIT_CMD = CommonVars[String]("wds.linkis.engine.spark.submit.cmd", "spark-submit") private var Ports: ArrayBuffer[Int] = _ def getJavaRemotePort = { if (Configuration.IS_TEST_MODE.getValue) { val r = new scala.util.Random() val port = 1024 + r.nextInt((65536 - 1024) + 1) info(s"open debug mode with port $port.") s"-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=$port" } else { "" } } private def getAvailablePort: Int = synchronized { var port = AbstractEngineCreator.getNewPort info("Get new port " + port) if (Ports == null) { info("Get inInitPorts is null ") Ports = ArrayBuffer(0, 1) info("Current ports is " + Ports.toList.toString()) } while (Ports.contains(port)) { if (AbstractEngineCreator != null) { port = AbstractEngineCreator.getNewPort } } Ports += port port } }
Example 83
Source File: CSResourceParser.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.engine.cs import java.util import java.util.regex.Pattern import com.webank.wedatasphere.linkis.cs.client.service.CSResourceService import com.webank.wedatasphere.linkis.engine.PropertiesExecuteRequest import org.apache.commons.lang.StringUtils import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer class CSResourceParser { private val pb = Pattern.compile("cs://[^\\s\"]+[$\\s]{0,1}", Pattern.CASE_INSENSITIVE) private val PREFIX = "cs://" private def getPreFixResourceNames(code: String): Array[String] = { val bmlResourceNames = new ArrayBuffer[String]() val mb = pb.matcher(code) while (mb.find) bmlResourceNames.append( bmlResourceNames.toArray } def parse(executeRequest: PropertiesExecuteRequest, code: String, contextIDValueStr: String, nodeNameStr: String): String = { //TODO getBMLResource peaceWong val bmlResourceList = CSResourceService.getInstance().getUpstreamBMLResource(contextIDValueStr, nodeNameStr) val parsedResources = new util.ArrayList[util.Map[String, Object]]() val preFixResourceNames = getPreFixResourceNames(code) val preFixNames = new ArrayBuffer[String]() val parsedNames = new ArrayBuffer[String]() preFixResourceNames.foreach { preFixResourceName => val resourceName = preFixResourceName.replace(PREFIX, "").trim val bmlResourceOption = bmlResourceList.find(_.getDownloadedFileName.equals(resourceName)) if (bmlResourceOption.isDefined) { val bmlResource = bmlResourceOption.get val map = new util.HashMap[String, Object]() map.put("resourceId", bmlResource.getResourceId) map.put("version", bmlResource.getVersion) map.put("fileName", resourceName) parsedResources.add(map) preFixNames.append(preFixResourceName) parsedNames.append(resourceName) } }"resources", parsedResources) StringUtils.replaceEach(code, preFixNames.toArray, parsedNames.toArray) } }
Example 84
Source File: RsOutputStream.scala From Linkis with Apache License 2.0 | 5 votes |
package import import import{MetaData, Record} import com.webank.wedatasphere.linkis.common.utils.Logging import com.webank.wedatasphere.linkis.engine.execute.EngineExecutorContext import import scala.collection.mutable.ArrayBuffer class RsOutputStream extends OutputStream with Logging{ private val line = ArrayBuffer[Byte]() private var isReady = false private var writer: ResultSetWriter[_ <: MetaData, _ <: Record] = _ override def write(b: Int) = if(isReady) synchronized { if(writer != null) { if (b == '\n') { val outStr = new String(line.toArray,"UTF-8") writer.addRecord(new LineRecord(outStr)) //info("output line:" + outStr) line.clear() } else line += b.toByte }else{ warn("writer is null") } } def reset(engineExecutorContext: EngineExecutorContext) = { writer = engineExecutorContext.createDefaultResultSetWriter() writer.addMetaData(null) } def ready() = isReady = true override def flush(): Unit = if(writer != null && line.nonEmpty) { val outStr = new String(line.toArray,"UTF-8") writer.addRecord(new LineRecord(outStr)) //info("flush line:" + outStr) line.clear() } override def toString = if(writer != null) writer.toString() else null override def close() = if(writer != null) { flush() writer.close() writer = null } }
Example 85
Source File: CodeGeneratorEngineHook.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.engine.execute.hook import import com.webank.wedatasphere.linkis.common.utils.Logging import com.webank.wedatasphere.linkis.engine.execute.{EngineExecutor, EngineHook} import com.webank.wedatasphere.linkis.scheduler.executer.{ExecuteRequest, RunTypeExecuteRequest} import com.webank.wedatasphere.linkis.server.JMap import import org.apache.commons.lang.StringUtils import scala.collection.mutable.ArrayBuffer @Deprecated //changed to UdfLoadEngineHook abstract class CodeGeneratorEngineHook extends EngineHook with Logging{ self => val udfPathProp = "udf.paths" protected var creator: String = _ protected var user: String = _ protected var initSpecialCode: String = _ protected val runType: String protected def acceptCodeType(line: String): Boolean protected def generateCode(): Array[String] = { val codeBuffer = new ArrayBuffer[String] val statementBuffer = new ArrayBuffer[String] var accept = true initSpecialCode.split("\n").foreach{ case "" => case l if l.startsWith("%") => if(acceptCodeType(l)){ accept = true codeBuffer.append(statementBuffer.mkString("\n")) statementBuffer.clear() }else{ accept = false } case l if accept => statementBuffer.append(l) case _ => } if(statementBuffer.nonEmpty) codeBuffer.append(statementBuffer.mkString("\n")) codeBuffer.toArray } override def beforeCreateEngine(params: JMap[String, String]): JMap[String, String] = { creator = params.get("creator") user = params.get("user") initSpecialCode = StringUtils.split(params.get(udfPathProp), ",").map(readFile).mkString("\n") params } override def afterCreatedEngine(executor: EngineExecutor): Unit = { generateCode().foreach { case "" => case c: String => info("Submit udf registration to engine, code: " + c) executor.execute(new ExecuteRequest with RunTypeExecuteRequest{ override val code: String = c override val runType: String = self.runType }) info("executed code: " + c) } } protected def readFile(path: String): String = { info("read file: " + path) val file = new File(path) if(file.exists()){ FileUtils.readFileToString(file) } else { info("udf file: [" + path + "] doesn't exist, ignore it.") "" } } } @Deprecated class SqlCodeGeneratorEngineHook extends CodeGeneratorEngineHook{ override val runType = "sql" override protected def acceptCodeType(line: String): Boolean = { line.startsWith("%sql") } } @Deprecated class PythonCodeGeneratorEngineHook extends CodeGeneratorEngineHook{ override val runType = "python" override protected def acceptCodeType(line: String): Boolean = { line.startsWith("%python") } } @Deprecated class ScalaCodeGeneratorEngineHook extends CodeGeneratorEngineHook{ override val runType = "scala" override protected def acceptCodeType(line: String): Boolean = { line.startsWith("%scala") } }
Example 86
Source File: AbstractEngineCreator.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.enginemanager import import com.webank.wedatasphere.linkis.common.conf.DWCArgumentsParser import com.webank.wedatasphere.linkis.common.utils.Utils import com.webank.wedatasphere.linkis.enginemanager.conf.EngineManagerConfiguration import com.webank.wedatasphere.linkis.enginemanager.exception.EngineManagerErrorException import com.webank.wedatasphere.linkis.enginemanager.impl.UserTimeoutEngineResource import com.webank.wedatasphere.linkis.enginemanager.process.{CommonProcessEngine, ProcessEngine, ProcessEngineBuilder} import com.webank.wedatasphere.linkis.protocol.engine.{EngineCallback, RequestEngine} import com.webank.wedatasphere.linkis.rpc.Sender import com.webank.wedatasphere.linkis.server.{JMap, toScalaMap} import import scala.collection.mutable.ArrayBuffer abstract class AbstractEngineCreator extends EngineCreator { private val inInitPorts = ArrayBuffer[Int]() private def getAvailablePort: Int = synchronized { var port = AbstractEngineCreator.getNewPort while(inInitPorts.contains(port)) port = AbstractEngineCreator.getNewPort inInitPorts += port port } def removePort(port: Int): Unit = inInitPorts -= port protected def createProcessEngineBuilder(): ProcessEngineBuilder protected def getExtractSpringConfigs(requestEngine: RequestEngine): JMap[String, String] = { val springConf = new JMap[String, String]"spring.")).foreach(key => springConf.put(key.substring(7), springConf } protected def createEngine(processEngineBuilder:ProcessEngineBuilder,parser:DWCArgumentsParser):ProcessEngine={ processEngineBuilder.getEngineResource match { case timeout: UserTimeoutEngineResource => new CommonProcessEngine(processEngineBuilder, parser, timeout.getTimeout) case _ => new CommonProcessEngine(processEngineBuilder, parser) } } override def create(ticketId: String, engineRequest: EngineResource, request: RequestEngine): Engine = { val port = getAvailablePort val processEngineBuilder = createProcessEngineBuilder() processEngineBuilder.setPort(port), request) val parser = new DWCArgumentsParser var springConf = Map("" -> EngineManagerConfiguration.ENGINE_SPRING_APPLICATION_NAME.getValue, "server.port" -> port.toString, "" -> "engine", "logging.config" -> "classpath:log4j2-engine.xml", "eureka.client.serviceUrl.defaultZone" -> EngineManagerReceiver.getSpringConf("eureka.client.serviceUrl.defaultZone")) springConf = springConf ++: getExtractSpringConfigs(request).toMap parser.setSpringConf(springConf) var dwcConf = Map("ticketId" -> ticketId, "creator" -> request.creator, "user" -> request.user) ++: EngineCallback.callbackToMap(EngineCallback(Sender.getThisServiceInstance.getApplicationName, Sender.getThisServiceInstance.getInstance)) if({case (k, v) => k.contains(" ") || (v != null && v.contains(" "))}) throw new EngineManagerErrorException(30000, "Startup parameters contain spaces!(启动参数中包含空格!)") dwcConf = dwcConf ++: parser.setDWCConf(dwcConf) val engine = createEngine(processEngineBuilder,parser) engine.setTicketId(ticketId) engine.setPort(port) engine match { case commonEngine: CommonProcessEngine => commonEngine.setUser(request.user) case _ => } engine } } object AbstractEngineCreator { private[enginemanager] def getNewPort: Int = { val socket = new ServerSocket(0) Utils.tryFinally(socket.getLocalPort)(IOUtils.closeQuietly(socket)) } }
Example 87
Source File: ScalaDDLCreator.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.metadata.ddl import com.webank.wedatasphere.linkis.common.utils.Logging import com.webank.wedatasphere.linkis.metadata.conf.MdqConfiguration import{MdqTableBO, MdqTableFieldsInfoBO} import com.webank.wedatasphere.linkis.metadata.exception.MdqIllegalParamException import org.apache.commons.lang.StringUtils import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer object ScalaDDLCreator extends DDLCreator with SQLConst with Logging{ override def createDDL(tableInfo:MdqTableBO, user:String): String = {"begin to generate ddl for user $user using ScalaDDLCreator") val dbName = tableInfo.getTableBaseInfo.getBase.getDatabase val tableName = tableInfo.getTableBaseInfo.getBase.getName val fields = tableInfo.getTableFieldsInfo val createTableCode = new StringBuilder createTableCode.append(SPARK_SQL).append(LEFT_PARENTHESES).append(MARKS).append(CREATE_TABLE) createTableCode.append(dbName).append(".").append(tableName) createTableCode.append(LEFT_PARENTHESES) val partitions = new ArrayBuffer[MdqTableFieldsInfoBO]() val fieldsArray = new ArrayBuffer[String]() fields foreach { field => if (field.getPartitionField != null && field.getPartitionField == true) partitions += field else{ val name = field.getName val _type = field.getType val desc = field.getComment if (StringUtils.isNotEmpty(desc)){ fieldsArray += (name + SPACE + _type + SPACE + COMMENT + SPACE + SINGLE_MARK + desc + SINGLE_MARK) }else{ fieldsArray += (name + SPACE + _type) } } } createTableCode.append(fieldsArray.mkString(COMMA)).append(RIGHT_PARENTHESES).append(SPACE) if (partitions.nonEmpty){ val partitionArr = new ArrayBuffer[String]() partitions foreach { p => val name = p.getName val _type = p.getType if (StringUtils.isEmpty(name) || StringUtils.isEmpty(_type)) throw MdqIllegalParamException("partition name or type is null") partitionArr += (name + SPACE + _type) } createTableCode.append(PARTITIONED_BY).append(LEFT_PARENTHESES).append(partitionArr.mkString(COMMA)). append(RIGHT_PARENTHESES).append(SPACE) } //如果是分区表,但是没有分区字段,默认是用ds做分区 if(partitions.isEmpty && tableInfo.getTableBaseInfo.getBase.getPartitionTable){ val partition = MdqConfiguration.DEFAULT_PARTITION_NAME.getValue val _type = "string" createTableCode.append(PARTITIONED_BY).append(LEFT_PARENTHESES).append(partition).append(SPACE).append(_type). append(RIGHT_PARENTHESES).append(SPACE) } createTableCode.append(STORED_AS).append(SPACE).append(MdqConfiguration.DEFAULT_STORED_TYPE.getValue).append(SPACE) createTableCode.append(MARKS) createTableCode.append(RIGHT_PARENTHESES) val finalCode = createTableCode.toString()"End to create ddl code, code is $finalCode") finalCode } def main(args: Array[String]): Unit = { val filePath = "E:\\data\\json\\data.json" val json = println(json) // val obj = new Gson().fromJson(json, classOf[MdqTableVO]) //val sql = createDDL(obj, "hadoop") //println(System.currentTimeMillis()) //println(sql) } }
Example 88
Source File: RMEventConsumer.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.resourcemanager.schedule import java.util.concurrent.{ExecutorService, Future} import com.webank.wedatasphere.linkis.common.utils.Utils import com.webank.wedatasphere.linkis.resourcemanager.event.RMEvent import com.webank.wedatasphere.linkis.resourcemanager.event.metric.{MetricRMEvent, MetricRMEventExecutor} import com.webank.wedatasphere.linkis.resourcemanager.event.notify.{NotifyRMEvent, NotifyRMEventExecutor} import com.webank.wedatasphere.linkis.scheduler.SchedulerContext import com.webank.wedatasphere.linkis.scheduler.queue._ import scala.collection.mutable.ArrayBuffer class RMEventConsumer(schedulerContext: SchedulerContext, executeService: ExecutorService) extends Consumer(schedulerContext, executeService) { private var queue: ConsumeQueue = _ private var group: Group = _ private var maxRunningJobsNum = 1000 //Not put(暂未放) private val runningJobs = new Array[SchedulerEvent](maxRunningJobsNum) private val executorManager = schedulerContext.getOrCreateExecutorManager private var rmConsumerListener : RMConsumerListener = _ var future: Future[_] = _ def this(schedulerContext: SchedulerContext, executeService: ExecutorService, group: Group) = { this(schedulerContext, executeService) = group maxRunningJobsNum = group.getMaximumCapacity } def start():Unit = future = executeService.submit(this) def setRmConsumerListener(rmConsumerListener: RMConsumerListener): Unit ={ this.rmConsumerListener = rmConsumerListener } override def setConsumeQueue(consumeQueue: ConsumeQueue) = { queue = consumeQueue } override def getConsumeQueue = queue override def getGroup = group override def setGroup(group: Group) = { = group } override def getRunningEvents = getEvents(_.isRunning) private def getEvents(op: SchedulerEvent => Boolean): Array[SchedulerEvent] = { val result = ArrayBuffer[SchedulerEvent]() runningJobs.filter(_ != null).filter(x => op(x)).foreach(result += _) result.toArray } override def run() = { Thread.currentThread().setName(s"${toString}Thread") info(s"$toString thread started!") while (!terminate) { Utils.tryAndError(loop()) Utils.tryQuietly(Thread.sleep(10)) } info(s"$toString thread stopped!") } def loop(): Unit = { var event = queue.take() while (event.turnToScheduled() != true) { event = queue.take() } if(rmConsumerListener != null){rmConsumerListener.beforeEventExecute(this,event.asInstanceOf[RMEvent])} Utils.tryAndError({ val executor = executorManager.askExecutor(event) if (executor.isDefined) { event match { case x: MetricRMEvent =>{ Utils.tryQuietly(executor.get.asInstanceOf[MetricRMEventExecutor].execute(new EventJob(x))) } case y: NotifyRMEvent =>{ Utils.tryQuietly(executor.get.asInstanceOf[NotifyRMEventExecutor].execute(new EventJob(y))) } } } }) if(rmConsumerListener != null){rmConsumerListener.afterEventExecute(this,event.asInstanceOf[RMEvent])} } override def shutdown() = { future.cancel(true) super.shutdown() } }
Example 89
Source File: StorageScriptFsReader.scala From Linkis with Apache License 2.0 | 5 votes |
package import import{FsPath, MetaData, Record} import import import import scala.collection.mutable.ArrayBuffer def isMetadata(line: String, prefix: String, prefixConf: String): Boolean = { val regex = ("\\s*" + prefix + "\\s*(.+)\\s*" + "=" + "\\s*(.+)\\s*").r line match { case regex(_, _) => true case _ => { val split: Array[String] = line.split("=") if (split.size != 2) return false if (split(0).split(" ").filter(_ != "").size != 4) return false if (!split(0).split(" ").filter(_ != "")(0).equals(prefixConf)) return false true } } } }
Example 90
Source File: ResultSetWriter.scala From Linkis with Apache License 2.0 | 5 votes |
package import{ResultSet, ResultSetWriter} import{FsPath, MetaData, Record} import scala.collection.mutable.ArrayBuffer object ResultSetWriter { def getResultSetWriter[K <: MetaData, V <: Record](resultSet: ResultSet[K,V], maxCacheSize: Long, storePath: FsPath):ResultSetWriter[K, V] = new StorageResultSetWriter[K, V](resultSet, maxCacheSize, storePath) def getResultSetWriter[K <: MetaData, V <: Record](resultSet: ResultSet[K,V], maxCacheSize: Long, storePath: FsPath, proxyUser:String):ResultSetWriter[K, V] ={ val writer = new StorageResultSetWriter[K, V](resultSet, maxCacheSize, storePath) writer.setProxyUser(proxyUser) writer } def getRecordByWriter(writer: ResultSetWriter[_ <:MetaData,_ <:Record],limit:Long): Array[Record] ={ val res = writer.toString getRecordByRes(res,limit) } def getRecordByRes(res: String,limit:Long): Array[Record] ={ val reader = ResultSetReader.getResultSetReader(res) var count = 0 val records = new ArrayBuffer[Record]() reader.getMetaData while (reader.hasNext && count < limit){ records += reader.getRecord count = count + 1 } records.toArray } def getLastRecordByRes(res: String):Record = { val reader = ResultSetReader.getResultSetReader(res) reader.getMetaData while (reader.hasNext ){ reader.getRecord } reader.getRecord } }
Example 91
Source File: StorageResultSetReader.scala From Linkis with Apache License 2.0 | 5 votes |
package import{ByteArrayInputStream, IOException, InputStream} import{ResultSet, ResultSetReader} import{MetaData, Record} import com.webank.wedatasphere.linkis.common.utils.Logging import import import import scala.collection.mutable.ArrayBuffer def readLine(): Array[Byte] = { var rowLen = 0 try rowLen = Dolphin.readInt(inputStream) catch { case t:StorageWarnException => info(s"Read finished(读取完毕)") ; return null case t: Throwable => throw t } val rowBuffer = ArrayBuffer[Byte]() var len = 0 //Read the entire line, except for the data of the line length(读取整行,除了行长的数据) while (rowLen > 0 && len >= 0) { if (rowLen > READ_CACHE) len = StorageUtils.readBytes(inputStream,bytes, READ_CACHE) else len = StorageUtils.readBytes(inputStream,bytes, rowLen) if (len > 0) { rowLen -= len rowBuffer ++= bytes.slice(0, len) } } rowCount = rowCount + 1 rowBuffer.toArray } @scala.throws[IOException] override def getRecord: Record = { if (metaData == null) throw new IOException("Must read metadata first(必须先读取metadata)") if (row == null) throw new IOException("Can't get the value of the field, maybe the IO stream has been read or has been closed!(拿不到字段的值,也许IO流已读取完毕或已被关闭!)") row } @scala.throws[IOException] override def getMetaData: MetaData = { if(metaData == null) init() metaData = deserializer.createMetaData(readLine()) metaData } @scala.throws[IOException] override def skip(recordNum: Int): Int = { if(recordNum < 0 ) return -1 if(metaData == null) getMetaData for(i <- recordNum until (0, -1)){ try inputStream.skip(Dolphin.readInt(inputStream)) catch { case t: Throwable => return -1} } recordNum } @scala.throws[IOException] override def getPosition: Long = rowCount @scala.throws[IOException] override def hasNext: Boolean = { if(metaData == null) getMetaData val line = readLine() if(line == null) return false row = deserializer.createRecord(line) if(row == null) return false true } @scala.throws[IOException] override def available: Long = inputStream.available() override def close(): Unit = inputStream.close() }
Example 92
Source File: TableResultDeserializer.scala From Linkis with Apache License 2.0 | 5 votes |
package import import{Column, DataType, Dolphin} import import scala.collection.mutable.ArrayBuffer override def createRecord(bytes: Array[Byte]): TableRecord = { val colByteLen = Dolphin.getString(bytes, 0, Dolphin.INT_LEN).toInt val colString = Dolphin.getString(bytes, Dolphin.INT_LEN, colByteLen) val colArray = if(colString.endsWith(Dolphin.COL_SPLIT)) colString.substring(0, colString.length -1).split(Dolphin.COL_SPLIT) else colString.split(Dolphin.COL_SPLIT) var index = Dolphin.INT_LEN + colByteLen val data = { i => val len = colArray(i).toInt val res = Dolphin.getString(bytes, index, len) index += len if(i >= metaData.columns.length) res else toValue(metaData.columns(i).dataType,res) }.toArray new TableRecord(data) } }
Example 93
Source File: RetryHandler.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.common.utils import com.webank.wedatasphere.linkis.common.exception.{DWCRetryException, FatalException} import org.apache.commons.lang.{ClassUtils => CommonClassUtils} import scala.collection.mutable.ArrayBuffer trait RetryHandler extends Logging { private var retryNum = 2 private var period = 100l private var maxPeriod = 1000l private val retryExceptions = ArrayBuffer[Class[_ <: Throwable]]() def setRetryNum(retryNum: Int): Unit = this.retryNum = retryNum def getRetryNum: Int = retryNum def setRetryPeriod(retryPeriod: Long): Unit = this.period = retryPeriod def getRetryPeriod: Long = period def setRetryMaxPeriod(retryMaxPeriod: Long): Unit = this.maxPeriod = retryMaxPeriod def getRetryMaxPeriod: Long = maxPeriod def addRetryException(t: Class[_ <: Throwable]): Unit = retryExceptions += t def getRetryExceptions = retryExceptions.toArray def exceptionCanRetry(t: Throwable): Boolean = !t.isInstanceOf[FatalException] && retryExceptions.exists(c => CommonClassUtils.isAssignable(t.getClass, c)) def nextInterval(attempt: Int): Long = { val interval = (this.period.toDouble * Math.pow(1.5D, (attempt - 1).toDouble)).toLong if (interval > this.maxPeriod) this.maxPeriod else interval } def retry[T](op: => T, retryName: String): T = { if(retryExceptions.isEmpty || retryNum <= 1) return op var retry = 0 var result = null.asInstanceOf[T] while(retry < retryNum && result == null) result = Utils.tryCatch(op) { t => retry += 1 if(retry >= retryNum) throw t else if(exceptionCanRetry(t)) { val retryInterval = nextInterval(retry) info(retryName + s" failed with ${t.getClass.getName}, wait ${ByteTimeUtils.msDurationToString(retryInterval)} for next retry. Retried $retry++ ...") Utils.tryQuietly(Thread.sleep(retryInterval)) null.asInstanceOf[T] } else throw t } result } }
Example 94
Source File: ShutdownUtils.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.common.utils import sun.misc.{Signal, SignalHandler} import scala.collection.mutable.ArrayBuffer object ShutdownUtils { private val shutdownRunners = ArrayBuffer[ShutdownRunner]() def addShutdownHook(runnable: Runnable): Unit = addShutdownHook(Int.MaxValue, runnable) def addShutdownHook(order: Int, runnable: Runnable): Unit = shutdownRunners synchronized shutdownRunners += new DefaultShutdownRunner(order, runnable) def addShutdownHook(hook: => Unit): Unit = addShutdownHook(Int.MaxValue, hook) def addShutdownHook(order: Int, hook: => Unit): Unit = shutdownRunners synchronized shutdownRunners += new FunctionShutdownRunner(order, hook) def addShutdownHook(shutdownRunner: ShutdownRunner): Unit = shutdownRunners synchronized shutdownRunners += shutdownRunner private val signals = Array("TERM", "HUP", "INT").map(new Signal(_)) private val signalHandler = new SignalHandler { override def handle(signal: Signal): Unit = { val hooks = shutdownRunners.sortBy(_.order){ case m: DefaultShutdownRunner => Utils.defaultScheduler.execute(m) m case m => val runnable = new DefaultShutdownRunner(m.order, m) Utils.defaultScheduler.execute(runnable) runnable } val startTime = System.currentTimeMillis ShutdownUtils synchronized { while(System.currentTimeMillis - startTime < 30000 && hooks.exists(!_.isCompleted)) ShutdownUtils.wait(3000) } System.exit(0) } } signals.foreach(Signal.handle(_, signalHandler)) } trait ShutdownRunner extends Runnable { val order: Int } class DefaultShutdownRunner(override val order: Int, runnable: Runnable) extends ShutdownRunner { private var completed = false override def run(): Unit = Utils.tryFinally({ completed = true ShutdownUtils synchronized ShutdownUtils.notify() } def isCompleted = completed } class FunctionShutdownRunner(override val order: Int, hook: => Unit) extends ShutdownRunner { override def run(): Unit = hook }
Example 95
Source File: DWCArgumentsParser.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.common.conf import org.apache.commons.lang.StringUtils import scala.collection.{JavaConversions, mutable} import scala.collection.mutable.ArrayBuffer object DWCArgumentsParser { protected val DWC_CONF = "--dwc-conf" protected val SPRING_CONF = "--spring-conf" private var dwcOptionMap = Map.empty[String, String] private[linkis] def setDWCOptionMap(dwcOptionMap: Map[String, String]) = this.dwcOptionMap = dwcOptionMap def getDWCOptionMap = dwcOptionMap def parse(args: Array[String]): DWCArgumentsParser = { val keyValueRegex = "([^=]+)=(.+)".r var i = 0 val optionParser = new DWCArgumentsParser while(i < args.length) { args(i) match { case DWC_CONF | SPRING_CONF => args(i + 1) match { case keyValueRegex(key, value) => optionParser.setConf(args(i), key, value) i += 1 case _ => throw new IllegalArgumentException("illegal commond line, format: --conf key=value.") } case _ => throw new IllegalArgumentException(s"illegal commond line, ${args(i)} cannot recognize.") } i += 1 } optionParser.validate() optionParser } def formatToArray(optionParser: DWCArgumentsParser): Array[String] = { val options = ArrayBuffer[String]() def write(confMap: Map[String, String], optionType: String): Unit = confMap.foreach { case (key, value) => if (StringUtils.isNotEmpty(key) && StringUtils.isNotEmpty(value)) { options += optionType options += (key + "=" + value) } } write(optionParser.getDWCConfMap, DWC_CONF) write(optionParser.getSpringConfMap, SPRING_CONF) options.toArray } def formatToArray(springOptionMap: Map[String, String], dwcOptionMap: Map[String, String]): Array[String] = formatToArray(new DWCArgumentsParser().setSpringConf(springOptionMap).setDWCConf(dwcOptionMap)) def format(optionParser: DWCArgumentsParser): String = formatToArray(optionParser).mkString(" ") def format(springOptionMap: Map[String, String], dwcOptionMap: Map[String, String]): String = formatToArray(springOptionMap, dwcOptionMap).mkString(" ") def formatSpringOptions(springOptionMap: Map[String, String]): Array[String] = { val options = ArrayBuffer[String]() springOptionMap.foreach { case (key, value) => if (StringUtils.isNotEmpty(key) && StringUtils.isNotEmpty(value)) { options += ("--" + key + "=" + value) } } options.toArray } } class DWCArgumentsParser { import DWCArgumentsParser._ private val dwcOptionMap = new mutable.HashMap[String, String]() private val springOptionMap = new mutable.HashMap[String, String]() def getSpringConfMap = springOptionMap.toMap def getSpringConfs = JavaConversions.mapAsJavaMap(springOptionMap) def getDWCConfMap = dwcOptionMap.toMap def setConf(optionType: String, key: String, value: String) = { optionType match { case DWC_CONF => dwcOptionMap += key -> value case SPRING_CONF => springOptionMap += key -> value } this } def setSpringConf(optionMap: Map[String, String]): DWCArgumentsParser = { if(optionMap != null) this.springOptionMap ++= optionMap this } def setDWCConf(optionMap: Map[String, String]): DWCArgumentsParser = { if(optionMap != null) this.dwcOptionMap ++= optionMap this } def validate() = {} }
Example 96
Source File: _03_TraitsAsStackableModifications.scala From LearningScala with Apache License 2.0 | 5 votes |
package _033_traits import scala.collection.mutable.ArrayBuffer class MyQueue extends BasicIntQueue with Doubling def main(args: Array[String]): Unit = { val queue = new BasicIntQueue queue.put(-10) queue.put(20) println(s"queue.get(): ${queue.get()}") println(s"queue.get(): ${queue.get()}") println() val myQueue = new MyQueue myQueue.put(-10) myQueue.put(20) println(s"myQueue.get(): ${myQueue.get()}") println(s"myQueue.get(): ${myQueue.get()}") println() // You could supply "BasicIntQueue with Doubling" directly to new instead of defining a named class. val queueWithDoubling = new BasicIntQueue with Doubling queueWithDoubling.put(-10) queueWithDoubling.put(20) println(s"queueWithDoubling.get(): ${queueWithDoubling.get()}") println(s"queueWithDoubling.get(): ${queueWithDoubling.get()}") println() // ORDER MATTERS examples: // You can now pick and choose which traits you want for a particular queue. val q1 = new BasicIntQueue with Incrementing with Filtering q1.put(-1) q1.put(0) q1.put(1) println(s"q1.get(): ${q1.get()}") println(s"q1.get(): ${q1.get()}") // println(s"q1.get(): ${q1.get()}") // will give an error println() val q2 = new BasicIntQueue with Filtering with Incrementing q2.put(-1) q2.put(0) q2.put(1) println(s"q2.get(): ${q2.get()}") println(s"q2.get(): ${q2.get()}") println(s"q2.get(): ${q2.get()}") println() } }
Example 97
Source File: _10_MutableCollections.scala From LearningScala with Apache License 2.0 | 5 votes |
package _020_collections object _10_MutableCollections { def main(args: Array[String]): Unit = { println("===== List buffers =====") listBufferExample() println() println("===== Array buffers =====") println(arrayBufferExample()) println() println("===== Mutable Sets =====") mutableSetExample() println() println("===== Mutable Maps =====") mutableMapExample() } private def mutableMapExample(): Unit = { import scala.collection.mutable val map = mutable.Map.empty[String, Int] println(map) map("hello") = 1 map("there") = 2 println(map) println(map("hello")) println("======") val nums = mutable.Map("i" -> 1, "ii" -> 2) println(nums) nums += ("vi" -> 6) println(nums) nums -= "ii" println(nums) nums ++= List("iii" -> 3, "v" -> 5) println(nums) nums --= List("i", "ii") println(nums) println("=====") println(s"nums.size: ${nums.size}") print("nums.contains(\"ii\"): ") println(nums.contains("ii")) print("nums(\"iii\"): ") println(nums("iii")) println(s"nums.keys ==> ${nums.keys}") println(s"nums.keySet ==> ${nums.keySet}") println(s"nums.values ==> ${nums.values}") println(s"nums.isEmpty: ${nums.isEmpty}") } def arrayBufferExample(): List[Int] = { import scala.collection.mutable.ArrayBuffer val ab = ArrayBuffer[Int](10, 20) ab += 30 ab += 40 ab.prepend(5) ab.toList //return immutable } private def listBufferExample(): Unit = { import scala.collection.mutable.ListBuffer val listBuffer = new ListBuffer[Int] listBuffer += 1 listBuffer += 2 println(listBuffer) 3 +=: listBuffer println(listBuffer) val list = listBuffer.toList println(list) } private def mutableSetExample(): Unit = { import scala.collection.mutable val emptySet = mutable.Set.empty[Int] println(emptySet) val nums = mutable.Set(1, 2, 3) println(nums) nums += 5 println(nums) nums -= 3 println(nums) nums ++= List(5, 6) println(nums) nums --= List(1, 2) println(nums) println(nums & Set(1, 3, 5, 7)) // intersection of two sets nums.clear() println(nums) } }
Example 98
Source File: TestableQueueInputDStream.scala From SparkUnitTestingExamples with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming import{ObjectInputStream, ObjectOutputStream} import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.streaming.dstream.InputDStream import scala.collection.mutable.{ArrayBuffer, Queue} import scala.reflect.ClassTag class TestableQueueInputDStream[T: ClassTag]( ssc: StreamingContext, val queue: Queue[RDD[T]], oneAtATime: Boolean, defaultRDD: RDD[T] ) extends InputDStream[T](ssc) { override def start() { } override def stop() { } private def readObject(in: ObjectInputStream): Unit = { logWarning("queueStream doesn't support checkpointing") } private def writeObject(oos: ObjectOutputStream): Unit = { logWarning("queueStream doesn't support checkpointing") } override def compute(validTime: Time): Option[RDD[T]] = { val buffer = new ArrayBuffer[RDD[T]]() queue.synchronized { if (oneAtATime && queue.nonEmpty) { buffer += queue.dequeue() } else { buffer ++= queue queue.clear() } } if (buffer.nonEmpty) { if (oneAtATime) { Some(buffer.head) } else { Some(new UnionRDD(, buffer.toSeq)) } } else if (defaultRDD != null) { Some(defaultRDD) } else { Some(ssc.sparkContext.emptyRDD) } } }
package spatial.dse import java.util.concurrent.LinkedBlockingQueue import argon.State import spatial.metadata.params._ import spatial.metadata.bounds._ import scala.collection.mutable.ArrayBuffer case class PruneWorker( start: Int, size: Int, prods: Seq[BigInt], dims: Seq[BigInt], indexedSpace: Seq[(Domain[_],Int)], restricts: Set[Restrict], queue: LinkedBlockingQueue[Seq[Int]] )(implicit state: State) extends Runnable { private def isLegalSpace(): Boolean = restricts.forall(_.evaluate()) def run(): Unit = { println(s"Searching from $start until ${start+size}") val pts = (start until (start+size)).filter{i => indexedSpace.foreach{case (domain,d) => domain.set( ((i / prods(d)) % dims(d)).toInt ) } isLegalSpace() } queue.put(pts) } }
package argon import scala.collection.mutable.{ArrayBuffer,HashSet} import utils.Instrument trait FlowRules { val IR: State } class Flows { private var rules = ArrayBuffer[(String,PartialFunction[(Sym[_],Op[_],SrcCtx,State),Unit])]() private[argon] var names = HashSet[String]() lazy val instrument = new Instrument("flows") def prepend(name: String, func: PartialFunction[(Sym[_],Op[_],SrcCtx,State),Unit]): Unit = { rules.prepend((name,func)) names += name } def add(name: String, func: PartialFunction[(Sym[_],Op[_],SrcCtx,State),Unit]): Unit = { rules += ((name,func)) names += name } def remove(name: String): Unit = { val idx = rules.indexWhere(_._1 == name) rules.remove(idx) names.remove(name) } def apply[A](lhs: Sym[A], rhs: Op[A])(implicit ctx: SrcCtx, state: State): Unit = { val tuple = (lhs,rhs,ctx,state) rules.foreach{case (name,rule) => if (rule.isDefinedAt(tuple)) { instrument(name){ rule.apply(tuple) } } } } def save(): Flows = { val flows = new Flows flows.rules ++= rules flows.names ++= names flows } def restore(flow: Flows): Unit = { rules = flow.rules names = flow.names } }
package argon import utils.implicits.collections._ import scala.collection.mutable import scala.collection.mutable.ArrayBuffer trait RewriteRules { val IR: State } class Rewrites { type RewriteRule = PartialFunction[(Op[_],SrcCtx,State),Option[Sym[_]]] private def keyOf[A<:Op[_]:Manifest] = manifest[A].runtimeClass.asInstanceOf[Class[A]] // Roughly O(G), where G is the total number of global rewrite rules // When possible, use rules instead of globals private var globals: ArrayBuffer[RewriteRule] = ArrayBuffer.empty // Roughly O(R), where R is the number of rules for a specific node class private val rules: mutable.HashMap[Class[_], ArrayBuffer[RewriteRule]] = mutable.HashMap.empty private[argon] val names: mutable.HashSet[String] = mutable.HashSet.empty def rule(op: Op[_]): Seq[RewriteRule] = rules.getOrElse(op.getClass, Nil) def addGlobal(name: String, rule: RewriteRule): Unit = if (!names.contains(name)) { names += name globals += rule } def add[O<:Op[_]:Manifest](name: String, rule: RewriteRule): Unit = if (!names.contains(name)) { names += name val key = keyOf[O] val pfs = rules.getOrElseAdd(key, () => ArrayBuffer.empty[RewriteRule]) pfs += rule } private def applyRule[A:Type](op: Op[A], ctx: SrcCtx, state: State, rule: RewriteRule): Option[A] = { rule.apply((op,ctx,state)) match { case Some(s) if <:< Type[A] => Some(s.asInstanceOf[A]) case Some(s) => None case _ => None } } def apply[A:Type](op: Op[A])(implicit ctx: SrcCtx, state: State): Option[A] = { Option(op.rewrite) .orElse{ rule(op).mapFind{rule => applyRule[A](op,ctx,state, rule) } } .orElse{ globals.mapFind{rule => applyRule[A](op,ctx,state, rule) } }.map { op2 => if (state.config.enLog) { dbgs(s"Rewrite $op => $op2") } op2 } } }
Example 102
Source File: BitTest.scala From spatial with MIT License | 5 votes |
package spatial.tests.compiler import spatial.dsl._ import scala.collection.mutable.ArrayBuffer @spatial class BitTest extends SpatialTest { override def backends = DISABLED // Returns a random number in [min,max) def rand(max: gen.Int, min: gen.Int): gen.Int = scala.util.Random.nextInt(max-min)+min def opp(x: Bit, y: Bit, op: gen.Int): Bit = op match { case 0 | 1 | 2 => x & y case 3 | 4 | 5 => x | y case 6 | 7 | 8 => x !== y case 9 | 10 | 11 => x === y case 12 => !x case 13 => !y } def main(args: Array[String]): Void = { Foreach(0 until 32){i => val bits: List[Bit] = List.fill(32){ random[Bit] } var layers: ArrayBuffer[List[Bit]] = ArrayBuffer(bits) (0 until 64).meta.foreach{i => val layer = List.fill(200){ val l1 = i //rand(layers.length,0) val l2 = i //rand(layers.length,0) val p1 = rand(layers(l1).length, 0) val p2 = rand(layers(l2).length, 0) val op = rand(14,0) val x = layers(l1).apply(p1) val y = layers(l2).apply(p2) opp(x,y,op) } layers += layer println(r"[$i] 1: ${layer(1)}, 3: ${layer(3)}, 5: ${layer(5)}") } } } }
Source File: TemplateRunner.scala From spatial with MIT License | 5 votes |
package fringe.test import import scala.collection.mutable.ArrayBuffer import scala.util.Properties.envOrElse object TemplateRunner { def deleteRecursively(file: File): Unit = { if (file.isDirectory) file.listFiles.foreach(deleteRecursively) if (file.exists && !file.delete) throw new Exception(s"Unable to delete ${file.getAbsolutePath}") } def apply(templateMap: Map[String, String => Boolean], args: Array[String]): Unit = { // Choose the default backend based on what is available. lazy val firrtlTerpBackendAvailable: Boolean = { try { val cls = Class.forName("chisel3.iotesters.FirrtlTerpBackend") cls != null } catch { case e: Throwable => false } } lazy val defaultBackend = if (firrtlTerpBackendAvailable) "firrtl" else "" val backendName = envOrElse("TESTER_BACKENDS", defaultBackend).split(" ").head val tempDir = s"""${envOrElse("NEW_TEMPLATES_HOME", "tmp")}/test_run_dir/""" val specificRegex = "(.*[0-9]+)".r val problemsToRun = if (args.isEmpty) { templateMap.keys.toSeq.sorted.toArray // Run all by default } else { { arg => arg match { case "all" => templateMap.keys.toSeq.sorted // Run all case specificRegex(c) => List(c).toSeq // Run specific test case _ => // Figure out tests that match this template and run all val tempRegex = s"(${arg}[0-9]+)".r templateMap.keys.toSeq.sorted.filter(tempRegex.pattern.matcher(_).matches) }}.flatten.toArray } var successful = 0 var passedTests:List[String] = List() val errors = new ArrayBuffer[String] for(testName <- problemsToRun) { // Wipe tempdir for consecutive tests of same module deleteRecursively(new File(tempDir)) templateMap.get(testName) match { case Some(test) => println(s"Starting template $testName") try { if(test(backendName)) { successful += 1 passedTests = passedTests :+ s"$testName" } else { errors += s"Template $testName: test error occurred" } } catch { case exception: Exception => exception.printStackTrace() errors += s"Template $testName: exception ${exception.getMessage}" case t : Throwable => errors += s"Template $testName: throwable ${t.getMessage}" } case _ => errors += s"Bad template name: $testName" } } if(successful > 0) { println(s"""Templates passing: $successful (${passedTests.mkString(", ")})""") } if(errors.nonEmpty) { println("=" * 80) println(s"Errors: ${errors.length}: in the following templates") println(errors.mkString("\n")) println("=" * 80) System.exit(1) } } }
Source File: AvroSchemaMerge.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.avro import com.sksamuel.exts.StringOption import org.apache.avro.Schema import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer object AvroSchemaMerge { def apply(name: String, namespace: String, schemas: List[Schema]): Schema = { require(schemas.forall(_.getType == Schema.Type.RECORD), "Can only merge records") // documentations can just be a concat val doc = != null).mkString("; ") // simple impl to start: take all the fields from the first schema, and then add in the missing ones // from second 2 and so on val fields = new ArrayBuffer[Schema.Field]() schemas.foreach { schema => schema.getFields.asScala.filterNot { field => fields.exists( == }.foreach { field => // avro is funny about sharing fields, so need to copy it val copy = new Schema.Field(, field.schema(), StringOption(field.doc).orNull, field.defaultVal) fields.append(copy) } } val schema = Schema.createRecord(name, if (doc.isEmpty()) null else doc, namespace, false) schema.setFields(fields.result().asJava) schema } }
Source File: JdbcPublisher.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.jdbc import java.sql.{Connection, PreparedStatement} import java.util.concurrent.atomic.AtomicBoolean import import com.sksamuel.exts.metrics.Timed import io.eels.Row import io.eels.component.jdbc.dialect.JdbcDialect import io.eels.datastream.{Publisher, Subscriber, Subscription} import scala.collection.mutable.ArrayBuffer class JdbcPublisher(connFn: () => Connection, query: String, bindFn: (PreparedStatement) => Unit, fetchSize: Int, dialect: JdbcDialect ) extends Publisher[Seq[Row]] with Timed with JdbcPrimitives with Using { override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { try { using(connFn()) { conn => logger.debug(s"Preparing query $query") using(conn.prepareStatement(query)) { stmt => stmt.setFetchSize(fetchSize) bindFn(stmt) logger.debug(s"Executing query $query") using(stmt.executeQuery()) { rs => val schema = schemaFor(dialect, rs) val running = new AtomicBoolean(true) subscriber.subscribed(Subscription.fromRunning(running)) val buffer = new ArrayBuffer[Row](fetchSize) while ( && running.get) { val values = schema.fieldNames().map { name => val raw = rs.getObject(name) dialect.sanitize(raw) } buffer append Row(schema, values) if (buffer.size == fetchSize) { buffer.clear() } } if (buffer.nonEmpty) subscriber.completed() } } } } catch { case t: Throwable => subscriber.error(t) } } }
Source File: HbasePublisher.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hbase import java.util import java.util.concurrent.atomic.AtomicBoolean import import com.sksamuel.exts.metrics.Timed import io.eels.Row import io.eels.datastream.{Publisher, Subscriber, Subscription} import io.eels.schema.StructType import org.apache.hadoop.hbase.TableName import org.apache.hadoop.hbase.client.{Connection, Result, Scan} import scala.collection.mutable.ArrayBuffer class HbasePublisher(connection: Connection, schema: StructType, namespace: String, tableName: String, bufferSize: Int, maxRows: Long, scanner: Scan, implicit val serializer: HbaseSerializer) extends Publisher[Seq[Row]] with Timed with Using { private val table = connection.getTable(TableName.valueOf(namespace, tableName)) override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { try { using(new CloseableIterator) { rowIter => val running = new AtomicBoolean(true) subscriber.subscribed(Subscription.fromRunning(running)) val buffer = new ArrayBuffer[Row](bufferSize) while (rowIter.hasNext && running.get()) { buffer append if (buffer.size == bufferSize) { buffer.clear() } } if (buffer.nonEmpty) subscriber.completed() } } catch { case t: Throwable => subscriber.error(t) } } class CloseableIterator extends Iterator[Row] with AutoCloseable { private val resultScanner = table.getScanner(scanner) private val resultScannerIter = resultScanner.iterator() private var rowCount = 0 private var iter: Iterator[Row] = Iterator.empty override def hasNext: Boolean = rowCount < maxRows && iter.hasNext || { if (rowCount < maxRows && resultScannerIter.hasNext) { iter = HBaseResultsIterator(schema, resultScannerIter) iter.hasNext } else false } override def next(): Row = { rowCount += 1 } override def close(): Unit = { resultScanner.close() } } case class HBaseResultsIterator(schema: StructType, resultIter: util.Iterator[Result])(implicit serializer: HbaseSerializer) extends Iterator[Row] { override def hasNext: Boolean = resultIter.hasNext override def next(): Row = { val resultRow = val values = { field => if (!field.key) { val value = resultRow.getValue(field.columnFamily.getOrElse(sys.error(s"No Column Family defined for field '${}'")).getBytes, if (value != null) serializer.fromBytes(value,, field.dataType) else null } else serializer.fromBytes(resultRow.getRow,, field.dataType) } Row(schema, values) } } }
Source File: OrcWriter.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.orc import java.util.concurrent.atomic.AtomicInteger import java.util.function.IntUnaryOperator import com.sksamuel.exts.Logging import com.typesafe.config.ConfigFactory import io.eels.Row import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.exec.vector.ColumnVector import org.apache.orc.{OrcConf, OrcFile, TypeDescription} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer // performs the actual write out of orc data, to be used by an orc sink class OrcWriter(path: Path, structType: StructType, options: OrcWriteOptions)(implicit conf: Configuration) extends Logging { private val schema: TypeDescription = OrcSchemaFns.toOrcSchema(structType) logger.trace(s"Creating orc writer for schema $schema") private val batchSize = { val size = ConfigFactory.load().getInt("eel.orc.sink.batchSize") Math.max(Math.min(1024, size), 1) } logger.debug(s"Orc writer will use batchsize=$batchSize") private val buffer = new ArrayBuffer[Row](batchSize) private val serializers = private val batch = schema.createRowBatch(batchSize) OrcConf.COMPRESSION_STRATEGY.setString(conf, OrcConf.COMPRESS.setString(conf,, _)) options.compressionBufferSize.foreach(OrcConf.BUFFER_SIZE.setLong(conf, _)) private val woptions = OrcFile.writerOptions(conf).setSchema(schema) options.rowIndexStride.foreach { size => woptions.rowIndexStride(size) logger.debug(s"Using stride size = $size") } if (options.bloomFilterColumns.nonEmpty) { woptions.bloomFilterColumns(options.bloomFilterColumns.mkString(",")) logger.debug(s"Using bloomFilterColumns = $options.bloomFilterColumns") } private lazy val writer = OrcFile.createWriter(path, woptions) private val counter = new AtomicInteger(0) def write(row: Row): Unit = { buffer.append(row) if (buffer.size == batchSize) flush() } def records: Int = counter.get() def flush(): Unit = { def writecol[T <: ColumnVector](rowIndex: Int, colIndex: Int, row: Row): Unit = { val value = row.values(colIndex) val vector = batch.cols(colIndex).asInstanceOf[T] val serializer = serializers(colIndex).asInstanceOf[OrcSerializer[T]] serializer.writeToVector(rowIndex, vector, value) } // don't use foreach here, using old school for loops for perf for (rowIndex <- buffer.indices) { val row = buffer(rowIndex) for (colIndex <- batch.cols.indices) { writecol(rowIndex, colIndex, row) } } batch.size = buffer.size writer.addRowBatch(batch) counter.updateAndGet(new IntUnaryOperator { override def applyAsInt(operand: Int): Int = operand + batch.size }) buffer.clear() batch.reset() } def close(): Long = { if (buffer.nonEmpty) flush() writer.close() val count = writer.getNumberOfRows"Orc writer wrote $count rows") count } }
Source File: SKRSpec.scala From spark-kafka-writer with Apache License 2.0 | 5 votes |
package com.github.benfradet.spark.kafka.writer import java.util.concurrent.atomic.AtomicInteger import org.apache.kafka.common.serialization.{StringDeserializer, StringSerializer} import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.scalatest.concurrent.Eventually import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} import scala.collection.mutable.ArrayBuffer import scala.util.Random import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec case class Foo(a: Int, b: String) trait SKRSpec extends AnyWordSpec with Matchers with BeforeAndAfterEach with BeforeAndAfterAll with Eventually { val sparkConf = new SparkConf() .setMaster("local[1]") .setAppName(getClass.getSimpleName) var ktu: KafkaTestUtils = _ override def beforeAll(): Unit = { ktu = new KafkaTestUtils ktu.setup() } override def afterAll(): Unit = { SKRSpec.callbackTriggerCount.set(0) if (ktu != null) { ktu.tearDown() ktu = null } } var topic: String = _ var ssc: StreamingContext = _ var spark: SparkSession = _ override def afterEach(): Unit = { if (ssc != null) { ssc.stop() ssc = null } if (spark != null) { spark.stop() spark = null } } override def beforeEach(): Unit = { ssc = new StreamingContext(sparkConf, Seconds(1)) spark = SparkSession.builder .config(sparkConf) .getOrCreate() topic = s"topic-${Random.nextInt()}" ktu.createTopics(topic) } def collect(ssc: StreamingContext, topic: String): ArrayBuffer[String] = { val kafkaParams = Map( "bootstrap.servers" -> ktu.brokerAddress, "auto.offset.reset" -> "earliest", "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "" -> "test-collect" ) val results = new ArrayBuffer[String] KafkaUtils.createDirectStream[String, String]( ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](Set(topic), kafkaParams) ).map(_.value()) .foreachRDD { rdd => results ++= rdd.collect() () } results } val producerConfig = Map( "bootstrap.servers" -> "", "key.serializer" -> classOf[StringSerializer].getName, "value.serializer" -> classOf[StringSerializer].getName ) } object SKRSpec { val callbackTriggerCount = new AtomicInteger() }
Source File: SidechainBlockInfo.scala From Sidechains-SDK with MIT License | 5 votes |
package com.horizen.chain import com.horizen.block.SidechainBlock import com.horizen.utils.{WithdrawalEpochInfo, WithdrawalEpochInfoSerializer} import com.horizen.vrf.{VrfOutput, VrfOutputSerializer} import scorex.core.NodeViewModifier import scorex.core.block.Block.Timestamp import scorex.core.consensus.ModifierSemanticValidity import scorex.core.serialization.{BytesSerializable, ScorexSerializer} import scorex.util.serialization.{Reader, Writer} import scorex.util.{ModifierId, bytesToId, idToBytes} import scala.collection.mutable.ArrayBuffer case class SidechainBlockInfo(height: Int, score: Long, parentId: ModifierId, timestamp: Timestamp, semanticValidity: ModifierSemanticValidity, mainchainHeaderHashes: Seq[MainchainHeaderHash], mainchainReferenceDataHeaderHashes: Seq[MainchainHeaderHash], withdrawalEpochInfo: WithdrawalEpochInfo, vrfOutputOpt: Option[VrfOutput], lastBlockInPreviousConsensusEpoch: ModifierId) extends BytesSerializable with LinkedElement[ModifierId] { override def getParentId: ModifierId = parentId override type M = SidechainBlockInfo override lazy val serializer: ScorexSerializer[SidechainBlockInfo] = SidechainBlockInfoSerializer override def bytes: Array[Byte] = SidechainBlockInfoSerializer.toBytes(this) } object SidechainBlockInfo { def mainchainHeaderHashesFromBlock(sidechainBlock: SidechainBlock): Seq[MainchainHeaderHash] = { => byteArrayToMainchainHeaderHash(header.hash)) } def mainchainReferenceDataHeaderHashesFromBlock(sidechainBlock: SidechainBlock): Seq[MainchainHeaderHash] = { => byteArrayToMainchainHeaderHash(data.headerHash)) } } object SidechainBlockInfoSerializer extends ScorexSerializer[SidechainBlockInfo] { override def serialize(obj: SidechainBlockInfo, w: Writer): Unit = { w.putInt(obj.height) w.putLong(obj.score) w.putBytes(idToBytes(obj.parentId)) w.putLong(obj.timestamp) w.put(obj.semanticValidity.code) w.putInt(obj.mainchainHeaderHashes.size) obj.mainchainHeaderHashes.foreach(id => w.putBytes( w.putInt(obj.mainchainReferenceDataHeaderHashes.size) obj.mainchainReferenceDataHeaderHashes.foreach(id => w.putBytes( WithdrawalEpochInfoSerializer.serialize(obj.withdrawalEpochInfo, w) w.putOption(obj.vrfOutputOpt){case (writer: Writer, vrfOutput: VrfOutput) => VrfOutputSerializer.getSerializer.serialize(vrfOutput, writer) } w.putBytes(idToBytes(obj.lastBlockInPreviousConsensusEpoch)) } private def readMainchainHeadersHashes(r: Reader): Seq[MainchainHeaderHash] = { val references: ArrayBuffer[MainchainHeaderHash] = ArrayBuffer() val length = r.getInt() (0 until length).foreach(_ => { val bytes = r.getBytes(mainchainHeaderHashSize) references.append(byteArrayToMainchainHeaderHash(bytes)) }) references } override def parse(r: Reader): SidechainBlockInfo = { val height = r.getInt() val score = r.getLong() val parentId = bytesToId(r.getBytes(NodeViewModifier.ModifierIdSize)) val timestamp = r.getLong() val semanticValidityCode = r.getByte() val mainchainHeaderHashes = readMainchainHeadersHashes(r) val mainchainReferenceDataHeaderHashes = readMainchainHeadersHashes(r) val withdrawalEpochInfo = WithdrawalEpochInfoSerializer.parse(r) val vrfOutputOpt = r.getOption(VrfOutputSerializer.getSerializer.parse(r)) val lastBlockInPreviousConsensusEpoch = bytesToId(r.getBytes(NodeViewModifier.ModifierIdSize)) SidechainBlockInfo(height, score, parentId, timestamp, ModifierSemanticValidity.restoreFromCode(semanticValidityCode), mainchainHeaderHashes, mainchainReferenceDataHeaderHashes, withdrawalEpochInfo, vrfOutputOpt, lastBlockInPreviousConsensusEpoch) } }
Source File: IODBStoreAdapter.scala From Sidechains-SDK with MIT License | 5 votes |
package import java.util.{ArrayList => JArrayList, List => JList} import java.util.Optional import com.horizen.utils.Pair import scala.collection.JavaConverters._ import io.iohk.iodb.Store import com.horizen.utils.ByteArrayWrapper import scala.collection.mutable.ArrayBuffer class IODBStoreAdapter (store : Store) extends Storage { override def get(key: ByteArrayWrapper): Optional[ByteArrayWrapper] = { val value = store.get(key) if (value.isEmpty) Optional.empty() else Optional.of(new ByteArrayWrapper(value.get)) } override def getOrElse(key: ByteArrayWrapper, defaultValue: ByteArrayWrapper): ByteArrayWrapper = { val value = store.get(key) if (value.isEmpty) defaultValue else new ByteArrayWrapper(value.get) } override def get(keys: JList[ByteArrayWrapper]): JList[Pair[ByteArrayWrapper, Optional[ByteArrayWrapper]]] = { val keysList = new ArrayBuffer[ByteArrayWrapper]() val valList = store.get(keys.asScala) val values = new JArrayList[Pair[ByteArrayWrapper,Optional[ByteArrayWrapper]]]() for (v <- valList) if (v._2.isDefined) values.add(new Pair[ByteArrayWrapper,Optional[ByteArrayWrapper]](new ByteArrayWrapper(v._1), Optional.of(new ByteArrayWrapper(v._2.get)))) else values.add(new Pair[ByteArrayWrapper,Optional[ByteArrayWrapper]](new ByteArrayWrapper(v._1), Optional.empty())) values } override def getAll: JList[Pair[ByteArrayWrapper, ByteArrayWrapper]] = { val values = new JArrayList[Pair[ByteArrayWrapper,ByteArrayWrapper]]() for ( i <- store.getAll()) values.add(new Pair[ByteArrayWrapper,ByteArrayWrapper](new ByteArrayWrapper(i._1), new ByteArrayWrapper(i._2))) values } override def lastVersionID(): Optional[ByteArrayWrapper] = { val value = store.lastVersionID if (value.isEmpty) Optional.empty() else Optional.of(new ByteArrayWrapper(value.get)) } override def update(version: ByteArrayWrapper, toUpdate: JList[Pair[ByteArrayWrapper, ByteArrayWrapper]], toRemove: JList[ByteArrayWrapper]): Unit = { val listToUpdate = new ArrayBuffer[Tuple2[ByteArrayWrapper,ByteArrayWrapper]]() for (r <- toUpdate.asScala) { listToUpdate.append(new Tuple2[ByteArrayWrapper, ByteArrayWrapper](r.getKey, r.getValue)) } store.update(version, toRemove.asScala, listToUpdate) } override def rollback(version : ByteArrayWrapper): Unit = { store.rollback(version) } override def rollbackVersions(): JList[ByteArrayWrapper] = { val versions = store.rollbackVersions() val value = new JArrayList[ByteArrayWrapper]() for (v <- versions) value.add(new ByteArrayWrapper(v)) value } override def isEmpty(): Boolean = !lastVersionID().isPresent override def close(): Unit = { store.close() } }
Source File: StoreOpsTest.scala From fs2-blobstore with Apache License 2.0 | 5 votes |
package blobstore import java.nio.charset.Charset import java.nio.file.Files import java.util.concurrent.Executors import cats.effect.{Blocker, IO} import cats.effect.laws.util.TestInstances import cats.implicits._ import fs2.Pipe import org.scalatest.Assertion import org.scalatest.flatspec.AnyFlatSpec import implicits._ import org.scalatest.matchers.must.Matchers import scala.collection.mutable.ArrayBuffer import scala.concurrent.ExecutionContext class StoreOpsTest extends AnyFlatSpec with Matchers with TestInstances { implicit val cs = IO.contextShift( val blocker = Blocker.liftExecutionContext(ExecutionContext.fromExecutor(Executors.newCachedThreadPool)) behavior of "PutOps" it should "buffer contents and compute size before calling Store.put" in { val bytes: Array[Byte] = "AAAAAAAAAA".getBytes(Charset.forName("utf-8")) val store = DummyStore(_.size must be(Some(bytes.length))) fs2.Stream.emits(bytes).covary[IO].through(store.bufferedPut(Path("path/to/file.txt"), blocker)).compile.drain.unsafeRunSync() store.buf.toArray must be(bytes) } it should "upload a file from a nio Path" in { val bytes = "hello".getBytes(Charset.forName("utf-8")) val store = DummyStore(_.size must be(Some(bytes.length))) fs2.Stream.bracket(IO(Files.createTempFile("test-file", ".bin"))) { p => IO(p.toFile.delete).void }.flatMap { p => fs2.Stream.emits(bytes).covary[IO].through(, blocker)).drain ++ fs2.Stream.eval(store.put(p, Path("path/to/file.txt"), blocker)) }.compile.drain.unsafeRunSync() store.buf.toArray must be(bytes) } } final case class DummyStore(check: Path => Assertion) extends Store[IO] { val buf = new ArrayBuffer[Byte]() override def put(path: Path): Pipe[IO, Byte, Unit] = { check(path) in => { buf.appendAll(in.compile.toVector.unsafeRunSync()) fs2.Stream.emit(()) } } override def list(path: Path): fs2.Stream[IO, Path] = ??? override def get(path: Path, chunkSize: Int): fs2.Stream[IO, Byte] = ??? override def move(src: Path, dst: Path): IO[Unit] = ??? override def copy(src: Path, dst: Path): IO[Unit] = ??? override def remove(path: Path): IO[Unit] = ??? }
Source File: MetadataTransformUtils.scala From automl with Apache License 2.0 | 5 votes |
package import org.apache.spark.sql.types.{MetadataBuilder, StructField} import scala.collection.mutable.ArrayBuffer def vectorCartesianTransform(fields: Array[StructField], numFeatures: Int): MetadataBuilder = { if (fields.length < 2) { throw new IllegalArgumentException("the number of cols in the input DataFrame should be no less than 2") } var res = Array[String]() if (fields.head.metadata.contains(DERIVATION)) { res = fields.head.metadata.getStringArray(DERIVATION) } else { res = createDerivation(numFeatures) } for (i <- 1 until fields.length) { if (fields(i).metadata.contains(DERIVATION)) { res = cartesianWithArray(res, fields(i).metadata.getStringArray(DERIVATION)) } else { res = cartesianWithArray(res, createDerivation(numFeatures)) } } val metadata = fields.last.metadata new MetadataBuilder().withMetadata(metadata).putStringArray(DERIVATION, res) } }
Source File: Message.scala From spark1.52 with Apache License 2.0 | 5 votes |
package import import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer import import org.apache.spark.util.Utils private[nio] abstract class Message(val typ: Long, val id: Int) { var senderAddress: InetSocketAddress = null var started = false var startTime = -1L var finishTime = -1L var isSecurityNeg = false var hasError = false def size: Int def getChunkForSending(maxChunkSize: Int): Option[MessageChunk] def getChunkForReceiving(chunkSize: Int): Option[MessageChunk] def timeTaken(): String = (finishTime - startTime).toString + " ms" override def toString: String = { this.getClass.getSimpleName + "(id = " + id + ", size = " + size + ")" } } private[nio] object Message { val BUFFER_MESSAGE = 1111111111L var lastId = 1 def getNewId(): Int = synchronized { lastId += 1 if (lastId == 0) { lastId += 1 } lastId } def createBufferMessage(dataBuffers: Seq[ByteBuffer], ackId: Int): BufferMessage = { if (dataBuffers == null) { return new BufferMessage(getNewId(), new ArrayBuffer[ByteBuffer], ackId) } if (dataBuffers.exists(_ == null)) { throw new Exception("Attempting to create buffer message with null buffer") } new BufferMessage(getNewId(), new ArrayBuffer[ByteBuffer] ++= dataBuffers, ackId) } def createBufferMessage(dataBuffers: Seq[ByteBuffer]): BufferMessage = createBufferMessage(dataBuffers, 0) def createBufferMessage(dataBuffer: ByteBuffer, ackId: Int): BufferMessage = { if (dataBuffer == null) { //ByteBuffer.allocate在能够读和写之前,必须有一个缓冲区,用静态方法 allocate() 来分配缓冲区 createBufferMessage(Array(ByteBuffer.allocate(0)), ackId) } else { createBufferMessage(Array(dataBuffer), ackId) } } def createBufferMessage(dataBuffer: ByteBuffer): BufferMessage = createBufferMessage(dataBuffer, 0) def createBufferMessage(ackId: Int): BufferMessage = { createBufferMessage(new Array[ByteBuffer](0), ackId) } def createErrorMessage(exception: Exception, ackId: Int): BufferMessage = { val exceptionString = Utils.exceptionString(exception) val serializedExceptionString = ByteBuffer.wrap(exceptionString.getBytes(UTF_8)) val errorMessage = createBufferMessage(serializedExceptionString, ackId) errorMessage.hasError = true errorMessage } def create(header: MessageChunkHeader): Message = { val newMessage: Message = header.typ match { case BUFFER_MESSAGE => new BufferMessage(, //ByteBuffer.allocate在能够读和写之前,必须有一个缓冲区,用静态方法 allocate() 来分配缓冲区 ArrayBuffer(ByteBuffer.allocate(header.totalSize)), header.other) } newMessage.hasError = header.hasError newMessage.senderAddress = header.address newMessage } }
Source File: ApplicationInfo.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.util.Date import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.deploy.ApplicationDescription import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.util.Utils private[spark] class ApplicationInfo( val startTime: Long, val id: String, val desc: ApplicationDescription, val submitDate: Date, val driver: RpcEndpointRef, defaultCores: Int) extends Serializable { //枚举类型赋值 @transient var state: ApplicationState.Value = _ @transient var executors: mutable.HashMap[Int, ExecutorDesc] = _ @transient var removedExecutors: ArrayBuffer[ExecutorDesc] = _ @transient var coresGranted: Int = _ @transient var endTime: Long = _ @transient var appSource: ApplicationSource = _ // A cap on the number of executors this application can have at any given time. //执行者的数量这个应用程序可以在任何给定的时间 // By default, this is infinite. Only after the first allocation request is issued by the // application will this be set to a finite value. This is used for dynamic allocation. //默认情况下,这是无限的,只有在应用程序发出第一个分配请求之后,这将被设置为有限的值,这用于动态分配 @transient private[master] var executorLimit: Int = _ @transient private var nextExecutorId: Int = _ init() //初始化方法 private def readObject(in: Unit = Utils.tryOrIOException { in.defaultReadObject() init() } private[deploy] def getExecutorLimit: Int = executorLimit def duration: Long = { if (endTime != -1) { endTime - startTime } else { System.currentTimeMillis() - startTime } } }
Source File: Schedulable.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.util.concurrent.ConcurrentLinkedQueue import scala.collection.mutable.ArrayBuffer import org.apache.spark.scheduler.SchedulingMode.SchedulingMode private[spark] trait Schedulable { var parent: Pool // child queues def schedulableQueue: ConcurrentLinkedQueue[Schedulable] def schedulingMode: SchedulingMode def weight: Int def minShare: Int def runningTasks: Int def priority: Int def stageId: Int def name: String def addSchedulable(schedulable: Schedulable): Unit def removeSchedulable(schedulable: Schedulable): Unit def getSchedulableByName(name: String): Schedulable def executorLost(executorId: String, host: String): Unit def checkSpeculatableTasks(): Boolean def getSortedTaskSetQueue: ArrayBuffer[TaskSetManager] }
Source File: ByteArrayChunkOutputStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package import import scala.collection.mutable.ArrayBuffer private var position = chunkSize override def write(b: Int): Unit = { allocateNewChunkIfNeeded() //注意前套数组取值方式 chunks(lastChunkIndex)(position) = b.toByte position += 1 } override def write(bytes: Array[Byte], off: Int, len: Int): Unit = { var written = 0 while (written < len) { allocateNewChunkIfNeeded() val thisBatch = math.min(chunkSize - position, len - written) System.arraycopy(bytes, written + off, chunks(lastChunkIndex), position, thisBatch) written += thisBatch position += thisBatch } } @inline private def allocateNewChunkIfNeeded(): Unit = { if (position == chunkSize) { chunks += new Array[Byte](chunkSize) lastChunkIndex += 1 position = 0 } } def toArrays: Array[Array[Byte]] = { if (lastChunkIndex == -1) { new Array[Array[Byte]](0) } else { // Copy the first n-1 chunks to the output, and then create an array that fits the last chunk. // An alternative would have been returning an array of ByteBuffers, with the last buffer // bounded to only the last chunk's position. However, given our use case in Spark (to put // the chunks in block manager), only limiting the view bound of the buffer would still // require the block manager to store the whole chunk. //将第一个n-1块复制到输出,然后创建一个适合最后一个块的数组。一个替代方法是返回一个ByteBuffers数组,最后一个缓冲区 //仅限于最后一个块的位置。 但是,考虑到我们在Spark中的用例(put块块中的块管理器),只会限制缓冲区的视图边界 //要求块管理器存储整个块。 val ret = new Array[Array[Byte]](chunks.size) for (i <- 0 until chunks.size - 1) { ret(i) = chunks(i) } if (position == chunkSize) { ret(lastChunkIndex) = chunks(lastChunkIndex) } else { ret(lastChunkIndex) = new Array[Byte](position) System.arraycopy(chunks(lastChunkIndex), 0, ret(lastChunkIndex), 0, position) } ret } } }
Source File: MapPartitionsWithPreparationRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.{Partition, Partitioner, TaskContext} override def compute(partition: Partition, context: TaskContext): Iterator[U] = { val prepared = if (preparedArguments.isEmpty) { preparePartition() } else { preparedArguments.remove(0) } val parentIterator = firstParent[T].iterator(partition, context) executePartition(context, partition.index, prepared, parentIterator) } }
Source File: UnionRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient rdd: RDD[T], val parentRddIndex: Int, @transient parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization //在任务序列化时更新对父拆分的引用 parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies override def getPartitions: Array[Partition] = { val array = new Array[Partition]( var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length) pos += rdd.partitions.length } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Source File: hbaseCommands.scala From Heracles with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase.execution import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.hbase._ import org.apache.spark.sql.hbase.util.DataTypeUtils import org.apache.spark.sql.types._ import scala.collection.mutable.ArrayBuffer @DeveloperApi case class AlterDropColCommand(namespace: String, tableName: String, columnName: String) extends RunnableCommand { def run(sparkSession: SparkSession): Seq[Row] = { sparkSession.sharedState.externalCatalog.asInstanceOf[HBaseCatalog] .alterTableDropNonKey(namespace, tableName, columnName) sparkSession.sharedState.externalCatalog.asInstanceOf[HBaseCatalog].stopAdmin() Seq.empty[Row] } } @DeveloperApi case class AlterAddColCommand(namespace: String, tableName: String, colName: String, colType: String, colFamily: String, colQualifier: String) extends RunnableCommand { def run(sparkSession: SparkSession): Seq[Row] = { val hbaseCatalog = sparkSession.sharedState.externalCatalog.asInstanceOf[HBaseCatalog] hbaseCatalog.alterTableAddNonKey(namespace, tableName, NonKeyColumn(colName, DataTypeUtils.getDataType(colType), colFamily, colQualifier)) hbaseCatalog.stopAdmin() Seq.empty[Row] } } @DeveloperApi case class InsertValueIntoTableCommand(tid: TableIdentifier, valueSeq: Seq[String]) extends RunnableCommand { override def run(sparkSession: SparkSession) = { val relation: HBaseRelation = sparkSession.sessionState.catalog.externalCatalog .asInstanceOf[HBaseCatalog] .getHBaseRelation(tid.database.getOrElse(null), tid.table).getOrElse(null) val bytes = => DataTypeUtils.string2TypeData(v._1, relation.schema(v._2).dataType)) val rows = sparkSession.sparkContext.makeRDD(Seq(Row.fromSeq(bytes))) val inputValuesDF = sparkSession.createDataFrame(rows, relation.schema) relation.insert(inputValuesDF, overwrite = false) Seq.empty[Row] } override def output: Seq[Attribute] = Seq.empty }
Source File: MeetupReceiver.scala From meetup-stream with Apache License 2.0 | 5 votes |
package receiver import org.apache.spark.streaming.receiver.Receiver import import org.apache.spark.Logging import com.ning.http.client.AsyncHttpClientConfig import com.ning.http.client._ import scala.collection.mutable.ArrayBuffer import import import import import import import class MeetupReceiver(url: String) extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) with Logging { @transient var client: AsyncHttpClient = _ @transient var inputPipe: PipedInputStream = _ @transient var outputPipe: PipedOutputStream = _ def onStart() { val cf = new AsyncHttpClientConfig.Builder() cf.setRequestTimeout(Integer.MAX_VALUE) cf.setReadTimeout(Integer.MAX_VALUE) cf.setPooledConnectionIdleTimeout(Integer.MAX_VALUE) client= new AsyncHttpClient( inputPipe = new PipedInputStream(1024 * 1024) outputPipe = new PipedOutputStream(inputPipe) val producerThread = new Thread(new DataConsumer(inputPipe)) producerThread.start() client.prepareGet(url).execute(new AsyncHandler[Unit]{ def onBodyPartReceived(bodyPart: HttpResponseBodyPart) = { bodyPart.writeTo(outputPipe) AsyncHandler.STATE.CONTINUE } def onStatusReceived(status: HttpResponseStatus) = { AsyncHandler.STATE.CONTINUE } def onHeadersReceived(headers: HttpResponseHeaders) = { AsyncHandler.STATE.CONTINUE } def onCompleted = { println("completed") } def onThrowable(t: Throwable)={ t.printStackTrace() } }) } def onStop() { if (Option(client).isDefined) client.close() if (Option(outputPipe).isDefined) { outputPipe.flush() outputPipe.close() } if (Option(inputPipe).isDefined) { inputPipe.close() } } class DataConsumer(inputStream: InputStream) extends Runnable { override def run() { val bufferedReader = new BufferedReader( new InputStreamReader( inputStream )) var input=bufferedReader.readLine() while(input!=null){ store(input) input=bufferedReader.readLine() } } } }
Source File: HashBasedDeduplicator.scala From pravda-ml with Apache License 2.0 | 5 votes |
package import odkl.analysis.spark.util.Logging import org.apache.spark.annotation.DeveloperApi import import import import import{BLAS, Vector} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, Row} import scala.collection.mutable.ArrayBuffer def setSimilarityTreshold(value: Double): this.type = set(similarityThreshold, value) setDefault(new ParamPair[String](inputColHash,"hash"), new ParamPair[Double](similarityThreshold,0.9)) def this() = this(Identifiable.randomUID("hashBasedDeduplication")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.sqlContext.createDataFrame( dataset.toDF .repartition(dataset.col($(inputColHash))) .sortWithinPartitions($(inputColHash)) .rdd .mapPartitions((f: Iterator[Row]) => { if (f.hasNext) { var curHash: Long = -1L val vectorsBuffer = new ArrayBuffer[Vector](0) // unique vectors buffer for this bucket for (it <- f) yield { val newHash = it.getAs[Long]($(inputColHash)) if (newHash == curHash) { val currentVector = it.getAs[Vector]($(inputColVector)) val isUnique = vectorsBuffer.forall(storedVector => { //are this vector is "different" with other in buffer? (, currentVector) / (norm(storedVector, 2) * norm(currentVector, 2))) < $(similarityThreshold) //is unsimilar? }) if (isUnique) { vectorsBuffer.append(currentVector) it } else { Row.empty //dummy Row } } else { vectorsBuffer.clear() vectorsBuffer.append(it.getAs[Vector]($(inputColVector))) curHash = newHash it } } } else { new Array[Row](0).toIterator //empty partition? } }).filter(!_.equals(Row.empty)), //filter dummy transformSchema(dataset.schema)) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { schema } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) }
Source File: NonSampleCompactor.scala From deequ with Apache License 2.0 | 5 votes |
package import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import scala.util.Random val output = (offset until len by 2).map(sortedBuffer(_)).toArray val tail = findOdd(items) items = items % 2 var newBuffer = ArrayBuffer[T]() if (tail.isDefined) { newBuffer = newBuffer :+ tail.get } buffer = newBuffer numOfCompress = numOfCompress + 1 output } }
Source File: ScaleAndConvert.scala From SparkNet with MIT License | 5 votes |
package preprocessing import java.awt.image.DataBufferByte import import javax.imageio.ImageIO import scala.collection.mutable.ArrayBuffer import scala.collection.JavaConversions._ import net.coobird.thumbnailator._ import org.apache.spark.rdd.RDD import libs._ object ScaleAndConvert { def BufferedImageToByteArray(image: java.awt.image.BufferedImage) : Array[Byte] = { val height = image.getHeight() val width = image.getWidth() val pixels = image.getRGB(0, 0, width, height, null, 0, width) val result = new Array[Byte](3 * height * width) var row = 0 while (row < height) { var col = 0 while (col < width) { val rgb = pixels(row * width + col) result(0 * height * width + row * width + col) = ((rgb >> 16) & 0xFF).toByte result(1 * height * width + row * width + col) = ((rgb >> 8) & 0xFF).toByte result(2 * height * width + row * width + col) = (rgb & 0xFF).toByte col += 1 } row += 1 } result } def decompressImageAndResize(compressedImage: Array[Byte], height: Int, width: Int) : Option[Array[Byte]] = { // this method takes a JPEG, decompresses it, and resizes it try { val im = ByteArrayInputStream(compressedImage)) val resizedImage = Thumbnails.of(im).forceSize(width, height).asBufferedImage() Some(BufferedImageToByteArray(resizedImage)) } catch { // If images can't be processed properly, just ignore them case e: java.lang.IllegalArgumentException => None case e: javax.imageio.IIOException => None case e: java.lang.NullPointerException => None } } }
Source File: ClassRDDPartitioner.scala From spark-orientdb-connector with Apache License 2.0 | 5 votes |
package com.metreta.spark.orientdb.connector.rdd.partitioner import scala.collection.JavaConversions.iterableAsScalaIterable import scala.collection.mutable.ArrayBuffer import org.apache.spark.Logging import org.apache.spark.Partition import com.metreta.spark.orientdb.connector.api.OrientDBConnector import com.orientechnologies.orient.core.metadata.schema.OClass import com.orientechnologies.orient.core.metadata.schema.OSchema import import com.metreta.spark.orientdb.connector.SystemTables import scala.collection.JavaConversions.iterableAsScalaIterable def getPartitions(): Array[Partition] = { val db = connector.databaseDocumentTx() var partitions = new ArrayBuffer[OrientPartition] val schema: OSchema = connector.getSchema(db) var klass: OClass = schema.getClass(mClass) val storage: OStorage = connector.getStorage(db) klass.getClusterIds.zipWithIndex foreach { case (clusterId, index) => partitions = partitions.+=(OrientPartition( index, null, // <- Host Address ????? PartitionName(klass.getName, storage.getClusterById(clusterId).getName))) } partitions.toArray } }
Source File: SparkContextFunctionsSpec.scala From spark-orientdb-connector with Apache License 2.0 | 5 votes |
package com.metreta.spark.orientdb.connector import scala.collection.mutable.ArrayBuffer import org.scalatest.BeforeAndAfterAll import import com.metreta.spark.orientdb.connector.utils.BaseOrientDbFlatSpec class SparkContextFunctionsSpec extends BaseOrientDbFlatSpec { var oridList: ArrayBuffer[String] = new ArrayBuffer var MaxCluster = 1000 var MaxRecord = 1000 override def beforeAll(): Unit = { initSparkConf(defaultSparkConf) createOridList() } override def afterAll(): Unit = { sparkContext.stop() } "A VertexId created from RID" should "be unique" in { val vertexIdList = oridList map { rid => sparkContext.getVertexIdFromString(rid) } val duplicatedValues = vertexIdList.groupBy(identity).collect { case (x, ys) if ys.lengthCompare(1) > 0 => x } duplicatedValues shouldBe empty } it should "be a positive number" in { val negativeValues = oridList filter { rid => sparkContext.getVertexIdFromString(rid) < 0 } negativeValues shouldBe empty } def createOridList() { for (clusterId <- 0 to MaxCluster) { for (recordId <- 0 to MaxRecord) { val rid = new StringBuilder rid.append(ORID.PREFIX); rid.append(clusterId); rid.append(ORID.SEPARATOR); rid.append(recordId); oridList += rid.toString } } } }
Source File: SpearmanCorrelation.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 val cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Source File: ApplicationMasterArguments.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import scala.collection.mutable.ArrayBuffer class ApplicationMasterArguments(val args: Array[String]) { var userJar: String = null var userClass: String = null var primaryPyFile: String = null var primaryRFile: String = null var userArgs: Seq[String] = Nil var propertiesFile: String = null parseArgs(args.toList) private def parseArgs(inputArgs: List[String]): Unit = { val userArgsBuffer = new ArrayBuffer[String]() var args = inputArgs while (!args.isEmpty) { // --num-workers, --worker-memory, and --worker-cores are deprecated since 1.0, // the properties with executor in their names are preferred. args match { case ("--jar") :: value :: tail => userJar = value args = tail case ("--class") :: value :: tail => userClass = value args = tail case ("--primary-py-file") :: value :: tail => primaryPyFile = value args = tail case ("--primary-r-file") :: value :: tail => primaryRFile = value args = tail case ("--arg") :: value :: tail => userArgsBuffer += value args = tail case ("--properties-file") :: value :: tail => propertiesFile = value args = tail case _ => printUsageAndExit(1, args) } } if (primaryPyFile != null && primaryRFile != null) { // scalastyle:off println System.err.println("Cannot have primary-py-file and primary-r-file at the same time") // scalastyle:on println System.exit(-1) } userArgs = userArgsBuffer.toList } def printUsageAndExit(exitCode: Int, unknownParam: Any = null) { // scalastyle:off println if (unknownParam != null) { System.err.println("Unknown/unsupported param " + unknownParam) } System.err.println(""" |Usage: org.apache.spark.deploy.yarn.ApplicationMaster [options] |Options: | --jar JAR_PATH Path to your application's JAR file | --class CLASS_NAME Name of your application's main class | --primary-py-file A main Python file | --primary-r-file A main R file | --arg ARG Argument to be passed to your application's main class. | Multiple invocations are possible, each will be passed in order. | --properties-file FILE Path to a custom Spark properties file. """.stripMargin) // scalastyle:on println System.exit(exitCode) } } object ApplicationMasterArguments { val DEFAULT_NUMBER_EXECUTORS = 2 }
Source File: ClientArguments.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import scala.collection.mutable.ArrayBuffer // TODO: Add code and support for ensuring that yarn resource 'tasks' are location aware ! private[spark] class ClientArguments(args: Array[String]) { var userJar: String = null var userClass: String = null var primaryPyFile: String = null var primaryRFile: String = null var userArgs: ArrayBuffer[String] = new ArrayBuffer[String]() parseArgs(args.toList) private def parseArgs(inputArgs: List[String]): Unit = { var args = inputArgs while (!args.isEmpty) { args match { case ("--jar") :: value :: tail => userJar = value args = tail case ("--class") :: value :: tail => userClass = value args = tail case ("--primary-py-file") :: value :: tail => primaryPyFile = value args = tail case ("--primary-r-file") :: value :: tail => primaryRFile = value args = tail case ("--arg") :: value :: tail => userArgs += value args = tail case Nil => case _ => throw new IllegalArgumentException(getUsageMessage(args)) } } if (primaryPyFile != null && primaryRFile != null) { throw new IllegalArgumentException("Cannot have primary-py-file and primary-r-file" + " at the same time") } } private def getUsageMessage(unknownParam: List[String] = null): String = { val message = if (unknownParam != null) s"Unknown/unsupported param $unknownParam\n" else "" message + s""" |Usage: org.apache.spark.deploy.yarn.Client [options] |Options: | --jar JAR_PATH Path to your application's JAR file (required in yarn-cluster | mode) | --class CLASS_NAME Name of your application's main class (required) | --primary-py-file A main Python file | --primary-r-file A main R file | --arg ARG Argument to be passed to your application's main class. | Multiple invocations are possible, each will be passed in order. """.stripMargin } }
Source File: KPLBasedKinesisTestUtils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import java.nio.ByteBuffer import java.nio.charset.StandardCharsets import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import{KinesisProducer => KPLProducer, KinesisProducerConfiguration, UserRecordResult} import{FutureCallback, Futures} private[kinesis] class KPLBasedKinesisTestUtils(streamShardCount: Int = 2) extends KinesisTestUtils(streamShardCount) { override protected def getProducer(aggregate: Boolean): KinesisDataGenerator = { if (!aggregate) { new SimpleDataGenerator(kinesisClient) } else { new KPLDataGenerator(regionName) } } } private[kinesis] class KPLDataGenerator(regionName: String) extends KinesisDataGenerator { private lazy val producer: KPLProducer = { val conf = new KinesisProducerConfiguration() .setRecordMaxBufferedTime(1000) .setMaxConnections(1) .setRegion(regionName) .setMetricsLevel("none") new KPLProducer(conf) } override def sendData(streamName: String, data: Seq[Int]): Map[String, Seq[(Int, String)]] = { val shardIdToSeqNumbers = new mutable.HashMap[String, ArrayBuffer[(Int, String)]]() data.foreach { num => val str = num.toString val data = ByteBuffer.wrap(str.getBytes(StandardCharsets.UTF_8)) val future = producer.addUserRecord(streamName, str, data) val kinesisCallBack = new FutureCallback[UserRecordResult]() { override def onFailure(t: Throwable): Unit = {} // do nothing override def onSuccess(result: UserRecordResult): Unit = { val shardId = result.getShardId val seqNumber = result.getSequenceNumber() val sentSeqNumbers = shardIdToSeqNumbers.getOrElseUpdate(shardId, new ArrayBuffer[(Int, String)]()) sentSeqNumbers += ((num, seqNumber)) } } Futures.addCallback(future, kinesisCallBack) } producer.flushSync() shardIdToSeqNumbers.toMap } }
Source File: Exchange.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression, SortOrder} import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]() plan.transformUp { case exchange: Exchange => // the exchanges that have same results usually also have same schemas (same column names). val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]()) val samePlan = sameSchema.find { e => exchange.sameResult(e) } if (samePlan.isDefined) { // Keep the output of this exchange, the following plans require that to resolve // attributes. ReusedExchangeExec(exchange.output, samePlan.get) } else { sameSchema += exchange exchange } } } }
Source File: SQLAppStatusStore.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.ui import java.lang.{Long => JLong} import java.util.Date import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import com.fasterxml.jackson.annotation.JsonIgnore import com.fasterxml.jackson.databind.annotation.JsonDeserialize import org.apache.spark.JobExecutionStatus import org.apache.spark.status.KVUtils.KVIndexParam import org.apache.spark.util.kvstore.{KVIndex, KVStore} class SparkPlanGraphNodeWrapper( val node: SparkPlanGraphNode, val cluster: SparkPlanGraphClusterWrapper) { def toSparkPlanGraphNode(): SparkPlanGraphNode = { assert(node == null ^ cluster == null, "One and only of of nore or cluster must be set.") if (node != null) node else cluster.toSparkPlanGraphCluster() } } case class SQLPlanMetric( name: String, accumulatorId: Long, metricType: String)
Source File: ManifestFileCommitProtocol.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.util.UUID import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.spark.internal.Logging import import def setupManifestOptions(fileLog: FileStreamSinkLog, batchId: Long): Unit = { this.fileLog = fileLog this.batchId = batchId } override def setupJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") // Do nothing } override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") val fileStatuses = taskCommits.flatMap(_.obj.asInstanceOf[Seq[SinkFileStatus]]).toArray if (fileLog.add(batchId, fileStatuses)) { logInfo(s"Committed batch $batchId") } else { throw new IllegalStateException(s"Race while writing batch $batchId") } } override def abortJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") // Do nothing } override def setupTask(taskContext: TaskAttemptContext): Unit = { addedFiles = new ArrayBuffer[String] } override def newTaskTempFile( taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = { // The file name looks like part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet // Note that %05d does not truncate the split number, so if we have more than 100000 tasks, // the file name is fine and won't overflow. val split = taskContext.getTaskAttemptID.getTaskID.getId val uuid = UUID.randomUUID.toString val filename = f"part-$split%05d-$uuid$ext" val file = { d => new Path(new Path(path, d), filename).toString }.getOrElse { new Path(path, filename).toString } addedFiles += file file } override def newTaskTempFileAbsPath( taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String = { throw new UnsupportedOperationException( s"$this does not support adding files with an absolute path") } override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = { if (addedFiles.nonEmpty) { val fs = new Path(addedFiles.head).getFileSystem(taskContext.getConfiguration) val statuses: Seq[SinkFileStatus] = => SinkFileStatus(fs.getFileStatus(new Path(f)))) new TaskCommitMessage(statuses) } else { new TaskCommitMessage(Seq.empty[SinkFileStatus]) } } override def abortTask(taskContext: TaskAttemptContext): Unit = { // Do nothing // TODO: we can also try delete the addedFiles as a best-effort cleanup. } }
Source File: BatchEvalPythonExecSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.python import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import org.apache.spark.api.python.{PythonEvalType, PythonFunction} import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, GreaterThan, In} import org.apache.spark.sql.execution.{FilterExec, InputAdapter, SparkPlanTest, WholeStageCodegenExec} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.BooleanType class BatchEvalPythonExecSuite extends SparkPlanTest with SharedSQLContext { import testImplicits.newProductEncoder import testImplicits.localSeqToDatasetHolder override def beforeAll(): Unit = { super.beforeAll() spark.udf.registerPython("dummyPythonUDF", new MyDummyPythonUDF) } override def afterAll(): Unit = { spark.sessionState.functionRegistry.dropFunction(FunctionIdentifier("dummyPythonUDF")) super.afterAll() } test("Python UDF: push down deterministic FilterExec predicates") { val df = Seq(("Hello", 4)).toDF("a", "b") .where("dummyPythonUDF(b) and dummyPythonUDF(a) and a in (3, 4)") val qualifiedPlanNodes = df.queryExecution.executedPlan.collect { case f @ FilterExec( And(_: AttributeReference, _: AttributeReference), InputAdapter(_: BatchEvalPythonExec)) => f case b @ BatchEvalPythonExec(_, _, WholeStageCodegenExec(FilterExec(_: In, _))) => b } assert(qualifiedPlanNodes.size == 2) } test("Nested Python UDF: push down deterministic FilterExec predicates") { val df = Seq(("Hello", 4)).toDF("a", "b") .where("dummyPythonUDF(a, dummyPythonUDF(a, b)) and a in (3, 4)") val qualifiedPlanNodes = df.queryExecution.executedPlan.collect { case f @ FilterExec(_: AttributeReference, InputAdapter(_: BatchEvalPythonExec)) => f case b @ BatchEvalPythonExec(_, _, WholeStageCodegenExec(FilterExec(_: In, _))) => b } assert(qualifiedPlanNodes.size == 2) } test("Python UDF: no push down on non-deterministic") { val df = Seq(("Hello", 4)).toDF("a", "b") .where("b > 4 and dummyPythonUDF(a) and rand() > 0.3") val qualifiedPlanNodes = df.queryExecution.executedPlan.collect { case f @ FilterExec( And(_: AttributeReference, _: GreaterThan), InputAdapter(_: BatchEvalPythonExec)) => f case b @ BatchEvalPythonExec(_, _, WholeStageCodegenExec(_: FilterExec)) => b } assert(qualifiedPlanNodes.size == 2) } test("Python UDF: push down on deterministic predicates after the first non-deterministic") { val df = Seq(("Hello", 4)).toDF("a", "b") .where("dummyPythonUDF(a) and rand() > 0.3 and b > 4") val qualifiedPlanNodes = df.queryExecution.executedPlan.collect { case f @ FilterExec( And(_: AttributeReference, _: GreaterThan), InputAdapter(_: BatchEvalPythonExec)) => f case b @ BatchEvalPythonExec(_, _, WholeStageCodegenExec(_: FilterExec)) => b } assert(qualifiedPlanNodes.size == 2) } test("Python UDF refers to the attributes from more than one child") { val df = Seq(("Hello", 4)).toDF("a", "b") val df2 = Seq(("Hello", 4)).toDF("c", "d") val joinDF = df.crossJoin(df2).where("dummyPythonUDF(a, c) == dummyPythonUDF(d, c)") val qualifiedPlanNodes = joinDF.queryExecution.executedPlan.collect { case b: BatchEvalPythonExec => b } assert(qualifiedPlanNodes.size == 1) } } // This Python UDF is dummy and just for testing. Unable to execute. class DummyUDF extends PythonFunction( command = Array[Byte](), envVars = Map("" -> "").asJava, pythonIncludes = ArrayBuffer("").asJava, pythonExec = "", pythonVer = "", broadcastVars = null, accumulator = null) class MyDummyPythonUDF extends UserDefinedPythonFunction( name = "dummyUDF", func = new DummyUDF, dataType = BooleanType, pythonEvalType = PythonEvalType.SQL_BATCHED_UDF, udfDeterministic = true)
Source File: UnionDStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class UnionDStream[T: ClassTag](parents: Array[DStream[T]]) extends DStream[T](parents.head.ssc) { require(parents.length > 0, "List of DStreams to union is empty") require( == 1, "Some of the DStreams have different contexts") require( == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { val rdds = new ArrayBuffer[RDD[T]]() { case Some(rdd) => rdds += rdd case None => throw new SparkException("Could not generate RDD from a parent for unifying at" + s" time $validTime") } if (rdds.nonEmpty) { Some( } else { None } } }
Source File: QueueInputDStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import{NotSerializableException, ObjectInputStream, ObjectOutputStream} import scala.collection.mutable.{ArrayBuffer, Queue} import scala.reflect.ClassTag import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.streaming.{StreamingContext, Time} private[streaming] class QueueInputDStream[T: ClassTag]( ssc: StreamingContext, val queue: Queue[RDD[T]], oneAtATime: Boolean, defaultRDD: RDD[T] ) extends InputDStream[T](ssc) { override def start() { } override def stop() { } private def readObject(in: ObjectInputStream): Unit = { throw new NotSerializableException("queueStream doesn't support checkpointing. " + "Please don't use queueStream when checkpointing is enabled.") } private def writeObject(oos: ObjectOutputStream): Unit = { logWarning("queueStream doesn't support checkpointing") } override def compute(validTime: Time): Option[RDD[T]] = { val buffer = new ArrayBuffer[RDD[T]]() queue.synchronized { if (oneAtATime && queue.nonEmpty) { buffer += queue.dequeue() } else { buffer ++= queue queue.clear() } } if (buffer.nonEmpty) { if (oneAtATime) { Some(buffer.head) } else { Some(new UnionRDD(, buffer.toSeq)) } } else if (defaultRDD != null) { Some(defaultRDD) } else { Some(ssc.sparkContext.emptyRDD) } } }
Source File: LocalSparkCluster.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkConf import org.apache.spark.deploy.master.Master import org.apache.spark.deploy.worker.Worker import org.apache.spark.internal.Logging import org.apache.spark.rpc.RpcEnv import org.apache.spark.util.Utils for (workerNum <- 1 to numWorkers) { val workerEnv = Worker.startRpcEnvAndEndpoint(localHostname, 0, 0, coresPerWorker, memoryPerWorker, masters, null, Some(workerNum), _conf) workerRpcEnvs += workerEnv } masters } def stop() { logInfo("Shutting down local Spark cluster.") // Stop the workers before the master so they don't get upset that it disconnected workerRpcEnvs.foreach(_.shutdown()) masterRpcEnvs.foreach(_.shutdown()) workerRpcEnvs.foreach(_.awaitTermination()) masterRpcEnvs.foreach(_.awaitTermination()) masterRpcEnvs.clear() workerRpcEnvs.clear() } }
Source File: TaskResult.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkEnv import org.apache.spark.serializer.SerializerInstance import import org.apache.spark.util.{AccumulatorV2, Utils} // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] def value(resultSer: SerializerInstance = null): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value val ser = if (resultSer == null) SparkEnv.get.serializer.newInstance() else resultSer valueObject = ser.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Source File: Schedulable.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.util.concurrent.ConcurrentLinkedQueue import scala.collection.mutable.ArrayBuffer import org.apache.spark.scheduler.SchedulingMode.SchedulingMode private[spark] trait Schedulable { var parent: Pool // child queues def schedulableQueue: ConcurrentLinkedQueue[Schedulable] def schedulingMode: SchedulingMode def weight: Int def minShare: Int def runningTasks: Int def priority: Int def stageId: Int def name: String def addSchedulable(schedulable: Schedulable): Unit def removeSchedulable(schedulable: Schedulable): Unit def getSchedulableByName(name: String): Schedulable def executorLost(executorId: String, host: String, reason: ExecutorLossReason): Unit def checkSpeculatableTasks(minTimeToSpeculation: Int): Boolean def getSortedTaskSetQueue: ArrayBuffer[TaskSetManager] }
Source File: ChunkedByteBufferOutputStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package import import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer import private[this] var position = chunkSize private[this] var _size = 0 private[this] var closed: Boolean = false def size: Long = _size override def close(): Unit = { if (!closed) { super.close() closed = true } } override def write(b: Int): Unit = { require(!closed, "cannot write to a closed ChunkedByteBufferOutputStream") allocateNewChunkIfNeeded() chunks(lastChunkIndex).put(b.toByte) position += 1 _size += 1 } override def write(bytes: Array[Byte], off: Int, len: Int): Unit = { require(!closed, "cannot write to a closed ChunkedByteBufferOutputStream") var written = 0 while (written < len) { allocateNewChunkIfNeeded() val thisBatch = math.min(chunkSize - position, len - written) chunks(lastChunkIndex).put(bytes, written + off, thisBatch) written += thisBatch position += thisBatch } _size += len } @inline private def allocateNewChunkIfNeeded(): Unit = { if (position == chunkSize) { chunks += allocator(chunkSize) lastChunkIndex += 1 position = 0 } } def toChunkedByteBuffer: ChunkedByteBuffer = { require(closed, "cannot call toChunkedByteBuffer() unless close() has been called") require(!toChunkedByteBufferWasCalled, "toChunkedByteBuffer() can only be called once") toChunkedByteBufferWasCalled = true if (lastChunkIndex == -1) { new ChunkedByteBuffer(Array.empty[ByteBuffer]) } else { // Copy the first n-1 chunks to the output, and then create an array that fits the last chunk. // An alternative would have been returning an array of ByteBuffers, with the last buffer // bounded to only the last chunk's position. However, given our use case in Spark (to put // the chunks in block manager), only limiting the view bound of the buffer would still // require the block manager to store the whole chunk. val ret = new Array[ByteBuffer](chunks.size) for (i <- 0 until chunks.size - 1) { ret(i) = chunks(i) ret(i).flip() } if (position == chunkSize) { ret(lastChunkIndex) = chunks(lastChunkIndex) ret(lastChunkIndex).flip() } else { ret(lastChunkIndex) = allocator(position) chunks(lastChunkIndex).flip() ret(lastChunkIndex).put(chunks(lastChunkIndex)) ret(lastChunkIndex).flip() StorageUtils.dispose(chunks(lastChunkIndex)) } new ChunkedByteBuffer(ret) } } }
Source File: UnionRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.collection.parallel.ForkJoinTaskSupport import scala.concurrent.forkjoin.ForkJoinPool import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient private val rdd: RDD[T], val parentRddIndex: Int, @transient private val parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } object UnionRDD { private[spark] lazy val partitionEvalTaskSupport = new ForkJoinTaskSupport(new ForkJoinPool(8)) } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies // visible for testing private[spark] val isPartitionListingParallel: Boolean = rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10) override def getPartitions: Array[Partition] = { val parRDDs = if (isPartitionListingParallel) { val parArray = rdds.par parArray.tasksupport = UnionRDD.partitionEvalTaskSupport parArray } else { rdds } val array = new Array[Partition]( var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length) pos += rdd.partitions.length } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Source File: ValueJsonConversionTest.scala From ingraph with Eclipse Public License 1.0 | 5 votes |
package ingraph.compiler.sql.driver import ingraph.compiler.sql.driver.ValueJsonConversion._ import ingraph.compiler.sql.driver.ValueJsonConversionTest._ import org.neo4j.driver.internal.value._ import org.neo4j.driver.internal.{InternalNode, InternalPath, InternalRelationship} import org.neo4j.driver.v1.Value import org.scalactic.source import org.scalactic.source.Position import org.scalatest.FunSuite import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer class ValueJsonConversionTest extends FunSuite { testParameters.foreach { case (value, testName, pos) => test(testName) { println(value) val jsonString = gson.toJson(value, classOf[Value]) println(jsonString) val deserialized = gson.fromJson(jsonString, classOf[Value]) assert(value == deserialized) }(pos) } } object ValueJsonConversionTest { val testValues: ArrayBuffer[Value] = ArrayBuffer.empty val testParameters: ArrayBuffer[(Value, String, Position)] = ArrayBuffer.empty def addTest(value: Value, testName: String = null)(implicit pos: source.Position): Unit = { testValues += value testParameters += ((value, Option(testName).getOrElse(value.getClass.getSimpleName), pos)) } private val stringValue = new StringValue("John") private val integerValue = new IntegerValue(101) private val propertiesMap = Map[String, Value]("name" -> stringValue).asJava addTest(new MapValue(propertiesMap)) addTest(new BytesValue(Array[Byte](0, 42, 127, -128))) addTest(new ListValue(stringValue, integerValue)) addTest(new NodeValue(new InternalNode(5, List("Label1", "Label2").asJavaCollection, propertiesMap))) addTest(new RelationshipValue(new InternalRelationship(42, 10, 20, "Edge_Type_1", propertiesMap))) addTest(new PathValue(new InternalPath( new InternalNode(0), new InternalRelationship(101, 0, 1, "TYPE_A"), new InternalNode(1) ))) addTest(BooleanValue.FALSE) addTest(BooleanValue.TRUE) addTest(NullValue.NULL) addTest(stringValue) addTest(integerValue) addTest(new FloatValue(3.14)) }
Source File: TokenStreamUtils.scala From odinson with Apache License 2.0 | 5 votes |
package ai.lum.odinson.lucene.analysis import scala.collection.mutable.ArrayBuffer import org.apache.lucene.analysis.Analyzer import org.apache.lucene.analysis.TokenStream import org.apache.lucene.analysis.tokenattributes.CharTermAttribute import import object TokenStreamUtils { def getTokens( docID: Int, fieldName: String, indexSearcher: IndexSearcher, analyzer: Analyzer ): Array[String] = { val doc = indexSearcher.doc(docID) val tvs = indexSearcher.getIndexReader().getTermVectors(docID) val text = doc.getField(fieldName).stringValue val ts = TokenSources.getTokenStream(fieldName, tvs, text, analyzer, -1) val tokens = getTokens(ts) tokens } def getTokens(ts: TokenStream): Array[String] = { ts.reset() val terms = new ArrayBuffer[String] while (ts.incrementToken()) { val charTermAttribute = ts.addAttribute(classOf[CharTermAttribute]) val term = charTermAttribute.toString terms += term } ts.end() ts.close() terms.toArray } }
Source File: Driver.scala From OnlineLDA_Spark with Apache License 2.0 | 5 votes |
package com.github.yuhao.yang import java.util.Calendar import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkContext, SparkConf} import scala.collection.mutable.ArrayBuffer object Driver extends Serializable{ def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) val inputDir = args(0) val filePaths = extractPaths(inputDir + "texts", true) val stopWordsPath = inputDir + "stop.txt" val vocabPath = inputDir + "wordsEn.txt" println("begin: " + Calendar.getInstance().getTime) println("path size: " + filePaths.size) assert(filePaths.size > 0) val conf = new SparkConf().setAppName("online LDA Spark") val sc = new SparkContext(conf) val vocab = Docs2Vec.extractVocab(sc, Seq(vocabPath), stopWordsPath) val vocabArray = val K = args(1).toInt // val lda = OnlineLDA_Spark.runBatchMode(sc, filePaths, vocab, K, 50) val lda = OnlineLDA_Spark.runOnlineMode(sc, filePaths, vocab, K, args(2).toInt) println("_lambda:") for(row <- 0 until lda._lambda.rows){ val v = lda._lambda(row, ::).t val topk = lda._lambda(row, ::).t.argtopk(10) val pairs = => (vocabArray(k), v(k))) val sorted = pairs.sortBy(_._2).reverse println( => (x._1)).mkString(","), => ("%2.2f".format(x._2))).mkString(",")) } println("end: " + Calendar.getInstance().getTime()) } def extractPaths(path: String, recursive: Boolean = true): Array[String] ={ val docsets = ArrayBuffer[String]() val fileList = new if(fileList == null) return docsets.toArray for(f <- fileList){ if(f.isDirectory){ if(recursive) docsets ++= extractPaths(f.getAbsolutePath, true) } else{ docsets += f.getAbsolutePath } } docsets.toArray } }
Source File: QuerySuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException} import org.scalatest.FunSuite import scala.collection.mutable.ArrayBuffer abstract class QuerySuite extends FunSuite with Logging { case class TestCase(program: String, query: String, data: Map[String, Seq[String]], answers: Seq[String], answersSize: Int) { def this(program: String, query: String, data: Map[String, Seq[String]], answersSize: Int) = this(program, query, data, null, answersSize) def this(program: String, query: String, data: Map[String, Seq[String]], answers: Seq[String]) = this(program, query, data, answers, answers.size) } def runTest(testCase: TestCase): Unit = runTests(Seq(testCase)) def runTests(testCases: Seq[TestCase]): Unit = { val sparkCtx = new SparkContext("local[*]", "QuerySuite", new SparkConf() .set("spark.eventLog.enabled", "true") //.set("spark.eventLog.dir", "../logs") .set("spark.ui.enabled", "false") .set("spark.sql.shuffle.partitions", "5") .setAll(Map.empty[String, String]) ) val bigDatalogCtx = new BigDatalogContext(sparkCtx) var count: Int = 1 for (testCase <- testCases) { bigDatalogCtx.loadProgram(testCase.program) for ((relationName, data) <- { val relationInfo = bigDatalogCtx.relationCatalog.getRelationInfo(relationName) if (relationInfo == null) throw new SparkException("You are attempting to load an unknown relation.") bigDatalogCtx.registerAndLoadTable(relationName, data, bigDatalogCtx.conf.numShufflePartitions) } val query = testCase.query val answers = testCase.answers logInfo("========== START BigDatalog Query " + count + " START ==========") val program = bigDatalogCtx.query(query) val results = program.execute().collect() // for some test cases we will only know the size of the answer set, not the actual answers if (answers == null) { assert(results.size == testCase.answersSize) } else { if (results.size != answers.size) { displayDifferences(, answers) // yes this will fail assert(results.size == answers.size) } else { for (result <- results) assert(answers.contains(result.toString())) } val resultStrings = for (answer <- answers) assert(resultStrings.contains(answer.toString())) } logInfo("========== END BigDatalog Query " + count + " END ==========\n") count += 1 bigDatalogCtx.reset() } sparkCtx.stop() } private def displayDifferences(results: Seq[String], answers: Seq[String]): Unit = { val missingAnswers = new ArrayBuffer[String] val missingResults = new ArrayBuffer[String] for (result <- results) if (!answers.contains(result)) missingAnswers += result for (answer <- answers) if (!results.contains(answer)) missingResults += answer if (missingAnswers.nonEmpty) logInfo("Results not in Answers: " + missingAnswers.mkString(", ")) if (missingResults.nonEmpty) logInfo("Answers not in Results: " + missingResults.mkString(", ")) } }
Source File: SpearmanCorrelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.Logging import org.apache.spark.SparkContext._ import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 var cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Source File: TestOutputStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming import{IOException, ObjectInputStream} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.{DStream, ForEachDStream} import org.apache.spark.util.Utils import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag class TestOutputStream[T: ClassTag](parent: DStream[T], val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]()) extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { val collected = rdd.collect() output += collected }, false) { // This is to clear the output buffer every it is read from a checkpoint @throws(classOf[IOException]) private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { ois.defaultReadObject() output.clear() } }
Source File: FlumeStreamSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import scala.collection.JavaConverters._ import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer} import scala.concurrent.duration._ import scala.language.postfixOps import import import import import org.jboss.netty.handler.codec.compression._ import org.scalatest.{BeforeAndAfter, Matchers} import org.scalatest.concurrent.Eventually._ import org.apache.spark.{Logging, SparkConf, SparkFunSuite} import import org.apache.spark.streaming.{Milliseconds, StreamingContext, TestOutputStream} class FlumeStreamSuite extends SparkFunSuite with BeforeAndAfter with Matchers with Logging { val conf = new SparkConf().setMaster("local[4]").setAppName("FlumeStreamSuite") var ssc: StreamingContext = null test("flume input stream") { testFlumeStream(testCompression = false) } test("flume input compressed stream") { testFlumeStream(testCompression = true) } private class CompressionChannelFactory(compressionLevel: Int) extends NioClientSocketChannelFactory { override def newChannel(pipeline: ChannelPipeline): SocketChannel = { val encoder = new ZlibEncoder(compressionLevel) pipeline.addFirst("deflater", encoder) pipeline.addFirst("inflater", new ZlibDecoder()) super.newChannel(pipeline) } } }
Source File: JDBCRelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import java.util.Properties import scala.collection.mutable.ArrayBuffer import org.apache.spark.Partition import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode} def columnPartition(partitioning: JDBCPartitioningInfo): Array[Partition] = { if (partitioning == null) return Array[Partition](JDBCPartition(null, 0)) val numPartitions = partitioning.numPartitions val column = partitioning.column if (numPartitions == 1) return Array[Partition](JDBCPartition(null, 0)) // Overflow and silliness can happen if you subtract then divide. // Here we get a little roundoff, but that's (hopefully) OK. val stride: Long = (partitioning.upperBound / numPartitions - partitioning.lowerBound / numPartitions) var i: Int = 0 var currentValue: Long = partitioning.lowerBound var ans = new ArrayBuffer[Partition]() while (i < numPartitions) { val lowerBound = if (i != 0) s"$column >= $currentValue" else null currentValue += stride val upperBound = if (i != numPartitions - 1) s"$column < $currentValue" else null val whereClause = if (upperBound == null) { lowerBound } else if (lowerBound == null) { upperBound } else { s"$lowerBound AND $upperBound" } ans += JDBCPartition(whereClause, i) i = i + 1 } ans.toArray } } private[sql] case class JDBCRelation( url: String, table: String, parts: Array[Partition], properties: Properties = new Properties())(@transient val sqlContext: SQLContext) extends BaseRelation with PrunedFilteredScan with InsertableRelation { override val needConversion: Boolean = false override val schema: StructType = JDBCRDD.resolveTable(url, table, properties) override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { // Rely on a type erasure hack to pass RDD[InternalRow] back as RDD[Row] JDBCRDD.scanTable( sqlContext.sparkContext, schema, url, properties, table, requiredColumns, filters, parts).asInstanceOf[RDD[Row]] } override def insert(data: DataFrame, overwrite: Boolean): Unit = { data.write .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append) .jdbc(url, table, properties) } }
Source File: KPLBasedKinesisTestUtils.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import java.nio.ByteBuffer import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import{KinesisProducer => KPLProducer, KinesisProducerConfiguration, UserRecordResult} import{FutureCallback, Futures} private[kinesis] class KPLBasedKinesisTestUtils extends KinesisTestUtils { override protected def getProducer(aggregate: Boolean): KinesisDataGenerator = { if (!aggregate) { new SimpleDataGenerator(kinesisClient) } else { new KPLDataGenerator(regionName) } } } private[kinesis] class KPLDataGenerator(regionName: String) extends KinesisDataGenerator { private lazy val producer: KPLProducer = { val conf = new KinesisProducerConfiguration() .setRecordMaxBufferedTime(1000) .setMaxConnections(1) .setRegion(regionName) .setMetricsLevel("none") new KPLProducer(conf) } override def sendData(streamName: String, data: Seq[Int]): Map[String, Seq[(Int, String)]] = { val shardIdToSeqNumbers = new mutable.HashMap[String, ArrayBuffer[(Int, String)]]() data.foreach { num => val str = num.toString val data = ByteBuffer.wrap(str.getBytes()) val future = producer.addUserRecord(streamName, str, data) val kinesisCallBack = new FutureCallback[UserRecordResult]() { override def onFailure(t: Throwable): Unit = {} // do nothing override def onSuccess(result: UserRecordResult): Unit = { val shardId = result.getShardId val seqNumber = result.getSequenceNumber() val sentSeqNumbers = shardIdToSeqNumbers.getOrElseUpdate(shardId, new ArrayBuffer[(Int, String)]()) sentSeqNumbers += ((num, seqNumber)) } } Futures.addCallback(future, kinesisCallBack) } producer.flushSync() shardIdToSeqNumbers.toMap } }
Source File: UnionDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import org.apache.spark.rdd.UnionRDD private[streaming] class UnionDStream[T: ClassTag](parents: Array[DStream[T]]) extends DStream[T](parents.head.ssc) { require(parents.length > 0, "List of DStreams to union is empty") require( == 1, "Some of the DStreams have different contexts") require( == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { val rdds = new ArrayBuffer[RDD[T]]() { case Some(rdd) => rdds += rdd case None => throw new SparkException("Could not generate RDD from a parent for unifying at" + s" time $validTime") } if (rdds.size > 0) { Some(new UnionRDD(, rdds)) } else { None } } }
Source File: QueueInputDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import{NotSerializableException, ObjectInputStream, ObjectOutputStream} import scala.collection.mutable.{ArrayBuffer, Queue} import scala.reflect.ClassTag import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.streaming.{Time, StreamingContext} private[streaming] class QueueInputDStream[T: ClassTag]( ssc: StreamingContext, val queue: Queue[RDD[T]], oneAtATime: Boolean, defaultRDD: RDD[T] ) extends InputDStream[T](ssc) { override def start() { } override def stop() { } private def readObject(in: ObjectInputStream): Unit = { throw new NotSerializableException("queueStream doesn't support checkpointing. " + "Please don't use queueStream when checkpointing is enabled.") } private def writeObject(oos: ObjectOutputStream): Unit = { logWarning("queueStream doesn't support checkpointing") } override def compute(validTime: Time): Option[RDD[T]] = { val buffer = new ArrayBuffer[RDD[T]]() if (oneAtATime && queue.size > 0) { buffer += queue.dequeue() } else { buffer ++= queue.dequeueAll(_ => true) } if (buffer.size > 0) { if (oneAtATime) { Some(buffer.head) } else { Some(new UnionRDD(, buffer.toSeq)) } } else if (defaultRDD != null) { Some(defaultRDD) } else { Some(ssc.sparkContext.emptyRDD) } } }
Source File: LocalSparkCluster.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import scala.collection.mutable.ArrayBuffer import org.apache.spark.rpc.RpcEnv import org.apache.spark.{Logging, SparkConf} import org.apache.spark.deploy.worker.Worker import org.apache.spark.deploy.master.Master import org.apache.spark.util.Utils for (workerNum <- 1 to numWorkers) { val workerEnv = Worker.startRpcEnvAndEndpoint(localHostname, 0, 0, coresPerWorker, memoryPerWorker, masters, null, Some(workerNum), _conf) workerRpcEnvs += workerEnv } masters } def stop() { logInfo("Shutting down local Spark cluster.") // Stop the workers before the master so they don't get upset that it disconnected workerRpcEnvs.foreach(_.shutdown()) masterRpcEnvs.foreach(_.shutdown()) workerRpcEnvs.foreach(_.awaitTermination()) masterRpcEnvs.foreach(_.awaitTermination()) masterRpcEnvs.clear() workerRpcEnvs.clear() } }
Source File: Schedulable.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.util.concurrent.ConcurrentLinkedQueue import scala.collection.mutable.ArrayBuffer import org.apache.spark.scheduler.SchedulingMode.SchedulingMode private[spark] trait Schedulable { var parent: Pool // child queues def schedulableQueue: ConcurrentLinkedQueue[Schedulable] def schedulingMode: SchedulingMode def weight: Int def minShare: Int def runningTasks: Int def priority: Int def stageId: Int def name: String def addSchedulable(schedulable: Schedulable): Unit def removeSchedulable(schedulable: Schedulable): Unit def getSchedulableByName(name: String): Schedulable def executorLost(executorId: String, host: String, reason: ExecutorLossReason): Unit def checkSpeculatableTasks(): Boolean def getSortedTaskSetQueue: ArrayBuffer[TaskSetManager] }
Source File: ByteArrayChunkOutputStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package import import scala.collection.mutable.ArrayBuffer private var position = chunkSize override def write(b: Int): Unit = { allocateNewChunkIfNeeded() chunks(lastChunkIndex)(position) = b.toByte position += 1 } override def write(bytes: Array[Byte], off: Int, len: Int): Unit = { var written = 0 while (written < len) { allocateNewChunkIfNeeded() val thisBatch = math.min(chunkSize - position, len - written) System.arraycopy(bytes, written + off, chunks(lastChunkIndex), position, thisBatch) written += thisBatch position += thisBatch } } @inline private def allocateNewChunkIfNeeded(): Unit = { if (position == chunkSize) { chunks += new Array[Byte](chunkSize) lastChunkIndex += 1 position = 0 } } def toArrays: Array[Array[Byte]] = { if (lastChunkIndex == -1) { new Array[Array[Byte]](0) } else { // Copy the first n-1 chunks to the output, and then create an array that fits the last chunk. // An alternative would have been returning an array of ByteBuffers, with the last buffer // bounded to only the last chunk's position. However, given our use case in Spark (to put // the chunks in block manager), only limiting the view bound of the buffer would still // require the block manager to store the whole chunk. val ret = new Array[Array[Byte]](chunks.size) for (i <- 0 until chunks.size - 1) { ret(i) = chunks(i) } if (position == chunkSize) { ret(lastChunkIndex) = chunks(lastChunkIndex) } else { ret(lastChunkIndex) = new Array[Byte](position) System.arraycopy(chunks(lastChunkIndex), 0, ret(lastChunkIndex), 0, position) } ret } } }
Source File: UnionRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient private val rdd: RDD[T], val parentRddIndex: Int, @transient private val parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies override def getPartitions: Array[Partition] = { val array = new Array[Partition]( var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length) pos += rdd.partitions.length } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Source File: TaskContextImpl.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark import scala.collection.mutable.{ArrayBuffer, HashMap} import org.apache.spark.executor.TaskMetrics import org.apache.spark.memory.TaskMemoryManager import org.apache.spark.metrics.MetricsSystem import org.apache.spark.metrics.source.Source import org.apache.spark.util.{TaskCompletionListener, TaskCompletionListenerException} private[spark] class TaskContextImpl( val stageId: Int, val partitionId: Int, override val taskAttemptId: Long, override val attemptNumber: Int, override val taskMemoryManager: TaskMemoryManager, @transient private val metricsSystem: MetricsSystem, internalAccumulators: Seq[Accumulator[Long]], val runningLocally: Boolean = false, val taskMetrics: TaskMetrics = TaskMetrics.empty) extends TaskContext with Logging { // For backwards-compatibility; this method is now deprecated as of 1.3.0. override def attemptId(): Long = taskAttemptId // List of callback functions to execute when the task completes. @transient private val onCompleteCallbacks = new ArrayBuffer[TaskCompletionListener] // Whether the corresponding task has been killed. @volatile private var interrupted: Boolean = false // Whether the task has completed. @volatile private var completed: Boolean = false override def addTaskCompletionListener(listener: TaskCompletionListener): this.type = { onCompleteCallbacks += listener this } override def addTaskCompletionListener(f: TaskContext => Unit): this.type = { onCompleteCallbacks += new TaskCompletionListener { override def onTaskCompletion(context: TaskContext): Unit = f(context) } this } @deprecated("use addTaskCompletionListener", "1.1.0") override def addOnCompleteCallback(f: () => Unit) { onCompleteCallbacks += new TaskCompletionListener { override def onTaskCompletion(context: TaskContext): Unit = f() } } private[spark] def markInterrupted(): Unit = { interrupted = true } override def isCompleted(): Boolean = completed override def isRunningLocally(): Boolean = runningLocally override def isInterrupted(): Boolean = interrupted override def getMetricsSources(sourceName: String): Seq[Source] = metricsSystem.getSourcesByName(sourceName) @transient private val accumulators = new HashMap[Long, Accumulable[_, _]] private[spark] override def registerAccumulator(a: Accumulable[_, _]): Unit = synchronized { accumulators( = a } private[spark] override def collectInternalAccumulators(): Map[Long, Any] = synchronized { accumulators.filter(_._2.isInternal).mapValues(_.localValue).toMap } private[spark] override def collectAccumulators(): Map[Long, Any] = synchronized { accumulators.mapValues(_.localValue).toMap } //private[spark] override val internalMetricsToAccumulators: Map[String, Accumulator[Long]] = { // Explicitly register internal accumulators here because these are // not captured in the task closure and are already deserialized internalAccumulators.foreach(registerAccumulator) { a => (, a) }.toMap } }
Source File: LinkerdApi.scala From asura with MIT License | 5 votes |
package import import import asura.common.exceptions.ErrorMessages.ErrorMessage import asura.common.model.{ApiRes, ApiResError} import asura.core.http.HttpEngine import asura.core.{CoreConfig, ErrorMessages} import asura.namerd.DtabEntry import asura.namerd.api.v1.NamerdV1Api import import javax.inject.{Inject, Singleton} import import play.api.Configuration import scala.collection.mutable.ArrayBuffer import scala.concurrent.{ExecutionContext, Future} @Singleton class LinkerdApi @Inject()( implicit val exec: ExecutionContext, val controllerComponents: SecurityComponents, config: Configuration, ) extends BaseApi { val srcPrefix = "/svc/" val dstPrefix = "/$/inet/" def getProxyServers() = Action { implicit req => if (CoreConfig.linkerdConfig.enabled) { OkApiRes(ApiRes(data = CoreConfig.linkerdConfig.servers)) } else { OkApiRes(ApiResError(getI18nMessage( } } def getHttp(group: String, project: String, server: String) = Action.async { implicit req => if (CoreConfig.linkerdConfig.enabled) { val proxyServer = CoreConfig.linkerdConfig.servers.find(_.tag.equals(server)).get NamerdV1Api.getNamespaceDtabs(proxyServer.namerd, proxyServer.httpNs)(HttpEngine.http).map(dtabs => { val items = ArrayBuffer[DtabItem]() dtabs.foreach(entry => { val pStrs = entry.prefix.split("/") val dStrs = entry.dst.split("/") if (pStrs.length == 5 && dStrs.length == 5) { items += DtabItem( group = pStrs(2), project = pStrs(3), namespace = pStrs(4), host = dStrs(3), port = dStrs(4), owned = group == pStrs(2) && project == pStrs(3) ) } }) toActionResultFromAny(items) }) } else { Future.successful(OkApiRes(ApiResError(getI18nMessage( } } def putHttp(group: String, project: String, server: String) = Action(parse.byteString).async { implicit req => if (CoreConfig.linkerdConfig.enabled) { val proxyServer = CoreConfig.linkerdConfig.servers.find(_.tag.equals(server)).get val dtabs = req.bodyAs(classOf[Dtabs]) if (null != dtabs && null != dtabs.dtabs && dtabs.dtabs.nonEmpty) { var error: ErrorMessage = null val entries = ArrayBuffer[DtabEntry]() for (i <- 0 until dtabs.dtabs.length if null == error) { val item = dtabs.dtabs(i) error = item.isValid() entries += DtabEntry( s"${srcPrefix}${}/${item.project}/${item.namespace}", s"${dstPrefix}${}/${item.port}" ) } if (null == error) { NamerdV1Api.updateNamespaceDtabs(proxyServer.namerd, proxyServer.httpNs, entries)(HttpEngine.http).toOkResult } else { error.toFutureFail } } else { Future.successful(OkApiRes(ApiRes())) } } else { Future.successful(OkApiRes(ApiResError(getI18nMessage( } } }
Source File: InterfaceMethodParamsActor.scala From asura with MIT License | 5 votes |
package import{ActorRef, Props, Status} import akka.pattern.pipe import akka.util.ByteString import import asura.common.util.LogUtils import asura.dubbo.DubboConfig import import asura.dubbo.model.InterfaceMethodParams import asura.dubbo.model.InterfaceMethodParams.MethodSignature import scala.collection.mutable.ArrayBuffer import scala.concurrent.{ExecutionContext, Future} class InterfaceMethodParamsActor(invoker: ActorRef, msg: GetInterfaceMethodParams) extends BaseActor { implicit val ec: ExecutionContext = context.dispatcher private val telnet: ActorRef = context.actorOf(TelnetClientActor.props(msg.address, if (msg.port > 0) msg.port else DubboConfig.DEFAULT_PORT, self)) override def receive: Receive = { case telnetData: ByteString => val utf8String = telnetData.utf8String if (utf8String.contains(TelnetClientActor.MSG_CONNECT_TO)) { log.debug(utf8String) if (utf8String.contains(TelnetClientActor.MSG_SUCCESS)) { telnet ! ByteString(s"ls -l ${msg.ref}\r\n") } else if (utf8String.contains(TelnetClientActor.MSG_FAIL)) { Future.failed(new RuntimeException(s"Remote connection to ${msg.address}:${msg.port} failed")) pipeTo invoker telnet ! TelnetClientActor.CMD_CLOSE context stop self } else { Future.failed(new RuntimeException(s"Unknown response ${utf8String}")) pipeTo invoker telnet ! TelnetClientActor.CMD_CLOSE context stop self } } else if (utf8String.contains("(") && utf8String.contains(")")) { getInterfaceMethodParams(msg.ref, utf8String) pipeTo invoker telnet ! TelnetClientActor.CMD_CLOSE } else { Future.failed(new RuntimeException(s"Unknown response: ${utf8String}")) pipeTo invoker telnet ! TelnetClientActor.CMD_CLOSE context stop self } case Status.Failure(t) => val stackTrace = LogUtils.stackTraceToString(t) log.warning(stackTrace) context stop self } def getInterfaceMethodParams(ref: String, content: String): Future[InterfaceMethodParams] = { Future.successful { val methods = ArrayBuffer[MethodSignature]() content.split("\r\n") .filter(!_.startsWith(DubboConfig.DEFAULT_PROMPT)) .map(signature => { val splits = signature.split(" ") if (splits.length == 2) { val ret = splits(0) val secondPart = splits(1) val idx = secondPart.indexOf("(") val method = secondPart.substring(0, idx) val params = secondPart.substring(idx + 1, secondPart.length - 1).split(",") methods += (MethodSignature(ret, method, params)) } }) InterfaceMethodParams(ref, methods) } } override def postStop(): Unit = log.debug(s"${self.path} stopped") } object InterfaceMethodParamsActor { def props(invoker: ActorRef, msg: GetInterfaceMethodParams) = { Props(new InterfaceMethodParamsActor(invoker, msg)) } }
Source File: JobReportDataItemSaveActor.scala From asura with MIT License | 5 votes |
package import{Props, Status} import import asura.common.util.LogUtils import import import import import scala.collection.mutable.ArrayBuffer import scala.concurrent.duration._ class JobReportDataItemSaveActor(dayIndexSuffix: String) extends BaseActor { val messages = ArrayBuffer[SaveReportDataHttpItemMessage]() override def receive: Receive = { case m: SaveReportDataHttpItemMessage => messages += m if (messages.length >= 10) { insert() } context.system.scheduler.scheduleOnce(2 seconds) { self ! Flush }(context.system.dispatcher) case Flush => insert() case Status.Failure(t) => log.warning(LogUtils.stackTraceToString(t)) } override def preStart(): Unit = { } override def postStop(): Unit = { insert() log.debug(s"${self.path} is stopped") } private def insert(): Unit = { if (messages.length > 0) { log.debug(s"${messages.length} items is saving...") JobReportDataItemService.index(messages, dayIndexSuffix) messages.clear() } } } object JobReportDataItemSaveActor { def props(dayIndexSuffix: String) = Props(new JobReportDataItemSaveActor(dayIndexSuffix)) case class SaveReportDataHttpItemMessage(id: String, dataItem: JobReportDataItem) }
Source File: JobStatusActor.scala From asura with MIT License | 5 votes |
package import import{ActorRef, Props} import import asura.common.model.Pagination import asura.core.model.QueryJob import import import asura.core.job.eventbus.JobStatusBus.JobStatusNotificationMessage import asura.core.job.{JobListItem, JobStates} import asura.core.redis.RedisJobState import asura.core.util.JacksonSupport import com.typesafe.scalalogging.Logger import scala.collection.mutable import scala.collection.mutable.ArrayBuffer class JobStatusActor() extends BaseActor { var query: QueryJob = null val watchIds = mutable.HashSet[String]() override def receive: Receive = { case SenderMessage(sender) => context.become(query(sender)) } def query(outSender: ActorRef): Receive = { case query: QueryJob => this.query = query JobService.queryJob(query).map(esResponse => if (esResponse.isSuccess) { val items = ArrayBuffer[JobListItem]() val jobsTable = mutable.HashMap[String, JobListItem]() val hits = esResponse.result.hits watchIds.clear() hits.hits.foreach(hit => { val jobId = watchIds.add(jobId) jobsTable += (jobId -> { val item = JacksonSupport.parse(hit.sourceAsString, classOf[JobListItem]) item.state = JobStates.UNKNOWN items += item item._id = jobId item }) }) if (watchIds.nonEmpty) { RedisJobState.getJobState(watchIds.toSet).onComplete { case util.Success(statesMap) => statesMap.forEach((jobKey, state) => jobsTable(jobKey).state = state) outSender ! ListActorEvent(Map("total" ->, "list" -> items)) case util.Failure(_) => outSender ! ListActorEvent(Map("total" ->, "list" -> items)) }(context.system.dispatcher) } else { outSender ! ListActorEvent(Map("total" -> 0, "list" -> Nil)) } } else { outSender ! ErrorActorEvent(esResponse.error.reason) })(context.system.dispatcher) case JobStatusNotificationMessage(_, operator, scheduler, group, name, data) => if (watchIds.contains(name)) { outSender ! ItemActorEvent(JobStatusOperationMessage(operator, scheduler, group, name, data)) } case eventMessage: ActorEvent => outSender ! eventMessage case Failure(t) => outSender ! ErrorActorEvent(t.getMessage) } override def postStop(): Unit = { import JobStatusActor.logger logger.debug(s"JobStatus for ${query} stopped") } } object JobStatusActor { val logger = Logger(classOf[JobStatusActor]) def props() = Props(new JobStatusActor()) case class JobQueryMessage(scheduler: String = null, group: String = null, text: String = null) extends Pagination }
Source File: HeaderUtils.scala From asura with MIT License | 5 votes |
package asura.core.http import akka.http.scaladsl.model.HttpHeader.ParsingResult.{Error, Ok} import akka.http.scaladsl.model.headers.{Cookie, RawHeader} import akka.http.scaladsl.model.{ErrorInfo, HttpHeader} import asura.common.util.StringUtils import{Environment, HttpCaseRequest} import asura.core.runtime.RuntimeContext import asura.core.{CoreConfig, ErrorMessages} import com.typesafe.scalalogging.Logger import scala.collection.immutable import scala.collection.mutable.ArrayBuffer object HeaderUtils { val logger = Logger("HeaderUtils") def toHeaders(cs: HttpCaseRequest, context: RuntimeContext): immutable.Seq[HttpHeader] = { val headers = ArrayBuffer[HttpHeader]() val request = cs.request val env = if (null != context.options) context.options.getUsedEnv() else null if (null != request) { val headerSeq = request.header if (null != headerSeq) { for (h <- headerSeq if (h.enabled && StringUtils.isNotEmpty(h.key))) { HttpHeader.parse(h.key, context.renderSingleMacroAsString(h.value)) match { case Ok(header: HttpHeader, errors: List[ErrorInfo]) => if (errors.nonEmpty) logger.warn(errors.mkString(",")) headers += header case Error(error: ErrorInfo) => logger.warn(error.detail) } } } val cookieSeq = request.cookie if (null != cookieSeq) { val cookies = ArrayBuffer[(String, String)]() for (c <- cookieSeq if (c.enabled && StringUtils.isNotEmpty(c.key))) { cookies += ((c.key, context.renderSingleMacroAsString(c.value))) } if (cookies.nonEmpty) headers += Cookie(cookies: _*) } } if (null != env && null != env.headers && env.headers.nonEmpty) { for (h <- env.headers if (h.enabled && StringUtils.isNotEmpty(h.key))) { HttpHeader.parse(h.key, context.renderSingleMacroAsString(h.value)) match { case Ok(header: HttpHeader, errors: List[ErrorInfo]) => if (errors.nonEmpty) logger.warn(errors.mkString(",")) headers += header case Error(error: ErrorInfo) => logger.warn(error.detail) } } } if (null != env && env.enableProxy) { val headerIdentifier = validateProxyVariables(env) val dst = StringBuilder.newBuilder dst.append("/").append("/").append(cs.project).append("/").append(env.namespace) headers += RawHeader(headerIdentifier, dst.toString) } headers.toList } def validateProxyVariables(env: Environment): String = { if (!CoreConfig.linkerdConfig.enabled) { throw ErrorMessages.error_ProxyDisabled.toException } if (StringUtils.isEmpty(env.namespace)) { throw ErrorMessages.error_EmptyNamespace.toException } if (StringUtils.isEmpty(env.server)) { throw ErrorMessages.error_EmptyProxyServer.toException } val proxyServerOpt = CoreConfig.linkerdConfig.servers.find(_.tag.equals(env.server)) if (proxyServerOpt.isEmpty && StringUtils.isEmpty(proxyServerOpt.get.headerIdentifier)) { throw ErrorMessages.error_InvalidProxyConfig.toException } else { proxyServerOpt.get.headerIdentifier } } def isApplicationJson(header: HttpHeader): Boolean = { if (header.lowercaseName().equals("content-type")) { header.value().contains(HttpContentTypes.JSON) } else { false } } }
package asura.core.assertion import asura.core.concurrent.ExecutionContextManager.cachedExecutor import asura.core.assertion.engine.{AssertResult, AssertionContext, FailAssertResult, Statistic} import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import scala.concurrent.Future object Or extends Assertion { override val name: String = Assertions.OR override def assert(actual: Any, expect: Any): Future[AssertResult] = { apply(actual, expect) } def apply(actual: Any, except: Any): Future[AssertResult] = { val result = AssertResult( isSuccessful = false, msg = AssertResult.MSG_FAILED ) val subResults = ArrayBuffer[mutable.Map[String, Any]]() result.subResult = subResults except match { case assertions: Seq[_] => if (assertions.nonEmpty) { val assertionResults = => { val subStatis = Statistic() val assertionMap = assertion.asInstanceOf[Map[String, Any]] val contextMap = actual.asInstanceOf[Object] AssertionContext.eval(assertionMap, contextMap, subStatis).map((subStatis, _)) }) Future.sequence(assertionResults).map(subStatisResults => { val subResults = ArrayBuffer[java.util.Map[String, Any]]() result.subResult = subResults subStatisResults.foreach(subStatisResult => { val (subStatis, subResult) = subStatisResult subResults += subResult result.pass(subStatis.passed) if (subStatis.isSuccessful) { result.isSuccessful = true result.msg = AssertResult.MSG_PASSED } }) result }) } else { Future.successful(null) } case _ => Future.successful(FailAssertResult(1, AssertResult.msgIncomparableTargetType(except))) } } }
Example 163
Example 164
Source File: TriggerEventsSaveActor.scala From asura with MIT License | 5 votes |
Example 165
Source File: ActivitySaveActor.scala From asura with MIT License | 5 votes |
Example 166
Source File: HttpResponse.scala From asura with MIT License | 5 votes |
Example 167
Source File: RecommendService.scala From asura with MIT License | 5 votes |
Example 168
Source File: HomeService.scala From asura with MIT License | 5 votes |
Example 169
Source File: TriggerEventLogService.scala From asura with MIT License | 5 votes |
Example 170
Source File: IndexService.scala From asura with MIT License | 5 votes |
Example 171
Source File: ScalapropsRunner.scala From scalaprops with MIT License | 5 votes |
Example 172
Source File: SolrTableFactory.scala From solr-sql with BSD 3-Clause "New" or "Revised" License | 5 votes |
Example 173
Source File: TSQR.scala From SparkAndMPIFactorizations with MIT License | 5 votes |
Example 174
Source File: ParallelizedWithLocalityRDD.scala From cloud-integration with Apache License 2.0 | 5 votes |
Example 175
Source File: DelTransfer.scala From bdg-sequila with Apache License 2.0 | 5 votes |
Example 176
Source File: MDTagParser.scala From bdg-sequila with Apache License 2.0 | 5 votes |
Example 177
Source File: NCListBuilder.scala From bdg-sequila with Apache License 2.0 | 5 votes |
Example 178
Source File: NCListTree.scala From bdg-sequila with Apache License 2.0 | 5 votes |
Example 179
Source File: CoverageUpdate.scala From bdg-sequila with Apache License 2.0 | 5 votes |
Example 180
Source File: BufferBenchmark.scala From sigmastate-interpreter with MIT License | 5 votes |
Example 181
Source File: HBaseLocalClient.scala From gimel with Apache License 2.0 | 5 votes |
Example 182
Source File: FriendEntity.scala From lagom-scala-chirper with Apache License 2.0 | 5 votes |
Example 183
Source File: TestContext.scala From swave with Mozilla Public License 2.0 | 5 votes |
Example 184
Source File: ercesiMIPSRunner.scala From ercesiMIPS with GNU General Public License v3.0 | 5 votes |
Example 185
Source File: GzetPersons.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
Example 186
Source File: IncrementalSeq.scala From inox with Apache License 2.0 | 5 votes |
Example 187
Source File: MatchCollector.scala From piglet with Apache License 2.0 | 5 votes |
Example 188
Source File: NFAStructure.scala From piglet with Apache License 2.0 | 5 votes |
Example 189
Source File: FlinkStreamingCEPTest.scala From piglet with Apache License 2.0 | 5 votes |
Example 190
Source File: Cross.scala From piglet with Apache License 2.0 | 5 votes |
Example 191
package dbis.piglet.op import dbis.piglet.schema._ import scala.collection.mutable.ArrayBuffer case class Zip( private val out: Pipe, private val in: List[Pipe], withIndex: Boolean ) extends PigOperator(List(out), in) { require((in.size > 1 && !withIndex) || (in.size == 1 && withIndex), "zip with index works only with one input. Otherwise we must have at least two inputs") override def lineageString: String = { s"""ZIP%$withIndex""" + super.lineageString } override def constructSchema: Option[Schema] = { val newFields = inputs.flatMap(p => p.producer.schema match { case Some(s) => { f => Field(, f.fType, :: f.lineage) } case None => throw new UnsupportedOperationException(s"Cannot zip with unknown Schema! (input pipe $p)") }) schema = Some(Schema( BagType( TupleType( (if(withIndex) newFields :+ Field("index", Types.LongType) else newFields).toArray ) ) )) schema } override def toString = s"""ZIP | out = ${outPipeNames.mkString(",")} | in = ${inPipeNames.mkString(",")} | withIndex = $withIndex""".stripMargin }
Source File: SpatialJoin.scala From piglet with Apache License 2.0 | 5 votes |
package dbis.piglet.op import dbis.piglet.expr.SpatialJoinPredicate import dbis.piglet.op.IndexMethod.IndexMethod import dbis.piglet.op.PartitionMethod.PartitionMethod import dbis.piglet.schema._ import scala.collection.mutable.ArrayBuffer case class SpatialJoin( private val out: Pipe, private val in: List[Pipe], predicate: SpatialJoinPredicate, index: Option[(IndexMethod, List[String])], leftParti: Option[(PartitionMethod, List[String])], rightParti: Option[(PartitionMethod, List[String])] ) extends PigOperator(List(out), in) { override def lineageString: String = { s"""SPATIALJOIN%${predicate.toString()}%$index%""" + super.lineageString } override def constructSchema: Option[Schema] = { val newFields = ArrayBuffer[Field]() inputs.foreach(p => p.producer.schema match { case Some(s) => if(s.isIndexed) { newFields ++= s.element.valueType.asInstanceOf[IndexType] // a bag of Indexes .valueType.fields // An Index contains tuples with two fields: indexed column and payload .last.fType.asInstanceOf[TupleType] // payload is again a tuple .fields // fields in each tuple .map { f => Field(, f.fType, :: f.lineage) } } else { newFields ++= s.fields map { f => Field(, f.fType, :: f.lineage) } } case None => newFields += Field("", Types.ByteArrayType) }) schema = Some(Schema(BagType(TupleType(newFields.toArray)))) schema } override def toString = s"""SPATIALJOIN | out = $outPipeName | in = ${inPipeNames.mkString(",")} | inSchema = {${",")}} | outSchema = $schema | predicate = $predicate | index = $index""".stripMargin // }
Example 193
package dbis.piglet.op import dbis.piglet.schema._ import scala.collection.mutable.ArrayBuffer case class Union(private val out: Pipe, private val in: List[Pipe]) extends PigOperator(List(out), in) { override def lineageString: String = { s"""UNION%""" + super.lineageString } override def constructSchema: Option[Schema] = { val bagType = (p: Pipe) => p.producer.schema.get.element val generalizedBagType = (b1: BagType, b2: BagType) => { require(b1.valueType.fields.length == b2.valueType.fields.length) val newFields = ArrayBuffer[Field]() val fieldPairs = for ((f1, f2) <- fieldPairs) { newFields += Field(, Types.escalateTypes(f1.fType, f2.fType)) } BagType(TupleType(newFields.toArray)) } // case 1: one of the input schema isn't known -> output schema = None if (inputs.exists(p => p.producer.schema.isEmpty)) { schema = None } else { // case 2: all input schemas have the same number of fields val s1 = inputs.head.producer.schema.get if (! inputs.tail.exists(p => s1.fields.length != p.producer.schema.get.fields.length)) { val typeList = => bagType(p)) val resultType = typeList.reduceLeft(generalizedBagType) schema = Some(Schema(resultType)) } else { // case 3: the number of fields differ schema = None } } schema } override def toString = s"""UNION | out = $outPipeName | in = { ${inPipeNames.mkString(",")} } | inSchema = $inputSchema | outSchema = $schema""".stripMargin }
Example 194
package dbis.piglet.codegen.flink.emitter import dbis.piglet.codegen.{ CodeEmitter, CodeGenContext, CodeGenException } import dbis.piglet.expr.Ref import dbis.piglet.op.Join import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.Set import dbis.piglet.codegen.scala_lang.ScalaEmitter import scala.collection.mutable.ListBuffer import dbis.piglet.codegen.flink.FlinkHelper class JoinEmitter extends dbis.piglet.codegen.scala_lang.JoinEmitter { override def template: String = """ val <out> = <rel1><rels, rel1_keys, rel2_keys:{ r,k1, k2 | .join(<r>).where(<k1>).equalTo(<k2>)}>.map{ | t => | val <pairs> = t | <class>(<fields>) | }""".stripMargin override def code(ctx: CodeGenContext, op: Join): String = { if (!op.schema.isDefined) throw CodeGenException("schema required in JOIN") val res = val keys = { case (i, k) => { x => s"_${FlinkHelper.getOrderIndex(i.producer.schema, x)}" } } var keysGroup: ListBuffer[(List[String], List[String])] = new ListBuffer for (i <- 0 until keys.length - 1) { val v = (keys(i), keys(i + 1)) keysGroup += v } val keysGroup1 = { case (i, k) => if (k > 0) (FlinkHelper.printQuote( { x => s"_$k.$x" }), FlinkHelper.printQuote(i._2)) else (FlinkHelper.printQuote(i._1), FlinkHelper.printQuote(i._2)) } val keys1 = => x._1) val keys2 = => x._2) val className = op.schema match { case Some(s) => ScalaEmitter.schemaClassName(s.className) case None => ScalaEmitter.schemaClassName(op.outPipeName) } var pairs = "(v1,v2)" for (i <- 3 to op.inputs.length) { pairs = s"($pairs,v$i)" } val fieldList = ArrayBuffer[String]() for (i <- 1 to op.inputs.length) { op.inputs(i - 1).producer.schema match { case Some(s) => fieldList ++= { case (f, k) => s"v$i._$k" } case None => fieldList += s"v$i._0" } } render( Map("out" -> op.outPipeName, "rel1" ->, "class" -> className, "rels" ->, "pairs" -> pairs, "rel1_keys" -> keys1, "rel2_keys" -> keys2, "fields" -> fieldList.mkString(", "))) } } object JoinEmitter { lazy val instance = new JoinEmitter }
Example 195
package dbis.piglet.codegen.flink import dbis.piglet.codegen.CodeGenException import dbis.piglet.expr.NamedField import dbis.piglet.expr.PositionalField import dbis.piglet.schema.Schema import dbis.piglet.expr.Ref import dbis.piglet.op.PigOperator import scala.collection.mutable.ArrayBuffer object FlinkHelper { def getOrderIndex(schema: Option[Schema], ref: Ref): Int = schema match { case Some(s) => ref match { case nf @ NamedField(f, _) => s.indexOfField(nf) case PositionalField(pos) => pos case _ => 0 } case None => throw new CodeGenException(s"the Flink OrderBy/Join operator needs a schema, thus, invalid field ") } def emitJoinFieldList(node: PigOperator): (String, String) = { val rels = node.inputs var fields = "" var pairs = "(v,w)" if (rels.length == 2) { val vsize = rels.head.inputSchema.get.fields.length fields = node.schema.get.fields.zipWithIndex .map { case (f, i) => if (i < vsize) s"v._$i" else s"w._${i - vsize}" }.mkString(", ") } else { pairs = "(v1,v2)" for (i <- 3 to rels.length) { pairs = s"($pairs,v$i)" } val fieldList = ArrayBuffer[String]() for (i <- 1 to node.inputs.length) { node.inputs(i - 1).producer.schema match { case Some(s) => fieldList ++= { case (f, k) => s"v$i._$k" } case None => fieldList += s"v$i._0" } } fields = fieldList.mkString(", ") } (pairs, fields) } def printQuote(values: List[String]) = """"""" + values.mkString("""","""") + """"""" }
Example 196
package org.dl4scala.examples.nlp.tsne import import org.datavec.api.util.ClassPathResource import org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer import org.deeplearning4j.models.sequencevectors.sequence.SequenceElement import org.deeplearning4j.models.word2vec.wordstore.VocabCache import org.nd4j.linalg.api.buffer.DataBuffer import org.nd4j.linalg.api.buffer.util.DataTypeUtil import org.nd4j.linalg.primitives import org.slf4j.LoggerFactory import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer object TSNEStandardExample { private val log = LoggerFactory.getLogger(TSNEStandardExample.getClass) def main(args: Array[String]): Unit = { // STEP 1: Initialization val iterations = 100 // create an n-dimensional array of doubles DataTypeUtil.setDTypeForContext(DataBuffer.Type.DOUBLE) val cacheList = new ArrayBuffer[String](); // cacheList is a dynamic array of strings used to hold all words //STEP 2: Turn text input into a list of words"Load & Vectorize data....") val wordFile = new ClassPathResource("words.txt").getFile //Open the file //Get the data of all unique word vectors val vectors: primitives.Pair[InMemoryLookupTable[_ <: SequenceElement], VocabCache[_ <: SequenceElement]] = WordVectorSerializer.loadTxt(wordFile) val cache = vectors.getSecond val weights = vectors.getFirst.getSyn0 //seperate weights of unique words into their own list (0 until cache.numWords()).foreach(i => cacheList.append(cache.wordAtIndex(i))) import org.deeplearning4j.plot.BarnesHutTsne //STEP 3: build a dual-tree tsne to use later//STEP 3: build a dual-tree tsne to use later"Build model....") val tsne = new BarnesHutTsne.Builder() .setMaxIter(iterations) .theta(0.5) .normalize(false) .learningRate(500) .useAdaGrad(false) .build //STEP 4: establish the tsne values and save them to a file"Store TSNE Coordinates for Plotting....") val outputFile = "target/archive-tmp/tsne-standard-coords.csv" new File(outputFile).getParentFile.mkdirs tsne.saveAsFile(cacheList.asJava, outputFile) } }
Example 197
package org.dl4scala.examples.feedforward.anomalydetection import java.awt.{GridLayout, Image} import java.awt.image.BufferedImage import javax.swing.{ImageIcon, JFrame, JLabel, JPanel} import org.nd4j.linalg.api.ndarray.INDArray import scala.collection.mutable.ArrayBuffer class MNISTVisualizer(imageScale: Double, digits: ArrayBuffer[INDArray], title: String, gridWidth: Int) { def this(imageScale: Double, digits: ArrayBuffer[INDArray], title: String) = { this(imageScale, digits, title, 5) } def visualize(): Unit = { val frame = new JFrame frame.setTitle(title) frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE) val panel = new JPanel panel.setLayout(new GridLayout(0, gridWidth)) val list = getComponents for (image <- list) { panel.add(image) } frame.add(panel) frame.setVisible(true) frame.pack() } def getComponents: ArrayBuffer[JLabel] = { val images = new ArrayBuffer[JLabel]() for (arr <- digits) { val bi = new BufferedImage(28, 28, BufferedImage.TYPE_BYTE_GRAY) for(i <- 0 until 784) { bi.getRaster.setSample(i % 28, i / 28, 0, (255 * arr.getDouble(i)).asInstanceOf[Int]) } val orig = new ImageIcon(bi) val imageScaled = orig.getImage.getScaledInstance((imageScale * 28).asInstanceOf[Int], (imageScale * 28).asInstanceOf[Int], Image.SCALE_REPLICATE) val scaled = new ImageIcon(imageScaled) images.append(new JLabel(scaled)) } images } }
Example 198
package import com.esotericsoftware.kryo.Kryo import{Input, Output} import import com.github.nearbydelta.deepspark.layer.InputLayer import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import scala.collection.mutable.ArrayBuffer import scala.collection.parallel.ParSeq class GeneralNetwork[In, Out](var inputLayer: InputLayer[In, _]) extends Network[In, Out] { @deprecated(message = "This is for kryo deserialization. Please use this(inputlayer)") def this() = this(null) override def NOut: Int = layerSeq.lastOption match { case Some(x) ⇒ x.NOut case None if inputLayer != null ⇒ inputLayer.NOut case None ⇒ 0 } override def backward(error: ParSeq[DataVec]): ArrayBuffer[() ⇒ Unit] = { val (upper, fseq) = backwardSeq(error) val (x, f) = inputLayer backward upper fseq ++= f.seq fseq } override def broadcast(sc: SparkContext): Unit = { inputLayer.broadcast(sc) super.broadcast(sc) } override def forward(in: In) = { val out = inputLayer.forward(in) forwardSingle(out) } override def forward(in: ParSeq[In]): ParSeq[DataVec] = { val out = inputLayer.forward(in) forwardSeq(out) } override def forward(in: RDD[(Long, In)]): RDD[(Long, DataVec)] = { val out = inputLayer.forward(in) broadcast(in.context) forwardRDD(out) } override def initiateBy(builder: WeightBuilder): this.type = { inputLayer.initiateBy(builder) super.initiateBy(builder) this } override def loss: Double = super.loss + inputLayer.loss override def read(kryo: Kryo, input: Input): Unit = { inputLayer = kryo.readClassAndObject(input).asInstanceOf[InputLayer[In, _]], input) } override def setUpdatable(bool: Boolean): Network[In, Out] = { inputLayer.setUpdatable(bool) super.setUpdatable(bool) } override def unbroadcast(): Unit = { inputLayer.unbroadcast() super.unbroadcast() } override def write(kryo: Kryo, output: Output): Unit = { kryo.writeClassAndObject(output, inputLayer) super.write(kryo, output) } }
Example 199
package import com.fasterxml.aalto.{AsyncByteArrayFeeder, AsyncXMLStreamReader} import scala.collection.mutable.ArrayBuffer trait ExtractStageHelpers { def update(xmlElementsLst: scala.collection.mutable.Set[XMLGroupElement], path: ArrayBuffer[String], newValue: Some[String]): Unit = { val elementsWithoutAnyValueForGivenPath = xmlElementsLst.collect { case e: XMLGroupElement if (e.xPath == path.toList) && e.value.isEmpty => e } XMLGroupElement) => { xmlElementsLst.remove(ele) val newElement = ele.copy(value = newValue) xmlElementsLst.add(newElement) }) } def getCompletedXMLElements(xmlElementsLst: scala.collection.mutable.Set[XMLGroupElement]): scala.collection.mutable.Set[XMLGroupElement] = { val completedElements = xmlElementsLst.collect { case e if !(e.xPath.nonEmpty && e.value.isEmpty) => e } completedElements.foreach({ xmlElementsLst -= _ }) completedElements } }
Example 200
package import com.fasterxml.aalto.{AsyncByteArrayFeeder, AsyncXMLStreamReader} import scala.collection.mutable.ArrayBuffer trait StreamHelper { def update(xmlElementsLst: scala.collection.mutable.Set[XMLElement], path: ArrayBuffer[String], newValue: Some[String]): Unit = { val elementsWithoutAnyValueForGivenPath = xmlElementsLst.collect { case e: XMLElement if (e.xPath == path.toList) && e.value.isEmpty => e } XMLElement) => { xmlElementsLst.remove(ele) val newElement = ele.copy(value = newValue) xmlElementsLst.add(newElement) }) } def getCompletedXMLElements(xmlElementsLst: scala.collection.mutable.Set[XMLElement]): scala.collection.mutable.Set[XMLElement] = { val completedElements = xmlElementsLst.collect { case e if !(e.xPath.nonEmpty && (e.value.isEmpty && e.attributes.isEmpty)) => e } completedElements.foreach({ xmlElementsLst -= _ }) completedElements } def getUpdatedElement(xPath: Seq[String], attributes: Map[String, String], elemText: String) (implicit reader: AsyncXMLStreamReader[AsyncByteArrayFeeder]): String = { val prefix = getPrefix val startElement = attributes.foldLeft(s"<$prefix${xPath.last}") { case (s, (k, v)) => s"""$s $k="$v"""" } + ">" val value = elemText val endElement = getEndElement(xPath, prefix) s"$startElement$value$endElement" } private def getPrefix(implicit reader: AsyncXMLStreamReader[AsyncByteArrayFeeder]): String = Option(reader.getPrefix) match { case Some(pre) if pre.nonEmpty => s"$pre:" case _ => "" } private def getEndElement(xPath: Seq[String], prefix: String) = s"</$prefix${xPath.last}>" }