org.apache.spark.storage.StorageLevel Scala Examples
The following examples show how to use org.apache.spark.storage.StorageLevel.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SqlNetworkWordCount.scala From drizzle-spark with Apache License 2.0 | 6 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} object SparkSessionSingleton { @transient private var instance: SparkSession = _ def getInstance(sparkConf: SparkConf): SparkSession = { if (instance == null) { instance = SparkSession .builder .config(sparkConf) .getOrCreate() } instance } } // scalastyle:on println
Example 2
Source File: CustomReceiver.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import java.io.{BufferedReader, InputStreamReader} import java.net.Socket import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { logInfo("Connecting to " + host + ":" + port) socket = new Socket(host, port) logInfo("Connected to " + host + ":" + port) val reader = new BufferedReader( new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8)) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() logInfo("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } } // scalastyle:on println
Example 3
Source File: FlumeEventCount.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.streaming.flume._ import org.apache.spark.util.IntParam object FlumeEventCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println( "Usage: FlumeEventCount <host> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val Array(host, IntParam(port)) = args val batchInterval = Milliseconds(2000) // Create the context and set the batch size val sparkConf = new SparkConf().setAppName("FlumeEventCount") val ssc = new StreamingContext(sparkConf, batchInterval) // Create a flume stream val stream = FlumeUtils.createStream(ssc, host, port, StorageLevel.MEMORY_ONLY_SER_2) // Print out the count of events received from this server in each batch stream.count().map(cnt => "Received " + cnt + " flume events." ).print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 4
Source File: RawNetworkGrep.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.util.IntParam object RawNetworkGrep { def main(args: Array[String]) { if (args.length != 4) { System.err.println("Usage: RawNetworkGrep <numStreams> <host> <port> <batchMillis>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val Array(IntParam(numStreams), host, IntParam(port), IntParam(batchMillis)) = args val sparkConf = new SparkConf().setAppName("RawNetworkGrep") // Create the context val ssc = new StreamingContext(sparkConf, Duration(batchMillis)) val rawStreams = (1 to numStreams).map(_ => ssc.rawSocketStream[String](host, port, StorageLevel.MEMORY_ONLY_SER_2)).toArray val union = ssc.union(rawStreams) union.filter(_.contains("the")).count().foreachRDD(r => println("Grep count: " + r.collect().mkString)) ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 5
Source File: NetworkWordCount.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} object NetworkWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: NetworkWordCount <hostname> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() // Create the context with a 1 second batch size val sparkConf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create a socket stream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 6
Source File: GraphLoader.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx import org.apache.spark.SparkContext import org.apache.spark.graphx.impl.{EdgePartitionBuilder, GraphImpl} import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel def edgeListFile( sc: SparkContext, path: String, canonicalOrientation: Boolean = false, numEdgePartitions: Int = -1, edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY, vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) : Graph[Int, Int] = { val startTime = System.currentTimeMillis // Parse the edge data table directly into edge partitions val lines = if (numEdgePartitions > 0) { sc.textFile(path, numEdgePartitions).coalesce(numEdgePartitions) } else { sc.textFile(path) } val edges = lines.mapPartitionsWithIndex { (pid, iter) => val builder = new EdgePartitionBuilder[Int, Int] iter.foreach { line => if (!line.isEmpty && line(0) != '#') { val lineArray = line.split("\\s+") if (lineArray.length < 2) { throw new IllegalArgumentException("Invalid line: " + line) } val srcId = lineArray(0).toLong val dstId = lineArray(1).toLong if (canonicalOrientation && srcId > dstId) { builder.add(dstId, srcId, 1) } else { builder.add(srcId, dstId, 1) } } } Iterator((pid, builder.toEdgePartition)) }.persist(edgeStorageLevel).setName("GraphLoader.edgeListFile - edges (%s)".format(path)) edges.count() logInfo("It took %d ms to load the edges".format(System.currentTimeMillis - startTime)) GraphImpl.fromEdgePartitions(edges, defaultVertexAttr = 1, edgeStorageLevel = edgeStorageLevel, vertexStorageLevel = vertexStorageLevel) } // end of edgeListFile }
Example 7
Source File: EdgeRDDSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx import org.apache.spark.SparkFunSuite import org.apache.spark.storage.StorageLevel class EdgeRDDSuite extends SparkFunSuite with LocalSparkContext { test("cache, getStorageLevel") { // test to see if getStorageLevel returns correct value after caching withSpark { sc => val verts = sc.parallelize(List((0L, 0), (1L, 1), (1L, 2), (2L, 3), (2L, 3), (2L, 3))) val edges = EdgeRDD.fromEdges(sc.parallelize(List.empty[Edge[Int]])) assert(edges.getStorageLevel == StorageLevel.NONE) edges.cache() assert(edges.getStorageLevel == StorageLevel.MEMORY_ONLY) } } }
Example 8
Source File: PeriodicGraphCheckpointer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.impl import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import org.apache.spark.storage.StorageLevel private[mllib] class PeriodicGraphCheckpointer[VD, ED]( checkpointInterval: Int, sc: SparkContext) extends PeriodicCheckpointer[Graph[VD, ED]](checkpointInterval, sc) { override protected def checkpoint(data: Graph[VD, ED]): Unit = data.checkpoint() override protected def isCheckpointed(data: Graph[VD, ED]): Boolean = data.isCheckpointed override protected def persist(data: Graph[VD, ED]): Unit = { if (data.vertices.getStorageLevel == StorageLevel.NONE) { data.vertices.persist() } if (data.edges.getStorageLevel == StorageLevel.NONE) { data.edges.persist() } } override protected def unpersist(data: Graph[VD, ED]): Unit = data.unpersist(blocking = false) override protected def getCheckpointFiles(data: Graph[VD, ED]): Iterable[String] = { data.getCheckpointFiles } }
Example 9
Source File: KinesisInputDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.{Duration, StreamingContext, Time} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo private[kinesis] class KinesisInputDStream[T: ClassTag]( _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 10
Source File: KafkaStreamSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import scala.collection.mutable import scala.concurrent.duration._ import scala.language.postfixOps import scala.util.Random import kafka.serializer.StringDecoder import org.scalatest.BeforeAndAfterAll import org.scalatest.concurrent.Eventually import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext} class KafkaStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfterAll { private var ssc: StreamingContext = _ private var kafkaTestUtils: KafkaTestUtils = _ override def beforeAll(): Unit = { kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() } override def afterAll(): Unit = { if (ssc != null) { ssc.stop() ssc = null } if (kafkaTestUtils != null) { kafkaTestUtils.teardown() kafkaTestUtils = null } } test("Kafka input stream") { val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName) ssc = new StreamingContext(sparkConf, Milliseconds(500)) val topic = "topic1" val sent = Map("a" -> 5, "b" -> 3, "c" -> 10) kafkaTestUtils.createTopic(topic) kafkaTestUtils.sendMessages(topic, sent) val kafkaParams = Map("zookeeper.connect" -> kafkaTestUtils.zkAddress, "group.id" -> s"test-consumer-${Random.nextInt(10000)}", "auto.offset.reset" -> "smallest") val stream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, Map(topic -> 1), StorageLevel.MEMORY_ONLY) val result = new mutable.HashMap[String, Long]() stream.map(_._2).countByValue().foreachRDD { r => r.collect().foreach { kv => result.synchronized { val count = result.getOrElseUpdate(kv._1, 0) + kv._2 result.put(kv._1, count) } } } ssc.start() eventually(timeout(10000 milliseconds), interval(100 milliseconds)) { assert(result.synchronized { sent === result }) } } }
Example 11
Source File: FlumeInputDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import java.io.{Externalizable, ObjectInput, ObjectOutput} import java.net.InetSocketAddress import java.nio.ByteBuffer import java.util.concurrent.Executors import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.avro.ipc.NettyServer import org.apache.avro.ipc.specific.SpecificResponder import org.apache.flume.source.avro.{AvroFlumeEvent, AvroSourceProtocol, Status} import org.jboss.netty.channel.{ChannelPipeline, ChannelPipelineFactory, Channels} import org.jboss.netty.channel.socket.nio.NioServerSocketChannelFactory import org.jboss.netty.handler.codec.compression._ import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream._ import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.util.Utils private[streaming] class FlumeInputDStream[T: ClassTag]( _ssc: StreamingContext, host: String, port: Int, storageLevel: StorageLevel, enableDecompression: Boolean ) extends ReceiverInputDStream[SparkFlumeEvent](_ssc) { override def getReceiver(): Receiver[SparkFlumeEvent] = { new FlumeReceiver(host, port, storageLevel, enableDecompression) } } private[streaming] class CompressionChannelPipelineFactory extends ChannelPipelineFactory { def getPipeline(): ChannelPipeline = { val pipeline = Channels.pipeline() val encoder = new ZlibEncoder(6) pipeline.addFirst("deflater", encoder) pipeline.addFirst("inflater", new ZlibDecoder()) pipeline } } }
Example 12
Source File: FlumeStreamSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import java.util.concurrent.ConcurrentLinkedQueue import scala.collection.JavaConverters._ import scala.concurrent.duration._ import scala.language.postfixOps import org.jboss.netty.channel.ChannelPipeline import org.jboss.netty.channel.socket.SocketChannel import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory import org.jboss.netty.handler.codec.compression._ import org.scalatest.{BeforeAndAfter, Matchers} import org.scalatest.concurrent.Eventually._ import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.internal.Logging import org.apache.spark.network.util.JavaUtils import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext, TestOutputStream} class FlumeStreamSuite extends SparkFunSuite with BeforeAndAfter with Matchers with Logging { val conf = new SparkConf().setMaster("local[4]").setAppName("FlumeStreamSuite") var ssc: StreamingContext = null test("flume input stream") { testFlumeStream(testCompression = false) } test("flume input compressed stream") { testFlumeStream(testCompression = true) } private class CompressionChannelFactory(compressionLevel: Int) extends NioClientSocketChannelFactory { override def newChannel(pipeline: ChannelPipeline): SocketChannel = { val encoder = new ZlibEncoder(compressionLevel) pipeline.addFirst("deflater", encoder) pipeline.addFirst("inflater", new ZlibDecoder()) super.newChannel(pipeline) } } }
Example 13
Source File: DatasetCacheSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.storage.StorageLevel class DatasetCacheSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("get storage level") { val ds1 = Seq("1", "2").toDS().as("a") val ds2 = Seq(2, 3).toDS().as("b") // default storage level ds1.persist() ds2.cache() assert(ds1.storageLevel == StorageLevel.MEMORY_AND_DISK) assert(ds2.storageLevel == StorageLevel.MEMORY_AND_DISK) // unpersist ds1.unpersist() assert(ds1.storageLevel == StorageLevel.NONE) // non-default storage level ds1.persist(StorageLevel.MEMORY_ONLY_2) assert(ds1.storageLevel == StorageLevel.MEMORY_ONLY_2) // joined Dataset should not be persisted val joined = ds1.joinWith(ds2, $"a.value" === $"b.value") assert(joined.storageLevel == StorageLevel.NONE) } test("persist and unpersist") { val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS().select(expr("_2 + 1").as[Int]) val cached = ds.cache() // count triggers the caching action. It should not throw. cached.count() // Make sure, the Dataset is indeed cached. assertCached(cached) // Check result. checkDataset( cached, 2, 3, 4) // Drop the cache. cached.unpersist() assert(cached.storageLevel == StorageLevel.NONE, "The Dataset should not be cached.") } test("persist and then rebind right encoder when join 2 datasets") { val ds1 = Seq("1", "2").toDS().as("a") val ds2 = Seq(2, 3).toDS().as("b") ds1.persist() assertCached(ds1) ds2.persist() assertCached(ds2) val joined = ds1.joinWith(ds2, $"a.value" === $"b.value") checkDataset(joined, ("2", 2)) assertCached(joined, 2) ds1.unpersist() assert(ds1.storageLevel == StorageLevel.NONE, "The Dataset ds1 should not be cached.") ds2.unpersist() assert(ds2.storageLevel == StorageLevel.NONE, "The Dataset ds2 should not be cached.") } test("persist and then groupBy columns asKey, map") { val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS() val grouped = ds.groupByKey(_._1) val agged = grouped.mapGroups { case (g, iter) => (g, iter.map(_._2).sum) } agged.persist() checkDataset( agged.filter(_._1 == "b"), ("b", 3)) assertCached(agged.filter(_._1 == "b")) ds.unpersist() assert(ds.storageLevel == StorageLevel.NONE, "The Dataset ds should not be cached.") agged.unpersist() assert(agged.storageLevel == StorageLevel.NONE, "The Dataset agged should not be cached.") } }
Example 14
Source File: WindowedDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.streaming.Duration private[streaming] class WindowedDStream[T: ClassTag]( parent: DStream[T], _windowDuration: Duration, _slideDuration: Duration) extends DStream[T](parent.ssc) { if (!_windowDuration.isMultipleOf(parent.slideDuration)) { throw new Exception("The window duration of windowed DStream (" + _windowDuration + ") " + "must be a multiple of the slide duration of parent DStream (" + parent.slideDuration + ")") } if (!_slideDuration.isMultipleOf(parent.slideDuration)) { throw new Exception("The slide duration of windowed DStream (" + _slideDuration + ") " + "must be a multiple of the slide duration of parent DStream (" + parent.slideDuration + ")") } // Persist parent level by default, as those RDDs are going to be obviously reused. parent.persist(StorageLevel.MEMORY_ONLY_SER) def windowDuration: Duration = _windowDuration override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = _slideDuration override def parentRememberDuration: Duration = rememberDuration + windowDuration override def persist(level: StorageLevel): DStream[T] = { // Do not let this windowed DStream be persisted as windowed (union-ed) RDDs share underlying // RDDs and persisting the windowed RDDs would store numerous copies of the underlying data. // Instead control the persistence of the parent DStream. parent.persist(level) this } override def compute(validTime: Time): Option[RDD[T]] = { val currentWindow = new Interval(validTime - windowDuration + parent.slideDuration, validTime) val rddsInWindow = parent.slice(currentWindow) Some(ssc.sc.union(rddsInWindow)) } }
Example 15
Source File: SocketInputDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io._ import java.net.{ConnectException, Socket} import java.nio.charset.StandardCharsets import scala.reflect.ClassTag import scala.util.control.NonFatal import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.util.NextIterator private[streaming] class SocketInputDStream[T: ClassTag]( _ssc: StreamingContext, host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends ReceiverInputDStream[T](_ssc) { def getReceiver(): Receiver[T] = { new SocketReceiver(host, port, bytesToObjects, storageLevel) } } private[streaming] class SocketReceiver[T: ClassTag]( host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends Receiver[T](storageLevel) with Logging { private var socket: Socket = _ def onStart() { logInfo(s"Connecting to $host:$port") try { socket = new Socket(host, port) } catch { case e: ConnectException => restart(s"Error connecting to $host:$port", e) return } logInfo(s"Connected to $host:$port") // Start the thread that receives data over a connection new Thread("Socket Receiver") { setDaemon(true) override def run() { receive() } }.start() } def onStop() { // in case restart thread close it twice synchronized { if (socket != null) { socket.close() socket = null logInfo(s"Closed socket to $host:$port") } } } def bytesToLines(inputStream: InputStream): Iterator[String] = { val dataInputStream = new BufferedReader( new InputStreamReader(inputStream, StandardCharsets.UTF_8)) new NextIterator[String] { protected override def getNext() = { val nextValue = dataInputStream.readLine() if (nextValue == null) { finished = true } nextValue } protected override def close() { dataInputStream.close() } } } }
Example 16
Source File: BlockTransferService.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.network import java.io.Closeable import java.nio.ByteBuffer import scala.concurrent.{Future, Promise} import scala.concurrent.duration.Duration import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient} import org.apache.spark.scheduler.MapStatus import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.util.ThreadUtils private[spark] abstract class BlockTransferService extends ShuffleClient with Closeable with Logging { def uploadBlockSync( hostname: String, port: Int, execId: String, blockId: BlockId, blockData: ManagedBuffer, level: StorageLevel, classTag: ClassTag[_]): Unit = { val future = uploadBlock(hostname, port, execId, blockId, blockData, level, classTag) ThreadUtils.awaitResult(future, Duration.Inf) } }
Example 17
Source File: NettyBlockRpcServer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.language.existentials import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, MapOutputReady, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.scheduler.MapStatus import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(appId, blocks.iterator.asJava) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteBuffer) case uploadBlock: UploadBlock => // StorageLevel and ClassTag are serialized as bytes using our JavaSerializer. val (level: StorageLevel, classTag: ClassTag[_]) = { serializer .newInstance() .deserialize(ByteBuffer.wrap(uploadBlock.metadata)) .asInstanceOf[(StorageLevel, ClassTag[_])] } val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) val blockId = BlockId(uploadBlock.blockId) blockManager.putBlockData(blockId, data, level, classTag) responseContext.onSuccess(ByteBuffer.allocate(0)) case mapOutputReady: MapOutputReady => val mapStatus: MapStatus = serializer.newInstance().deserialize(ByteBuffer.wrap(mapOutputReady.serializedMapStatus)) blockManager.mapOutputReady( mapOutputReady.shuffleId, mapOutputReady.mapId, mapOutputReady.numReduces, mapStatus) } } override def getStreamManager(): StreamManager = streamManager }
Example 18
Source File: FutureTaskNotifier.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark._ import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.internal.Logging import org.apache.spark.storage.BlockManagerId import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.storage.StorageLevel private[spark] object FutureTaskNotifier extends Logging { def taskCompleted( status: MapStatus, mapId: Int, shuffleId: Int, numReduces: Int, nextStageLocs: Option[Seq[BlockManagerId]], shuffleWriteMetrics: ShuffleWriteMetrics, skipZeroByteNotifications: Boolean): Unit = { if (!nextStageLocs.isEmpty && numReduces == nextStageLocs.get.length) { val drizzleRpcsStart = System.nanoTime sendMapStatusToNextTaskLocations(status, mapId, shuffleId, numReduces, nextStageLocs, skipZeroByteNotifications) shuffleWriteMetrics.incWriteTime(System.nanoTime - drizzleRpcsStart) } else { logInfo( s"No taskCompletion next: ${nextStageLocs.map(_.length).getOrElse(0)} r: $numReduces") } } // Push metadata saying that this map task finished, so that the tasks in the next stage // know they can begin pulling the data. private def sendMapStatusToNextTaskLocations( status: MapStatus, mapId: Int, shuffleId: Int, numReduces: Int, nextStageLocs: Option[Seq[BlockManagerId]], skipZeroByteNotifications: Boolean) { val numReduces = nextStageLocs.get.length val uniqueLocations = if (skipZeroByteNotifications) { nextStageLocs.get.zipWithIndex.filter { x => status.getSizeForBlock(x._2) != 0L }.map(_._1).toSet } else { nextStageLocs.get.toSet } uniqueLocations.foreach { blockManagerId => try { SparkEnv.get.blockManager.blockTransferService.mapOutputReady( blockManagerId.host, blockManagerId.port, shuffleId, mapId, numReduces, status) } catch { case e: Exception => logWarning(s"Failed to send map outputs to $blockManagerId", e) } } } }
Example 19
Source File: SparkContextInfoSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark import org.scalatest.Assertions import org.apache.spark.storage.StorageLevel class SparkContextInfoSuite extends SparkFunSuite with LocalSparkContext { test("getPersistentRDDs only returns RDDs that are marked as cached") { sc = new SparkContext("local", "test") assert(sc.getPersistentRDDs.isEmpty === true) val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2) assert(sc.getPersistentRDDs.isEmpty === true) rdd.cache() assert(sc.getPersistentRDDs.size === 1) assert(sc.getPersistentRDDs.values.head === rdd) } test("getPersistentRDDs returns an immutable map") { sc = new SparkContext("local", "test") val rdd1 = sc.makeRDD(Array(1, 2, 3, 4), 2).cache() val myRdds = sc.getPersistentRDDs assert(myRdds.size === 1) assert(myRdds(0) === rdd1) assert(myRdds(0).getStorageLevel === StorageLevel.MEMORY_ONLY) // myRdds2 should have 2 RDDs, but myRdds should not change val rdd2 = sc.makeRDD(Array(5, 6, 7, 8), 1).cache() val myRdds2 = sc.getPersistentRDDs assert(myRdds2.size === 2) assert(myRdds2(0) === rdd1) assert(myRdds2(1) === rdd2) assert(myRdds2(0).getStorageLevel === StorageLevel.MEMORY_ONLY) assert(myRdds2(1).getStorageLevel === StorageLevel.MEMORY_ONLY) assert(myRdds.size === 1) assert(myRdds(0) === rdd1) assert(myRdds(0).getStorageLevel === StorageLevel.MEMORY_ONLY) } test("getRDDStorageInfo only reports on RDDs that actually persist data") { sc = new SparkContext("local", "test") val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2).cache() assert(sc.getRDDStorageInfo.size === 0) rdd.collect() assert(sc.getRDDStorageInfo.size === 1) assert(sc.getRDDStorageInfo.head.isCached) assert(sc.getRDDStorageInfo.head.memSize > 0) assert(sc.getRDDStorageInfo.head.storageLevel === StorageLevel.MEMORY_ONLY) } test("call sites report correct locations") { sc = new SparkContext("local", "test") testPackage.runCallSiteTest(sc) } } package object testPackage extends Assertions { private val CALL_SITE_REGEX = "(.+) at (.+):([0-9]+)".r def runCallSiteTest(sc: SparkContext) { val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2) val rddCreationSite = rdd.getCreationSite val curCallSite = sc.getCallSite().shortForm // note: 2 lines after definition of "rdd" val rddCreationLine = rddCreationSite match { case CALL_SITE_REGEX(func, file, line) => assert(func === "makeRDD") assert(file === "SparkContextInfoSuite.scala") line.toInt case _ => fail("Did not match expected call site format") } curCallSite match { case CALL_SITE_REGEX(func, file, line) => assert(func === "getCallSite") // this is correct because we called it from outside of Spark assert(file === "SparkContextInfoSuite.scala") assert(line.toInt === rddCreationLine.toInt + 2) case _ => fail("Did not match expected call site format") } } }
Example 20
Source File: package.scala From spark-lp with Apache License 2.0 | 5 votes |
implicit object DVectorSpace extends VectorSpace[DVector] { override def combine(alpha: Double, a: DVector, beta: Double, b: DVector): DVector = if (alpha == 1.0 && beta == 1.0) { a.zip(b).map { case (aPart, bPart) => { BLAS.axpy(1.0, aPart, bPart) // bPart += aPart bPart } } } else { a.zip(b).map { case (aPart, bPart) => // NOTE A DenseVector result is assumed here (not sparse safe). DenseVectorSpace.combine(alpha, aPart, beta, bPart).toDense } } override def dot(a: DVector, b: DVector): Double = a.dot(b) override def entrywiseProd(a: DVector, b: DVector): DVector = { a.zip(b).map { case (aPart, bPart) => DenseVectorSpace.entrywiseProd(aPart, bPart).toDense } } override def entrywiseNegDiv(a: DVector, b: DVector): DVector = { a.zip(b).map { case (aPart, bPart) => DenseVectorSpace.entrywiseNegDiv(aPart, bPart) } } override def sum(a: DVector): Double = a.aggregate(0.0)( seqOp = (acc: Double, v: DenseVector) => acc + v.values.sum, combOp = (acc1: Double, acc2: Double) => acc1 + acc2 ) override def min(a: DVector): Double = a.aggregate(Double.PositiveInfinity)( (mi, x) => Math.min(mi, x.values.min), Math.min ) override def max(a: DVector): Double = a.aggregate(Double.NegativeInfinity)( (ma, x) => Math.max(ma, x.values.max), Math.max ) override def cache(a: DVector): Unit = if (a.getStorageLevel == StorageLevel.NONE) { a.cache() } } }
Example 21
Source File: LinopMatrixAdjoint.scala From spark-lp with Apache License 2.0 | 5 votes |
override def apply(x: DVector): DenseVector = { val n = this.n matrix.zipPartitions(x)((matrixPartition, xPartition) => Iterator.single( matrixPartition.checkedZip(xPartition.next.values.toIterator).aggregate( // NOTE A DenseVector result is assumed here (not sparse safe). Vectors.zeros(n).toDense)( seqop = (_, _) match { case (sum, (matrix_i, x_i)) => { // Multiply an element of x by its corresponding matrix row, and add to the // accumulation sum vector. BLAS.axpy(x_i, matrix_i, sum) sum } }, combop = (sum1, sum2) => { // Add the intermediate sum vectors. BLAS.axpy(1.0, sum2, sum1) sum1 } )) ).treeAggregate(Vectors.zeros(n).toDense)( seqOp = (sum1, sum2) => { // Add the intermediate sum vectors. BLAS.axpy(1.0, sum2, sum1) sum1 }, combOp = (sum1, sum2) => { // Add the intermediate sum vectors. BLAS.axpy(1.0, sum2, sum1) sum1 } , depth ) } }
Example 22
Source File: SpLinopMatrix.scala From spark-lp with Apache License 2.0 | 5 votes |
override def apply(mat: DMatrix): DMatrix = { dvector.zipPartitions(mat)((vectorPartition, matPartition) => vectorPartition.next().values.toIterator.checkedZip(matPartition.toIterator).map { case (a: Double, x: Vector) => val xc = x.copy BLAS.scal(a, xc) xc } ) } }
Example 23
Source File: Dictionary.scala From topwords with GNU General Public License v3.0 | 5 votes |
package io.github.qf6101.topwords import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import scala.collection.mutable.ListBuffer def apply(corpus: RDD[String], tauL: Int, tauF: Int, useProbThld: Double): Dictionary = { //enumerate all the possible words: corpus -> words val words = corpus.flatMap { text => val permutations = ListBuffer[String]() for (i <- 1 to tauL) { for (j <- 0 until text.length) { if (j + i <= text.length) permutations += text.substring(j, j + i) } } permutations }.map(_ -> 1).reduceByKey(_ + _).filter { case (word, freq) => // leave the single characters in dictionary for smoothing reason even if they are low frequency word.length == 1 || freq >= tauF }.persist(StorageLevel.MEMORY_AND_DISK_SER_2) //filter words by the use probability threshold: words -> prunedWords val sumWordFreq = words.map(_._2).sum() val prunedWords = words.map { case (word, freq) => (word, freq, freq / sumWordFreq) }.filter { case (word, _, theta) => // leave the single characters in dictionary for smoothing reason even if they have small theta word.length == 1 || theta >= useProbThld } words.unpersist() prunedWords.persist(StorageLevel.MEMORY_AND_DISK_SER_2) //normalize the word use probability: prunedWords -> normalizedWords val sumPrunedWordFreq = prunedWords.map(_._2).sum() val normalizedWords = prunedWords.map { case (word, freq, _) => word -> freq / sumPrunedWordFreq }.collectAsMap().toMap prunedWords.unpersist() //return the overcomplete dictionary: normalizedWords -> dictionary new Dictionary(normalizedWords) } }
Example 24
Source File: FlumeStream.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.streaming import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf.{ConfigurableStreamingStop, Port, StopGroup} import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.flume._ class FlumeStream extends ConfigurableStreamingStop{ override var batchDuration: Int = _ override val authorEmail: String = "[email protected]" override val description: String = "Get data from flume" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) var hostname:String =_ var port:Int=_ override def setProperties(map: Map[String, Any]): Unit = { hostname=MapUtil.get(map,key="hostname").asInstanceOf[String] port=MapUtil.get(map,key="port").asInstanceOf[String].toInt val timing = MapUtil.get(map,key="batchDuration") batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val hostname = new PropertyDescriptor().name("hostname").displayName("hostname").description("hostname of the slave machine to which the flume data will be sent, the hostName must be one of the cluster worker node").defaultValue("").required(true) val port = new PropertyDescriptor().name("port").displayName("port").description("Port of the slave machine to which the flume data will be sent, the port should be greater than 10000").defaultValue("").required(true) val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true) descriptor = hostname :: descriptor descriptor = port :: descriptor descriptor = batchDuration :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/streaming/FlumeStream.png") } override def getGroup(): List[String] = { List(StopGroup.StreamingGroup) } override def getDStream(ssc: StreamingContext): DStream[String] = { val flumeStream = FlumeUtils.createStream(ssc, hostname, port) flumeStream.map(e => new String(e.event.getBody.array(), "UTF-8")) } override def initialize(ctx: ProcessContext): Unit = {} override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {} }
Example 25
Source File: SocketTextStream.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.streaming import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf._ import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.{DStream, InputDStream, ReceiverInputDStream, SocketReceiver} import org.apache.spark.streaming.{Seconds, StreamingContext} class SocketTextStream extends ConfigurableStreamingStop { override val authorEmail: String = "[email protected]" override val description: String = "Receive text data from socket" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) override var batchDuration: Int = _ var hostname:String =_ var port:String=_ //var schema:String=_ override def setProperties(map: Map[String, Any]): Unit = { hostname=MapUtil.get(map,key="hostname").asInstanceOf[String] port=MapUtil.get(map,key="port").asInstanceOf[String] //schema=MapUtil.get(map,key="schema").asInstanceOf[String] val timing = MapUtil.get(map,key="batchDuration") batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val hostname = new PropertyDescriptor().name("hostname").displayName("hostname").description("Hostname to connect to for receiving data").defaultValue("").required(true) val port = new PropertyDescriptor().name("port").displayName("port").description("Port to connect to for receiving data").defaultValue("").required(true) //val schema = new PropertyDescriptor().name("schema").displayName("schema").description("data schema").defaultValue("").required(true) val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true) descriptor = hostname :: descriptor descriptor = port :: descriptor //descriptor = schema :: descriptor descriptor = batchDuration :: descriptor descriptor } //TODO: change icon override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/streaming/SocketTextStream.png") } override def getGroup(): List[String] = { List(StopGroup.StreamingGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession](); val socketDF = spark .readStream .format("socket") .option("host",hostname) .option("port",port) .load() out.write(socketDF) } override def getDStream(ssc: StreamingContext): DStream[String] = { val dstream = ssc.socketTextStream(hostname,Integer.parseInt(port)) dstream.asInstanceOf[DStream[String]] } }
Example 26
Source File: KafkaWordCount.scala From AI with Apache License 2.0 | 5 votes |
package com.bigchange.basic import java.util import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord} import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} object KafkaWordCount { def main(args: Array[String]) { if (args.length < 4) { System.err.println("Usage: <zkQuorum> <group> <topics> <numThreads>") System.exit(1) } val Array(zkQuorum, group, topics, numThreads) = args val sparkConf = new SparkConf().setAppName("KafkaWordCount"). set("spark.streaming.receiver.writeAheadLog.enable", "true"). set("spark.streaming.kafka.maxRatePerPartition", "1000") val ssc = new StreamingContext(sparkConf, Seconds(2)) // 设置 checkpoint,这是考虑到了有 window 操作,window 操作一般是需要进行 checkpoint ssc.checkpoint("checkpoint") val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap // createStream 返回的是一个 Tuple2,具有 key,value,这里只关注 value. // 注意这里是 Receiver-based 方式(还提供了 non-receiver 模式),默认配置下,这种方式是会在 receiver 挂掉 // 丢失数据的,需要设置 Write Ahead, 上面我们已经配置了, 那么存储 level 也可以进行相应调整. val lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap, StorageLevel.MEMORY_AND_DISK_SER).map(_._2) val words = lines.flatMap(_.split(" ")) // 统计的是 10 分钟内的单词数量,每隔 10 秒统计 1 次 val wordCounts = words.map(x => (x, 1L)) .reduceByKeyAndWindow(_ + _, _ - _, Seconds(10), Seconds(2), 2). filter(x => x._2 > 0) wordCounts.print() ssc.start() ssc.awaitTermination() } } // Produces some random words between 1 and 100. object KafkaWordCountProducer { def main(args: Array[String]) { if (args.length < 4) { System.err.println("Usage: <metadataBrokerList> <topic> " + "<messagesPerSec> <wordsPerMessage>") System.exit(1) } // 需要注意的是这里是 broker list,为 host:port,host:port 形式 val Array(brokers, topic, messagesPerSec, wordsPerMessage) = args // Zookeeper connection properties val props = new util.HashMap[String, Object]() props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers) props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer") props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer") val producer = new KafkaProducer[String, String](props) // Send some messages while (true) { (1 to messagesPerSec.toInt).foreach { messageNum => val str = (1 to wordsPerMessage.toInt).map(x => scala.util.Random.nextInt(100).toString) .mkString(" ") val message = new ProducerRecord[String, String](topic, null, str) producer.send(message) } Thread.sleep(1000) } } }
Example 27
Source File: PipeOptimizePersistAndName.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.pipe.optimize import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext class PipeOptimizePersistAndName[A](rddname: String = null, newLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends PipeElementPassthrough[RDD[A]] { def substep(input: RDD[A])(implicit pipeContext: AbstractPipeContext): Unit = { pipeContext match { case pc: SddfPipeContext => { input.persist(newLevel) if(rddname != null){ input.name = rddname pc.persistedRDDs += (rddname -> input) analysable.values += ("name" -> rddname) } } case _ => { throw new Exception("Wrong AbstractPipeContext type.") } } } } object PipeOptimizePersistAndName { def apply[A](rddname: String = null, newLevel: StorageLevel = StorageLevel.MEMORY_ONLY) = { new PipeOptimizePersistAndName[A](rddname, newLevel) } }
Example 28
Source File: package.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.vs import org.apache.spark.mllib.linalg.BLAS import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._ import org.apache.spark.mllib.optimization.tfocs.VectorSpace import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ import org.apache.spark.mllib.optimization.tfocs.vs.vector.DenseVectorSpace import org.apache.spark.storage.StorageLevel package object dvector { implicit object DVectorSpace extends VectorSpace[DVector] { override def combine(alpha: Double, a: DVector, beta: Double, b: DVector): DVector = if (alpha == 1.0 && beta == 0.0) { // When minimizing rather than maximizing, the TFOCS implementation frequently requests a // no-op linear combination where alpha == 1.0 and beta == 0.0. This case is specifically // optimized. a } else { a.zip(b).map { case (aPart, bPart) => // NOTE A DenseVector result is assumed here (not sparse safe). DenseVectorSpace.combine(alpha, aPart, beta, bPart).toDense } } override def dot(a: DVector, b: DVector): Double = a.dot(b) override def cache(a: DVector): Unit = if (a.getStorageLevel == StorageLevel.NONE) { a.cache() } } }
Example 29
Source File: SmoothDual.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.fs.generic.double import org.apache.spark.mllib.optimization.tfocs.{ Mode, ProxCapableFunction, ProxMode, ProxValue, SmoothFunction, Value, VectorSpace } import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ import org.apache.spark.storage.StorageLevel class SmoothDual[X](objectiveF: ProxCapableFunction[X], mu: Double, x0: X)( implicit vs: VectorSpace[X]) extends SmoothFunction[X] { vs.cache(x0) override def apply(ATz: X, mode: Mode): Value[X] = { val offsetCenter = vs.combine(mu, ATz, 1.0, x0) val ProxValue(proxF, Some(proxMinimizer)) = objectiveF(offsetCenter, mu, ProxMode(mode.f, true)) // Cache proxMinimizer when it will be required more than once. if (mode.f) vs.cache(proxMinimizer) val f = if (mode.f) { // TODO This might be optimized as a single spark job. val diff = vs.combine(1.0, x0, -1.0, proxMinimizer) Some(vs.dot(ATz, proxMinimizer) - proxF.get - (0.5 / mu) * vs.dot(diff, diff)) } else { None } val g = if (mode.g) { Some(vs.combine(-1.0, proxMinimizer, 0.0, proxMinimizer)) } else { None } Value(f, g) } }
Example 30
Source File: LinopMatrix.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.fs.vector.dvector import org.apache.spark.mllib.linalg.{ BLAS, DenseVector } import org.apache.spark.mllib.optimization.tfocs.fs.dvector.vector.LinopMatrixAdjoint import org.apache.spark.mllib.optimization.tfocs.LinearOperator import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ import org.apache.spark.storage.StorageLevel class LinopMatrix(private val matrix: DMatrix) extends LinearOperator[DenseVector, DVector] { if (matrix.getStorageLevel == StorageLevel.NONE) { matrix.cache() } override def apply(x: DenseVector): DVector = { val bcX = matrix.context.broadcast(x) // Take the dot product of each matrix row with x. // NOTE A DenseVector result is assumed here (not sparse safe). matrix.mapPartitions(partitionRows => Iterator.single(new DenseVector(partitionRows.map(row => BLAS.dot(row, bcX.value)).toArray))) } override def t: LinearOperator[DVector, DenseVector] = new LinopMatrixAdjoint(matrix) }
Example 31
Source File: LinopMatrixAdjoint.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.fs.dvector.vector import org.apache.spark.mllib.linalg.BLAS import org.apache.spark.mllib.linalg.{ DenseVector, Vectors } import org.apache.spark.mllib.optimization.tfocs.CheckedIteratorFunctions._ import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvector.LinopMatrix import org.apache.spark.mllib.optimization.tfocs.LinearOperator import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ import org.apache.spark.storage.StorageLevel class LinopMatrixAdjoint(@transient private val matrix: DMatrix) extends LinearOperator[DVector, DenseVector] { if (matrix.getStorageLevel == StorageLevel.NONE) { matrix.cache() } private lazy val n = matrix.first().size override def apply(x: DVector): DenseVector = { val n = this.n matrix.zipPartitions(x)((matrixPartition, xPartition) => Iterator.single( matrixPartition.checkedZip(xPartition.next.values.toIterator).aggregate( // NOTE A DenseVector result is assumed here (not sparse safe). Vectors.zeros(n).toDense)( seqop = (_, _) match { case (sum, (matrix_i, x_i)) => { // Multiply an element of x by its corresponding matrix row, and add to the // accumulation sum vector. BLAS.axpy(x_i, matrix_i, sum) sum } }, combop = (sum1, sum2) => { // Add the intermediate sum vectors. BLAS.axpy(1.0, sum2, sum1) sum1 } )) ).treeAggregate(Vectors.zeros(n).toDense)( seqOp = (sum1, sum2) => { // Add the intermediate sum vectors. BLAS.axpy(1.0, sum2, sum1) sum1 }, combOp = (sum1, sum2) => { // Add the intermediate sum vectors. BLAS.axpy(1.0, sum2, sum1) sum1 } ) } override def t: LinearOperator[DenseVector, DVector] = new LinopMatrix(matrix) }
Example 32
Source File: SmoothLogLLogistic.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.fs.dvector.double import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._ import org.apache.spark.mllib.optimization.tfocs.{ Mode, SmoothFunction, Value } import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ import org.apache.spark.storage.StorageLevel class SmoothLogLLogistic(y: DVector) extends SmoothFunction[DVector] { if (y.getStorageLevel == StorageLevel.NONE) { y.cache() } override def apply(mu: DVector, mode: Mode): Value[DVector] = { val f = if (mode.f) { // TODO Performance might be improved by reimplementing as a single aggregate rather than // mapping through an intermediate DVector and summing, which breaks per-element pipelining. Some(y.zipElements(mu, (y_i, mu_i) => { val yFactor = if (mu_i > 0.0) y_i - 1.0 else if (mu_i < 0.0) y_i else 0.0 yFactor * mu_i - math.log1p(math.exp(-math.abs(mu_i))) }).sum) } else { None } val g = if (mode.g) { Some(y.zipElements(mu, (y_i, mu_i) => { // Compute the log logistic loss gradient elementwise. val muFactor = if (mu_i > 0.0) 1.0 else math.exp(mu_i) y_i - muFactor / (1.0 + math.exp(-math.abs(mu_i))) })) } else { None } Value(f, g) } }
Example 33
Source File: ProxShiftRPlus.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.fs.dvector.double import org.apache.spark.mllib.optimization.tfocs.{ ProxCapableFunction, ProxMode, ProxValue } import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._ import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ import org.apache.spark.storage.StorageLevel class ProxShiftRPlus(c: DVector) extends ProxCapableFunction[DVector] { if (c.getStorageLevel == StorageLevel.NONE) { c.cache() } override def apply(x: DVector, t: Double, mode: ProxMode): ProxValue[DVector] = { val minimizer = x.zipElements(c, (x_i, c_i) => math.max(0, x_i - t * c_i)) // If both f and minimizer are requested, the minimizer will be read twice so cache it. if (mode.f && mode.minimizer) minimizer.cache() val f = if (mode.f) Some(c.dot(minimizer)) else None ProxValue(f, Some(minimizer)) } override def apply(x: DVector): Double = { val rPlus = x.aggregateElements(0.0)( seqOp = (sum, x_i) => sum + (if (x_i < 0) Double.PositiveInfinity else 0.0), combOp = _ + _) if (rPlus.isPosInfinity) Double.PositiveInfinity else x.dot(c) } }
Example 34
Source File: SmoothHuber.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.fs.dvector.double import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._ import org.apache.spark.mllib.optimization.tfocs.{ Mode, SmoothFunction, Value } import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ import org.apache.spark.storage.StorageLevel class SmoothHuber(x0: DVector, tau: Double) extends SmoothFunction[DVector] { if (x0.getStorageLevel == StorageLevel.NONE) { x0.cache() } override def apply(x: DVector, mode: Mode): Value[DVector] = { val diff = x.diff(x0) val tau = this.tau // If both f and g are requested then diff will be read twice, so cache it. if (mode.f && mode.g) diff.cache() val f = if (mode.f) { // TODO If f is required but not g, then performance might be improved by reimplementing as // a single aggregate using 'x' and 'x0' without an intermediate 'diff' DVector, which breaks // per-element pipelining. Some(diff.aggregateElements(0.0)( seqOp = (sum, diff_i) => { // Find the Huber loss, corresponding to the adjusted l2 loss when the magnitude is <= tau // and to the adjusted l1 loss when the magnitude is > tau. val huberValue = if (math.abs(diff_i) <= tau) { 0.5 * diff_i * diff_i / tau } else { math.abs(diff_i) - tau / 2.0 } sum + huberValue }, combOp = _ + _)) } else { None } val g = if (mode.g) { // Compute the Huber loss gradient elementwise. Some(diff.mapElements(diff_i => diff_i / math.max(math.abs(diff_i), tau))) } else { None } Value(f, g) } }
Example 35
Source File: SmoothQuad.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.fs.dvector.double import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._ import org.apache.spark.mllib.optimization.tfocs.{ Mode, SmoothFunction, Value } import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ import org.apache.spark.storage.StorageLevel class SmoothQuad(x0: DVector) extends SmoothFunction[DVector] { if (x0.getStorageLevel == StorageLevel.NONE) { x0.cache() } override def apply(x: DVector, mode: Mode): Value[DVector] = { // Compute the squared error gradient (just the difference between vectors). val g = x.diff(x0) // If both f and g are requested then g will be read twice, so cache it. if (mode.f && mode.g) g.cache() val f = if (mode.f) { // Compute the squared error. // TODO If f is required but not g, then performance might be improved by reimplementing as // a single aggregate using 'x' and 'x0' without an intermediate 'g' DVector, which breaks // per-element pipelining. Some(g.aggregate(0.0)((sum, gPart) => sum + math.pow(Vectors.norm(gPart, 2), 2), _ + _) / 2.0) } else { None } Value(f, Some(g)) } }
Example 36
Source File: Streaming.scala From scala-spark-cab-rides-predictions with MIT License | 5 votes |
import com.amazonaws.services.dynamodbv2.document.internal.InternalUtils import com.amazonaws.services.dynamodbv2.streamsadapter.model.RecordAdapter import com.amazonaws.services.kinesis.model.Record import com.google.gson.Gson import org.apache.spark.sql._ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.kinesis.dynamostream.KinesisInitialPositions.Latest import org.apache.spark.streaming.kinesis.dynamostream.KinesisInputDStream import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext} object Trials extends App { import org.apache.log4j.{Level, Logger} Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) //session setup System.setProperty("hadoop.home.dir", "C:\\winutils") val sparkSession = SparkSession.builder() .master("local[*]") .appName("test") .getOrCreate() val sc = sparkSession.sparkContext val ssc = new StreamingContext(sc, Seconds(10)) val sqlContext = sparkSession.sqlContext //creates an array of strings from raw byte array def kinesisRecordHandler: Record => Array[String] = (record: Record) => new String(record.getData.array()).split(",") //converts records to map of key value pair and then json def recordHandler = (record: Record) => { val gson = new Gson val sRecord = record.asInstanceOf[RecordAdapter].getInternalObject val map = InternalUtils.toSimpleMapValue(sRecord.getDynamodb.getNewImage) gson.toJson(map) } case class CabPrice(cab_type: String, product_id: String, name: String, price: String, distance: String, surge_multiplier: String, time_stamp: String, source: String, destination: String, id: String) val stream_cab = KinesisInputDStream.builder .streamingContext(ssc) .streamName("cab_rides") .regionName("us-east-1") .initialPosition(new Latest()) .checkpointAppName("cab_rides-app") .checkpointInterval(Milliseconds(1000)) .storageLevel(StorageLevel.MEMORY_AND_DISK_2) .buildWithMessageHandler(recordHandler) val stream_weather = KinesisInputDStream.builder .streamingContext(ssc) .streamName("weather") .regionName("us-east-1") .initialPosition(new Latest()) .checkpointAppName("cab_rides-app") .checkpointInterval(Milliseconds(1000)) .storageLevel(StorageLevel.MEMORY_AND_DISK_2) .buildWithMessageHandler(recordHandler) //creating dataframe, can be stored as temp view val cabSchema = Encoders.product[CabPrice].schema stream_cab.foreachRDD(rdd => { import sqlContext.implicits._ //val xx: Dataset[String] = rdd.toDS() val df: DataFrame = sqlContext.read.schema(cabSchema).json(rdd.toDS()) df.show() }) ssc.start() ssc.awaitTermination() }
Example 37
Source File: PersistStreamByInterval.scala From spark-streaming-demo with Apache License 2.0 | 5 votes |
package com.datastax.examples.meetup import com.datastax.examples.meetup.model.MeetupRsvp import com.datastax.examples.meetup.model.EventInterval import com.datastax.examples.meetup.websocket._ import com.datastax.spark.connector._ import com.datastax.spark.connector.streaming._ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, Minutes, StreamingContext} import org.apache.spark.streaming.StreamingContext._ class PersistStreamByInterval extends Serializable { val tableColumns = SomeColumns("event", "interval", "dimension", "subtotal") def start(ssc: StreamingContext, websocket: String, keyspace: String, table: String) { val stream = ssc.receiverStream[MeetupRsvp](new WebSocketReceiver(websocket, StorageLevel.MEMORY_ONLY_SER)) //stream.checkpoint(Seconds(60)) //stream.repartition(2) // Filter Accepted RSVP val rsvpAccepted = stream.filter(_.response == "yes") // Number of attendees by Country val rsvpByCountry = rsvpAccepted .map( rsvp => (rsvp.group.group_country, rsvp.guests + 1) ) .reduceByKey(_ + _) .map{ case (country, attendees) => ("attending", EventInterval.All, country, attendees) } rsvpByCountry.saveToCassandra(keyspace, table, tableColumns) // Trending Topics val trendingTopics = rsvpAccepted .flatMap( rsvp => rsvp.group.group_topics ) .map( topic => (topic.topic_name, 1) ) .reduceByKeyAndWindow((a:Int,b:Int) => a+b, Minutes(5), Seconds(10)) .filter( t => t._2 > 5 ) // min threshold = 5 .transform( (rdd, time) => rdd.map { case (topic, count) => ("trending", EventInterval.Seconds(time), topic, count)} ) trendingTopics.saveToCassandra(keyspace, table, tableColumns) ssc.start() ssc.awaitTermination() } }
Example 38
Source File: WebSocketReceiver.scala From spark-streaming-demo with Apache License 2.0 | 5 votes |
package com.datastax.examples.meetup.websocket import com.datastax.examples.meetup.model._ import org.apache.spark.storage.StorageLevel import scalawebsocket.WebSocket import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.Logging import org.json4s._ import org.json4s.jackson.JsonMethods._ class WebSocketReceiver(url: String, storageLevel: StorageLevel) extends Receiver[MeetupRsvp](storageLevel) with Logging { @volatile private var webSocket: WebSocket = _ def onStart() { try{ logInfo("Connecting to WebSocket: " + url) val newWebSocket = WebSocket().open(url).onTextMessage({ msg: String => parseJson(msg) }) setWebSocket(newWebSocket) logInfo("Connected to: WebSocket" + url) } catch { case e: Exception => restart("Error starting WebSocket stream", e) } } def onStop() { setWebSocket(null) logInfo("WebSocket receiver stopped") } private def setWebSocket(newWebSocket: WebSocket) = synchronized { if (webSocket != null) { webSocket.shutdown() } webSocket = newWebSocket } private def parseJson(jsonStr: String): Unit = { implicit lazy val formats = DefaultFormats try { val json = parse(jsonStr) val rsvp = json.extract[MeetupRsvp] store(rsvp) } catch { case e: MappingException => logError("Unable to map JSON message to MeetupRsvp object:" + e.msg) case e: Exception => logError("Unable to map JSON message to MeetupRsvp object") } } }
Example 39
Source File: BHTSNE.scala From spark-tsne with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.tsne.impl import breeze.linalg._ import breeze.stats.distributions.Rand import com.github.saurfang.spark.tsne.tree.SPTree import com.github.saurfang.spark.tsne.{TSNEGradient, TSNEHelper, TSNEParam, X2P} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.storage.StorageLevel import org.slf4j.LoggerFactory import scala.util.Random object BHTSNE { private def logger = LoggerFactory.getLogger(BHTSNE.getClass) def tsne( input: RowMatrix, noDims: Int = 2, maxIterations: Int = 1000, perplexity: Double = 30, theta: Double = 0.5, reportLoss: Int => Boolean = {i => i % 10 == 0}, callback: (Int, DenseMatrix[Double], Option[Double]) => Unit = {case _ => }, seed: Long = Random.nextLong() ): DenseMatrix[Double] = { if(input.rows.getStorageLevel == StorageLevel.NONE) { logger.warn("Input is not persisted and performance could be bad") } Rand.generator.setSeed(seed) val tsneParam = TSNEParam() import tsneParam._ val n = input.numRows().toInt val Y: DenseMatrix[Double] = DenseMatrix.rand(n, noDims, Rand.gaussian(0, 1)) :/ 1e4 val iY = DenseMatrix.zeros[Double](n, noDims) val gains = DenseMatrix.ones[Double](n, noDims) // approximate p_{j|i} val p_ji = X2P(input, 1e-5, perplexity) val P = TSNEHelper.computeP(p_ji, n).glom() .map(rows => rows.map { case (i, data) => (i, data.map(_._1).toSeq, DenseVector(data.map(_._2 * exaggeration_factor).toArray)) }) .cache() var iteration = 1 while(iteration <= maxIterations) { val bcY = P.context.broadcast(Y) val bcTree = P.context.broadcast(SPTree(Y)) val initialValue = (DenseMatrix.zeros[Double](n, noDims), DenseMatrix.zeros[Double](n, noDims), 0.0) val (posF, negF, sumQ) = P.treeAggregate(initialValue)( seqOp = (c, v) => { // c: (pos, neg, sumQ), v: Array[(i, Seq(j), vec(Distance))] TSNEGradient.computeEdgeForces(v, bcY.value, c._1) val q = TSNEGradient.computeNonEdgeForces(bcTree.value, bcY.value, theta, c._2, v.map(_._1): _*) (c._1, c._2, c._3 + q) }, combOp = (c1, c2) => { // c: (grad, loss) (c1._1 + c2._1, c1._2 + c2._2, c1._3 + c2._3) }) val dY: DenseMatrix[Double] = posF :- (negF :/ sumQ) TSNEHelper.update(Y, dY, iY, gains, iteration, tsneParam) if(reportLoss(iteration)) { val loss = P.treeAggregate(0.0)( seqOp = (c, v) => { TSNEGradient.computeLoss(v, bcY.value, sumQ) }, combOp = _ + _ ) logger.debug(s"Iteration $iteration finished with $loss") callback(iteration, Y.copy, Some(loss)) } else { logger.debug(s"Iteration $iteration finished") callback(iteration, Y.copy, None) } bcY.destroy() bcTree.destroy() //undo early exaggeration if(iteration == early_exaggeration) { P.foreach { rows => rows.foreach { case (_, _, vec) => vec.foreachPair { case (i, v) => vec.update(i, v / exaggeration_factor) } } } } iteration += 1 } Y } }
Example 40
Source File: SimpleTSNE.scala From spark-tsne with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.tsne.impl import breeze.linalg._ import breeze.stats.distributions.Rand import com.github.saurfang.spark.tsne.{TSNEGradient, TSNEHelper, TSNEParam, X2P} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.storage.StorageLevel import org.slf4j.LoggerFactory import scala.util.Random object SimpleTSNE { private def logger = LoggerFactory.getLogger(SimpleTSNE.getClass) def tsne( input: RowMatrix, noDims: Int = 2, maxIterations: Int = 1000, perplexity: Double = 30, callback: (Int, DenseMatrix[Double], Option[Double]) => Unit = {case _ => }, seed: Long = Random.nextLong()): DenseMatrix[Double] = { if(input.rows.getStorageLevel == StorageLevel.NONE) { logger.warn("Input is not persisted and performance could be bad") } Rand.generator.setSeed(seed) val tsneParam = TSNEParam() import tsneParam._ val n = input.numRows().toInt val Y: DenseMatrix[Double] = DenseMatrix.rand(n, noDims, Rand.gaussian(0, 1)) val iY = DenseMatrix.zeros[Double](n, noDims) val gains = DenseMatrix.ones[Double](n, noDims) // approximate p_{j|i} val p_ji = X2P(input, 1e-5, perplexity) val P = TSNEHelper.computeP(p_ji, n).glom().cache() var iteration = 1 while(iteration <= maxIterations) { val bcY = P.context.broadcast(Y) val numerator = P.map{ arr => TSNEGradient.computeNumerator(bcY.value, arr.map(_._1): _*) }.cache() val bcNumerator = P.context.broadcast({ numerator.treeAggregate(0.0)(seqOp = (x, v) => x + sum(v), combOp = _ + _) }) val (dY, loss) = P.zip(numerator).treeAggregate((DenseMatrix.zeros[Double](n, noDims), 0.0))( seqOp = (c, v) => { // c: (grad, loss), v: (Array[(i, Iterable(j, Distance))], numerator) val l = TSNEGradient.compute(v._1, bcY.value, v._2, bcNumerator.value, c._1, iteration <= early_exaggeration) (c._1, c._2 + l) }, combOp = (c1, c2) => { // c: (grad, loss) (c1._1 + c2._1, c1._2 + c2._2) }) bcY.destroy() bcNumerator.destroy() numerator.unpersist() TSNEHelper.update(Y, dY, iY, gains, iteration, tsneParam) logger.debug(s"Iteration $iteration finished with $loss") callback(iteration, Y.copy, Some(loss)) iteration += 1 } Y } }
Example 41
Source File: RedisInputDStream.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.redislabs.provider.redis.streaming import com.redislabs.provider.redis.RedisConfig import org.apache.curator.utils.ThreadUtils import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.dstream.ReceiverInputDStream import redis.clients.jedis._ import scala.reflect.{ClassTag, classTag} import scala.util.control.NonFatal keys.foreach{ key => executorPool.submit(new MessageHandler(redisConfig.connectionForKey(key), key)) } } finally { executorPool.shutdown() } } def onStop() { } private class MessageHandler(conn: Jedis, key: String) extends Runnable { def run() { try { while(!isStopped) { val response = conn.blpop(2, key) if (response == null || response.isEmpty) { // no-op } else if (classTag[T] == classTag[String]) { store(response.get(1).asInstanceOf[T]) } else if (classTag[T] == classTag[(String, String)]) { store((response.get(0), response.get(1)).asInstanceOf[T]) } else { throw new scala.Exception("Unknown Redis Streaming type") } } } catch { case NonFatal(e) => restart("Error receiving data", e) } finally { onStop() } } } }
Example 42
Source File: redisStreamingFunctions.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.redislabs.provider.redis.streaming import com.redislabs.provider.redis.{ReadWriteConfig, RedisConfig} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.InputDStream def createRedisStreamWithoutListname(keys: Array[String], storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_2) (implicit redisConf: RedisConfig = RedisConfig.fromSparkConf(ssc.sparkContext.getConf)): RedisInputDStream[String] = { new RedisInputDStream(ssc, keys, storageLevel, redisConf, classOf[String]) } def createRedisXStream(consumersConfig: Seq[ConsumerConfig], storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_2) (implicit redisConfig: RedisConfig = RedisConfig.fromSparkConf(ssc.sparkContext.getConf)): InputDStream[StreamItem] = { val readWriteConfig = ReadWriteConfig.fromSparkConf(ssc.sparkContext.getConf) val receiver = new RedisStreamReceiver(consumersConfig, redisConfig, readWriteConfig, storageLevel) ssc.receiverStream(receiver) } } trait RedisStreamingFunctions { implicit def toRedisStreamingContext(ssc: StreamingContext): RedisStreamingContext = new RedisStreamingContext(ssc) }
Example 43
Source File: GraphLoader.scala From graphx-algorithm with GNU General Public License v2.0 | 5 votes |
package org.apache.spark.graphx import org.apache.spark.storage.StorageLevel import org.apache.spark.{Logging, SparkContext} import org.apache.spark.graphx.impl.{EdgePartitionBuilder, GraphImpl} def edgeListFile( sc: SparkContext, path: String, canonicalOrientation: Boolean = false, numEdgePartitions: Int = -1, edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY, vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) : Graph[Int, Int] = { val startTime = System.currentTimeMillis // Parse the edge data table directly into edge partitions val lines = if (numEdgePartitions > 0) { sc.textFile(path, numEdgePartitions).coalesce(numEdgePartitions) } else { sc.textFile(path) } val edges = lines.mapPartitionsWithIndex { (pid, iter) => val builder = new EdgePartitionBuilder[Int, Int] iter.foreach { line => if (!line.isEmpty && line(0) != '#') { val lineArray = line.split("\\s+") if (lineArray.length < 2) { logWarning("Invalid line: " + line) } val srcId = lineArray(0).toLong val dstId = lineArray(1).toLong if (canonicalOrientation && srcId > dstId) { builder.add(dstId, srcId, 1) } else { builder.add(srcId, dstId, 1) } } } Iterator((pid, builder.toEdgePartition)) }.persist(edgeStorageLevel).setName("GraphLoader.edgeListFile - edges (%s)".format(path)) edges.count() logInfo("It took %d ms to load the edges".format(System.currentTimeMillis - startTime)) GraphImpl.fromEdgePartitions(edges, defaultVertexAttr = 1, edgeStorageLevel = edgeStorageLevel, vertexStorageLevel = vertexStorageLevel) } // end of edgeListFile }
Example 44
Source File: EdgeRDDImpl.scala From graphx-algorithm with GNU General Public License v2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{OneToOneDependency, HashPartitioner, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.graphx._ class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 45
Source File: GraphLoaderPlus.scala From graphx-algorithm with GNU General Public License v2.0 | 5 votes |
package org.apache.spark.graphx import org.apache.spark.storage.StorageLevel import org.apache.spark.{Logging, SparkContext} import org.apache.spark.graphx.impl.{EdgePartitionBuilder, GraphImpl} def edgeListFile( sc: SparkContext, path: String, canonicalOrientation: Boolean = false, numEdgePartitions: Int = -1, edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY, vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) : Graph[Int, Int] = { val startTime = System.currentTimeMillis // Parse the edge data table directly into edge partitions val lines = if (numEdgePartitions > 0) { sc.textFile(path, numEdgePartitions).coalesce(numEdgePartitions) } else { sc.textFile(path) } val edges = lines.mapPartitionsWithIndex { (pid, iter) => val builder = new EdgePartitionBuilder[Int, Int] iter.foreach { line => if (!line.isEmpty && line(0) != '#') { val lineArray = line.split("\\s+") if (lineArray.length < 2) { throw new IllegalArgumentException("Invalid line: " + line) } if (lineArray.length == 2) { val srcId = lineArray(0).toLong val dstId = lineArray(1).toLong if (canonicalOrientation && srcId > dstId) { builder.add(dstId, srcId, 1) } else { builder.add(srcId, dstId, 1) } } else { val srcId = lineArray(0).toLong val dstId = lineArray(1).toLong val weight = lineArray(2).toInt if (canonicalOrientation && srcId > dstId) { builder.add(dstId, srcId, weight) } else { builder.add(srcId, dstId, weight) } } } } Iterator((pid, builder.toEdgePartition)) }.persist(edgeStorageLevel).setName("GraphLoaderPlus.edgeListFile - edges (%s)".format(path)) edges.count() logInfo("It took %d ms to load the edges".format(System.currentTimeMillis - startTime)) GraphImpl.fromEdgePartitions(edges, defaultVertexAttr = 1, edgeStorageLevel = edgeStorageLevel, vertexStorageLevel = vertexStorageLevel) } // end of edgeListFile }
Example 46
Source File: SparkStreamAdapterExample.scala From eventuate with Apache License 2.0 | 5 votes |
package com.rbmhtechnology.example.spark //#spark-stream-adapter import com.rbmhtechnology.eventuate._ import com.rbmhtechnology.eventuate.adapter.spark.SparkStreamAdapter import org.apache.spark._ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.streaming.dstream.DStream //# import akka.actor._ import com.rbmhtechnology.eventuate.log.EventLogWriter import com.rbmhtechnology.eventuate.log.leveldb.LeveldbEventLog import scala.collection.immutable._ import scala.io.Source object SparkStreamAdapterExample extends App { implicit val system: ActorSystem = ActorSystem(ReplicationConnection.DefaultRemoteSystemName) val logName: String = "L" val endpoint: ReplicationEndpoint = new ReplicationEndpoint(id = "1", logNames = Set(logName), logFactory = logId => LeveldbEventLog.props(logId), connections = Set()) val log: ActorRef = endpoint.logs(logName) val writer: EventLogWriter = new EventLogWriter("writer", log) endpoint.activate() //#spark-stream-adapter val sparkConfig = new SparkConf(true) .setAppName("adapter") .setMaster("local[4]") val sparkContext = new SparkContext(sparkConfig) val sparkStreamingContext = new StreamingContext(sparkContext, Seconds(1)) // Create an Eventuate Spark stream adapter val sparkStreamAdapter = new SparkStreamAdapter( sparkStreamingContext, system.settings.config) // Create a DStream from event log L by connecting to its replication endpoint val stream: DStream[DurableEvent] = sparkStreamAdapter.eventStream( id = "s1", host = "127.0.0.1", port = 2552, logName = "L", fromSequenceNr = 1L, storageLevel = StorageLevel.MEMORY_ONLY) // For processing in strict event storage order, use repartition(1) stream.repartition(1).foreachRDD(rdd => rdd.foreach(println)) // Start event stream processing sparkStreamingContext.start() //# // Generate new events from stdin val lines = Source.stdin.getLines() def prompt(): Unit = { if (lines.hasNext) lines.next() match { case "exit" => sparkStreamingContext.stop(stopSparkContext = true) system.terminate() case line => writer.write(Seq(line)) prompt() } } prompt() }
Example 47
Source File: CustomReceiver.scala From Learning-Spark-SQL with MIT License | 5 votes |
import java.io.{BufferedReader, InputStreamReader} import java.net.Socket import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { println("Connecting to " + host + ":" + port) socket = new Socket(host, port) println("Connected to " + host + ":" + port) val reader = new BufferedReader( new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8)) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() println("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } }
Example 48
Source File: TFLArrivalPredictionsByLine.scala From Learning-Spark-SQL with MIT License | 5 votes |
import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.receiver.Receiver import org.jfarcand.wcs.{TextListener, WebSocket} import scala.util.parsing.json.JSON import scalaj.http.Http import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import org.apache.http.HttpResponse; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; class TFLArrivalPredictionsByLine() extends Receiver[String](StorageLevel.MEMORY_ONLY) with Runnable { private val tflUrl = "https://api.tfl.gov.uk/Line/circle/Arrivals?stopPointId=940GZZLUERC&app_id=a73727f3&app_key=dc8150560a2422afae2b70cf291c4327" @transient private var thread: Thread = _ override def onStart(): Unit = { thread = new Thread(this) thread.start() } override def onStop(): Unit = { thread.interrupt() } override def run(): Unit = { while (true){ receive(); Thread.sleep(60*1000); } } private def receive(): Unit = { val httpClient = new DefaultHttpClient(); val getRequest = new HttpGet(tflUrl); getRequest.addHeader("accept", "application/json"); val response = httpClient.execute(getRequest); if (response.getStatusLine().getStatusCode() != 200) { throw new RuntimeException("Failed : HTTP error code : " + response.getStatusLine().getStatusCode()); } val br = new BufferedReader( new InputStreamReader((response.getEntity().getContent()))); var output=br.readLine(); while(output!=null){ println(output) output=br.readLine() } } }
Example 49
Source File: TFLCustomReceiver.scala From Learning-Spark-SQL with MIT License | 5 votes |
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import org.apache.http.HttpResponse; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object TFLCustomReceiver { private val url = "https://api.tfl.gov.uk/Line/circle/Arrivals?stopPointId=940GZZLUERC&app_id=a73727f3&app_key=dc8150560a2422afae2b70cf291c4327" def main(args: Array[String]) { // Create the context with a 1 second batch size val sparkConf = new SparkConf().setAppName("TFLCustomReceiver") val ssc = new StreamingContext(sparkConf, Seconds(300)) val lines = ssc.receiverStream(new TFLCustomReceiver(url)) lines.print() ssc.start() ssc.awaitTermination() } } class TFLCustomReceiver(url: String) extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) { def onStart() { // Start the thread that receives data over a connection new Thread("Http Receiver") { override def run() { receive() } }.start() } def onStop() { // There is nothing much to do as the thread calling receive() // is designed to stop by itself if isStopped() returns false } private def receive() { var userInput: String = null var httpClient: DefaultHttpClient = null var getRequest: HttpGet = null try { // Connect to host:port httpClient = new DefaultHttpClient(); getRequest = new HttpGet(url); getRequest.addHeader("accept", "application/json"); while(!isStopped) { val response = httpClient.execute(getRequest); if (response.getStatusLine().getStatusCode() != 200) { throw new RuntimeException("Failed : HTTP error code : "+ response.getStatusLine().getStatusCode()); } val reader = new BufferedReader(new InputStreamReader((response.getEntity().getContent()))); userInput = reader.readLine() while(userInput != null) { store(userInput) //println(userInput) userInput = reader.readLine() } reader.close() Thread.sleep(60*1000) } httpClient.close() // Restart in an attempt to connect again when server is active again //restart("Trying to connect again") } catch { case e: java.net.ConnectException => // restart if could not connect to server restart("Error connecting to " + url, e) case t: Throwable => // restart if there is any other error restart("Error receiving data", t) } } }
Example 50
Source File: VLBFGS1.scala From spark-vl-bfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.optim import java.util.Random import scala.language.implicitConversions import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.optim.VectorFreeLBFGS.{Oracle, VectorSpace} import org.apache.spark.ml.optim.VectorRDDFunctions._ import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors} import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.storage.StorageLevel private def gradient(data: RDD[Array[LabeledPoint]], dx: RDD[Vector]): RDD[Vector] = { data.cartesian(dx).map { case (points, x) => val g = Vectors.zeros(x.size) points.foreach { case LabeledPoint(b, a) => val err = BLAS.dot(a, x) - b BLAS.axpy(err, a, g) } g }.treeSum() } def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("VLBFGS").setMaster("local[*]") val sc = new SparkContext(conf) sc.setCheckpointDir("/tmp/checkpoint") val n = 1000 val p = 100 val random = new Random(0L) val xExact = Vectors.dense(Array.fill(p)(random.nextDouble())) val data = RandomRDDs.normalVectorRDD(sc, n, p, 4, 11L).mapPartitionsWithIndex { (idx, part) => val random = new Random(100 + idx) part.map { v => val target = BLAS.dot(v, xExact) + 0.1 * random.nextGaussian() LabeledPoint(target, v) } }.glom() .cache() val x = solve(data).first() println(s"x_exact = $xExact") println(s"x_vlbfgs = $x") sc.stop() } }
Example 51
Source File: Sessionize.scala From Mastering-Scala-Machine-Learning with MIT License | 5 votes |
package org.akozlov.chapter06 import java.io._ import java.time.ZoneOffset import java.time.LocalDateTime import java.time.format.DateTimeFormatter import org.apache.spark.{SparkConf,SparkContext} import org.apache.spark.storage.StorageLevel object Sessionize extends App { val sc = new SparkContext("local[8]", "Sessionize", new SparkConf()) val checkoutPattern = ".*>checkout.*".r.pattern // a basic page view structure case class PageView(ts: String, path: String) extends Serializable with Ordered[PageView] { override def toString: String = { s"($ts #$path)" } def compare(other: PageView) = ts compare other.ts } // represent a session case class Session[A <: PageView](id: String, visits: Seq[A]) extends Serializable { override def toString: String = { val vsts = visits.mkString("[", ",", "]") s"($id -> $vsts)" } } def toEpochSeconds(str: String) = { LocalDateTime.parse(str, DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")).toEpochSecond(ZoneOffset.UTC) } val sessions = sc.textFile("data/clickstream") .map(line => {val parts = line.split("\t"); (parts(4), new PageView(parts(0), parts(20)))}) .groupByKey.map(x => { new Session(x._1, x._2.toSeq.sorted) } ) .cache // sessions.take(100).foreach(println) def findAllCheckoutSessions(s: Session[PageView]) = { s.visits.tails.filter { _ match { case PageView(ts1, "mycompanycom>homepage") :: PageView(ts2, page) :: tail if (page != "mycompanycom>homepage" ) => true; case _ => false } } .foldLeft(Seq[Session[PageView]]()) { case (r, x) => { x.find(y => checkoutPattern.matcher(y.path).matches) match { case Some(checkout) if (toEpochSeconds(checkout.ts) > toEpochSeconds(x.head.ts) + 60) => r.:+(new Session(s.id, x.slice(0, x.indexOf(checkout)))) case _ => r } } } } val prodLandingSessions = sessions.flatMap(findAllCheckoutSessions) prodLandingSessions.collect.foreach(println) sc.stop() }
Example 52
Source File: FlumeWordCount.scala From Mastering-Scala-Machine-Learning with MIT License | 5 votes |
package org.akozlov.chapter03 import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.flume._ object FlumeWordCount { def main(args: Array[String]) { // Create the context with a 2 second batch size val sparkConf = new SparkConf().setMaster("local[2]").setAppName("FlumeWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) ssc.checkpoint("/tmp/flume_check") val hostPort=args(0).split(":") System.out.println("Opening a sink at host: [" + hostPort(0) + "] port: [" + hostPort(1).toInt + "]") val lines = FlumeUtils.createPollingStream(ssc, hostPort(0), hostPort(1).toInt, StorageLevel.MEMORY_ONLY) val words = lines .map(e => new String(e.event.getBody.array)).map(_.toLowerCase).flatMap(_.split("\\W+")) .map(word => (word, 1L)) .reduceByKeyAndWindow(_+_, _-_, Seconds(6), Seconds(2)).print ssc.start() ssc.awaitTermination() } }
Example 53
Source File: KafkaWordCount.scala From Mastering-Scala-Machine-Learning with MIT License | 5 votes |
package org.akozlov.chapter03 import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.kafka._ object KafkaWordCount { def main(args: Array[String]) { // Create the context with a 2 second batch size val sparkConf = new SparkConf().setMaster("local[2]").setAppName("KafkaWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) ssc.checkpoint("/tmp/kafka_check") System.out.println("Opening a Kafka consumer at zk: [" + args(0) + "] for group group-1 and topic example") val lines = KafkaUtils.createStream(ssc, args(0), "group-1", Map("example" -> 1), StorageLevel.MEMORY_ONLY) val words = lines .flatMap(_._2.toLowerCase.split("\\W+")) .map(word => (word, 1L)) .reduceByKeyAndWindow(_+_, _-_, Seconds(6), Seconds(2)).print ssc.start() ssc.awaitTermination() } }
Example 54
Source File: ReUseWithCheckpoint.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_4 import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.scalatest.FunSuite class ReUseWithCheckpoint extends FunSuite { private val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext private val checkpointEnabled = true private val storageLevel = StorageLevel.MEMORY_AND_DISK test("should use checkpoint for re-usability of RDD") { //given val sortedRDD = spark.makeRDD(List(1, 2, 5, 77, 888)) if (storageLevel != StorageLevel.NONE) { sortedRDD.persist(storageLevel) } if (checkpointEnabled) { sortedRDD.sparkContext.setCheckpointDir("hdfs://tmp/checkpoint") sortedRDD.checkpoint() } //when performALotOfExpensiveComputations(sortedRDD) //then sortedRDD.collect().toList } def performALotOfExpensiveComputations(sortedRDD: RDD[Int]): Unit = { //.... sortedRDD.count() //failure sortedRDD.collect() } }
Example 55
Source File: StressReceiver.scala From spark-cassandra-stress with Apache License 2.0 | 5 votes |
package com.datastax.sparkstress import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.receiver.Receiver import org.apache.log4j.Logger class StressReceiver[T]( index: Int, rowGenerator: RowGenerator[T], config: Config, blockIntervalInMs: Int, storageLevel: StorageLevel) extends Receiver[T](storageLevel) { class EmitterThread(receiver: StressReceiver[_]) extends Thread(s"Emitter$index") { override def run(): Unit = { val rowIterator = rowGenerator.generatePartition(config.seed, index) val throughPutPerBlockInterval = (blockIntervalInMs / (config.streamingBatchIntervalSeconds * 1000.0) * config.receiverThroughputPerBatch).toLong while (rowIterator.hasNext) { val batchBegin = System.currentTimeMillis() for (x <- 1l to throughPutPerBlockInterval if rowIterator.hasNext) { store(rowIterator.next()) } val batchEnd = System.currentTimeMillis() val napTime = blockIntervalInMs - (batchEnd - batchBegin) if (napTime > 0) Thread.sleep(napTime) } receiver.stop("Iterator Empty") } } def onStart() = { new EmitterThread(this).start() } def onStop() = { } }
Example 56
Source File: StreamingTask.scala From spark-cassandra-stress with Apache License 2.0 | 5 votes |
package com.datastax.sparkstress import java.util.concurrent.TimeUnit import com.datastax.spark.connector.cql.CassandraConnector import com.datastax.spark.connector.streaming._ import com.datastax.sparkstress.RowGenerator.PerfRowGenerator import com.datastax.sparkstress.RowTypes._ import com.datastax.sparkstress.SparkStressImplicits._ import com.datastax.sparkstress.StressTask._ import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.{StreamingContext, _} import scala.reflect.ClassTag abstract class StreamingTask[rowType]( val config: Config, val ss: SparkSession) (implicit ct:ClassTag[rowType]) extends StressTask { val ssc = new StreamingContext(ss.sparkContext, Seconds(config.streamingBatchIntervalSeconds)) val opsPerBatch = (config.numReceivers * config.receiverThroughputPerBatch) val estimatedReqRuntime: Long = ((config.totalOps / opsPerBatch) * config.streamingBatchIntervalSeconds) + 10 val terminationTime: Long = { if (config.terminationTimeMinutes == 0) { estimatedReqRuntime } else { val newTerminationTime: Long = TimeUnit.MINUTES.toSeconds(config.terminationTimeMinutes) if (estimatedReqRuntime <= newTerminationTime) { println(s"Using the estimated runtime (${estimatedReqRuntime} secs}) required to stream ${config.totalOps} since it is <= the requested runtime (${newTerminationTime} secs).") estimatedReqRuntime } else { println(s"Converting requested runtime of ${config.terminationTimeMinutes} min to ${newTerminationTime} secs.") newTerminationTime } } } def setupCQL() = { val cc = CassandraConnector(ss.sparkContext.getConf) cc.withSessionDo { session => if (config.deleteKeyspace) { println(s"Destroying Keyspace") session.execute(s"DROP KEYSPACE IF EXISTS ${config.keyspace}") } val kscql = getKeyspaceCql(config.keyspace, getLocalDC(cc), config.replicationFactor) val tbcql = getTableCql(config.table) println( s"""Running the following create statements\n$kscql\n${tbcql.mkString("\n")}""") session.execute(kscql) session.execute(s"USE ${config.keyspace}") for (cql <- tbcql) session.execute(cql) } printf("Done Setting up CQL Keyspace/Table\n") } def getTableCql(tbName: String): Seq[String] override def getGenerator: RowGenerator[PerfRowClass] = generator override def dstreamOps(dstream: DStream[PerfRowClass]): Unit = dstream.saveToCassandra(config.keyspace, config.table) }
Example 57
Source File: L5-9Mqtt.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.mqtt.MQTTUtils object YearlyDistributionApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: YearlyDistributionApp <appname> <brokerUrl> <topic> <checkpointDir>") System.exit(1) } val Seq(appName, brokerUrl, topic, checkpointDir) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) MQTTUtils.createStream(ssc, brokerUrl, topic, StorageLevel.MEMORY_ONLY_SER_2) .map(rec => rec.split(",")) .map(rec => (rec(1).split(" ")(0), 1)) .updateStateByKey(statefulCount) .map(pair => (pair._2, pair._1)) .transform(rec => rec.sortByKey(ascending = false)) .saveAsTextFiles("YearlyDistribution") ssc.start() ssc.awaitTermination() } val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) }
Example 58
Source File: L5-11FlumePull.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.flume.FlumeUtils object DailyUserTypeDistributionApp2 { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: DailyUserTypeDistributionApp <appname> <hostname> <port> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, hostname, port, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) FlumeUtils.createPollingStream(ssc, hostname, port.toInt, StorageLevel.MEMORY_ONLY_SER_2) .map(rec => new String(rec.event.getBody().array()).split(",")) .map(rec => ((rec(1).split(" ")(0), rec(12)), 1)) .updateStateByKey(statefulCount) .repartition(1) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) }
Example 59
Source File: L5-16Twitter.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.twitter.TwitterUtils import org.apache.spark.storage.StorageLevel import twitter4j.conf.ConfigurationBuilder import twitter4j.TwitterFactory object TwitterApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: TwitterApp <appname> <outputPath>") System.exit(1) } val Seq(appName, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) val cb = new ConfigurationBuilder() cb.setOAuthConsumerKey("") cb.setOAuthConsumerSecret("") cb.setOAuthAccessToken("") cb.setOAuthAccessTokenSecret("") val twitterAuth = new TwitterFactory(cb.build()).getInstance().getAuthorization() val tweetStream = TwitterUtils.createStream(ssc, Some(twitterAuth), Array("nyc citi bike", "nyc bike share")) tweetStream.count().print() tweetStream.saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 60
Source File: HttpInputDStreamAsync.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.ClassTag import org.apache.spark.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaDStream import org.apache.spark.streaming.api.java.JavaDStream.fromDStream import org.apache.spark.streaming.api.java.JavaStreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import com.ning.http.client.AsyncCompletionHandler import com.ning.http.client.AsyncHttpClient import com.ning.http.client.Response class HttpInputDStreamAsync( @transient ssc_ : StreamingContext, storageLevel: StorageLevel, url: String) extends ReceiverInputDStream[String](ssc_) with Logging { def getReceiver(): Receiver[String] = { new HttpReceiverAsync(storageLevel, url) } } class HttpReceiverAsync( storageLevel: StorageLevel, url: String) extends Receiver[String](storageLevel) with Logging { var asyncHttpClient: AsyncHttpClient = _ def onStop() { asyncHttpClient.close() logInfo("Disconnected from Http Server") } def onStart() { asyncHttpClient = new AsyncHttpClient() asyncHttpClient.prepareGet(url).execute(new AsyncCompletionHandler[Response]() { override def onCompleted(response: Response): Response = { store(response.getResponseBody) return response } override def onThrowable(t: Throwable) { restart("Error! Problems while connecting", t) } }); logInfo("Http Connection initiated") } } object HttpUtilsAsync { def createStream( ssc: StreamingContext, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, url: String): DStream[String] = { new HttpInputDStreamAsync(ssc, storageLevel, url) } def createStream( jssc: JavaStreamingContext, storageLevel: StorageLevel, url: String): JavaDStream[String] = { implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] createStream(jssc.ssc, storageLevel, url) } }
Example 61
Source File: L5-11FlumePush.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.flume.FlumeUtils object DailyUserTypeDistributionApp { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: DailyUserTypeDistributionApp <appname> <hostname> <port> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, hostname, port, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) FlumeUtils.createStream(ssc, hostname, port.toInt, StorageLevel.MEMORY_ONLY_SER_2) .map(rec => new String(rec.event.getBody().array()).split(",")) .map(rec => ((rec(1).split(" ")(0), rec(12)), 1)) .updateStateByKey(statefulCount) .repartition(1) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) }
Example 62
Source File: L5-13Kafka.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.kafka.KafkaUtils object StationJourneyCountApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) //.set("spark.streaming.receiver.writeAheadLog.enable", "true") val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val topics = Map[String, Int]( topic -> 1) KafkaUtils.createStream(ssc, zkQuorum, consumerGroupId, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2) .map(rec => rec.split(",")) .map(rec => ((rec(3), rec(7)), 1)) .reduceByKey(_ + _) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 63
Source File: L5-14KafkaCustomConf.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.kafka.KafkaUtils import kafka.serializer.StringDecoder import org.apache.spark.storage.StorageLevel object StationJourneyCountCustomApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) //.set("spark.streaming.receiver.writeAheadLog.enable", "true") val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val topics = Map[String, Int]( topic -> 1) val params = Map[String, String]( "zookeeper.connect" -> zkQuorum, "group.id" -> consumerGroupId, "bootstrap.servers" -> brokerUrl) KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](ssc, params, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2) .map(rec => rec.split(",")) .map(rec => ((rec(3), rec(7)), 1)) .reduceByKey(_ + _) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 64
Source File: HttpInputDStream.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.util.Timer import java.util.TimerTask import scala.reflect.ClassTag import org.apache.http.client.methods.HttpGet import org.apache.http.impl.client.CloseableHttpClient import org.apache.http.impl.client.HttpClients import org.apache.http.util.EntityUtils import org.apache.spark.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaDStream import org.apache.spark.streaming.api.java.JavaDStream.fromDStream import org.apache.spark.streaming.api.java.JavaStreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver class HttpInputDStream( @transient ssc_ : StreamingContext, storageLevel: StorageLevel, url: String, interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging { def getReceiver(): Receiver[String] = { new HttpReceiver(storageLevel, url, interval) } } class HttpReceiver( storageLevel: StorageLevel, url: String, interval: Long) extends Receiver[String](storageLevel) with Logging { var httpClient: CloseableHttpClient = _ var trigger: Timer = _ def onStop() { httpClient.close() logInfo("Disconnected from Http Server") } def onStart() { httpClient = HttpClients.createDefault() trigger = new Timer() trigger.scheduleAtFixedRate(new TimerTask { def run() = doGet() }, 0, interval * 1000) logInfo("Http Receiver initiated") } def doGet() { logInfo("Fetching data from Http source") val response = httpClient.execute(new HttpGet(url)) try { val content = EntityUtils.toString(response.getEntity()) store(content) } catch { case e: Exception => restart("Error! Problems while connecting", e) } finally { response.close() } } } object HttpUtils { def createStream( ssc: StreamingContext, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, url: String, interval: Long): DStream[String] = { new HttpInputDStream(ssc, storageLevel, url, interval) } def createStream( jssc: JavaStreamingContext, storageLevel: StorageLevel, url: String, interval: Long): JavaDStream[String] = { implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] createStream(jssc.ssc, storageLevel, url, interval) } }
Example 65
Source File: L7-2-3Tachyon.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions object ReferrerApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: ReferrerApp <appname> <hostname> <port> <tachyonUrl> <checkpointDir> <outputPathTop> <outputPathSpark>") System.exit(1) } val Seq(appName, hostname, port, tachyonUrl, checkpointDir, outputPathTop, outputPathSpark) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.externalBlockStore.url", tachyonUrl) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val clickstream = ssc.socketTextStream(hostname, port.toInt) .map(rec => rec.split("\\t")) .persist(StorageLevel.OFF_HEAP) val topRefStream = clickstream .map(rec => { var prev_title = rec(3) if (!prev_title.startsWith("other")) { prev_title = "wikipedia" } (prev_title, 1) }) val topSparkStream = clickstream .filter(rec => rec(4).equals("Apache_Spark")) .map(rec => (rec(3), 1)) saveTopKeys(topRefStream, outputPathTop) saveTopKeys(topSparkStream, outputPathSpark) ssc.start() ssc.awaitTermination() } def saveTopKeys(clickstream: DStream[(String, Int)], outputPath: String) { clickstream.updateStateByKey((values, state: Option[Int]) => Some(values.sum + state.getOrElse(0))) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rec => rec.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) } }
Example 66
Source File: HttpInputDStream.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.util.Timer import java.util.TimerTask import scala.reflect.ClassTag import org.apache.http.client.methods.HttpGet import org.apache.http.impl.client.CloseableHttpClient import org.apache.http.impl.client.HttpClients import org.apache.http.util.EntityUtils import org.apache.spark.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaDStream import org.apache.spark.streaming.api.java.JavaDStream.fromDStream import org.apache.spark.streaming.api.java.JavaStreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver class HttpInputDStream( @transient ssc_ : StreamingContext, storageLevel: StorageLevel, url: String, interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging { def getReceiver(): Receiver[String] = { new HttpReceiver(storageLevel, url, interval) } } class HttpReceiver( storageLevel: StorageLevel, url: String, interval: Long) extends Receiver[String](storageLevel) with Logging { var httpClient: CloseableHttpClient = _ var trigger: Timer = _ def onStop() { httpClient.close() logInfo("Disconnected from Http Server") } def onStart() { httpClient = HttpClients.createDefault() trigger = new Timer() trigger.scheduleAtFixedRate(new TimerTask { def run() = doGet() }, 0, interval * 1000) logInfo("Http Receiver initiated") } def doGet() { logInfo("Fetching data from Http source") val response = httpClient.execute(new HttpGet(url)) try { val content = EntityUtils.toString(response.getEntity()) store(content) } catch { case e: Exception => restart("Error! Problems while connecting", e) } finally { response.close() } } } object HttpUtils { def createStream( ssc: StreamingContext, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, url: String, interval: Long): DStream[String] = { new HttpInputDStream(ssc, storageLevel, url, interval) } def createStream( jssc: JavaStreamingContext, storageLevel: StorageLevel, url: String, interval: Long): JavaDStream[String] = { implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] createStream(jssc.ssc, storageLevel, url, interval) } }
Example 67
Source File: KmeansTest.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.spark.mllib import org.apache.log4j.{Level, Logger} import org.apache.spark.storage.StorageLevel import org.apache.spark.{SparkConf, SparkContext} import org.scalaml.{Logging, Resource} import org.scalaml.Predef._ import org.scalaml.stats.TSeries._ import org.scalaml.trading.YahooFinancials import org.scalaml.workflow.data.DataSource import org.scalatest.FunSuite import org.scalatest.concurrent.ScalaFutures import scala.concurrent.Future final class KmeansTest extends FunSuite with ScalaFutures with Logging with Resource { import scala.concurrent.ExecutionContext.Implicits.global protected[this] val name = "Spark MLlib K-Means" private val K = 8 private val NRUNS = 4 private val MAXITERS = 60 private val PATH = "spark/CSCO.csv" private val CACHE = false test(s"$name evaluation") { show(s"Evaluation") Logger.getRootLogger.setLevel(Level.ERROR) // The Spark configuration has to be customize to your environment val sparkConf = new SparkConf().setMaster("local") .setAppName("Kmeans") .set("spark.executor.memory", "4096m") implicit val sc = SparkContext.getOrCreate(sparkConf) // no need to load additional jar file val kmeanClustering: Option[Kmeans] = extract.map(input => { val volatilityVol = zipToSeries(input._1, input._2).take(500) val config = new KmeansConfig(K, MAXITERS, NRUNS) val rddConfig = RDDConfig(CACHE, StorageLevel.MEMORY_ONLY) Kmeans(config, rddConfig, volatilityVol) }) // Wraps into a future to enforce time out in case of a straggler val ft = Future[Boolean] { predict(kmeanClustering) } whenReady(ft) { result => assert(result) } sc.stop } private def predict(kmeanClustering: Option[Kmeans]): Boolean = { kmeanClustering.map(kmeansCluster => { val obs = Array[Double](0.1, 0.9) val clusterId1 = kmeansCluster |> obs show(s"(${obs(0)},${obs(1)}) => Cluster #$clusterId1") val obs2 = Array[Double](0.56, 0.11) val clusterId2 = kmeansCluster |> obs2 val result = s"(${obs2(0)},${obs2(1)}) => Cluster #$clusterId2" show(s"$name result: $result") }) true } private def extract: Option[(DblVec, DblVec)] = { import scala.util._ val extractors = List[Array[String] => Double]( YahooFinancials.volatility, YahooFinancials.volume ) DataSource(getPath(PATH).get, true).map(_.|>) match { case Success(pfnSrc) => pfnSrc(extractors).map(res => ((res(0).toVector, res(1).toVector))).toOption case Failure(e) => failureHandler(e) None } } } // --------------------------------- EOF -------------------------------------------------
Example 68
Source File: InputTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.sdk.pipeline.input import org.apache.spark.storage.StorageLevel import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Matchers, WordSpec} @RunWith(classOf[JUnitRunner]) class InputTest extends WordSpec with Matchers { "Input" should { val input = new InputMock(Map()) val expected = StorageLevel.DISK_ONLY val result = input.storageLevel("DISK_ONLY") "Return the associated storageLevel" in { result should be(expected) } } "classSuffix must be " in { val expected = "Input" val result = Input.ClassSuffix result should be(expected) } }
Example 69
Source File: WebSocketReceiver.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.input.websocket import akka.event.slf4j.SLF4JLogging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.receiver.Receiver class WebSocketReceiver(url: String, storageLevel: StorageLevel) extends Receiver[String](storageLevel) with SLF4JLogging { private var webSocket: Option[WebSocket] = None def onStart() { try { log.info("Connecting to WebSocket: " + url) val newWebSocket = WebSocket().open(url) .onTextMessage({ msg: String => store(msg) }) .onBinaryMessage({ msg: Array[Byte] => store(new Predef.String(msg)) }) setWebSocket(Option(newWebSocket)) log.info("Connected to: WebSocket" + url) } catch { case e: Exception => restart("Error starting WebSocket stream", e) } } def onStop() { setWebSocket() log.info("WebSocket receiver stopped") } private def setWebSocket(newWebSocket: Option[WebSocket] = None) = synchronized { if (webSocket.isDefined) webSocket.get.shutdown() webSocket = newWebSocket } }
Example 70
Source File: TwitterDataSource.scala From opencpu-spark-executor with Apache License 2.0 | 5 votes |
package io.onetapbeyond.opencpu.spark.executor.examples import scala.util.Random import scala.collection.JavaConverters._ import org.apache.spark.storage.StorageLevel object TwitterDataSource { def build():List[String] = { List("Amazing performance by Bolt.", "The accident left many people injured.", "Great news!", "Another bad hair day for Trump!", "Big winner in this weeks lottery.", "Yet another losing day at the races.", "Beautiful photos from space show world in best light.", "The worst winter I ever spent was a summer in San Francisco.") } }
Example 71
Source File: CustomReceiver.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import java.io.{BufferedReader, InputStreamReader} import java.net.Socket import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { logInfo("Connecting to " + host + ":" + port) socket = new Socket(host, port) logInfo("Connected to " + host + ":" + port) val reader = new BufferedReader( new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8)) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() logInfo("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } } // scalastyle:on println
Example 72
Source File: FlumeEventCount.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.streaming.flume._ import org.apache.spark.util.IntParam object FlumeEventCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println( "Usage: FlumeEventCount <host> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val Array(host, IntParam(port)) = args val batchInterval = Milliseconds(2000) // Create the context and set the batch size val sparkConf = new SparkConf().setAppName("FlumeEventCount") val ssc = new StreamingContext(sparkConf, batchInterval) // Create a flume stream val stream = FlumeUtils.createStream(ssc, host, port, StorageLevel.MEMORY_ONLY_SER_2) // Print out the count of events received from this server in each batch stream.count().map(cnt => "Received " + cnt + " flume events." ).print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 73
Source File: RawNetworkGrep.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.util.IntParam object RawNetworkGrep { def main(args: Array[String]) { if (args.length != 4) { System.err.println("Usage: RawNetworkGrep <numStreams> <host> <port> <batchMillis>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val Array(IntParam(numStreams), host, IntParam(port), IntParam(batchMillis)) = args val sparkConf = new SparkConf().setAppName("RawNetworkGrep") // Create the context val ssc = new StreamingContext(sparkConf, Duration(batchMillis)) val rawStreams = (1 to numStreams).map(_ => ssc.rawSocketStream[String](host, port, StorageLevel.MEMORY_ONLY_SER_2)).toArray val union = ssc.union(rawStreams) union.filter(_.contains("the")).count().foreachRDD(r => println("Grep count: " + r.collect().mkString)) ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 74
Source File: SqlNetworkWordCount.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} object SparkSessionSingleton { @transient private var instance: SparkSession = _ def getInstance(sparkConf: SparkConf): SparkSession = { if (instance == null) { instance = SparkSession .builder .config(sparkConf) .getOrCreate() } instance } } // scalastyle:on println
Example 75
Source File: NetworkWordCount.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} object NetworkWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: NetworkWordCount <hostname> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() // Create the context with a 1 second batch size val sparkConf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create a socket stream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 76
Source File: GraphLoader.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx import org.apache.spark.SparkContext import org.apache.spark.graphx.impl.{EdgePartitionBuilder, GraphImpl} import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel def edgeListFile( sc: SparkContext, path: String, canonicalOrientation: Boolean = false, numEdgePartitions: Int = -1, edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY, vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) : Graph[Int, Int] = { val startTime = System.currentTimeMillis // Parse the edge data table directly into edge partitions val lines = if (numEdgePartitions > 0) { sc.textFile(path, numEdgePartitions).coalesce(numEdgePartitions) } else { sc.textFile(path) } val edges = lines.mapPartitionsWithIndex { (pid, iter) => val builder = new EdgePartitionBuilder[Int, Int] iter.foreach { line => if (!line.isEmpty && line(0) != '#') { val lineArray = line.split("\\s+") if (lineArray.length < 2) { throw new IllegalArgumentException("Invalid line: " + line) } val srcId = lineArray(0).toLong val dstId = lineArray(1).toLong if (canonicalOrientation && srcId > dstId) { builder.add(dstId, srcId, 1) } else { builder.add(srcId, dstId, 1) } } } Iterator((pid, builder.toEdgePartition)) }.persist(edgeStorageLevel).setName("GraphLoader.edgeListFile - edges (%s)".format(path)) edges.count() logInfo("It took %d ms to load the edges".format(System.currentTimeMillis - startTime)) GraphImpl.fromEdgePartitions(edges, defaultVertexAttr = 1, edgeStorageLevel = edgeStorageLevel, vertexStorageLevel = vertexStorageLevel) } // end of edgeListFile }
Example 77
Source File: EdgeRDDImpl.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{HashPartitioner, OneToOneDependency} import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 78
Source File: EdgeRDDSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx import org.apache.spark.SparkFunSuite import org.apache.spark.storage.StorageLevel class EdgeRDDSuite extends SparkFunSuite with LocalSparkContext { test("cache, getStorageLevel") { // test to see if getStorageLevel returns correct value after caching withSpark { sc => val verts = sc.parallelize(List((0L, 0), (1L, 1), (1L, 2), (2L, 3), (2L, 3), (2L, 3))) val edges = EdgeRDD.fromEdges(sc.parallelize(List.empty[Edge[Int]])) assert(edges.getStorageLevel == StorageLevel.NONE) edges.cache() assert(edges.getStorageLevel == StorageLevel.MEMORY_ONLY) } } }
Example 79
Source File: PeriodicGraphCheckpointer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.impl import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import org.apache.spark.storage.StorageLevel private[mllib] class PeriodicGraphCheckpointer[VD, ED]( checkpointInterval: Int, sc: SparkContext) extends PeriodicCheckpointer[Graph[VD, ED]](checkpointInterval, sc) { override protected def checkpoint(data: Graph[VD, ED]): Unit = data.checkpoint() override protected def isCheckpointed(data: Graph[VD, ED]): Boolean = data.isCheckpointed override protected def persist(data: Graph[VD, ED]): Unit = { if (data.vertices.getStorageLevel == StorageLevel.NONE) { data.vertices.persist() } if (data.edges.getStorageLevel == StorageLevel.NONE) { data.edges.persist() } } override protected def unpersist(data: Graph[VD, ED]): Unit = data.unpersist(blocking = false) override protected def getCheckpointFiles(data: Graph[VD, ED]): Iterable[String] = { data.getCheckpointFiles } }
Example 80
Source File: KinesisInputDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.{Duration, StreamingContext, Time} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo private[kinesis] class KinesisInputDStream[T: ClassTag]( _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 81
Source File: KafkaStreamSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import scala.collection.mutable import scala.concurrent.duration._ import scala.language.postfixOps import scala.util.Random import kafka.serializer.StringDecoder import org.scalatest.BeforeAndAfterAll import org.scalatest.concurrent.Eventually import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext} class KafkaStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfterAll { private var ssc: StreamingContext = _ private var kafkaTestUtils: KafkaTestUtils = _ override def beforeAll(): Unit = { kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() } override def afterAll(): Unit = { if (ssc != null) { ssc.stop() ssc = null } if (kafkaTestUtils != null) { kafkaTestUtils.teardown() kafkaTestUtils = null } } test("Kafka input stream") { val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName) ssc = new StreamingContext(sparkConf, Milliseconds(500)) val topic = "topic1" val sent = Map("a" -> 5, "b" -> 3, "c" -> 10) kafkaTestUtils.createTopic(topic) kafkaTestUtils.sendMessages(topic, sent) val kafkaParams = Map("zookeeper.connect" -> kafkaTestUtils.zkAddress, "group.id" -> s"test-consumer-${Random.nextInt(10000)}", "auto.offset.reset" -> "smallest") val stream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, Map(topic -> 1), StorageLevel.MEMORY_ONLY) val result = new mutable.HashMap[String, Long]() stream.map(_._2).countByValue().foreachRDD { r => r.collect().foreach { kv => result.synchronized { val count = result.getOrElseUpdate(kv._1, 0) + kv._2 result.put(kv._1, count) } } } ssc.start() eventually(timeout(10000 milliseconds), interval(100 milliseconds)) { assert(result.synchronized { sent === result }) } } }
Example 82
Source File: FlumeStreamSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import java.util.concurrent.ConcurrentLinkedQueue import scala.collection.JavaConverters._ import scala.concurrent.duration._ import scala.language.postfixOps import org.jboss.netty.channel.ChannelPipeline import org.jboss.netty.channel.socket.SocketChannel import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory import org.jboss.netty.handler.codec.compression._ import org.scalatest.{BeforeAndAfter, Matchers} import org.scalatest.concurrent.Eventually._ import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.internal.Logging import org.apache.spark.network.util.JavaUtils import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext, TestOutputStream} class FlumeStreamSuite extends SparkFunSuite with BeforeAndAfter with Matchers with Logging { val conf = new SparkConf().setMaster("local[4]").setAppName("FlumeStreamSuite") var ssc: StreamingContext = null test("flume input stream") { testFlumeStream(testCompression = false) } test("flume input compressed stream") { testFlumeStream(testCompression = true) } private class CompressionChannelFactory(compressionLevel: Int) extends NioClientSocketChannelFactory { override def newChannel(pipeline: ChannelPipeline): SocketChannel = { val encoder = new ZlibEncoder(compressionLevel) pipeline.addFirst("deflater", encoder) pipeline.addFirst("inflater", new ZlibDecoder()) super.newChannel(pipeline) } } }
Example 83
Source File: DatasetCacheSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.storage.StorageLevel class DatasetCacheSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("get storage level") { val ds1 = Seq("1", "2").toDS().as("a") val ds2 = Seq(2, 3).toDS().as("b") // default storage level ds1.persist() ds2.cache() assert(ds1.storageLevel == StorageLevel.MEMORY_AND_DISK) assert(ds2.storageLevel == StorageLevel.MEMORY_AND_DISK) // unpersist ds1.unpersist() assert(ds1.storageLevel == StorageLevel.NONE) // non-default storage level ds1.persist(StorageLevel.MEMORY_ONLY_2) assert(ds1.storageLevel == StorageLevel.MEMORY_ONLY_2) // joined Dataset should not be persisted val joined = ds1.joinWith(ds2, $"a.value" === $"b.value") assert(joined.storageLevel == StorageLevel.NONE) } test("persist and unpersist") { val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS().select(expr("_2 + 1").as[Int]) val cached = ds.cache() // count triggers the caching action. It should not throw. cached.count() // Make sure, the Dataset is indeed cached. assertCached(cached) // Check result. checkDataset( cached, 2, 3, 4) // Drop the cache. cached.unpersist() assert(cached.storageLevel == StorageLevel.NONE, "The Dataset should not be cached.") } test("persist and then rebind right encoder when join 2 datasets") { val ds1 = Seq("1", "2").toDS().as("a") val ds2 = Seq(2, 3).toDS().as("b") ds1.persist() assertCached(ds1) ds2.persist() assertCached(ds2) val joined = ds1.joinWith(ds2, $"a.value" === $"b.value") checkDataset(joined, ("2", 2)) assertCached(joined, 2) ds1.unpersist() assert(ds1.storageLevel == StorageLevel.NONE, "The Dataset ds1 should not be cached.") ds2.unpersist() assert(ds2.storageLevel == StorageLevel.NONE, "The Dataset ds2 should not be cached.") } test("persist and then groupBy columns asKey, map") { val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS() val grouped = ds.groupByKey(_._1) val agged = grouped.mapGroups { case (g, iter) => (g, iter.map(_._2).sum) } agged.persist() checkDataset( agged.filter(_._1 == "b"), ("b", 3)) assertCached(agged.filter(_._1 == "b")) ds.unpersist() assert(ds.storageLevel == StorageLevel.NONE, "The Dataset ds should not be cached.") agged.unpersist() assert(agged.storageLevel == StorageLevel.NONE, "The Dataset agged should not be cached.") } }
Example 84
Source File: WindowedDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.streaming.Duration private[streaming] class WindowedDStream[T: ClassTag]( parent: DStream[T], _windowDuration: Duration, _slideDuration: Duration) extends DStream[T](parent.ssc) { if (!_windowDuration.isMultipleOf(parent.slideDuration)) { throw new Exception("The window duration of windowed DStream (" + _windowDuration + ") " + "must be a multiple of the slide duration of parent DStream (" + parent.slideDuration + ")") } if (!_slideDuration.isMultipleOf(parent.slideDuration)) { throw new Exception("The slide duration of windowed DStream (" + _slideDuration + ") " + "must be a multiple of the slide duration of parent DStream (" + parent.slideDuration + ")") } // Persist parent level by default, as those RDDs are going to be obviously reused. parent.persist(StorageLevel.MEMORY_ONLY_SER) def windowDuration: Duration = _windowDuration override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = _slideDuration override def parentRememberDuration: Duration = rememberDuration + windowDuration override def persist(level: StorageLevel): DStream[T] = { // Do not let this windowed DStream be persisted as windowed (union-ed) RDDs share underlying // RDDs and persisting the windowed RDDs would store numerous copies of the underlying data. // Instead control the persistence of the parent DStream. parent.persist(level) this } override def compute(validTime: Time): Option[RDD[T]] = { val currentWindow = new Interval(validTime - windowDuration + parent.slideDuration, validTime) val rddsInWindow = parent.slice(currentWindow) Some(ssc.sc.union(rddsInWindow)) } }
Example 85
Source File: SocketInputDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io._ import java.net.{ConnectException, Socket} import java.nio.charset.StandardCharsets import scala.reflect.ClassTag import scala.util.control.NonFatal import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.util.NextIterator private[streaming] class SocketInputDStream[T: ClassTag]( _ssc: StreamingContext, host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends ReceiverInputDStream[T](_ssc) { def getReceiver(): Receiver[T] = { new SocketReceiver(host, port, bytesToObjects, storageLevel) } } private[streaming] class SocketReceiver[T: ClassTag]( host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends Receiver[T](storageLevel) with Logging { private var socket: Socket = _ def onStart() { logInfo(s"Connecting to $host:$port") try { socket = new Socket(host, port) } catch { case e: ConnectException => restart(s"Error connecting to $host:$port", e) return } logInfo(s"Connected to $host:$port") // Start the thread that receives data over a connection new Thread("Socket Receiver") { setDaemon(true) override def run() { receive() } }.start() } def onStop() { // in case restart thread close it twice synchronized { if (socket != null) { socket.close() socket = null logInfo(s"Closed socket to $host:$port") } } } def bytesToLines(inputStream: InputStream): Iterator[String] = { val dataInputStream = new BufferedReader( new InputStreamReader(inputStream, StandardCharsets.UTF_8)) new NextIterator[String] { protected override def getNext() = { val nextValue = dataInputStream.readLine() if (nextValue == null) { finished = true } nextValue } protected override def close() { dataInputStream.close() } } } }
Example 86
Source File: BlockTransferService.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.network import java.io.Closeable import java.nio.ByteBuffer import scala.concurrent.{Future, Promise} import scala.concurrent.duration.Duration import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient} import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.util.ThreadUtils private[spark] abstract class BlockTransferService extends ShuffleClient with Closeable with Logging { def uploadBlockSync( hostname: String, port: Int, execId: String, blockId: BlockId, blockData: ManagedBuffer, level: StorageLevel, classTag: ClassTag[_]): Unit = { val future = uploadBlock(hostname, port, execId, blockId, blockData, level, classTag) ThreadUtils.awaitResult(future, Duration.Inf) } }
Example 87
Source File: NettyBlockRpcServer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.language.existentials import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(appId, blocks.iterator.asJava) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteBuffer) case uploadBlock: UploadBlock => // StorageLevel and ClassTag are serialized as bytes using our JavaSerializer. val (level: StorageLevel, classTag: ClassTag[_]) = { serializer .newInstance() .deserialize(ByteBuffer.wrap(uploadBlock.metadata)) .asInstanceOf[(StorageLevel, ClassTag[_])] } val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) val blockId = BlockId(uploadBlock.blockId) blockManager.putBlockData(blockId, data, level, classTag) responseContext.onSuccess(ByteBuffer.allocate(0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 88
Source File: SparkContextInfoSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark import org.scalatest.Assertions import org.apache.spark.storage.StorageLevel class SparkContextInfoSuite extends SparkFunSuite with LocalSparkContext { test("getPersistentRDDs only returns RDDs that are marked as cached") { sc = new SparkContext("local", "test") assert(sc.getPersistentRDDs.isEmpty === true) val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2) assert(sc.getPersistentRDDs.isEmpty === true) rdd.cache() assert(sc.getPersistentRDDs.size === 1) assert(sc.getPersistentRDDs.values.head === rdd) } test("getPersistentRDDs returns an immutable map") { sc = new SparkContext("local", "test") val rdd1 = sc.makeRDD(Array(1, 2, 3, 4), 2).cache() val myRdds = sc.getPersistentRDDs assert(myRdds.size === 1) assert(myRdds(0) === rdd1) assert(myRdds(0).getStorageLevel === StorageLevel.MEMORY_ONLY) // myRdds2 should have 2 RDDs, but myRdds should not change val rdd2 = sc.makeRDD(Array(5, 6, 7, 8), 1).cache() val myRdds2 = sc.getPersistentRDDs assert(myRdds2.size === 2) assert(myRdds2(0) === rdd1) assert(myRdds2(1) === rdd2) assert(myRdds2(0).getStorageLevel === StorageLevel.MEMORY_ONLY) assert(myRdds2(1).getStorageLevel === StorageLevel.MEMORY_ONLY) assert(myRdds.size === 1) assert(myRdds(0) === rdd1) assert(myRdds(0).getStorageLevel === StorageLevel.MEMORY_ONLY) } test("getRDDStorageInfo only reports on RDDs that actually persist data") { sc = new SparkContext("local", "test") val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2).cache() assert(sc.getRDDStorageInfo.size === 0) rdd.collect() assert(sc.getRDDStorageInfo.size === 1) assert(sc.getRDDStorageInfo.head.isCached) assert(sc.getRDDStorageInfo.head.memSize > 0) assert(sc.getRDDStorageInfo.head.storageLevel === StorageLevel.MEMORY_ONLY) } test("call sites report correct locations") { sc = new SparkContext("local", "test") testPackage.runCallSiteTest(sc) } } package object testPackage extends Assertions { private val CALL_SITE_REGEX = "(.+) at (.+):([0-9]+)".r def runCallSiteTest(sc: SparkContext) { val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2) val rddCreationSite = rdd.getCreationSite val curCallSite = sc.getCallSite().shortForm // note: 2 lines after definition of "rdd" val rddCreationLine = rddCreationSite match { case CALL_SITE_REGEX(func, file, line) => assert(func === "makeRDD") assert(file === "SparkContextInfoSuite.scala") line.toInt case _ => fail("Did not match expected call site format") } curCallSite match { case CALL_SITE_REGEX(func, file, line) => assert(func === "getCallSite") // this is correct because we called it from outside of Spark assert(file === "SparkContextInfoSuite.scala") assert(line.toInt === rddCreationLine.toInt + 2) case _ => fail("Did not match expected call site format") } } }
Example 89
Source File: BlockTransferService.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.network import java.io.Closeable import java.nio.ByteBuffer import scala.concurrent.{Promise, Await, Future} import scala.concurrent.duration.Duration import org.apache.spark.Logging import org.apache.spark.network.buffer.{NioManagedBuffer, ManagedBuffer} import org.apache.spark.network.shuffle.{ShuffleClient, BlockFetchingListener} import org.apache.spark.storage.{BlockManagerId, BlockId, StorageLevel} private[spark] abstract class BlockTransferService extends ShuffleClient with Closeable with Logging { def uploadBlockSync( hostname: String, port: Int, execId: String, blockId: BlockId, blockData: ManagedBuffer, level: StorageLevel): Unit = { Await.result(uploadBlock(hostname, port, execId, blockId, blockData, level), Duration.Inf) } }
Example 90
Source File: NettyBlockRpcServer.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConversions._ import org.apache.spark.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, messageBytes: Array[Byte], responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteArray(messageBytes) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(blocks.iterator) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteArray) case uploadBlock: UploadBlock => // StorageLevel is serialized as bytes using our JavaSerializer. val level: StorageLevel = serializer.newInstance().deserialize(ByteBuffer.wrap(uploadBlock.metadata)) val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) blockManager.putBlockData(BlockId(uploadBlock.blockId), data, level) responseContext.onSuccess(new Array[Byte](0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 91
Source File: SparkContextInfoSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark import org.scalatest.{Assertions, FunSuite} import org.apache.spark.storage.StorageLevel class SparkContextInfoSuite extends FunSuite with LocalSparkContext { test("getPersistentRDDs only returns RDDs that are marked as cached") { sc = new SparkContext("local", "test") assert(sc.getPersistentRDDs.isEmpty === true) val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2) assert(sc.getPersistentRDDs.isEmpty === true) rdd.cache() assert(sc.getPersistentRDDs.size === 1) assert(sc.getPersistentRDDs.values.head === rdd) } test("getPersistentRDDs returns an immutable map") { sc = new SparkContext("local", "test") val rdd1 = sc.makeRDD(Array(1, 2, 3, 4), 2).cache() val myRdds = sc.getPersistentRDDs assert(myRdds.size === 1) assert(myRdds(0) === rdd1) assert(myRdds(0).getStorageLevel === StorageLevel.MEMORY_ONLY) // myRdds2 should have 2 RDDs, but myRdds should not change val rdd2 = sc.makeRDD(Array(5, 6, 7, 8), 1).cache() val myRdds2 = sc.getPersistentRDDs assert(myRdds2.size === 2) assert(myRdds2(0) === rdd1) assert(myRdds2(1) === rdd2) assert(myRdds2(0).getStorageLevel === StorageLevel.MEMORY_ONLY) assert(myRdds2(1).getStorageLevel === StorageLevel.MEMORY_ONLY) assert(myRdds.size === 1) assert(myRdds(0) === rdd1) assert(myRdds(0).getStorageLevel === StorageLevel.MEMORY_ONLY) } test("getRDDStorageInfo only reports on RDDs that actually persist data") { sc = new SparkContext("local", "test") val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2).cache() assert(sc.getRDDStorageInfo.size === 0) rdd.collect() assert(sc.getRDDStorageInfo.size === 1) assert(sc.getRDDStorageInfo.head.isCached) assert(sc.getRDDStorageInfo.head.memSize > 0) assert(sc.getRDDStorageInfo.head.storageLevel === StorageLevel.MEMORY_ONLY) } test("call sites report correct locations") { sc = new SparkContext("local", "test") testPackage.runCallSiteTest(sc) } } package object testPackage extends Assertions { private val CALL_SITE_REGEX = "(.+) at (.+):([0-9]+)".r def runCallSiteTest(sc: SparkContext) { val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2) val rddCreationSite = rdd.getCreationSite val curCallSite = sc.getCallSite().shortForm // note: 2 lines after definition of "rdd" val rddCreationLine = rddCreationSite match { case CALL_SITE_REGEX(func, file, line) => { assert(func === "makeRDD") assert(file === "SparkContextInfoSuite.scala") line.toInt } case _ => fail("Did not match expected call site format") } curCallSite match { case CALL_SITE_REGEX(func, file, line) => { assert(func === "getCallSite") // this is correct because we called it from outside of Spark assert(file === "SparkContextInfoSuite.scala") assert(line.toInt === rddCreationLine.toInt + 2) } case _ => fail("Did not match expected call site format") } } }
Example 92
Source File: BandingCollisionStrategy.scala From spark-neighbors with MIT License | 5 votes |
package com.github.karlhigley.spark.neighbors.collision import scala.util.hashing.MurmurHash3 import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import com.github.karlhigley.spark.neighbors.lsh.{ BitSignature, HashTableEntry, IntSignature } def apply(hashTables: RDD[_ <: HashTableEntry[_]]): RDD[(Product, Point)] = { val bandEntries = hashTables.flatMap(entry => { val elements = entry.sigElements val banded = elements.grouped(elements.size / bands).zipWithIndex banded.map { case (bandSig, bandNum) => { // Arrays are mutable and can't be used in RDD keys // Use a hash value (i.e. an int) as a substitute val bandSigHash = MurmurHash3.arrayHash(bandSig) val key = (entry.table, bandNum, bandSigHash).asInstanceOf[Product] (key, (entry.id, entry.point)) } } }) bandEntries } }
Example 93
Source File: SimpleCollisionStrategy.scala From spark-neighbors with MIT License | 5 votes |
package com.github.karlhigley.spark.neighbors.collision import scala.util.hashing.MurmurHash3 import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import com.github.karlhigley.spark.neighbors.lsh.{ BitSignature, HashTableEntry, IntSignature } def apply(hashTables: RDD[_ <: HashTableEntry[_]]): RDD[(Product, Point)] = { val entries = hashTables.map(entry => { // Arrays are mutable and can't be used in RDD keys // Use a hash value (i.e. an int) as a substitute val key = (entry.table, MurmurHash3.arrayHash(entry.sigElements)).asInstanceOf[Product] (key, (entry.id, entry.point)) }) entries } }
Example 94
Source File: SparkTest.scala From spark-records with Apache License 2.0 | 5 votes |
package examples.fancy_numbers import com.swoop.spark.records._ import com.swoop.spark.test.SparkSqlSpec import org.apache.spark.sql.Dataset import org.apache.spark.storage.StorageLevel class SparkTest extends ExampleSpec with SparkSqlSpec with TestNegative5To100 { lazy val dc = SimpleDriverContext(sc) lazy val jc = dc.jobContext(SimpleJobContext) lazy val ds = recordsDataset(-5 to 100, jc) lazy val records = ds.collect "in an integration test" - { implicit val env = FlatRecordEnvironment() val sqlContext = sqlc import sqlContext.implicits._ behave like fancyRecordBuilder(records, jc) "should build records with Spark" in { ds.count should be(105) } "should filter error records" in { ds.errorRecords.count should be(6) } "should extract data from records" in { ds.recordData.count should be(99) } "should extract issues" in { ds.allIssues.count should be(8) ds.errorIssues.count should be(6) } "should demonstrate issueCounts() output" in { ds.issueCounts.show(false) } "should demonstrate errorIssueCounts() output" in { ds.errorIssueCounts.show(false) } "should demonstrate messageCounts() output" in { ds.messageCounts.show(false) } "should demonstrate errorMessageCounts() output" in { ds.errorMessageCounts.show(false) } "should demonstrate errorDetailCounts() output" in { ds.errorIssues.errorDetailCounts().show } "should demonstrate unknownErrorDetailCounts() output" in { ds.errorIssues.unknownErrorDetailCounts("examples.fancy_numbers").show } "should demonstrate errorDetails() output" in { ds.errorIssues.errorDetails().show } "should demonstrate unknownErrorDetails() output" in { ds.errorIssues.unknownErrorDetails("examples.fancy_numbers").show } } def recordsDataset(numbers: Seq[Int], jc: JobContext): Dataset[FancyNumberRecord] = { val sqlContext = sqlc import sqlContext.implicits._ sqlc.createDataset(numbers) .mapPartitions(inputs => Example.buildRecords(inputs, jc)) .persist(StorageLevel.MEMORY_ONLY) } }
Example 95
Source File: CachingData_7_5.scala From LearningSparkV2 with Apache License 2.0 | 5 votes |
package main.scala.chapter7 import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel object CachingData_7_5 { def printConfig(session: SparkSession, key:String) = { // get conf val v = session.conf.getOption(key) println(s"${key} -> ${v}\n") } def timer[A](blockOfCode: => A) = { val startTime = System.nanoTime val result = blockOfCode val endTime = System.nanoTime val delta = endTime - startTime (result, delta/1000000d) } def main(args: Array[String]) { // create a session val spark = SparkSession.builder .master("local[*]") .appName("CachingData") .getOrCreate() import spark.implicits._ printConfig(spark, "\"spark.sql.join.preferSortMergeJoin\"") val df = spark.range(1 * 10000000).toDF("id").withColumn("square", $"id" * $"id") // cache DataFrame df.cache() val (res, tm) = timer(df.count()) println(s"***** Count=${res} and time=${tm}") println("***** Get the second time around") val (res2, tm2) = timer(df.count()) println(s"***** Count=${res2} and time=${tm2}") // Persist on Disk //df.persist(StorageLevel.MEMORY_ONLY) df.persist(StorageLevel.DISK_ONLY) val (res3, tm3) = timer(df.count()) println(s"***** Count=${res3} and time=${tm3}") // create an temporary SQL view df.createOrReplaceTempView("dfTable") spark.sql("cache table dfTable") val (res4, tm4) = timer(spark.sql("select count(*) from dfTable").show()) println(s"***** Count=${res4} and time=${tm4}") // uncomment to view the SparkUI otherwise the program terminates and shutdowsn the UI // Thread.sleep(200000000) // unpersist df.unpersist() // } }
Example 96
Source File: SparkBlockedVector.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.algebra import breeze.linalg.{DenseVector, NumericOps} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import scala.collection.immutable.NumericRange def vertcat(vectors: SparkBlockedVector*): SparkBlockedVector = { //sanity check assert(vectors.map(_.colBlocks).distinct.length == 1, "In case of vertical concatenation of matrices their columns sizes must be equal") val sizes = vectors.map(_.rowBlocks) new SparkBlockedVector(vectors.zipWithIndex.map(couple => { val offset = sizes.slice(0, couple._2).sum couple._1._data.map(c => (c._1+offset, c._2)) }).reduceLeft((a,b) => a.union(b))) } }
Example 97
Source File: AMQPInputDStream.scala From streaming-amqp with Apache License 2.0 | 5 votes |
package io.radanalytics.streaming.amqp import org.apache.qpid.proton.message.Message import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.amqp.ReliableAMQPReceiver import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import scala.reflect.ClassTag class AMQPInputDStream[T: ClassTag]( ssc: StreamingContext, host: String, port: Int, username: Option[String], password: Option[String], address: String, messageConverter: Message => Option[T], useReliableReceiver: Boolean, storageLevel: StorageLevel ) extends ReceiverInputDStream[T](ssc) { def getReceiver(): Receiver[T] = { if (!useReliableReceiver) { new AMQPReceiver(host, port, username, password, address, messageConverter, storageLevel) } else { new ReliableAMQPReceiver(host, port, username, password, address, messageConverter, storageLevel) } } }
Example 98
Source File: AMQPServerStreamSuite.scala From streaming-amqp with Apache License 2.0 | 5 votes |
package io.radanalytics.streaming.amqp import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.amqp.AMQPUtils import org.apache.spark.streaming.{Duration, Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkFunSuite} import org.scalatest.BeforeAndAfter import org.scalatest.concurrent.Eventually import scala.concurrent.duration._ class AMQPServerStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfter { private val batchDuration: Duration = Seconds(1) private val master: String = "local[2]" private val appName: String = this.getClass().getSimpleName() private val address: String = "my_address" private val checkpointDir: String = "/tmp/spark-streaming-amqp-tests" private var conf: SparkConf = _ private var ssc: StreamingContext = _ private var amqpTestUtils: AMQPTestUtils = _ before { conf = new SparkConf().setMaster(master).setAppName(appName) conf.set("spark.streaming.receiver.writeAheadLog.enable", "true") ssc = new StreamingContext(conf, batchDuration) ssc.checkpoint(checkpointDir) amqpTestUtils = new AMQPTestUtils() amqpTestUtils.setup() } after { if (ssc != null) { ssc.stop() } if (amqpTestUtils != null) { amqpTestUtils.teardown() } } test("AMQP receive server") { val sendMessage = "Spark Streaming & AMQP" val max = 10 val delay = 100l amqpTestUtils.startAMQPServer(sendMessage, max, delay) val converter = new AMQPBodyFunction[String] val receiveStream = AMQPUtils.createStream(ssc, amqpTestUtils.host, amqpTestUtils.port, amqpTestUtils.username, amqpTestUtils.password, address, converter, StorageLevel.MEMORY_ONLY) var receivedMessage: List[String] = List() receiveStream.foreachRDD(rdd => { if (!rdd.isEmpty()) { receivedMessage = receivedMessage ::: rdd.collect().toList } }) ssc.start() eventually(timeout(10000 milliseconds), interval(1000 milliseconds)) { assert(receivedMessage.length == max) } ssc.stop() amqpTestUtils.stopAMQPServer() } }
Example 99
Source File: Test_example_CNN.scala From SparkMLlibDeepLearn with Apache License 2.0 | 5 votes |
package tests import org.apache.log4j.{ Level, Logger } import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.storage.StorageLevel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.linalg.{ Vector, Vectors } import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.regression.LabeledPoint import breeze.linalg.{ Matrix => BM, CSCMatrix => BSM, DenseMatrix => BDM, Vector => BV, DenseVector => BDV, SparseVector => BSV, axpy => brzAxpy, svd => brzSvd, max => Bmax, min => Bmin, sum => Bsum } import scala.collection.mutable.ArrayBuffer import CNN.CNN object Test_example_CNN { def main(args: Array[String]) { //1 ����Spark���� val conf = new SparkConf().setAppName("CNNtest") val sc = new SparkContext(conf) //2 �������� Logger.getRootLogger.setLevel(Level.WARN) val data_path = "/deeplearn/train_d3.txt" val examples = sc.textFile(data_path).cache() val train_d1 = examples.map { line => val f1 = line.split("\t") val f = f1.map(f => f.toDouble) val y = f.slice(0, 10) val x = f.slice(10, f.length) (new BDM(1, y.length, y), (new BDM(1, x.length, x)).reshape(28, 28) / 255.0) } val train_d = train_d1.map(f => (f._1, f._2)) //3 ����ѵ������������ģ�� // opts:��������������������������֤���� val opts = Array(50.0, 1.0, 0.0) train_d.cache val numExamples = train_d.count() println(s"numExamples = $numExamples.") val CNNmodel = new CNN(). setMapsize(new BDM(1, 2, Array(28.0, 28.0))). setTypes(Array("i", "c", "s", "c", "s")). setLayer(5). setOnum(10). setOutputmaps(Array(0.0, 6.0, 0.0, 12.0, 0.0)). setKernelsize(Array(0.0, 5.0, 0.0, 5.0, 0.0)). setScale(Array(0.0, 0.0, 2.0, 0.0, 2.0)). setAlpha(1.0). CNNtrain(train_d, opts) //4 ģ�Ͳ��� val CNNforecast = CNNmodel.predict(train_d) val CNNerror = CNNmodel.Loss(CNNforecast) println(s"NNerror = $CNNerror.") val printf1 = CNNforecast.map(f => (f.label.data, f.predict_label.data)).take(200) println("Ԥ��ֵ") for (i <- 0 until printf1.length) { val outi = printf1(i)._2.mkString("\t") println(outi) } } }
Example 100
Source File: DeltaLoad.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.algo import com.adidas.analytics.algo.DeltaLoad._ import com.adidas.analytics.algo.core.Algorithm import com.adidas.analytics.algo.shared.DateComponentDerivation import com.adidas.analytics.config.DeltaLoadConfiguration.PartitionedDeltaLoadConfiguration import com.adidas.analytics.util.DataFrameUtils._ import com.adidas.analytics.util._ import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.storage.StorageLevel import org.slf4j.{Logger, LoggerFactory} private def getUpsertRecords(deltaRecords: Dataset[Row], resultColumns: Seq[String]): Dataset[Row] = { // Create partition window - Partitioning by delta records logical key (i.e. technical key of active records) val partitionWindow = Window .partitionBy(businessKey.map(col): _*) .orderBy(technicalKey.map(component => col(component).desc): _*) // Ranking & projection val rankedDeltaRecords = deltaRecords .withColumn(rankingColumnName, row_number().over(partitionWindow)) .filter(upsertRecordsModesFilterFunction) rankedDeltaRecords .filter(rankedDeltaRecords(rankingColumnName) === 1) .selectExpr(resultColumns: _*) } protected def withDatePartitions(spark: SparkSession, dfs: DFSWrapper, dataFrames: Vector[DataFrame]): Vector[DataFrame] = { logger.info("Adding partitioning information if needed") try { dataFrames.map { df => if (df.columns.toSeq.intersect(targetPartitions) != targetPartitions){ df.transform(withDateComponents(partitionSourceColumn, partitionSourceColumnFormat, targetPartitions)) } else df } } catch { case e: Throwable => logger.error("Cannot add partitioning information for data frames.", e) //TODO: Handle failure case properly throw new RuntimeException("Unable to transform data frames.", e) } } } object DeltaLoad { private val logger: Logger = LoggerFactory.getLogger(getClass) def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): DeltaLoad = { new DeltaLoad(spark, dfs, configLocation) } }
Example 101
Source File: AkkaUtilsSuite.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.akka import scala.concurrent.duration._ import akka.actor.{Props, SupervisorStrategy} import org.apache.spark.SparkFunSuite import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.ReceiverInputDStream class AkkaUtilsSuite extends SparkFunSuite { test("createStream") { val ssc: StreamingContext = new StreamingContext("local[2]", "test", Seconds(1000)) try { // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[String] = AkkaUtils.createStream( ssc, Props[TestActor](), "test") val test2: ReceiverInputDStream[String] = AkkaUtils.createStream( ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2) val test3: ReceiverInputDStream[String] = AkkaUtils.createStream( ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2, supervisorStrategy = SupervisorStrategy.defaultStrategy) val test4: ReceiverInputDStream[String] = AkkaUtils.createStream( ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2, () => null) val test5: ReceiverInputDStream[String] = AkkaUtils.createStream( ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2, () => null) val test6: ReceiverInputDStream[String] = AkkaUtils.createStream( ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2, () => null, SupervisorStrategy.defaultStrategy) } finally { ssc.stop() } } } class TestActor extends ActorReceiver { override def receive: Receive = { case m: String => store(m) case m => store(m, 10.seconds) } }
Example 102
Source File: PubNubWordCount.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming.pubnub import com.google.gson.JsonParser import com.pubnub.api.PNConfiguration import com.pubnub.api.enums.PNReconnectionPolicy import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Milliseconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.pubnub.{PubNubUtils, SparkPubNubMessage} object PubNubWordCount { def main(args: Array[String]): Unit = { if (args.length != 3) { // scalastyle:off println System.err.println( """ |Usage: PubNubWordCount <subscribeKey> <channel> | | <subscribeKey> subscribe key | <channel> channel | <aggregationPeriodMS> aggregation period in milliseconds | """.stripMargin ) // scalastyle:on System.exit(1) } val Seq(subscribeKey, channel, aggregationPeriod) = args.toSeq val sparkConf = new SparkConf().setAppName("PubNubWordCount").setMaster("local[2]") val ssc = new StreamingContext(sparkConf, Milliseconds(aggregationPeriod.toLong)) val config = new PNConfiguration config.setSubscribeKey(subscribeKey) config.setSecure(true) config.setReconnectionPolicy(PNReconnectionPolicy.LINEAR) val pubNubStream: ReceiverInputDStream[SparkPubNubMessage] = PubNubUtils.createStream( ssc, config, Seq(channel), Seq(), None, StorageLevel.MEMORY_AND_DISK_SER_2) val wordCounts = pubNubStream .flatMap( message => new JsonParser().parse(message.getPayload) .getAsJsonObject.get("text").getAsString.split("\\s") ) .map(word => (word, 1)) .reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } }
Example 103
Source File: PubNubUtils.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.pubnub import java.util.{Set => JSet} import collection.JavaConverters._ import com.pubnub.api.PNConfiguration import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaReceiverInputDStream import org.apache.spark.streaming.api.java.JavaStreamingContext import org.apache.spark.streaming.dstream.ReceiverInputDStream object PubNubUtils { def createStream( jssc: JavaStreamingContext, configuration: PNConfiguration, channels: JSet[String], channelGroups: JSet[String], timeToken: Option[Long], storageLevel: StorageLevel): JavaReceiverInputDStream[SparkPubNubMessage] = { createStream( jssc.ssc, configuration, Seq.empty ++ channels.asScala, Seq.empty ++ channelGroups.asScala, timeToken, storageLevel ) } }
Example 104
Source File: CloudantChangesConfig.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.bahir.cloudant import org.apache.spark.storage.StorageLevel import org.apache.bahir.cloudant.common.JsonStoreConfigManager class CloudantChangesConfig(protocol: String, host: String, dbName: String, indexName: String = null, viewName: String = null) (username: String, password: String, partitions: Int, maxInPartition: Int, minInPartition: Int, requestTimeout: Long, bulkSize: Int, schemaSampleSize: Int, createDBOnSave: Boolean, endpoint: String, selector: String, timeout: Int, storageLevel: StorageLevel, useQuery: Boolean, queryLimit: Int, batchInterval: Int, numberOfRetries: Int) extends CloudantConfig(protocol, host, dbName, indexName, viewName)(username, password, partitions, maxInPartition, minInPartition, requestTimeout, bulkSize, schemaSampleSize, createDBOnSave, endpoint, useQuery, queryLimit, numberOfRetries) { override val defaultIndex: String = endpoint def getBatchInterval : Int = { batchInterval } def getSelector : String = { if (selector != null && !selector.isEmpty) { selector } else { val version = getClient.serverVersion if (version.matches("1.*")) { null } else { // Exclude design docs and deleted=true docs "{ \"_id\": { \"$regex\": \"^(?!_design/)\" }, " + "\"_deleted\": { \"$exists\": false } }" } } } def getStorageLevelForStreaming : StorageLevel = { if (storageLevel == null) { StorageLevel.MEMORY_ONLY } else { storageLevel } } def getContinuousChangesUrl: String = { var url = dbUrl + "/" + defaultIndex + "?include_docs=true&feed=continuous&heartbeat=3000" if (getSelector != null) { url = url + "&filter=_selector" } url } def getChangesReceiverUrl: String = { var url = dbUrl + "/" + defaultIndex + "?include_docs=true&feed=normal" + "&seq_interval=" + bulkSize + "&timeout=" + timeout if (getSelector != null) { url = url + "&filter=_selector" } url } // Use _all_docs endpoint for getting the total number of docs def getTotalUrl: String = { dbUrl + "/" + JsonStoreConfigManager.ALL_DOCS_INDEX } } object CloudantChangesConfig { // Error message from internal _changes receiver var receiverErrorMsg: String = "" }
Example 105
Source File: ChangesReceiver.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.bahir.cloudant.internal import java.io.{BufferedReader, InputStreamReader} import java.util.concurrent.TimeUnit import com.google.gson.JsonParser import okhttp3._ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.receiver.Receiver import org.apache.bahir.cloudant.CloudantChangesConfig import org.apache.bahir.cloudant.common._ class ChangesReceiver(config: CloudantChangesConfig) extends Receiver[String](StorageLevel.MEMORY_AND_DISK) { def onStart() { // Start the thread that receives data over a connection new Thread("Cloudant Receiver") { override def run() { receive() } }.start() } private def receive(): Unit = { val okHttpClient: OkHttpClient = new OkHttpClient.Builder() .connectTimeout(5, TimeUnit.SECONDS) .readTimeout(60, TimeUnit.SECONDS) .build val url = config.getChangesReceiverUrl.toString val builder = new Request.Builder().url(url) if (config.username != null) { val credential = Credentials.basic(config.username, config.password) builder.header("Authorization", credential) } if(config.getSelector != null) { val jsonType = MediaType.parse("application/json; charset=utf-8") val selector = "{\"selector\":" + config.getSelector + "}" val selectorBody = RequestBody.create(jsonType, selector) builder.post(selectorBody) } val request = builder.build val response = okHttpClient.newCall(request).execute val status_code = response.code if (status_code == 200) { val changesInputStream = response.body.byteStream var json = new ChangesRow() if (changesInputStream != null) { val bufferedReader = new BufferedReader(new InputStreamReader(changesInputStream)) while ((json = ChangesRowScanner.readRowFromReader(bufferedReader)) != null) { if (!isStopped() && json != null && !json.getDoc.has("_deleted")) { store(json.getDoc.toString) } } } } else { val responseAsJson = new JsonParser().parse(response.body.string) val errorMsg = "Error retrieving _changes feed data from database " + "'" + config.getDbname + "' with response code " + status_code + ": " + responseAsJson.toString reportError(errorMsg, new CloudantException(errorMsg)) CloudantChangesConfig.receiverErrorMsg = errorMsg } } override def onStop(): Unit = { } }
Example 106
Source File: CloudantReceiver.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.bahir.cloudant import java.io.{BufferedReader, InputStreamReader} import java.util.concurrent.TimeUnit import okhttp3._ import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.receiver.Receiver import org.apache.bahir.cloudant.common._ class CloudantReceiver(sparkConf: SparkConf, cloudantParams: Map[String, String]) extends Receiver[String](StorageLevel.MEMORY_AND_DISK) { // CloudantChangesConfig requires `_changes` endpoint option lazy val config: CloudantChangesConfig = { JsonStoreConfigManager.getConfig(sparkConf, cloudantParams + ("cloudant.endpoint" -> JsonStoreConfigManager.CHANGES_INDEX) ).asInstanceOf[CloudantChangesConfig] } def onStart() { // Start the thread that receives data over a connection new Thread("Cloudant Receiver") { override def run() { receive() } }.start() } private def receive(): Unit = { val okHttpClient: OkHttpClient = new OkHttpClient.Builder() .connectTimeout(5, TimeUnit.SECONDS) .readTimeout(60, TimeUnit.SECONDS) .build val url = config.getChangesReceiverUrl.toString val builder = new Request.Builder().url(url) if (config.username != null) { val credential = Credentials.basic(config.username, config.password) builder.header("Authorization", credential) } if(config.getSelector != null) { val jsonType = MediaType.parse("application/json; charset=utf-8") val selector = "{\"selector\":" + config.getSelector + "}" val selectorBody = RequestBody.create(jsonType, selector) builder.post(selectorBody) } val request = builder.build val response = okHttpClient.newCall(request).execute val status_code = response.code if (status_code == 200) { val changesInputStream = response.body.byteStream var json = new ChangesRow() if (changesInputStream != null) { val bufferedReader = new BufferedReader(new InputStreamReader(changesInputStream)) while ((json = ChangesRowScanner.readRowFromReader(bufferedReader)) != null) { if (!isStopped() && json != null && !json.getDoc.has("_deleted")) { store(json.getDoc.toString) } } } } else { val errorMsg = "Error retrieving _changes feed " + config.getDbname + ": " + status_code reportError(errorMsg, new CloudantException(errorMsg)) } } def onStop(): Unit = { } }
Example 107
Source File: TwitterAlgebirdHLL.scala From bahir with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming.twitter import com.twitter.algebird.HyperLogLog._ import com.twitter.algebird.HyperLogLogMonoid import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.twitter._ // scalastyle:off val BIT_SIZE = 12 val filters = args val sparkConf = new SparkConf().setAppName("TwitterAlgebirdHLL") // check Spark configuration for master URL, set it to local if not configured if (!sparkConf.contains("spark.master")) { sparkConf.setMaster("local[2]") } val ssc = new StreamingContext(sparkConf, Seconds(5)) val stream = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_ONLY_SER) val users = stream.map(status => status.getUser.getId) val hll = new HyperLogLogMonoid(BIT_SIZE) var globalHll = hll.zero var userSet: Set[Long] = Set() val approxUsers = users.mapPartitions(ids => { ids.map(id => hll.create(id)) }).reduce(_ + _) val exactUsers = users.map(id => Set(id)).reduce(_ ++ _) approxUsers.foreachRDD(rdd => { if (rdd.count() != 0) { val partial = rdd.first() globalHll += partial println("Approx distinct users this batch: %d".format(partial.estimatedSize.toInt)) println("Approx distinct users overall: %d".format(globalHll.estimatedSize.toInt)) } }) exactUsers.foreachRDD(rdd => { if (rdd.count() != 0) { val partial = rdd.first() userSet ++= partial println("Exact distinct users this batch: %d".format(partial.size)) println("Exact distinct users overall: %d".format(userSet.size)) println("Error rate: %2.5f%%".format(((globalHll.estimatedSize / userSet.size.toDouble) - 1 ) * 100)) } }) ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 108
Source File: TwitterInputDStream.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.twitter import twitter4j._ import twitter4j.auth.Authorization import twitter4j.auth.OAuthAuthorization import twitter4j.conf.ConfigurationBuilder import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.streaming.dstream._ import org.apache.spark.streaming.receiver.Receiver private[streaming] class TwitterInputDStream( _ssc: StreamingContext, twitterAuth: Option[Authorization], query: Option[FilterQuery], storageLevel: StorageLevel ) extends ReceiverInputDStream[Status](_ssc) { private def createOAuthAuthorization(): Authorization = { new OAuthAuthorization(new ConfigurationBuilder().build()) } private val authorization = twitterAuth.getOrElse(createOAuthAuthorization()) override def getReceiver(): Receiver[Status] = { new TwitterReceiver(authorization, query, storageLevel) } } private[streaming] class TwitterReceiver( twitterAuth: Authorization, query: Option[FilterQuery], storageLevel: StorageLevel ) extends Receiver[Status](storageLevel) with Logging { @volatile private var twitterStream: TwitterStream = _ @volatile private var stopped = false def onStart() { try { val newTwitterStream = new TwitterStreamFactory().getInstance(twitterAuth) newTwitterStream.addListener(new StatusListener { def onStatus(status: Status): Unit = { store(status) } // Unimplemented def onDeletionNotice(statusDeletionNotice: StatusDeletionNotice) {} def onTrackLimitationNotice(i: Int) {} def onScrubGeo(l: Long, l1: Long) {} def onStallWarning(stallWarning: StallWarning) {} def onException(e: Exception) { if (!stopped) { restart("Error receiving tweets", e) } } }) if (query.isDefined) { newTwitterStream.filter(query.get) } else { newTwitterStream.sample() } setTwitterStream(newTwitterStream) logInfo("Twitter receiver started") stopped = false } catch { case e: Exception => restart("Error starting Twitter stream", e) } } def onStop() { stopped = true setTwitterStream(null) logInfo("Twitter receiver stopped") } private def setTwitterStream(newTwitterStream: TwitterStream) = synchronized { if (twitterStream != null) { twitterStream.shutdown() } twitterStream = newTwitterStream } }
Example 109
Source File: TwitterStreamSuite.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.twitter import java.util.UUID import scala.collection.mutable import org.scalatest.BeforeAndAfter import org.scalatest.concurrent.Eventually import org.scalatest.time import org.scalatest.time.Span import twitter4j.{FilterQuery, Status, TwitterFactory} import twitter4j.auth.{Authorization, NullAuthorization} import org.apache.spark.ConditionalSparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.ReceiverInputDStream class TwitterStreamSuite extends ConditionalSparkFunSuite with Eventually with BeforeAndAfter with Logging { def shouldRunTest(): Boolean = sys.env.get("ENABLE_TWITTER_TESTS").contains("1") var ssc: StreamingContext = _ before { ssc = new StreamingContext("local[2]", this.getClass.getSimpleName, Seconds(1)) } after { if (ssc != null) { ssc.stop() } } test("twitter input stream") { val filters = Seq("filter1", "filter2") val query = new FilterQuery().language("fr,es") val authorization: Authorization = NullAuthorization.getInstance() // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None) val test2: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters) val test3: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_AND_DISK_SER_2) val test4: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization)) val test5: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization), filters) val test6: ReceiverInputDStream[Status] = TwitterUtils.createStream( ssc, Some(authorization), filters, StorageLevel.MEMORY_AND_DISK_SER_2) val test7: ReceiverInputDStream[Status] = TwitterUtils.createFilteredStream( ssc, Some(authorization), Some(query), StorageLevel.MEMORY_AND_DISK_SER_2) } testIf("messages received", () => TwitterStreamSuite.this.shouldRunTest()) { val userId = TwitterFactory.getSingleton.updateStatus( UUID.randomUUID().toString ).getUser.getId val receiveStream = TwitterUtils.createFilteredStream( ssc, None, Some(new FilterQuery().follow(userId)) ) @volatile var receivedMessages: mutable.Set[Status] = mutable.Set() receiveStream.foreachRDD { rdd => for (element <- rdd.collect()) { receivedMessages += element } receivedMessages } ssc.start() val nbOfMsg = 2 var publishedMessages: List[String] = List() (1 to nbOfMsg).foreach( _ => { publishedMessages = UUID.randomUUID().toString :: publishedMessages } ) eventually(timeout(Span(15, time.Seconds)), interval(Span(1000, time.Millis))) { publishedMessages.foreach( m => if (!receivedMessages.map(m => m.getText).contains(m.toString)) { TwitterFactory.getSingleton.updateStatus(m) } ) assert( publishedMessages.map(m => m.toString).toSet .subsetOf(receivedMessages.map(m => m.getText)) ) } } }
Example 110
Source File: PragmaticSentimentTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.sda.pragmatic import com.johnsnowlabs.nlp.annotators.common.Sentence import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.Tokenizer import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs} import org.apache.spark.storage.StorageLevel import org.scalatest._ import org.scalatest.tagobjects.Slow class PragmaticSentimentBigTestSpec extends FlatSpec { "Parquet based data" should "be sentiment detected properly" taggedAs Slow in { import java.util.Date val data = ContentProvider.parquetData.limit(1000) val documentAssembler = new DocumentAssembler() .setInputCol("text") val assembled = documentAssembler.transform(data) val sentimentDetector = new SentimentDetector() val readyData = AnnotatorBuilder.withFullPOSTagger(AnnotatorBuilder.withFullLemmatizer(assembled)) val result = sentimentDetector .setInputCols(Array("token", "sentence")) .setOutputCol("my_sda_scores") .setDictionary(ExternalResource("src/test/resources/sentiment-corpus/default-sentiment-dict.txt", ReadAs.TEXT, Map("delimiter" -> ","))) .setEnableScore(false) .fit(readyData) .transform(readyData) import Annotation.extractors._ val date1 = new Date().getTime result.show(2) info(s"20 show sample of disk based sentiment analysis took: ${(new Date().getTime - date1)/1000} seconds") val date2 = new Date().getTime result.take("my_sda_scores", 5000) info(s"5000 take sample of disk based sentiment analysis took: ${(new Date().getTime - date2)/1000} seconds") val dataFromMemory = readyData.persist(StorageLevel.MEMORY_AND_DISK) info(s"data in memory is of size: ${dataFromMemory.count}") val resultFromMemory = sentimentDetector.fit(dataFromMemory).transform(dataFromMemory) val date3 = new Date().getTime resultFromMemory.show info(s"20 show sample of memory based sentiment analysis took: ${(new Date().getTime - date3)/1000} seconds") val date4 = new Date().getTime resultFromMemory.take("my_sda_scores", 5000) info(s"5000 take sample of memory based sentiment analysis took: ${(new Date().getTime - date4)/1000} seconds") succeed } } class PragmaticSentimentTestSpec extends FlatSpec with PragmaticSentimentBehaviors { val sentimentSentenceTexts = "The staff of the restaurant is nice and the eggplant is bad " + "I recommend others to avoid because it is too expensive" val sentimentSentences = { new Tokenizer().fit(ContentProvider.parquetData).tag(Sentence.fromTexts(sentimentSentenceTexts)).toArray } "an isolated sentiment detector" should behave like isolatedSentimentDetector(sentimentSentences, -4.0) "a spark based sentiment detector" should behave like sparkBasedSentimentDetector( DataBuilder.basicDataBuild("The staff of the restaurant is nice and the eggplant is bad." + " I recommend others to avoid.") ) "A SentimentDetector" should "be readable and writable" in { val sentimentDetector = new SentimentDetector().setDictionary(ExternalResource("src/test/resources/sentiment-corpus/default-sentiment-dict.txt", ReadAs.TEXT, Map("delimiter" -> ","))).fit(DataBuilder.basicDataBuild("dummy")) val path = "./test-output-tmp/sentimentdetector" try { sentimentDetector.write.overwrite.save(path) val sentimentDetectorRead = SentimentDetectorModel.read.load(path) assert(sentimentDetector.model.score(sentimentSentences) == sentimentDetectorRead.model.score(sentimentSentences)) } catch { case _: java.io.IOException => succeed } } }
Example 111
Source File: Sessionize.scala From ml-in-scala with The Unlicense | 5 votes |
package org.akozlov.chapter06 import java.io._ import java.time.ZoneOffset import java.time.LocalDateTime import java.time.format.DateTimeFormatter import org.apache.spark.{SparkConf,SparkContext} import org.apache.spark.storage.StorageLevel object Sessionize extends App { val sc = new SparkContext("local[8]", "Sessionize", new SparkConf()) val checkoutPattern = ".*>checkout.*".r.pattern // a basic page view structure case class PageView(ts: String, path: String) extends Serializable with Ordered[PageView] { override def toString: String = { s"($ts #$path)" } def compare(other: PageView) = ts compare other.ts } // represent a session case class Session[A <: PageView](id: String, visits: Seq[A]) extends Serializable { override def toString: String = { val vsts = visits.mkString("[", ",", "]") s"($id -> $vsts)" } } def toEpochSeconds(str: String) = { LocalDateTime.parse(str, DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")).toEpochSecond(ZoneOffset.UTC) } val sessions = sc.textFile("data/clickstream") .map(line => {val parts = line.split("\t"); (parts(4), new PageView(parts(0), parts(20)))}) .groupByKey.map(x => { new Session(x._1, x._2.toSeq.sorted) } ) .cache // sessions.take(100).foreach(println) def findAllCheckoutSessions(s: Session[PageView]) = { s.visits.tails.filter { _ match { case PageView(ts1, "mycompanycom>homepage") :: PageView(ts2, page) :: tail if (page != "mycompanycom>homepage" ) => true; case _ => false } } .foldLeft(Seq[Session[PageView]]()) { case (r, x) => { x.find(y => checkoutPattern.matcher(y.path).matches) match { case Some(checkout) if (toEpochSeconds(checkout.ts) > toEpochSeconds(x.head.ts) + 60) => r.:+(new Session(s.id, x.slice(0, x.indexOf(checkout)))) case _ => r } } } } val prodLandingSessions = sessions.flatMap(findAllCheckoutSessions) prodLandingSessions.collect.foreach(println) sc.stop() }
Example 112
Source File: FlumeWordCount.scala From ml-in-scala with The Unlicense | 5 votes |
package org.akozlov.chapter03 import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.flume._ object FlumeWordCount { def main(args: Array[String]) { // Create the context with a 2 second batch size val sparkConf = new SparkConf().setMaster("local[2]").setAppName("FlumeWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) ssc.checkpoint("/tmp/flume_check") val hostPort=args(0).split(":") System.out.println("Opening a sink at host: [" + hostPort(0) + "] port: [" + hostPort(1).toInt + "]") val lines = FlumeUtils.createPollingStream(ssc, hostPort(0), hostPort(1).toInt, StorageLevel.MEMORY_ONLY) val words = lines .map(e => new String(e.event.getBody.array)).map(_.toLowerCase).flatMap(_.split("\\W+")) .map(word => (word, 1L)) .reduceByKeyAndWindow(_+_, _-_, Seconds(6), Seconds(2)).print ssc.start() ssc.awaitTermination() } }
Example 113
Source File: KafkaWordCount.scala From ml-in-scala with The Unlicense | 5 votes |
package org.akozlov.chapter03 import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.kafka._ object KafkaWordCount { def main(args: Array[String]) { // Create the context with a 2 second batch size val sparkConf = new SparkConf().setMaster("local[2]").setAppName("KafkaWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) ssc.checkpoint("/tmp/kafka_check") System.out.println("Opening a Kafka consumer at zk: [" + args(0) + "] for group group-1 and topic example") val lines = KafkaUtils.createStream(ssc, args(0), "group-1", Map("example" -> 1), StorageLevel.MEMORY_ONLY) val words = lines .flatMap(_._2.toLowerCase.split("\\W+")) .map(word => (word, 1L)) .reduceByKeyAndWindow(_+_, _-_, Seconds(6), Seconds(2)).print ssc.start() ssc.awaitTermination() } }
Example 114
Source File: ApspResult.scala From spark-all-pairs-shortest-path with Apache License 2.0 | 5 votes |
import java.io.Serializable import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.distributed.BlockMatrix import org.apache.spark.Logging import org.apache.spark.storage.StorageLevel class ApspResult ( var size: Long, var distMatrix: BlockMatrix) extends Serializable with Logging{ validateResult(distMatrix) private def validateResult(result: BlockMatrix): Unit = { require(result.numRows == result.numCols, "The shortest distance matrix is not square.") require(size == result.numRows, s"The size of the shortest distance matrix does not match $size.") if (result.blocks.getStorageLevel == StorageLevel.NONE) { logWarning("The APSP result is not cached. Lookup could be slow") } } def lookupDist(srcId: Long, dstId: Long): Double = { val sizePerBlock = distMatrix.rowsPerBlock val rowBlockId = (srcId/sizePerBlock).toInt val colBlockId = (dstId/sizePerBlock).toInt val block = distMatrix.blocks.filter{case ((i, j), _) => ( i == rowBlockId) & (j == colBlockId)} .first._2 block.toArray((dstId % sizePerBlock).toInt * block.numRows + (srcId % sizePerBlock).toInt) } def toLocal(): Matrix = { distMatrix.toLocalMatrix() } }
Example 115
Source File: GraphBuilderApp.scala From spark-pagerank with MIT License | 5 votes |
package com.soundcloud.spark.pagerank import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.kohsuke.args4j.{ CmdLineParser, Option } private[pagerank] def runFromInputs(options: Options, spark: SparkSession, input: RDD[String]): Unit = { // read TSV edges // coalesce to smaller number of partitions // convert to internal edges data type val weightedEdges = input .map(parseEdge) .persist(StorageLevel.MEMORY_ONLY_2) // normalize edges val edges = GraphUtils.normalizeOutEdgeWeights(weightedEdges) // unpersist temporary datasets weightedEdges.unpersist() // build PageRank graph val graph = PageRankGraph.fromEdgesWithUniformPriors( edges, tmpStorageLevel = StorageLevel.MEMORY_ONLY_2, edgesStorageLevel = StorageLevel.MEMORY_AND_DISK, verticesStorageLevel = StorageLevel.MEMORY_AND_DISK ) // save graph PageRankGraph.save(spark, graph, options.output) // run additional graph statistics (optional) // TODO(jd): does not exist yet // run graph validation (optional) if (options.validateGraphStructure) { graph.validateStructure().foreach { errors => println(errors.mkString("\n")) } } } private[pagerank] def parseEdge(str: String): Edge = { val Array(srcId, dstId, weight) = str.split("\t") Edge(srcId.toLong, dstId.toLong, weight.toDouble) } }
Example 116
Source File: PageRankApp.scala From spark-pagerank with MIT License | 5 votes |
package com.soundcloud.spark.pagerank import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.kohsuke.args4j.{CmdLineParser, Option => ArgOption} private[pagerank] def runFromInputs( spark: SparkSession, options: Options, inputGraph: PageRankGraph, priorsOpt: Option[VertexRDD]): Unit = { import spark.implicits._ // replace priors, if another vector was supplied val graph = priorsOpt match { case None => inputGraph case Some(priors) => inputGraph.updateVertexValues(priors) } PageRank.run( graph, teleportProb = options.teleportProb, maxIterations = options.maxIterations, convergenceThresholdOpt = options.convergenceThresholdOpt ) .toDS() .write .parquet(options.output) } private[pagerank] def extractStatistic[T](stats: Seq[String], key: String)(parse: (String) => T): T = { stats.map(_.split(",")).find(_.apply(0) == key) match { case Some(pair) => parse(pair(1)) case None => throw new IllegalArgumentException(s"Statistic not found with key: $key") } } }
Example 117
Source File: ConvergenceCheckApp.scala From spark-pagerank with MIT License | 5 votes |
package com.soundcloud.spark.pagerank import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.kohsuke.args4j.{ CmdLineParser, Option => ArgOption } object ConvergenceCheckApp extends SparkApp { class Options { @ArgOption(name = "--inputA", usage = "Version A of the PageRank vector", required = true) var inputA: String = _ @ArgOption(name = "--inputB", usage = "Version B of the PageRank vector", required = true) var inputB: String = _ } def run(args: Array[String], spark: SparkSession): Unit = { import spark.implicits._ def read(path: String): VertexRDD = spark.read.parquet(path).as[Vertex].rdd val options = new Options() new CmdLineParser(options).parseArgument(args: _*) val a = read(options.inputA) val b = read(options.inputB) val delta = sumOfDifferences(a, b) println("Sum of the vertices: ") println(f" A: ${a.map(_.value).sum()}") println(f" B: ${b.map(_.value).sum()}") println(f"Sum of component-wise differences: $delta%.15f") } private[pagerank] def sumOfDifferences(left: VertexRDD, right: VertexRDD): Value = { val leftPair = left.map(v => (v.id, v.value)) val rightPair = right.map(v => (v.id, v.value)) leftPair .join(rightPair) .map { case (_, (l, r)) => math.abs(l - r) } .sum() } }
Example 118
Source File: PageRankAppTest.scala From spark-pagerank with MIT License | 5 votes |
package com.soundcloud.spark.pagerank import java.io.File import org.apache.commons.io.FileUtils import org.apache.spark.storage.StorageLevel import org.scalatest.{ BeforeAndAfter, Matchers, FunSuite } class PageRankAppTest extends FunSuite with BeforeAndAfter with Matchers with GraphTesting with SparkTesting { val path = "target/test/PageRankAppTest" before { FileUtils.deleteDirectory(new File(path)) } // TODO(jd): design a better integration test as this just runs the app without assertions test("integration test") { val options = new PageRankApp.Options() options.output = path val numVertices = 5 val prior = 1.0 / numVertices val stats = Seq(s"numVertices,$numVertices") val edges = spark.sparkContext.parallelize(Seq[OutEdgePair]( // node 1 is dangling (2, OutEdge(1, 1.0)), (3, OutEdge(1, 1.0)), (4, OutEdge(2, 0.5)), (4, OutEdge(3, 0.5)), (5, OutEdge(3, 0.5)), (5, OutEdge(4, 0.5)) )) val vertices = spark.sparkContext.parallelize(Seq[RichVertexPair]( (1, VertexMetadata(prior, true)), (2, VertexMetadata(prior, false)), (3, VertexMetadata(prior, false)), (4, VertexMetadata(prior, false)), (5, VertexMetadata(prior, false)) )) val graph = PageRankGraph( numVertices, edges.persist(StorageLevel.MEMORY_ONLY), vertices.persist(StorageLevel.MEMORY_ONLY) ) PageRankApp.runFromInputs( spark, options, graph, priorsOpt = None ) } }
Example 119
Source File: ConcatColumnBenchmark.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.jmh import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{DataFrame, SparkSession, functions} import org.apache.spark.storage.StorageLevel import org.opencypher.morpheus.impl.MorpheusFunctions import org.opencypher.morpheus.impl.expressions.EncodeLong._ import org.openjdk.jmh.annotations._ @State(Scope.Benchmark) @BenchmarkMode(Array(Mode.AverageTime)) class ConcatColumnBenchmark { implicit var sparkSession: SparkSession = _ var df: DataFrame = _ @Setup def setUp(): Unit = { sparkSession = SparkSession.builder().master("local[*]").getOrCreate() val fromRow = 100000000L val numRows = 1000000 val rangeDf = sparkSession.range(fromRow, fromRow + numRows).toDF("i") val indexCol = rangeDf.col("i") df = rangeDf .withColumn("s", indexCol.cast(StringType)) .withColumn("b", indexCol.encodeLongAsMorpheusId) .partitionAndCache } @Benchmark def concatWs(): Int = { val result = df.withColumn("c", functions.concat_ws("|", df.col("i"), df.col("s"), df.col("b"))) result.select("c").collect().length } @Benchmark def serialize(): Int = { val result = df.withColumn("c", MorpheusFunctions.serialize(df.col("i"), df.col("s"), df.col("b"))) result.select("c").collect().length } implicit class DataFrameSetup(df: DataFrame) { def partitionAndCache: DataFrame = { val cached = df.repartition(10).persist(StorageLevel.MEMORY_ONLY) cached.count() cached } } }
Example 120
Source File: CachedDataSourceTest.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.api.io import org.apache.spark.storage.StorageLevel import org.opencypher.morpheus.api.io.util.CachedDataSource._ import org.opencypher.morpheus.impl.MorpheusConverters._ import org.opencypher.morpheus.impl.table.SparkTable.DataFrameTable import org.opencypher.morpheus.testing.MorpheusTestSuite import org.opencypher.morpheus.testing.fixture.GraphConstructionFixture import org.opencypher.okapi.api.graph.{Namespace, PropertyGraph} import org.opencypher.okapi.relational.api.graph.RelationalCypherGraph import org.opencypher.okapi.relational.impl.graph.ScanGraph import org.scalatest.BeforeAndAfterEach class CachedDataSourceTest extends MorpheusTestSuite with GraphConstructionFixture with BeforeAndAfterEach { override val testNamespace: Namespace = morpheus.catalog.sessionNamespace private val testDataSource = morpheus.catalog.source(testNamespace) override protected def beforeEach(): Unit = { super.beforeEach() testDataSource.store(testGraphName, initGraph(s"CREATE (:A)")) } override protected def afterEach(): Unit = { if (testDataSource.hasGraph(testGraphName)) { unpersist(testDataSource.graph(testGraphName).asMorpheus) testDataSource.delete(testGraphName) } super.afterEach() } it("should cache the graph on first read") { val g0 = testDataSource.graph(testGraphName) assert(g0, StorageLevel.NONE) val cachedDataSource = testDataSource.withCaching val g1 = cachedDataSource.graph(testGraphName) assert(g1, StorageLevel.MEMORY_AND_DISK) assert(g0, StorageLevel.MEMORY_AND_DISK) // side effect for session ds cachedDataSource.hasGraph(testGraphName) should equal(true) testDataSource.hasGraph(testGraphName) should equal(true) } it("should cache the graph on first read with specific storage level") { val cachedDs = testDataSource.withCaching(StorageLevel.MEMORY_ONLY) val g = cachedDs.graph(testGraphName) assert(g, StorageLevel.MEMORY_ONLY) } it("should delete a graph and remove from cache") { val cachedDs = testDataSource.withCaching val g = cachedDs.graph(testGraphName) assert(g, StorageLevel.MEMORY_AND_DISK) cachedDs.delete(testGraphName) assert(g, StorageLevel.NONE) cachedDs.hasGraph(testGraphName) should equal(false) testDataSource.hasGraph(testGraphName) should equal(false) } private def assert(g: PropertyGraph, storageLevel: StorageLevel): Unit = { g.asInstanceOf[ScanGraph[DataFrameTable]].scans .map(_.table.df) .foreach(_.storageLevel should equal(storageLevel)) } private def unpersist(graph: RelationalCypherGraph[DataFrameTable]): Unit = { graph.tables.foreach(_.df.unpersist) } }
Example 121
Source File: CustomReceiver.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import java.io.{BufferedReader, InputStreamReader} import java.net.Socket import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { logInfo("Connecting to " + host + ":" + port) socket = new Socket(host, port) logInfo("Connected to " + host + ":" + port) val reader = new BufferedReader( new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8)) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() logInfo("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } } // scalastyle:on println
Example 122
Source File: FlumeEventCount.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.streaming.flume._ import org.apache.spark.util.IntParam object FlumeEventCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println( "Usage: FlumeEventCount <host> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val Array(host, IntParam(port)) = args val batchInterval = Milliseconds(2000) // Create the context and set the batch size val sparkConf = new SparkConf().setAppName("FlumeEventCount") val ssc = new StreamingContext(sparkConf, batchInterval) // Create a flume stream val stream = FlumeUtils.createStream(ssc, host, port, StorageLevel.MEMORY_ONLY_SER_2) // Print out the count of events received from this server in each batch stream.count().map(cnt => "Received " + cnt + " flume events." ).print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 123
Source File: RawNetworkGrep.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.util.IntParam object RawNetworkGrep { def main(args: Array[String]) { if (args.length != 4) { System.err.println("Usage: RawNetworkGrep <numStreams> <host> <port> <batchMillis>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val Array(IntParam(numStreams), host, IntParam(port), IntParam(batchMillis)) = args val sparkConf = new SparkConf().setAppName("RawNetworkGrep") // Create the context val ssc = new StreamingContext(sparkConf, Duration(batchMillis)) val rawStreams = (1 to numStreams).map(_ => ssc.rawSocketStream[String](host, port, StorageLevel.MEMORY_ONLY_SER_2)).toArray val union = ssc.union(rawStreams) union.filter(_.contains("the")).count().foreachRDD(r => println("Grep count: " + r.collect().mkString)) ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 124
Source File: SqlNetworkWordCount.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} object SparkSessionSingleton { @transient private var instance: SparkSession = _ def getInstance(sparkConf: SparkConf): SparkSession = { if (instance == null) { instance = SparkSession .builder .config(sparkConf) .getOrCreate() } instance } } // scalastyle:on println
Example 125
Source File: NetworkWordCount.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} object NetworkWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: NetworkWordCount <hostname> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() // Create the context with a 1 second batch size val sparkConf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create a socket stream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 126
Source File: GraphLoader.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx import org.apache.spark.SparkContext import org.apache.spark.graphx.impl.{EdgePartitionBuilder, GraphImpl} import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel def edgeListFile( sc: SparkContext, path: String, canonicalOrientation: Boolean = false, numEdgePartitions: Int = -1, edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY, vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) : Graph[Int, Int] = { val startTime = System.currentTimeMillis // Parse the edge data table directly into edge partitions val lines = if (numEdgePartitions > 0) { sc.textFile(path, numEdgePartitions).coalesce(numEdgePartitions) } else { sc.textFile(path) } val edges = lines.mapPartitionsWithIndex { (pid, iter) => val builder = new EdgePartitionBuilder[Int, Int] iter.foreach { line => if (!line.isEmpty && line(0) != '#') { val lineArray = line.split("\\s+") if (lineArray.length < 2) { throw new IllegalArgumentException("Invalid line: " + line) } val srcId = lineArray(0).toLong val dstId = lineArray(1).toLong if (canonicalOrientation && srcId > dstId) { builder.add(dstId, srcId, 1) } else { builder.add(srcId, dstId, 1) } } } Iterator((pid, builder.toEdgePartition)) }.persist(edgeStorageLevel).setName("GraphLoader.edgeListFile - edges (%s)".format(path)) edges.count() logInfo("It took %d ms to load the edges".format(System.currentTimeMillis - startTime)) GraphImpl.fromEdgePartitions(edges, defaultVertexAttr = 1, edgeStorageLevel = edgeStorageLevel, vertexStorageLevel = vertexStorageLevel) } // end of edgeListFile }
Example 127
Source File: EdgeRDDImpl.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{HashPartitioner, OneToOneDependency} import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 128
Source File: EdgeRDDSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx import org.apache.spark.SparkFunSuite import org.apache.spark.storage.StorageLevel class EdgeRDDSuite extends SparkFunSuite with LocalSparkContext { test("cache, getStorageLevel") { // test to see if getStorageLevel returns correct value after caching withSpark { sc => val verts = sc.parallelize(List((0L, 0), (1L, 1), (1L, 2), (2L, 3), (2L, 3), (2L, 3))) val edges = EdgeRDD.fromEdges(sc.parallelize(List.empty[Edge[Int]])) assert(edges.getStorageLevel == StorageLevel.NONE) edges.cache() assert(edges.getStorageLevel == StorageLevel.MEMORY_ONLY) } } }
Example 129
Source File: PeriodicGraphCheckpointer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.impl import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import org.apache.spark.storage.StorageLevel private[mllib] class PeriodicGraphCheckpointer[VD, ED]( checkpointInterval: Int, sc: SparkContext) extends PeriodicCheckpointer[Graph[VD, ED]](checkpointInterval, sc) { override protected def checkpoint(data: Graph[VD, ED]): Unit = data.checkpoint() override protected def isCheckpointed(data: Graph[VD, ED]): Boolean = data.isCheckpointed override protected def persist(data: Graph[VD, ED]): Unit = { if (data.vertices.getStorageLevel == StorageLevel.NONE) { data.vertices.persist() } if (data.edges.getStorageLevel == StorageLevel.NONE) { data.edges.persist() } } override protected def unpersist(data: Graph[VD, ED]): Unit = data.unpersist(blocking = false) override protected def getCheckpointFiles(data: Graph[VD, ED]): Iterable[String] = { data.getCheckpointFiles } }
Example 130
Source File: KinesisInputDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.{Duration, StreamingContext, Time} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo private[kinesis] class KinesisInputDStream[T: ClassTag]( _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 131
Source File: KafkaStreamSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import scala.collection.mutable import scala.concurrent.duration._ import scala.language.postfixOps import scala.util.Random import kafka.serializer.StringDecoder import org.scalatest.BeforeAndAfterAll import org.scalatest.concurrent.Eventually import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext} class KafkaStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfterAll { private var ssc: StreamingContext = _ private var kafkaTestUtils: KafkaTestUtils = _ override def beforeAll(): Unit = { kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() } override def afterAll(): Unit = { if (ssc != null) { ssc.stop() ssc = null } if (kafkaTestUtils != null) { kafkaTestUtils.teardown() kafkaTestUtils = null } } test("Kafka input stream") { val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName) ssc = new StreamingContext(sparkConf, Milliseconds(500)) val topic = "topic1" val sent = Map("a" -> 5, "b" -> 3, "c" -> 10) kafkaTestUtils.createTopic(topic) kafkaTestUtils.sendMessages(topic, sent) val kafkaParams = Map("zookeeper.connect" -> kafkaTestUtils.zkAddress, "group.id" -> s"test-consumer-${Random.nextInt(10000)}", "auto.offset.reset" -> "smallest") val stream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, Map(topic -> 1), StorageLevel.MEMORY_ONLY) val result = new mutable.HashMap[String, Long]() stream.map(_._2).countByValue().foreachRDD { r => r.collect().foreach { kv => result.synchronized { val count = result.getOrElseUpdate(kv._1, 0) + kv._2 result.put(kv._1, count) } } } ssc.start() eventually(timeout(10000 milliseconds), interval(100 milliseconds)) { assert(result.synchronized { sent === result }) } } }
Example 132
Source File: FlumeStreamSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import java.util.concurrent.ConcurrentLinkedQueue import scala.collection.JavaConverters._ import scala.concurrent.duration._ import scala.language.postfixOps import org.jboss.netty.channel.ChannelPipeline import org.jboss.netty.channel.socket.SocketChannel import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory import org.jboss.netty.handler.codec.compression._ import org.scalatest.{BeforeAndAfter, Matchers} import org.scalatest.concurrent.Eventually._ import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.internal.Logging import org.apache.spark.network.util.JavaUtils import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext, TestOutputStream} class FlumeStreamSuite extends SparkFunSuite with BeforeAndAfter with Matchers with Logging { val conf = new SparkConf().setMaster("local[4]").setAppName("FlumeStreamSuite") var ssc: StreamingContext = null test("flume input stream") { testFlumeStream(testCompression = false) } test("flume input compressed stream") { testFlumeStream(testCompression = true) } private class CompressionChannelFactory(compressionLevel: Int) extends NioClientSocketChannelFactory { override def newChannel(pipeline: ChannelPipeline): SocketChannel = { val encoder = new ZlibEncoder(compressionLevel) pipeline.addFirst("deflater", encoder) pipeline.addFirst("inflater", new ZlibDecoder()) super.newChannel(pipeline) } } }
Example 133
Source File: DatasetCacheSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.storage.StorageLevel class DatasetCacheSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("get storage level") { val ds1 = Seq("1", "2").toDS().as("a") val ds2 = Seq(2, 3).toDS().as("b") // default storage level ds1.persist() ds2.cache() assert(ds1.storageLevel == StorageLevel.MEMORY_AND_DISK) assert(ds2.storageLevel == StorageLevel.MEMORY_AND_DISK) // unpersist ds1.unpersist() assert(ds1.storageLevel == StorageLevel.NONE) // non-default storage level ds1.persist(StorageLevel.MEMORY_ONLY_2) assert(ds1.storageLevel == StorageLevel.MEMORY_ONLY_2) // joined Dataset should not be persisted val joined = ds1.joinWith(ds2, $"a.value" === $"b.value") assert(joined.storageLevel == StorageLevel.NONE) } test("persist and unpersist") { val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS().select(expr("_2 + 1").as[Int]) val cached = ds.cache() // count triggers the caching action. It should not throw. cached.count() // Make sure, the Dataset is indeed cached. assertCached(cached) // Check result. checkDataset( cached, 2, 3, 4) // Drop the cache. cached.unpersist() assert(cached.storageLevel == StorageLevel.NONE, "The Dataset should not be cached.") } test("persist and then rebind right encoder when join 2 datasets") { val ds1 = Seq("1", "2").toDS().as("a") val ds2 = Seq(2, 3).toDS().as("b") ds1.persist() assertCached(ds1) ds2.persist() assertCached(ds2) val joined = ds1.joinWith(ds2, $"a.value" === $"b.value") checkDataset(joined, ("2", 2)) assertCached(joined, 2) ds1.unpersist() assert(ds1.storageLevel == StorageLevel.NONE, "The Dataset ds1 should not be cached.") ds2.unpersist() assert(ds2.storageLevel == StorageLevel.NONE, "The Dataset ds2 should not be cached.") } test("persist and then groupBy columns asKey, map") { val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS() val grouped = ds.groupByKey(_._1) val agged = grouped.mapGroups { case (g, iter) => (g, iter.map(_._2).sum) } agged.persist() checkDataset( agged.filter(_._1 == "b"), ("b", 3)) assertCached(agged.filter(_._1 == "b")) ds.unpersist() assert(ds.storageLevel == StorageLevel.NONE, "The Dataset ds should not be cached.") agged.unpersist() assert(agged.storageLevel == StorageLevel.NONE, "The Dataset agged should not be cached.") } }
Example 134
Source File: WindowedDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.streaming.Duration private[streaming] class WindowedDStream[T: ClassTag]( parent: DStream[T], _windowDuration: Duration, _slideDuration: Duration) extends DStream[T](parent.ssc) { if (!_windowDuration.isMultipleOf(parent.slideDuration)) { throw new Exception("The window duration of windowed DStream (" + _windowDuration + ") " + "must be a multiple of the slide duration of parent DStream (" + parent.slideDuration + ")") } if (!_slideDuration.isMultipleOf(parent.slideDuration)) { throw new Exception("The slide duration of windowed DStream (" + _slideDuration + ") " + "must be a multiple of the slide duration of parent DStream (" + parent.slideDuration + ")") } // Persist parent level by default, as those RDDs are going to be obviously reused. parent.persist(StorageLevel.MEMORY_ONLY_SER) def windowDuration: Duration = _windowDuration override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = _slideDuration override def parentRememberDuration: Duration = rememberDuration + windowDuration override def persist(level: StorageLevel): DStream[T] = { // Do not let this windowed DStream be persisted as windowed (union-ed) RDDs share underlying // RDDs and persisting the windowed RDDs would store numerous copies of the underlying data. // Instead control the persistence of the parent DStream. parent.persist(level) this } override def compute(validTime: Time): Option[RDD[T]] = { val currentWindow = new Interval(validTime - windowDuration + parent.slideDuration, validTime) val rddsInWindow = parent.slice(currentWindow) Some(ssc.sc.union(rddsInWindow)) } }
Example 135
Source File: SocketInputDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io._ import java.net.{ConnectException, Socket} import java.nio.charset.StandardCharsets import scala.reflect.ClassTag import scala.util.control.NonFatal import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.util.NextIterator private[streaming] class SocketInputDStream[T: ClassTag]( _ssc: StreamingContext, host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends ReceiverInputDStream[T](_ssc) { def getReceiver(): Receiver[T] = { new SocketReceiver(host, port, bytesToObjects, storageLevel) } } private[streaming] class SocketReceiver[T: ClassTag]( host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends Receiver[T](storageLevel) with Logging { private var socket: Socket = _ def onStart() { logInfo(s"Connecting to $host:$port") try { socket = new Socket(host, port) } catch { case e: ConnectException => restart(s"Error connecting to $host:$port", e) return } logInfo(s"Connected to $host:$port") // Start the thread that receives data over a connection new Thread("Socket Receiver") { setDaemon(true) override def run() { receive() } }.start() } def onStop() { // in case restart thread close it twice synchronized { if (socket != null) { socket.close() socket = null logInfo(s"Closed socket to $host:$port") } } } def bytesToLines(inputStream: InputStream): Iterator[String] = { val dataInputStream = new BufferedReader( new InputStreamReader(inputStream, StandardCharsets.UTF_8)) new NextIterator[String] { protected override def getNext() = { val nextValue = dataInputStream.readLine() if (nextValue == null) { finished = true } nextValue } protected override def close() { dataInputStream.close() } } } }
Example 136
Source File: BlockTransferService.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.network import java.io.Closeable import java.nio.ByteBuffer import scala.concurrent.{Future, Promise} import scala.concurrent.duration.Duration import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient} import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.util.ThreadUtils private[spark] abstract class BlockTransferService extends ShuffleClient with Closeable with Logging { def uploadBlockSync( hostname: String, port: Int, execId: String, blockId: BlockId, blockData: ManagedBuffer, level: StorageLevel, classTag: ClassTag[_]): Unit = { val future = uploadBlock(hostname, port, execId, blockId, blockData, level, classTag) ThreadUtils.awaitResult(future, Duration.Inf) } }
Example 137
Source File: NettyBlockRpcServer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.language.existentials import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(appId, blocks.iterator.asJava) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteBuffer) case uploadBlock: UploadBlock => // StorageLevel and ClassTag are serialized as bytes using our JavaSerializer. val (level: StorageLevel, classTag: ClassTag[_]) = { serializer .newInstance() .deserialize(ByteBuffer.wrap(uploadBlock.metadata)) .asInstanceOf[(StorageLevel, ClassTag[_])] } val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) val blockId = BlockId(uploadBlock.blockId) blockManager.putBlockData(blockId, data, level, classTag) responseContext.onSuccess(ByteBuffer.allocate(0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 138
Source File: SparkContextInfoSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark import org.scalatest.Assertions import org.apache.spark.storage.StorageLevel class SparkContextInfoSuite extends SparkFunSuite with LocalSparkContext { test("getPersistentRDDs only returns RDDs that are marked as cached") { sc = new SparkContext("local", "test") assert(sc.getPersistentRDDs.isEmpty === true) val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2) assert(sc.getPersistentRDDs.isEmpty === true) rdd.cache() assert(sc.getPersistentRDDs.size === 1) assert(sc.getPersistentRDDs.values.head === rdd) } test("getPersistentRDDs returns an immutable map") { sc = new SparkContext("local", "test") val rdd1 = sc.makeRDD(Array(1, 2, 3, 4), 2).cache() val myRdds = sc.getPersistentRDDs assert(myRdds.size === 1) assert(myRdds(0) === rdd1) assert(myRdds(0).getStorageLevel === StorageLevel.MEMORY_ONLY) // myRdds2 should have 2 RDDs, but myRdds should not change val rdd2 = sc.makeRDD(Array(5, 6, 7, 8), 1).cache() val myRdds2 = sc.getPersistentRDDs assert(myRdds2.size === 2) assert(myRdds2(0) === rdd1) assert(myRdds2(1) === rdd2) assert(myRdds2(0).getStorageLevel === StorageLevel.MEMORY_ONLY) assert(myRdds2(1).getStorageLevel === StorageLevel.MEMORY_ONLY) assert(myRdds.size === 1) assert(myRdds(0) === rdd1) assert(myRdds(0).getStorageLevel === StorageLevel.MEMORY_ONLY) } test("getRDDStorageInfo only reports on RDDs that actually persist data") { sc = new SparkContext("local", "test") val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2).cache() assert(sc.getRDDStorageInfo.size === 0) rdd.collect() assert(sc.getRDDStorageInfo.size === 1) assert(sc.getRDDStorageInfo.head.isCached) assert(sc.getRDDStorageInfo.head.memSize > 0) assert(sc.getRDDStorageInfo.head.storageLevel === StorageLevel.MEMORY_ONLY) } test("call sites report correct locations") { sc = new SparkContext("local", "test") testPackage.runCallSiteTest(sc) } } package object testPackage extends Assertions { private val CALL_SITE_REGEX = "(.+) at (.+):([0-9]+)".r def runCallSiteTest(sc: SparkContext) { val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2) val rddCreationSite = rdd.getCreationSite val curCallSite = sc.getCallSite().shortForm // note: 2 lines after definition of "rdd" val rddCreationLine = rddCreationSite match { case CALL_SITE_REGEX(func, file, line) => assert(func === "makeRDD") assert(file === "SparkContextInfoSuite.scala") line.toInt case _ => fail("Did not match expected call site format") } curCallSite match { case CALL_SITE_REGEX(func, file, line) => assert(func === "getCallSite") // this is correct because we called it from outside of Spark assert(file === "SparkContextInfoSuite.scala") assert(line.toInt === rddCreationLine.toInt + 2) case _ => fail("Did not match expected call site format") } } }
Example 139
Source File: SparkTachyonHdfsLR.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo import org.apache.spark.storage.StorageLevel object SparkTachyonHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def main(args: Array[String]) { showWarning() val inputPath = args(0) val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR") val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } }
Example 140
Source File: SparkTachyonPi.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import scala.math.random import org.apache.spark._ import org.apache.spark.storage.StorageLevel object SparkTachyonPi { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("SparkTachyonPi") val spark = new SparkContext(sparkConf) val slices = if (args.length > 0) args(0).toInt else 2 val n = 100000 * slices val rdd = spark.parallelize(1 to n, slices) rdd.persist(StorageLevel.OFF_HEAP) val count = rdd.map { i => val x = random * 2 - 1 val y = random * 2 - 1 if (x * x + y * y < 1) 1 else 0 }.reduce(_ + _) println("Pi is roughly " + 4.0 * count / n) spark.stop() } }
Example 141
Source File: CustomReceiver.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import java.io.{InputStreamReader, BufferedReader, InputStream} import java.net.Socket import org.apache.spark.{SparkConf, Logging} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { logInfo("Connecting to " + host + ":" + port) socket = new Socket(host, port) logInfo("Connected to " + host + ":" + port) val reader = new BufferedReader(new InputStreamReader(socket.getInputStream(), "UTF-8")) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() logInfo("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } }
Example 142
Source File: FlumeEventCount.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.streaming.flume._ import org.apache.spark.util.IntParam object FlumeEventCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println( "Usage: FlumeEventCount <host> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val Array(host, IntParam(port)) = args val batchInterval = Milliseconds(2000) // Create the context and set the batch size val sparkConf = new SparkConf().setAppName("FlumeEventCount") val ssc = new StreamingContext(sparkConf, batchInterval) // Create a flume stream val stream = FlumeUtils.createStream(ssc, host, port, StorageLevel.MEMORY_ONLY_SER_2) // Print out the count of events received from this server in each batch stream.count().map(cnt => "Received " + cnt + " flume events." ).print() ssc.start() ssc.awaitTermination() } }
Example 143
Source File: TwitterAlgebirdHLL.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import com.twitter.algebird.HyperLogLogMonoid import com.twitter.algebird.HyperLogLog._ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.twitter._ import org.apache.spark.SparkConf // scalastyle:off val BIT_SIZE = 12 val filters = args val sparkConf = new SparkConf().setAppName("TwitterAlgebirdHLL") val ssc = new StreamingContext(sparkConf, Seconds(5)) val stream = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_ONLY_SER) val users = stream.map(status => status.getUser.getId) val hll = new HyperLogLogMonoid(BIT_SIZE) var globalHll = hll.zero var userSet: Set[Long] = Set() val approxUsers = users.mapPartitions(ids => { ids.map(id => hll(id)) }).reduce(_ + _) val exactUsers = users.map(id => Set(id)).reduce(_ ++ _) approxUsers.foreachRDD(rdd => { if (rdd.count() != 0) { val partial = rdd.first() globalHll += partial println("Approx distinct users this batch: %d".format(partial.estimatedSize.toInt)) println("Approx distinct users overall: %d".format(globalHll.estimatedSize.toInt)) } }) exactUsers.foreachRDD(rdd => { if (rdd.count() != 0) { val partial = rdd.first() userSet ++= partial println("Exact distinct users this batch: %d".format(partial.size)) println("Exact distinct users overall: %d".format(userSet.size)) println("Error rate: %2.5f%%".format(((globalHll.estimatedSize / userSet.size.toDouble) - 1 ) * 100)) } }) ssc.start() ssc.awaitTermination() } }
Example 144
Source File: RawNetworkGrep.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.util.IntParam object RawNetworkGrep { def main(args: Array[String]) { if (args.length != 4) { System.err.println("Usage: RawNetworkGrep <numStreams> <host> <port> <batchMillis>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val Array(IntParam(numStreams), host, IntParam(port), IntParam(batchMillis)) = args val sparkConf = new SparkConf().setAppName("RawNetworkGrep") // Create the context val ssc = new StreamingContext(sparkConf, Duration(batchMillis)) val rawStreams = (1 to numStreams).map(_ => ssc.rawSocketStream[String](host, port, StorageLevel.MEMORY_ONLY_SER_2)).toArray val union = ssc.union(rawStreams) union.filter(_.contains("the")).count().foreachRDD(r => println("Grep count: " + r.collect().mkString)) ssc.start() ssc.awaitTermination() } }
Example 145
Source File: NetworkWordCount.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.storage.StorageLevel object NetworkWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: NetworkWordCount <hostname> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() // Create the context with a 1 second batch size val sparkConf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create a socket stream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } }
Example 146
Source File: FlumePollingEventCount.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.streaming.flume._ import org.apache.spark.util.IntParam import java.net.InetSocketAddress object FlumePollingEventCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println( "Usage: FlumePollingEventCount <host> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val Array(host, IntParam(port)) = args val batchInterval = Milliseconds(2000) // Create the context and set the batch size val sparkConf = new SparkConf().setAppName("FlumePollingEventCount") val ssc = new StreamingContext(sparkConf, batchInterval) // Create a flume stream that polls the Spark Sink running in a Flume agent val stream = FlumeUtils.createPollingStream(ssc, host, port) // Print out the count of events received from this server in each batch stream.count().map(cnt => "Received " + cnt + " flume events." ).print() ssc.start() ssc.awaitTermination() } }
Example 147
Source File: MQTTWordCount.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.eclipse.paho.client.mqttv3._ import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.mqtt._ import org.apache.spark.SparkConf object MQTTWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println( "Usage: MQTTWordCount <MqttbrokerUrl> <topic>") System.exit(1) } val Seq(brokerUrl, topic) = args.toSeq val sparkConf = new SparkConf().setAppName("MQTTWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) val lines = MQTTUtils.createStream(ssc, brokerUrl, topic, StorageLevel.MEMORY_ONLY_SER_2) val words = lines.flatMap(x => x.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } }
Example 148
Source File: GraphLoader.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx import org.apache.spark.storage.StorageLevel import org.apache.spark.{Logging, SparkContext} import org.apache.spark.graphx.impl.{EdgePartitionBuilder, GraphImpl} def edgeListFile( sc: SparkContext, path: String, canonicalOrientation: Boolean = false, numEdgePartitions: Int = -1, edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY, vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) : Graph[Int, Int] = { val startTime = System.currentTimeMillis // Parse the edge data table directly into edge partitions val lines = if (numEdgePartitions > 0) { sc.textFile(path, numEdgePartitions).coalesce(numEdgePartitions) } else { sc.textFile(path) } val edges = lines.mapPartitionsWithIndex { (pid, iter) => val builder = new EdgePartitionBuilder[Int, Int] iter.foreach { line => if (!line.isEmpty && line(0) != '#') { val lineArray = line.split("\\s+") if (lineArray.length < 2) { throw new IllegalArgumentException("Invalid line: " + line) } val srcId = lineArray(0).toLong val dstId = lineArray(1).toLong if (canonicalOrientation && srcId > dstId) { builder.add(dstId, srcId, 1) } else { builder.add(srcId, dstId, 1) } } } Iterator((pid, builder.toEdgePartition)) }.persist(edgeStorageLevel).setName("GraphLoader.edgeListFile - edges (%s)".format(path)) edges.count() logInfo("It took %d ms to load the edges".format(System.currentTimeMillis - startTime)) GraphImpl.fromEdgePartitions(edges, defaultVertexAttr = 1, edgeStorageLevel = edgeStorageLevel, vertexStorageLevel = vertexStorageLevel) } // end of edgeListFile }
Example 149
Source File: EdgeRDDImpl.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{OneToOneDependency, HashPartitioner} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.graphx._ class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 150
Source File: EdgeRDDSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx import org.apache.spark.SparkFunSuite import org.apache.spark.storage.StorageLevel class EdgeRDDSuite extends SparkFunSuite with LocalSparkContext { test("cache, getStorageLevel") { // test to see if getStorageLevel returns correct value after caching withSpark { sc => val verts = sc.parallelize(List((0L, 0), (1L, 1), (1L, 2), (2L, 3), (2L, 3), (2L, 3))) val edges = EdgeRDD.fromEdges(sc.parallelize(List.empty[Edge[Int]])) assert(edges.getStorageLevel == StorageLevel.NONE) edges.cache() assert(edges.getStorageLevel == StorageLevel.MEMORY_ONLY) } } }
Example 151
Source File: TwitterInputDStream.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.twitter import twitter4j._ import twitter4j.auth.Authorization import twitter4j.conf.ConfigurationBuilder import twitter4j.auth.OAuthAuthorization import org.apache.spark.streaming._ import org.apache.spark.streaming.dstream._ import org.apache.spark.storage.StorageLevel import org.apache.spark.Logging import org.apache.spark.streaming.receiver.Receiver private[streaming] class TwitterInputDStream( @transient ssc_ : StreamingContext, twitterAuth: Option[Authorization], filters: Seq[String], storageLevel: StorageLevel ) extends ReceiverInputDStream[Status](ssc_) { private def createOAuthAuthorization(): Authorization = { new OAuthAuthorization(new ConfigurationBuilder().build()) } private val authorization = twitterAuth.getOrElse(createOAuthAuthorization()) override def getReceiver(): Receiver[Status] = { new TwitterReceiver(authorization, filters, storageLevel) } } private[streaming] class TwitterReceiver( twitterAuth: Authorization, filters: Seq[String], storageLevel: StorageLevel ) extends Receiver[Status](storageLevel) with Logging { @volatile private var twitterStream: TwitterStream = _ @volatile private var stopped = false def onStart() { try { val newTwitterStream = new TwitterStreamFactory().getInstance(twitterAuth) newTwitterStream.addListener(new StatusListener { def onStatus(status: Status): Unit = { store(status) } // Unimplemented def onDeletionNotice(statusDeletionNotice: StatusDeletionNotice) {} def onTrackLimitationNotice(i: Int) {} def onScrubGeo(l: Long, l1: Long) {} def onStallWarning(stallWarning: StallWarning) {} def onException(e: Exception) { if (!stopped) { restart("Error receiving tweets", e) } } }) val query = new FilterQuery if (filters.size > 0) { query.track(filters.toArray) newTwitterStream.filter(query) } else { newTwitterStream.sample() } setTwitterStream(newTwitterStream) logInfo("Twitter receiver started") stopped = false } catch { case e: Exception => restart("Error starting Twitter stream", e) } } def onStop() { stopped = true setTwitterStream(null) logInfo("Twitter receiver stopped") } private def setTwitterStream(newTwitterStream: TwitterStream) = synchronized { if (twitterStream != null) { twitterStream.shutdown() } twitterStream = newTwitterStream } }
Example 152
Source File: TwitterStreamSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.twitter import org.scalatest.BeforeAndAfter import twitter4j.Status import twitter4j.auth.{NullAuthorization, Authorization} import org.apache.spark.{Logging, SparkFunSuite} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.ReceiverInputDStream class TwitterStreamSuite extends SparkFunSuite with BeforeAndAfter with Logging { val batchDuration = Seconds(1) private val master: String = "local[2]" private val framework: String = this.getClass.getSimpleName test("twitter input stream") { val ssc = new StreamingContext(master, framework, batchDuration) val filters = Seq("filter1", "filter2") val authorization: Authorization = NullAuthorization.getInstance() // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None) val test2: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters) val test3: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_AND_DISK_SER_2) val test4: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization)) val test5: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization), filters) val test6: ReceiverInputDStream[Status] = TwitterUtils.createStream( ssc, Some(authorization), filters, StorageLevel.MEMORY_AND_DISK_SER_2) // Note that actually testing the data receiving is hard as authentication keys are // necessary for accessing Twitter live stream ssc.stop() } }
Example 153
Source File: FlumeStreamSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import java.net.{InetSocketAddress, ServerSocket} import java.nio.ByteBuffer import scala.collection.JavaConversions._ import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer} import scala.concurrent.duration._ import scala.language.postfixOps import com.google.common.base.Charsets import org.apache.avro.ipc.NettyTransceiver import org.apache.avro.ipc.specific.SpecificRequestor import org.apache.commons.lang3.RandomUtils import org.apache.flume.source.avro import org.apache.flume.source.avro.{AvroFlumeEvent, AvroSourceProtocol} import org.jboss.netty.channel.ChannelPipeline import org.jboss.netty.channel.socket.SocketChannel import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory import org.jboss.netty.handler.codec.compression._ import org.scalatest.{BeforeAndAfter, Matchers} import org.scalatest.concurrent.Eventually._ import org.apache.spark.{Logging, SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext, TestOutputStream} import org.apache.spark.util.Utils class FlumeStreamSuite extends SparkFunSuite with BeforeAndAfter with Matchers with Logging { val conf = new SparkConf().setMaster("local[4]").setAppName("FlumeStreamSuite") var ssc: StreamingContext = null var transceiver: NettyTransceiver = null after { if (ssc != null) { ssc.stop() } if (transceiver != null) { transceiver.close() } } test("flume input stream") { testFlumeStream(testCompression = false) } test("flume input compressed stream") { testFlumeStream(testCompression = true) } private class CompressionChannelFactory(compressionLevel: Int) extends NioClientSocketChannelFactory { override def newChannel(pipeline: ChannelPipeline): SocketChannel = { val encoder = new ZlibEncoder(compressionLevel) pipeline.addFirst("deflater", encoder) pipeline.addFirst("inflater", new ZlibDecoder()) super.newChannel(pipeline) } } }
Example 154
Source File: ZeroMQStreamSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.zeromq import akka.actor.SupervisorStrategy import akka.util.ByteString import akka.zeromq.Subscribe import org.apache.spark.SparkFunSuite import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.ReceiverInputDStream class ZeroMQStreamSuite extends SparkFunSuite { val batchDuration = Seconds(1) private val master: String = "local[2]" private val framework: String = this.getClass.getSimpleName test("zeromq input stream") { val ssc = new StreamingContext(master, framework, batchDuration) val publishUrl = "abc" val subscribe = new Subscribe(null.asInstanceOf[ByteString]) val bytesToObjects = (bytes: Seq[ByteString]) => null.asInstanceOf[Iterator[String]] // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[String] = ZeroMQUtils.createStream(ssc, publishUrl, subscribe, bytesToObjects) val test2: ReceiverInputDStream[String] = ZeroMQUtils.createStream( ssc, publishUrl, subscribe, bytesToObjects, StorageLevel.MEMORY_AND_DISK_SER_2) val test3: ReceiverInputDStream[String] = ZeroMQUtils.createStream( ssc, publishUrl, subscribe, bytesToObjects, StorageLevel.MEMORY_AND_DISK_SER_2, SupervisorStrategy.defaultStrategy) // TODO: Actually test data receiving ssc.stop() } }
Example 155
Source File: MQTTInputDStream.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.mqtt import org.eclipse.paho.client.mqttv3.IMqttDeliveryToken import org.eclipse.paho.client.mqttv3.MqttCallback import org.eclipse.paho.client.mqttv3.MqttClient import org.eclipse.paho.client.mqttv3.MqttMessage import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream._ import org.apache.spark.streaming.receiver.Receiver private[streaming] class MQTTInputDStream( @transient ssc_ : StreamingContext, brokerUrl: String, topic: String, storageLevel: StorageLevel ) extends ReceiverInputDStream[String](ssc_) { private[streaming] override def name: String = s"MQTT stream [$id]" def getReceiver(): Receiver[String] = { new MQTTReceiver(brokerUrl, topic, storageLevel) } } private[streaming] class MQTTReceiver( brokerUrl: String, topic: String, storageLevel: StorageLevel ) extends Receiver[String](storageLevel) { def onStop() { } def onStart() { // Set up persistence for messages val persistence = new MemoryPersistence() // Initializing Mqtt Client specifying brokerUrl, clientID and MqttClientPersistance val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), persistence) // Callback automatically triggers as and when new message arrives on specified topic val callback = new MqttCallback() { // Handles Mqtt message override def messageArrived(topic: String, message: MqttMessage) { store(new String(message.getPayload(), "utf-8")) } override def deliveryComplete(token: IMqttDeliveryToken) { } override def connectionLost(cause: Throwable) { restart("Connection lost ", cause) } } // Set up callback for MqttClient. This needs to happen before // connecting or subscribing, otherwise messages may be lost client.setCallback(callback) // Connect to MqttBroker client.connect() // Subscribe to Mqtt topic client.subscribe(topic) } }
Example 156
Source File: MQTTUtils.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.mqtt import scala.reflect.ClassTag import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.{JavaReceiverInputDStream, JavaStreamingContext, JavaDStream} import org.apache.spark.streaming.dstream.{ReceiverInputDStream, DStream} object MQTTUtils { def createStream( jssc: JavaStreamingContext, brokerUrl: String, topic: String, storageLevel: StorageLevel ): JavaReceiverInputDStream[String] = { implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] createStream(jssc.ssc, brokerUrl, topic, storageLevel) } }
Example 157
Source File: KafkaStreamSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import scala.collection.mutable import scala.concurrent.duration._ import scala.language.postfixOps import scala.util.Random import kafka.serializer.StringDecoder import org.scalatest.BeforeAndAfterAll import org.scalatest.concurrent.Eventually import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext} class KafkaStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfterAll { private var ssc: StreamingContext = _ private var kafkaTestUtils: KafkaTestUtils = _ override def beforeAll(): Unit = { kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() } override def afterAll(): Unit = { if (ssc != null) { ssc.stop() ssc = null } if (kafkaTestUtils != null) { kafkaTestUtils.teardown() kafkaTestUtils = null } } test("Kafka input stream") { val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName) ssc = new StreamingContext(sparkConf, Milliseconds(500)) val topic = "topic1" val sent = Map("a" -> 5, "b" -> 3, "c" -> 10) kafkaTestUtils.createTopic(topic) kafkaTestUtils.sendMessages(topic, sent) val kafkaParams = Map("zookeeper.connect" -> kafkaTestUtils.zkAddress, "group.id" -> s"test-consumer-${Random.nextInt(10000)}", "auto.offset.reset" -> "smallest") val stream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, Map(topic -> 1), StorageLevel.MEMORY_ONLY) val result = new mutable.HashMap[String, Long]() with mutable.SynchronizedMap[String, Long] stream.map(_._2).countByValue().foreachRDD { r => val ret = r.collect() ret.toMap.foreach { kv => val count = result.getOrElseUpdate(kv._1, 0) + kv._2 result.put(kv._1, count) } } ssc.start() eventually(timeout(10000 milliseconds), interval(100 milliseconds)) { assert(sent === result) } } }
Example 158
Source File: OTShuffledHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.online.joins import org.apache.spark.SparkEnv import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Partitioning} import org.apache.spark.sql.execution.joins.{BuildSide, HashJoin, HashedRelation} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.hive.online.ComposeRDDFunctions._ import org.apache.spark.sql.hive.online._ import org.apache.spark.storage.{OLABlockId, StorageLevel} case class OTShuffledHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: SparkPlan, right: SparkPlan)( @transient val controller: OnlineDataFrame, @transient val trace: List[Int] = -1 :: Nil, opId: OpId = OpId.newOpId) extends BinaryNode with HashJoin with OTStateful { override def outputPartitioning: Partitioning = left.outputPartitioning override def requiredChildDistribution = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil def retrieveState(): RDD[HashedRelation] = prevBatch match { case Some(bId) => val numParts = controller.olaBlocks(opId, bId) OLABlockRDD.create[HashedRelation](sparkContext, opId.id, Array((numParts, bId)), numParts) case None => sys.error(s"Unexpected prevBatch = $prevBatch") } override def doExecute() = { prevBatch match { case None => val buildRdd = buildPlan.execute() controller.olaBlocks((opId, currentBatch)) = buildRdd.partitions.length buildRdd.zipPartitionsWithIndex(streamedPlan.execute()) { (index, buildIter, streamIter) => val hashed = HashedRelation(buildIter, buildSideKeyGenerator) SparkEnv.get.blockManager.putSingle( OLABlockId(opId.id, currentBatch, index), hashed, StorageLevel.MEMORY_AND_DISK) hashJoin(streamIter, hashed) } case Some(_) => retrieveState().zipPartitionsWithIndex(streamedPlan.execute()) { (index, buildIter, streamIter) => val hashed = buildIter.next() hashJoin(streamIter, hashed) } } } override protected final def otherCopyArgs = controller :: trace :: opId :: Nil override def simpleString = s"${super.simpleString} $opId" override def newBatch(newTrace: List[Int]): SparkPlan = OTShuffledHashJoin(leftKeys, rightKeys, buildSide, left, right)(controller, newTrace, opId) }
Example 159
Source File: BagelSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.bagel import org.scalatest.{BeforeAndAfter, Assertions} import org.scalatest.concurrent.Timeouts import org.scalatest.time.SpanSugar._ import org.apache.spark._ import org.apache.spark.storage.StorageLevel class TestVertex(val active: Boolean, val age: Int) extends Vertex with Serializable class TestMessage(val targetId: String) extends Message[String] with Serializable class BagelSuite extends SparkFunSuite with Assertions with BeforeAndAfter with Timeouts { var sc: SparkContext = _ after { if (sc != null) { sc.stop() sc = null } } test("halting by voting") { sc = new SparkContext("local", "test") val verts = sc.parallelize(Array("a", "b", "c", "d").map(id => (id, new TestVertex(true, 0)))) val msgs = sc.parallelize(Array[(String, TestMessage)]()) val numSupersteps = 5 val result = Bagel.run(sc, verts, msgs, sc.defaultParallelism) { (self: TestVertex, msgs: Option[Array[TestMessage]], superstep: Int) => (new TestVertex(superstep < numSupersteps - 1, self.age + 1), Array[TestMessage]()) } for ((id, vert) <- result.collect) { assert(vert.age === numSupersteps) } } test("halting by message silence") { sc = new SparkContext("local", "test") val verts = sc.parallelize(Array("a", "b", "c", "d").map(id => (id, new TestVertex(false, 0)))) val msgs = sc.parallelize(Array("a" -> new TestMessage("a"))) val numSupersteps = 5 val result = Bagel.run(sc, verts, msgs, sc.defaultParallelism) { (self: TestVertex, msgs: Option[Array[TestMessage]], superstep: Int) => val msgsOut = msgs match { case Some(ms) if (superstep < numSupersteps - 1) => ms case _ => Array[TestMessage]() } (new TestVertex(self.active, self.age + 1), msgsOut) } for ((id, vert) <- result.collect) { assert(vert.age === numSupersteps) } } test("large number of iterations") { // This tests whether jobs with a large number of iterations finish in a reasonable time, // because non-memoized recursion in RDD or DAGScheduler used to cause them to hang failAfter(30 seconds) { sc = new SparkContext("local", "test") val verts = sc.parallelize((1 to 4).map(id => (id.toString, new TestVertex(true, 0)))) val msgs = sc.parallelize(Array[(String, TestMessage)]()) val numSupersteps = 50 val result = Bagel.run(sc, verts, msgs, sc.defaultParallelism) { (self: TestVertex, msgs: Option[Array[TestMessage]], superstep: Int) => (new TestVertex(superstep < numSupersteps - 1, self.age + 1), Array[TestMessage]()) } for ((id, vert) <- result.collect) { assert(vert.age === numSupersteps) } } } test("using non-default persistence level") { failAfter(10 seconds) { sc = new SparkContext("local", "test") val verts = sc.parallelize((1 to 4).map(id => (id.toString, new TestVertex(true, 0)))) val msgs = sc.parallelize(Array[(String, TestMessage)]()) val numSupersteps = 20 val result = Bagel.run(sc, verts, msgs, sc.defaultParallelism, StorageLevel.DISK_ONLY) { (self: TestVertex, msgs: Option[Array[TestMessage]], superstep: Int) => (new TestVertex(superstep < numSupersteps - 1, self.age + 1), Array[TestMessage]()) } for ((id, vert) <- result.collect) { assert(vert.age === numSupersteps) } } } }
Example 160
Source File: WindowedDStream.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.rdd.{PartitionerAwareUnionRDD, RDD, UnionRDD} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.streaming.Duration import scala.reflect.ClassTag private[streaming] class WindowedDStream[T: ClassTag]( parent: DStream[T], _windowDuration: Duration, _slideDuration: Duration) extends DStream[T](parent.ssc) { if (!_windowDuration.isMultipleOf(parent.slideDuration)) { throw new Exception("The window duration of windowed DStream (" + _windowDuration + ") " + "must be a multiple of the slide duration of parent DStream (" + parent.slideDuration + ")") } if (!_slideDuration.isMultipleOf(parent.slideDuration)) { throw new Exception("The slide duration of windowed DStream (" + _slideDuration + ") " + "must be a multiple of the slide duration of parent DStream (" + parent.slideDuration + ")") } // Persist parent level by default, as those RDDs are going to be obviously reused. parent.persist(StorageLevel.MEMORY_ONLY_SER) def windowDuration: Duration = _windowDuration override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = _slideDuration override def parentRememberDuration: Duration = rememberDuration + windowDuration override def persist(level: StorageLevel): DStream[T] = { // Do not let this windowed DStream be persisted as windowed (union-ed) RDDs share underlying // RDDs and persisting the windowed RDDs would store numerous copies of the underlying data. // Instead control the persistence of the parent DStream. parent.persist(level) this } override def compute(validTime: Time): Option[RDD[T]] = { val currentWindow = new Interval(validTime - windowDuration + parent.slideDuration, validTime) val rddsInWindow = parent.slice(currentWindow) val windowRDD = if (rddsInWindow.flatMap(_.partitioner).distinct.length == 1) { logDebug("Using partition aware union for windowing at " + validTime) new PartitionerAwareUnionRDD(ssc.sc, rddsInWindow) } else { logDebug("Using normal union for windowing at " + validTime) new UnionRDD(ssc.sc, rddsInWindow) } Some(windowRDD) } }
Example 161
Source File: SocketInputDStream.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.util.control.NonFatal import org.apache.spark.streaming.StreamingContext import org.apache.spark.storage.StorageLevel import org.apache.spark.util.NextIterator import scala.reflect.ClassTag import java.io._ import java.net.{UnknownHostException, Socket} import org.apache.spark.Logging import org.apache.spark.streaming.receiver.Receiver private[streaming] class SocketInputDStream[T: ClassTag]( @transient ssc_ : StreamingContext, host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends ReceiverInputDStream[T](ssc_) { def getReceiver(): Receiver[T] = { new SocketReceiver(host, port, bytesToObjects, storageLevel) } } private[streaming] class SocketReceiver[T: ClassTag]( host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends Receiver[T](storageLevel) with Logging { def onStart() { // Start the thread that receives data over a connection new Thread("Socket Receiver") { setDaemon(true) override def run() { receive() } }.start() } def onStop() { // There is nothing much to do as the thread calling receive() // is designed to stop by itself isStopped() returns false } def bytesToLines(inputStream: InputStream): Iterator[String] = { val dataInputStream = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")) new NextIterator[String] { protected override def getNext() = { val nextValue = dataInputStream.readLine() if (nextValue == null) { finished = true } nextValue } protected override def close() { dataInputStream.close() } } } }
Example 162
Source File: BlockTransferService.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.network import java.io.Closeable import java.nio.ByteBuffer import scala.concurrent.{Promise, Await, Future} import scala.concurrent.duration.Duration import org.apache.spark.Logging import org.apache.spark.network.buffer.{NioManagedBuffer, ManagedBuffer} import org.apache.spark.network.shuffle.{ShuffleClient, BlockFetchingListener} import org.apache.spark.storage.{BlockManagerId, BlockId, StorageLevel} private[spark] abstract class BlockTransferService extends ShuffleClient with Closeable with Logging { def uploadBlockSync( hostname: String, port: Int, execId: String, blockId: BlockId, blockData: ManagedBuffer, level: StorageLevel): Unit = { Await.result(uploadBlock(hostname, port, execId, blockId, blockData, level), Duration.Inf) } }
Example 163
Source File: NettyBlockRpcServer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConversions._ import org.apache.spark.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, messageBytes: Array[Byte], responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteArray(messageBytes) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(blocks.iterator) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteArray) case uploadBlock: UploadBlock => // StorageLevel is serialized as bytes using our JavaSerializer. val level: StorageLevel = serializer.newInstance().deserialize(ByteBuffer.wrap(uploadBlock.metadata)) val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) blockManager.putBlockData(BlockId(uploadBlock.blockId), data, level) responseContext.onSuccess(new Array[Byte](0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 164
Source File: SparkContextInfoSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark import org.scalatest.Assertions import org.apache.spark.storage.StorageLevel class SparkContextInfoSuite extends SparkFunSuite with LocalSparkContext { test("getPersistentRDDs only returns RDDs that are marked as cached") { sc = new SparkContext("local", "test") assert(sc.getPersistentRDDs.isEmpty === true) val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2) assert(sc.getPersistentRDDs.isEmpty === true) rdd.cache() assert(sc.getPersistentRDDs.size === 1) assert(sc.getPersistentRDDs.values.head === rdd) } test("getPersistentRDDs returns an immutable map") { sc = new SparkContext("local", "test") val rdd1 = sc.makeRDD(Array(1, 2, 3, 4), 2).cache() val myRdds = sc.getPersistentRDDs assert(myRdds.size === 1) assert(myRdds(0) === rdd1) assert(myRdds(0).getStorageLevel === StorageLevel.MEMORY_ONLY) // myRdds2 should have 2 RDDs, but myRdds should not change val rdd2 = sc.makeRDD(Array(5, 6, 7, 8), 1).cache() val myRdds2 = sc.getPersistentRDDs assert(myRdds2.size === 2) assert(myRdds2(0) === rdd1) assert(myRdds2(1) === rdd2) assert(myRdds2(0).getStorageLevel === StorageLevel.MEMORY_ONLY) assert(myRdds2(1).getStorageLevel === StorageLevel.MEMORY_ONLY) assert(myRdds.size === 1) assert(myRdds(0) === rdd1) assert(myRdds(0).getStorageLevel === StorageLevel.MEMORY_ONLY) } test("getRDDStorageInfo only reports on RDDs that actually persist data") { sc = new SparkContext("local", "test") val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2).cache() assert(sc.getRDDStorageInfo.size === 0) rdd.collect() assert(sc.getRDDStorageInfo.size === 1) assert(sc.getRDDStorageInfo.head.isCached) assert(sc.getRDDStorageInfo.head.memSize > 0) assert(sc.getRDDStorageInfo.head.storageLevel === StorageLevel.MEMORY_ONLY) } test("call sites report correct locations") { sc = new SparkContext("local", "test") testPackage.runCallSiteTest(sc) } } package object testPackage extends Assertions { private val CALL_SITE_REGEX = "(.+) at (.+):([0-9]+)".r def runCallSiteTest(sc: SparkContext) { val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2) val rddCreationSite = rdd.getCreationSite val curCallSite = sc.getCallSite().shortForm // note: 2 lines after definition of "rdd" val rddCreationLine = rddCreationSite match { case CALL_SITE_REGEX(func, file, line) => { assert(func === "makeRDD") assert(file === "SparkContextInfoSuite.scala") line.toInt } case _ => fail("Did not match expected call site format") } curCallSite match { case CALL_SITE_REGEX(func, file, line) => { assert(func === "getCallSite") // this is correct because we called it from outside of Spark assert(file === "SparkContextInfoSuite.scala") assert(line.toInt === rddCreationLine.toInt + 2) } case _ => fail("Did not match expected call site format") } } }
Example 165
Source File: StorageLevelTests.scala From frameless with Apache License 2.0 | 5 votes |
package frameless import org.apache.spark.storage.StorageLevel import org.apache.spark.storage.StorageLevel._ import org.scalacheck.Prop._ import org.scalacheck.{Arbitrary, Gen} class StorageLevelTests extends TypedDatasetSuite { val storageLevelGen: Gen[StorageLevel] = Gen.oneOf(Seq(NONE, DISK_ONLY, DISK_ONLY_2, MEMORY_ONLY, MEMORY_ONLY_2, MEMORY_ONLY_SER, MEMORY_ONLY_SER_2, MEMORY_AND_DISK, MEMORY_AND_DISK_2, MEMORY_AND_DISK_SER, MEMORY_AND_DISK_SER_2, OFF_HEAP)) test("storageLevel") { def prop[A: TypedEncoder : Arbitrary] = forAll(vectorGen[A], storageLevelGen) { (data: Vector[A], storageLevel: StorageLevel) => val dataset = TypedDataset.create(data) if (storageLevel != StorageLevel.NONE) dataset.persist(storageLevel) dataset.count().run() dataset.storageLevel() ?= dataset.dataset.storageLevel } check(prop[Int]) check(prop[String]) } }
Example 166
Source File: SparkTachyonHdfsLR.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo import org.apache.spark.storage.StorageLevel object SparkTachyonHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD(SGD随机梯度下降) or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS(BFGS是逆秩2拟牛顿法) |for more conventional use. """.stripMargin) } case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def main(args: Array[String]) { showWarning() val inputPath = args(0) val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR") val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP) val ITERATIONS = args(1).toInt // Initialize w to a random value 将w初始化为一个随机值 var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } } // scalastyle:on println
Example 167
Source File: SparkTachyonPi.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import scala.math.random import org.apache.spark._ import org.apache.spark.storage.StorageLevel object SparkTachyonPi { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("SparkTachyonPi") val spark = new SparkContext(sparkConf) val slices = if (args.length > 0) args(0).toInt else 2 val n = 100000 * slices val rdd = spark.parallelize(1 to n, slices) rdd.persist(StorageLevel.OFF_HEAP) val count = rdd.map { i => val x = random * 2 - 1 val y = random * 2 - 1 if (x * x + y * y < 1) 1 else 0 }.reduce(_ + _) println("Pi is roughly " + 4.0 * count / n) spark.stop() } } // scalastyle:on println
Example 168
Source File: CustomReceiver.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import java.io.{InputStreamReader, BufferedReader, InputStream} import java.net.Socket import org.apache.spark.{SparkConf, Logging} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver import java.io.File import java.io.FileInputStream private def receive() { var socket: Socket = null var userInput: String = null try { logInfo("Connecting to " + host + ":" + port) socket = new Socket(host, port) //连接机器 logInfo("Connected to " + host + ":" + port) //获取网络连接输入流 println("isConnected:"+socket.isConnected()) val socketInput=socket.getInputStream() // //val inputFile=new File("../data/mllib/als/testCustomReceiver.data") // val in = new FileInputStream(inputFile) // val in = new FileInputStream(socketInput) val reader = new BufferedReader(new InputStreamReader(socketInput, "UTF-8")) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput)//存储数据 userInput = reader.readLine()//读取数据 println("userInput:"+userInput) } reader.close()//关闭流 socket.close()//关闭连接 logInfo("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } } // scalastyle:on println
Example 169
Source File: ImageInputDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import java.io.InputStream import java.net.Socket import org.apache.hadoop.io.BytesWritable import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import scala.collection.mutable.ArrayBuffer import org.apache.spark.Logging class ImageInputDStream(@transient ssc_ : StreamingContext, host: String, port: Int, storageLevel: StorageLevel) extends ReceiverInputDStream[BytesWritable](ssc_) with Logging { override def getReceiver(): Receiver[BytesWritable] = { new ImageRecevier(host, port, storageLevel) } } class ImageRecevier(host: String, port: Int, storageLevel: StorageLevel) extends Receiver[BytesWritable](storageLevel) with Logging { override def onStart(): Unit = { new Thread("Image Socket") { setDaemon(true) override def run(): Unit = { receive() } }.start() } def receive(): Unit = { var socket: Socket = null var in: InputStream = null try { log.info("Connecting to " + host + ":" + port) socket = new Socket(host, port) log.info("Connected to " + host + ":" + port) in = socket.getInputStream val buf = new ArrayBuffer[Byte]() var bytes = new Array[Byte](1024) var len = 0 while (-1 < len) { len = in.read(bytes) if (len > 0) { buf ++= bytes } } val bw = new BytesWritable(buf.toArray) log.error("byte:::::" + bw.getLength) store(bw) log.info("Stopped receiving") restart("Retrying connecting to " + host + ":" + port) } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } finally { if (in != null) { in.close() } if (socket != null) { socket.close() log.info("Closed socket to " + host + ":" + port) } } } override def onStop(): Unit = { } }
Example 170
Source File: RawNetworkGrep.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.util.IntParam object RawNetworkGrep { def main(args: Array[String]) { if (args.length != 4) { System.err.println("Usage: RawNetworkGrep <numStreams> <host> <port> <batchMillis>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val Array(IntParam(numStreams), host, IntParam(port), IntParam(batchMillis)) = args val sparkConf = new SparkConf().setAppName("RawNetworkGrep") // Create the context //创建上下文,批次间隔 val ssc = new StreamingContext(sparkConf, Duration(batchMillis)) val rawStreams = (1 to numStreams).map(_ => ssc.rawSocketStream[String](host, port, StorageLevel.MEMORY_ONLY_SER_2)).toArray val union = ssc.union(rawStreams) union.filter(_.contains("the")).count().foreachRDD(r => println("Grep count: " + r.collect().mkString)) ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 171
Source File: SqlNetworkWordCount.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Time, Seconds, StreamingContext} import org.apache.spark.util.IntParam import org.apache.spark.sql.SQLContext import org.apache.spark.storage.StorageLevel object SQLContextSingleton { @transient private var instance: SQLContext = _ def getInstance(sparkContext: SparkContext): SQLContext = { if (instance == null) { instance = new SQLContext(sparkContext) } instance } } // scalastyle:on println
Example 172
Source File: GraphLoader.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx import org.apache.spark.storage.StorageLevel import org.apache.spark.{Logging, SparkContext} import org.apache.spark.graphx.impl.{EdgePartitionBuilder, GraphImpl} def edgeListFile( sc: SparkContext, path: String, canonicalOrientation: Boolean = false, numEdgePartitions: Int = -1, edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY, vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) : Graph[Int, Int] = { val startTime = System.currentTimeMillis // Parse the edge data table directly into edge partitions //将边数据表直接解析为边分区 val lines = if (numEdgePartitions > 0) { sc.textFile(path, numEdgePartitions).coalesce(numEdgePartitions) } else { sc.textFile(path) } val edges = lines.mapPartitionsWithIndex { (pid, iter) => val builder = new EdgePartitionBuilder[Int, Int] iter.foreach { line => if (!line.isEmpty && line(0) != '#') { val lineArray = line.split("\\s+") if (lineArray.length < 2) { throw new IllegalArgumentException("Invalid line: " + line) } val srcId = lineArray(0).toLong val dstId = lineArray(1).toLong if (canonicalOrientation && srcId > dstId) { builder.add(dstId, srcId, 1) } else { builder.add(srcId, dstId, 1) } } } Iterator((pid, builder.toEdgePartition)) }.persist(edgeStorageLevel).setName("GraphLoader.edgeListFile - edges (%s)".format(path)) edges.count() logInfo("It took %d ms to load the edges".format(System.currentTimeMillis - startTime)) GraphImpl.fromEdgePartitions(edges, defaultVertexAttr = 1, edgeStorageLevel = edgeStorageLevel, vertexStorageLevel = vertexStorageLevel) } // end of edgeListFile }
Example 173
Source File: EdgeRDDImpl.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{OneToOneDependency, HashPartitioner} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.graphx._ class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 174
Source File: EdgeRDDSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx import org.apache.spark.SparkFunSuite import org.apache.spark.storage.StorageLevel class EdgeRDDSuite extends SparkFunSuite with LocalSparkContext { test("cache, getStorageLevel") { // test to see if getStorageLevel returns correct value after caching //检查缓存后getStorageLevel是否返回正确的值 withSpark { sc => val verts = sc.parallelize(List((0L, 0), (1L, 1), (1L, 2), (2L, 3), (2L, 3), (2L, 3))) //根据边构建图 val edges = EdgeRDD.fromEdges(sc.parallelize(List.empty[Edge[Int]])) assert(edges.getStorageLevel == StorageLevel.NONE) edges.cache() assert(edges.getStorageLevel == StorageLevel.MEMORY_ONLY) } } }
Example 175
Source File: TwitterInputDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.twitter import twitter4j._ import twitter4j.auth.Authorization import twitter4j.conf.ConfigurationBuilder import twitter4j.auth.OAuthAuthorization import org.apache.spark.streaming._ import org.apache.spark.streaming.dstream._ import org.apache.spark.storage.StorageLevel import org.apache.spark.Logging import org.apache.spark.streaming.receiver.Receiver private[streaming] class TwitterInputDStream( @transient ssc_ : StreamingContext, twitterAuth: Option[Authorization], filters: Seq[String], storageLevel: StorageLevel ) extends ReceiverInputDStream[Status](ssc_) { private def createOAuthAuthorization(): Authorization = { new OAuthAuthorization(new ConfigurationBuilder().build()) } private val authorization = twitterAuth.getOrElse(createOAuthAuthorization()) override def getReceiver(): Receiver[Status] = { new TwitterReceiver(authorization, filters, storageLevel) } } private[streaming] class TwitterReceiver( twitterAuth: Authorization, filters: Seq[String], storageLevel: StorageLevel ) extends Receiver[Status](storageLevel) with Logging { @volatile private var twitterStream: TwitterStream = _ @volatile private var stopped = false def onStart() { try { val newTwitterStream = new TwitterStreamFactory().getInstance(twitterAuth) newTwitterStream.addListener(new StatusListener { def onStatus(status: Status): Unit = { store(status) } // Unimplemented def onDeletionNotice(statusDeletionNotice: StatusDeletionNotice) {} def onTrackLimitationNotice(i: Int) {} def onScrubGeo(l: Long, l1: Long) {} def onStallWarning(stallWarning: StallWarning) {} def onException(e: Exception) { if (!stopped) { restart("Error receiving tweets", e) } } }) val query = new FilterQuery if (filters.size > 0) { query.track(filters.toArray) newTwitterStream.filter(query) } else { newTwitterStream.sample() } setTwitterStream(newTwitterStream) logInfo("Twitter receiver started") stopped = false } catch { case e: Exception => restart("Error starting Twitter stream", e) } } def onStop() { stopped = true setTwitterStream(null) logInfo("Twitter receiver stopped") } private def setTwitterStream(newTwitterStream: TwitterStream) = synchronized { if (twitterStream != null) { twitterStream.shutdown() } twitterStream = newTwitterStream } }
Example 176
Source File: TwitterStreamSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.twitter import org.scalatest.BeforeAndAfter import twitter4j.Status import twitter4j.auth.{NullAuthorization, Authorization} import org.apache.spark.{Logging, SparkFunSuite} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.ReceiverInputDStream class TwitterStreamSuite extends SparkFunSuite with BeforeAndAfter with Logging { val batchDuration = Seconds(1) private val master: String = "local[2]" private val framework: String = this.getClass.getSimpleName test("twitter input stream") { val ssc = new StreamingContext(master, framework, batchDuration) val filters = Seq("filter1", "filter2") val authorization: Authorization = NullAuthorization.getInstance() // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None) val test2: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters) val test3: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_AND_DISK_SER_2) val test4: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization)) val test5: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization), filters) val test6: ReceiverInputDStream[Status] = TwitterUtils.createStream( ssc, Some(authorization), filters, StorageLevel.MEMORY_AND_DISK_SER_2) // Note that actually testing the data receiving is hard as authentication keys are // necessary for accessing Twitter live stream ssc.stop() } }
Example 177
Source File: FlumePollingStreamSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import java.net.InetSocketAddress import scala.collection.JavaConversions._ import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer} import scala.concurrent.duration._ import scala.language.postfixOps import com.google.common.base.Charsets.UTF_8 import org.scalatest.BeforeAndAfter import org.scalatest.concurrent.Eventually._ import org.apache.spark.{Logging, SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.{Seconds, TestOutputStream, StreamingContext} import org.apache.spark.util.{ManualClock, Utils} private def testMultipleTimes(test: () => Unit): Unit = { var testPassed = false var attempt = 0 while (!testPassed && attempt < maxAttempts) { try { test() testPassed = true } catch { case e: Exception if Utils.isBindCollision(e) => logWarning("Exception when running flume polling test: " + e) attempt += 1 } } assert(testPassed, s"Test failed after $attempt attempts!") } private def testFlumePolling(): Unit = { try { val port = utils.startSingleSink() writeAndVerify(Seq(port)) utils.assertChannelsAreEmpty() } finally { utils.close() } } private def testFlumePollingMultipleHost(): Unit = { try { val ports = utils.startMultipleSinks() writeAndVerify(ports) utils.assertChannelsAreEmpty() } finally { utils.close() } } def writeAndVerify(sinkPorts: Seq[Int]): Unit = { // Set up the streaming context and input streams //设置流上下文和输入流 val ssc = new StreamingContext(conf, batchDuration) val addresses = sinkPorts.map(port => new InetSocketAddress("localhost", port)) val flumeStream: ReceiverInputDStream[SparkFlumeEvent] = FlumeUtils.createPollingStream(ssc, addresses, StorageLevel.MEMORY_AND_DISK, utils.eventsPerBatch, 5) val outputBuffer = new ArrayBuffer[Seq[SparkFlumeEvent]] with SynchronizedBuffer[Seq[SparkFlumeEvent]] val outputStream = new TestOutputStream(flumeStream, outputBuffer) outputStream.register() ssc.start() try { utils.sendDatAndEnsureAllDataHasBeenReceived() val clock = ssc.scheduler.clock.asInstanceOf[ManualClock] clock.advance(batchDuration.milliseconds) // The eventually is required to ensure that all data in the batch has been processed. //最终需要确保批处理中的所有数据已被处理 eventually(timeout(10 seconds), interval(100 milliseconds)) { val flattenOutputBuffer = outputBuffer.flatten val headers = flattenOutputBuffer.map(_.event.getHeaders.map { case kv => (kv._1.toString, kv._2.toString) }).map(mapAsJavaMap) val bodies = flattenOutputBuffer.map(e => new String(e.event.getBody.array(), UTF_8)) utils.assertOutput(headers, bodies) } } finally { ssc.stop() } } }
Example 178
Source File: FlumeStreamSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import scala.collection.JavaConversions._ import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer} import scala.concurrent.duration._ import scala.language.postfixOps import com.google.common.base.Charsets import org.jboss.netty.channel.ChannelPipeline import org.jboss.netty.channel.socket.SocketChannel import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory import org.jboss.netty.handler.codec.compression._ import org.scalatest.{BeforeAndAfter, Matchers} import org.scalatest.concurrent.Eventually._ import org.apache.spark.{Logging, SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext, TestOutputStream} private class CompressionChannelFactory(compressionLevel: Int) extends NioClientSocketChannelFactory { override def newChannel(pipeline: ChannelPipeline): SocketChannel = { val encoder = new ZlibEncoder(compressionLevel) pipeline.addFirst("deflater", encoder) pipeline.addFirst("inflater", new ZlibDecoder()) super.newChannel(pipeline) } } }
Example 179
Source File: ZeroMQStreamSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.zeromq import akka.actor.SupervisorStrategy import akka.util.ByteString import akka.zeromq.Subscribe import org.apache.spark.SparkFunSuite import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.ReceiverInputDStream class ZeroMQStreamSuite extends SparkFunSuite { val batchDuration = Seconds(1) private val master: String = "local[2]" private val framework: String = this.getClass.getSimpleName test("zeromq input stream") { val ssc = new StreamingContext(master, framework, batchDuration) val publishUrl = "abc" val subscribe = new Subscribe(null.asInstanceOf[ByteString]) val bytesToObjects = (bytes: Seq[ByteString]) => null.asInstanceOf[Iterator[String]] // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[String] = ZeroMQUtils.createStream(ssc, publishUrl, subscribe, bytesToObjects) val test2: ReceiverInputDStream[String] = ZeroMQUtils.createStream( ssc, publishUrl, subscribe, bytesToObjects, StorageLevel.MEMORY_AND_DISK_SER_2) val test3: ReceiverInputDStream[String] = ZeroMQUtils.createStream( ssc, publishUrl, subscribe, bytesToObjects, StorageLevel.MEMORY_AND_DISK_SER_2, SupervisorStrategy.defaultStrategy) // TODO: Actually test data receiving ssc.stop() } }
Example 180
Source File: MQTTUtils.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.mqtt import scala.reflect.ClassTag import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.{JavaDStream, JavaReceiverInputDStream, JavaStreamingContext} import org.apache.spark.streaming.dstream.ReceiverInputDStream object MQTTUtils { private[mqtt] class MQTTUtilsPythonHelper { def createStream( jssc: JavaStreamingContext, brokerUrl: String, topic: String, storageLevel: StorageLevel ): JavaDStream[String] = { MQTTUtils.createStream(jssc, brokerUrl, topic, storageLevel) } }
Example 181
Source File: MQTTStreamSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.mqtt import scala.concurrent.duration._ import scala.language.postfixOps import org.scalatest.BeforeAndAfter import org.scalatest.concurrent.Eventually import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext} class MQTTStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfter { private val batchDuration = Milliseconds(500) private val master = "local[2]" private val framework = this.getClass.getSimpleName private val topic = "def" private var ssc: StreamingContext = _ private var mqttTestUtils: MQTTTestUtils = _ before { ssc = new StreamingContext(master, framework, batchDuration) mqttTestUtils = new MQTTTestUtils mqttTestUtils.setup() } after { if (ssc != null) { ssc.stop() ssc = null } if (mqttTestUtils != null) { mqttTestUtils.teardown() mqttTestUtils = null } } test("mqtt input stream") { val sendMessage = "MQTT demo for spark streaming" val receiveStream = MQTTUtils.createStream(ssc, "tcp://" + mqttTestUtils.brokerUri, topic, StorageLevel.MEMORY_ONLY) @volatile var receiveMessage: List[String] = List() receiveStream.foreachRDD { rdd => if (rdd.collect.length > 0) { receiveMessage = receiveMessage ::: List(rdd.first) receiveMessage } } ssc.start() // Retry it because we don't know when the receiver will start. eventually(timeout(10000 milliseconds), interval(100 milliseconds)) { mqttTestUtils.publishData(topic, sendMessage) assert(sendMessage.equals(receiveMessage(0))) } ssc.stop() } }
Example 182
Source File: KafkaStreamSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import scala.collection.mutable import scala.concurrent.duration._ import scala.language.postfixOps import scala.util.Random import kafka.serializer.StringDecoder import org.scalatest.BeforeAndAfterAll import org.scalatest.concurrent.Eventually import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext} class KafkaStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfterAll { private var ssc: StreamingContext = _ private var kafkaTestUtils: KafkaTestUtils = _ override def beforeAll(): Unit = { kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() } override def afterAll(): Unit = { if (ssc != null) { ssc.stop() ssc = null } if (kafkaTestUtils != null) { kafkaTestUtils.teardown() kafkaTestUtils = null } } test("Kafka input stream") {//Kafka输入流 val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName) ssc = new StreamingContext(sparkConf, Milliseconds(500)) val topic = "topic1" val sent = Map("a" -> 5, "b" -> 3, "c" -> 10) kafkaTestUtils.createTopic(topic) kafkaTestUtils.sendMessages(topic, sent) val kafkaParams = Map("zookeeper.connect" -> kafkaTestUtils.zkAddress, "group.id" -> s"test-consumer-${Random.nextInt(10000)}", "auto.offset.reset" -> "smallest") val stream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, Map(topic -> 1), StorageLevel.MEMORY_ONLY) val result = new mutable.HashMap[String, Long]() with mutable.SynchronizedMap[String, Long] stream.map(_._2).countByValue().foreachRDD { r => val ret = r.collect() ret.toMap.foreach { kv => val count = result.getOrElseUpdate(kv._1, 0) + kv._2 result.put(kv._1, count) } } ssc.start() eventually(timeout(10000 milliseconds), interval(100 milliseconds)) { assert(sent === result) } } }
Example 183
Source File: BagelSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.bagel import org.scalatest.{BeforeAndAfter, Assertions} import org.scalatest.concurrent.Timeouts import org.scalatest.time.SpanSugar._ import org.apache.spark._ import org.apache.spark.storage.StorageLevel class TestVertex(val active: Boolean, val age: Int) extends Vertex with Serializable class TestMessage(val targetId: String) extends Message[String] with Serializable class BagelSuite extends SparkFunSuite with Assertions with BeforeAndAfter with Timeouts { var sc: SparkContext = _ after { if (sc != null) { sc.stop() sc = null } } test("halting by voting") { sc = new SparkContext("local", "test") val verts = sc.parallelize(Array("a", "b", "c", "d").map(id => (id, new TestVertex(true, 0)))) val msgs = sc.parallelize(Array[(String, TestMessage)]()) val numSupersteps = 5 val result = Bagel.run(sc, verts, msgs, sc.defaultParallelism) { (self: TestVertex, msgs: Option[Array[TestMessage]], superstep: Int) => (new TestVertex(superstep < numSupersteps - 1, self.age + 1), Array[TestMessage]()) } for ((id, vert) <- result.collect) { assert(vert.age === numSupersteps) } } test("halting by message silence") { sc = new SparkContext("local", "test") val verts = sc.parallelize(Array("a", "b", "c", "d").map(id => (id, new TestVertex(false, 0)))) val msgs = sc.parallelize(Array("a" -> new TestMessage("a"))) val numSupersteps = 5 val result = Bagel.run(sc, verts, msgs, sc.defaultParallelism) { (self: TestVertex, msgs: Option[Array[TestMessage]], superstep: Int) => val msgsOut = msgs match { case Some(ms) if (superstep < numSupersteps - 1) => ms case _ => Array[TestMessage]() } (new TestVertex(self.active, self.age + 1), msgsOut) } for ((id, vert) <- result.collect) { assert(vert.age === numSupersteps) } } test("large number of iterations") { // This tests whether jobs with a large number of iterations finish in a reasonable time, // because non-memoized recursion in RDD or DAGScheduler used to cause them to hang failAfter(30 seconds) { sc = new SparkContext("local", "test") val verts = sc.parallelize((1 to 4).map(id => (id.toString, new TestVertex(true, 0)))) val msgs = sc.parallelize(Array[(String, TestMessage)]()) val numSupersteps = 50 val result = Bagel.run(sc, verts, msgs, sc.defaultParallelism) { (self: TestVertex, msgs: Option[Array[TestMessage]], superstep: Int) => (new TestVertex(superstep < numSupersteps - 1, self.age + 1), Array[TestMessage]()) } for ((id, vert) <- result.collect) { assert(vert.age === numSupersteps) } } } test("using non-default persistence level") { failAfter(10 seconds) { sc = new SparkContext("local", "test") val verts = sc.parallelize((1 to 4).map(id => (id.toString, new TestVertex(true, 0)))) val msgs = sc.parallelize(Array[(String, TestMessage)]()) val numSupersteps = 20 val result = Bagel.run(sc, verts, msgs, sc.defaultParallelism, StorageLevel.DISK_ONLY) { (self: TestVertex, msgs: Option[Array[TestMessage]], superstep: Int) => (new TestVertex(superstep < numSupersteps - 1, self.age + 1), Array[TestMessage]()) } for ((id, vert) <- result.collect) { assert(vert.age === numSupersteps) } } } }
Example 184
Source File: KinesisInputDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo import org.apache.spark.streaming.{Duration, StreamingContext, Time} private[kinesis] class KinesisInputDStream( @transient _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[Array[Byte]](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[Array[Byte]] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[Array[Byte]] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, awsCredentialsOption) } }
Example 185
Source File: WindowedDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.rdd.{PartitionerAwareUnionRDD, RDD, UnionRDD} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.streaming.Duration import scala.reflect.ClassTag private[streaming] class WindowedDStream[T: ClassTag]( parent: DStream[T], _windowDuration: Duration, _slideDuration: Duration) extends DStream[T](parent.ssc) { if (!_windowDuration.isMultipleOf(parent.slideDuration)) { throw new Exception("The window duration of windowed DStream (" + _windowDuration + ") " + "must be a multiple of the slide duration of parent DStream (" + parent.slideDuration + ")") } if (!_slideDuration.isMultipleOf(parent.slideDuration)) { throw new Exception("The slide duration of windowed DStream (" + _slideDuration + ") " + "must be a multiple of the slide duration of parent DStream (" + parent.slideDuration + ")") } // Persist parent level by default, as those RDDs are going to be obviously reused. //默认持久化级别,那些将要明显重用RDDS parent.persist(StorageLevel.MEMORY_ONLY_SER) def windowDuration: Duration = _windowDuration override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = _slideDuration override def parentRememberDuration: Duration = rememberDuration + windowDuration override def persist(level: StorageLevel): DStream[T] = { // Do not let this windowed DStream be persisted as windowed (union-ed) RDDs share underlying // RDDs and persisting the windowed RDDs would store numerous copies of the underlying data. // Instead control the persistence of the parent DStream. parent.persist(level) this } override def compute(validTime: Time): Option[RDD[T]] = { val currentWindow = new Interval(validTime - windowDuration + parent.slideDuration, validTime) val rddsInWindow = parent.slice(currentWindow) val windowRDD = if (rddsInWindow.flatMap(_.partitioner).distinct.length == 1) { logDebug("Using partition aware union for windowing at " + validTime) new PartitionerAwareUnionRDD(ssc.sc, rddsInWindow) } else { logDebug("Using normal union for windowing at " + validTime) new UnionRDD(ssc.sc, rddsInWindow) } Some(windowRDD) } }
Example 186
Source File: SocketInputDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.util.control.NonFatal import org.apache.spark.streaming.StreamingContext import org.apache.spark.storage.StorageLevel import org.apache.spark.util.NextIterator import scala.reflect.ClassTag import java.io._ import java.net.{UnknownHostException, Socket} import org.apache.spark.Logging import org.apache.spark.streaming.receiver.Receiver private[streaming] class SocketInputDStream[T: ClassTag]( @transient ssc_ : StreamingContext, host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends ReceiverInputDStream[T](ssc_) { def getReceiver(): Receiver[T] = { new SocketReceiver(host, port, bytesToObjects, storageLevel) } } private[streaming] class SocketReceiver[T: ClassTag]( host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends Receiver[T](storageLevel) with Logging { def onStart() { // Start the thread that receives data over a connection //启动接收到连接上的数据的线程 new Thread("Socket Receiver") { setDaemon(true) override def run() { receive() } }.start() } def onStop() { // There is nothing much to do as the thread calling receive() //没有什么可做的线程调用receive() // is designed to stop by itself isStopped() returns false //是为了阻止自己isstopped()返回false } def bytesToLines(inputStream: InputStream): Iterator[String] = { val dataInputStream = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")) new NextIterator[String] { protected override def getNext() = { val nextValue = dataInputStream.readLine() if (nextValue == null) { finished = true } nextValue } protected override def close() { dataInputStream.close() } } } }
Example 187
Source File: NettyBlockRpcServer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConversions._ import org.apache.spark.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, messageBytes: Array[Byte], responseContext: RpcResponseCallback): Unit = { //消息解码 val message = BlockTransferMessage.Decoder.fromByteArray(messageBytes) logTrace(s"Received request: $message") message match { //提供下载Block文件的功能, case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = //数据blockIds,存放BlockId,获得块数据 openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(blocks.iterator) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteArray) //提供上传Block文件的RPC服务 case uploadBlock: UploadBlock => // StorageLevel is serialized as bytes using our JavaSerializer. //使用我们的JavaSerializer将StorageLevel序列化为字节 //存储级别 val level: StorageLevel = serializer.newInstance().deserialize(ByteBuffer.wrap(uploadBlock.metadata)) val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) //存储局部块,使用给定的存储级别 blockManager.putBlockData(BlockId(uploadBlock.blockId), data, level) responseContext.onSuccess(new Array[Byte](0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 188
Source File: LocalRDDCheckpointData.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Logging, SparkEnv, SparkException, TaskContext} import org.apache.spark.storage.{RDDBlockId, StorageLevel} import org.apache.spark.util.Utils def transformStorageLevel(level: StorageLevel): StorageLevel = { // If this RDD is to be cached off-heap, fail fast since we cannot provide any // correctness guarantees about subsequent computations after the first one //如果这个RDD要被堆栈缓存,那么快速失败,因为我们不能在第一个之后提供关于后续计算的任何正确性保证 if (level.useOffHeap) { throw new SparkException("Local checkpointing is not compatible with off-heap caching.") } StorageLevel(useDisk = true, level.useMemory, level.deserialized, level.replication) } }
Example 189
Source File: SparkContextInfoSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark import org.scalatest.Assertions import org.apache.spark.storage.StorageLevel class SparkContextInfoSuite extends SparkFunSuite with LocalSparkContext { //只返回RDDS被标记为缓存 test("getPersistentRDDs only returns RDDs that are marked as cached") { sc = new SparkContext("local", "test") assert(sc.getPersistentRDDs.isEmpty === true) val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2) //获得持久化RDD空值 assert(sc.getPersistentRDDs.isEmpty === true) rdd.cache()//RDD持久化缓存 assert(sc.getPersistentRDDs.size === 1) //返回列表第一个RDD的值 assert(sc.getPersistentRDDs.values.head === rdd) } //返回一个不可变的Map test("getPersistentRDDs returns an immutable map") { sc = new SparkContext("local", "test") val rdd1 = sc.makeRDD(Array(1, 2, 3, 4), 2).cache() val myRdds = sc.getPersistentRDDs //返回已标记的持久化 assert(myRdds.size === 1) assert(myRdds(0) === rdd1) //获得持久化存储级别,默认内存 assert(myRdds(0).getStorageLevel === StorageLevel.MEMORY_ONLY) // myRdds2 should have 2 RDDs, but myRdds should not change val rdd2 = sc.makeRDD(Array(5, 6, 7, 8), 1).cache() val myRdds2 = sc.getPersistentRDDs assert(myRdds2.size === 2) assert(myRdds2(0) === rdd1) assert(myRdds2(1) === rdd2) assert(myRdds2(0).getStorageLevel === StorageLevel.MEMORY_ONLY) assert(myRdds2(1).getStorageLevel === StorageLevel.MEMORY_ONLY) assert(myRdds.size === 1) assert(myRdds(0) === rdd1) assert(myRdds(0).getStorageLevel === StorageLevel.MEMORY_ONLY) } //报告RDDS实际持久化RDDInfo数据 test("getRDDStorageInfo only reports on RDDs that actually persist data") { sc = new SparkContext("local", "test") val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2).cache() assert(sc.getRDDStorageInfo.size === 0) rdd.collect() assert(sc.getRDDStorageInfo.size === 1)//RDDInfo assert(sc.getRDDStorageInfo.head.isCached)//判断是否缓存 assert(sc.getRDDStorageInfo.head.memSize > 0)//内存大小 assert(sc.getRDDStorageInfo.head.storageLevel === StorageLevel.MEMORY_ONLY) } test("call sites report correct locations") {//报告正确的位置 sc = new SparkContext("local", "test") testPackage.runCallSiteTest(sc) } } package object testPackage extends Assertions { private val CALL_SITE_REGEX = "(.+) at (.+):([0-9]+)".r def runCallSiteTest(sc: SparkContext) { val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2) val rddCreationSite = rdd.getCreationSite println("===="+rddCreationSite) //注意:2行后定义“rdd” val curCallSite = sc.getCallSite().shortForm // note: 2 lines after definition of "rdd" val rddCreationLine = rddCreationSite match { case CALL_SITE_REGEX(func, file, line) => { assert(func === "makeRDD") assert(file === "SparkContextInfoSuite.scala") line.toInt } case _ => fail("Did not match expected call site format") } curCallSite match { case CALL_SITE_REGEX(func, file, line) => { //这是正确的,因为我们从Spark的外部称它为正确的 assert(func === "getCallSite") // this is correct because we called it from outside of Spark assert(file === "SparkContextInfoSuite.scala") println("==line==="+line.toInt ) //assert(line.toInt === rddCreationLine.toInt + 2) } case _ => fail("Did not match expected call site format") } } }
Example 190
Source File: MeetupReceiver.scala From meetup-stream with Apache License 2.0 | 5 votes |
package receiver import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.storage.StorageLevel import org.apache.spark.Logging import com.ning.http.client.AsyncHttpClientConfig import com.ning.http.client._ import scala.collection.mutable.ArrayBuffer import java.io.OutputStream import java.io.ByteArrayInputStream import java.io.InputStreamReader import java.io.BufferedReader import java.io.InputStream import java.io.PipedInputStream import java.io.PipedOutputStream class MeetupReceiver(url: String) extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) with Logging { @transient var client: AsyncHttpClient = _ @transient var inputPipe: PipedInputStream = _ @transient var outputPipe: PipedOutputStream = _ def onStart() { val cf = new AsyncHttpClientConfig.Builder() cf.setRequestTimeout(Integer.MAX_VALUE) cf.setReadTimeout(Integer.MAX_VALUE) cf.setPooledConnectionIdleTimeout(Integer.MAX_VALUE) client= new AsyncHttpClient(cf.build()) inputPipe = new PipedInputStream(1024 * 1024) outputPipe = new PipedOutputStream(inputPipe) val producerThread = new Thread(new DataConsumer(inputPipe)) producerThread.start() client.prepareGet(url).execute(new AsyncHandler[Unit]{ def onBodyPartReceived(bodyPart: HttpResponseBodyPart) = { bodyPart.writeTo(outputPipe) AsyncHandler.STATE.CONTINUE } def onStatusReceived(status: HttpResponseStatus) = { AsyncHandler.STATE.CONTINUE } def onHeadersReceived(headers: HttpResponseHeaders) = { AsyncHandler.STATE.CONTINUE } def onCompleted = { println("completed") } def onThrowable(t: Throwable)={ t.printStackTrace() } }) } def onStop() { if (Option(client).isDefined) client.close() if (Option(outputPipe).isDefined) { outputPipe.flush() outputPipe.close() } if (Option(inputPipe).isDefined) { inputPipe.close() } } class DataConsumer(inputStream: InputStream) extends Runnable { override def run() { val bufferedReader = new BufferedReader( new InputStreamReader( inputStream )) var input=bufferedReader.readLine() while(input!=null){ store(input) input=bufferedReader.readLine() } } } }
Example 191
Source File: IsotonicRegression.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.regression.IsotonicRegressionModel import org.apache.spark.ml.util._ import org.apache.spark.mllib.odkl.{IsotonicRegression => MLlibIsotonicRegression} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.storage.StorageLevel @Since("1.5.0") @Experimental class IsotonicRegression @Since("1.5.0")(@Since("1.5.0") override val uid: String) extends org.apache.spark.ml.regression.IsotonicRegression(uid) { @Since("1.5.0") def this() = this(Identifiable.randomUID("isoReg")) @Since("1.5.0") override def fit(dataset: Dataset[_]): IsotonicRegressionModel = { validateAndTransformSchema(dataset.schema, fitting = true) // Extract columns from data. If dataset is persisted, do not persist oldDataset. val instances = extractWeightedLabeledPoints(dataset) val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK) val isotonicRegression = new MLlibIsotonicRegression().setIsotonic($(isotonic)) val oldModel = isotonicRegression.run(instances) copyValues(new IsotonicRegressionModel(uid, oldModel).setParent(this)) } } @Since("1.6.0") object IsotonicRegression extends DefaultParamsReadable[IsotonicRegression] { @Since("1.6.0") override def load(path: String): IsotonicRegression = super.load(path) }
Example 192
Source File: GraphProviders.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.loaders.csv.providers import ml.sparkling.graph.loaders.csv.types.Types import ml.sparkling.graph.loaders.csv.types.Types.ToVertexId import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.storage.StorageLevel import org.apache.spark.sql.SparkSession; import scala.reflect.ClassTag object GraphProviders { val defaultStorageLevel=StorageLevel.MEMORY_ONLY def simpleGraphBuilder[VD: ClassTag, ED: ClassTag](defaultVertex: Option[VD]=None, vertexProvider: Row => Seq[(VertexId, VD)], edgeProvider: Row => Seq[Edge[ED]], edgeStorageLevel: StorageLevel = defaultStorageLevel, vertexStorageLevel: StorageLevel =defaultStorageLevel) (dataFrame: DataFrame): Graph[VD, ED] = { def mapRows[MT: ClassTag](mappingFunction: (Row) => Seq[MT]): RDD[MT] = { dataFrame.rdd.mapPartitionsWithIndex((id, rowIterator) => { rowIterator.flatMap { case row => mappingFunction(row) } }) } val vertices: RDD[(VertexId, VD)] = mapRows(vertexProvider) val edges: RDD[Edge[ED]] = mapRows(edgeProvider) defaultVertex match{ case None => Graph(vertices,edges,edgeStorageLevel=edgeStorageLevel,vertexStorageLevel=vertexStorageLevel) case Some(defaultVertexValue)=> Graph(vertices,edges,defaultVertexValue,edgeStorageLevel,vertexStorageLevel) } } def indexedGraphBuilder[VD:ClassTag, ED: ClassTag](defaultVertex: Option[VD]=None, vertexProvider: (Row, ToVertexId[VD]) => Seq[(VertexId, VD)], edgeProvider: (Row, ToVertexId[VD]) => Seq[Edge[ED]], columnsToIndex: Seq[Int], edgeStorageLevel: StorageLevel = defaultStorageLevel, vertexStorageLevel: StorageLevel = defaultStorageLevel) (dataFrame: DataFrame): Graph[VD, ED] = { val index = dataFrame.rdd.flatMap(row => columnsToIndex.map(row(_))).distinct().zipWithUniqueId().collect().toMap def extractIdFromIndex(vertex: VD) = index(vertex) simpleGraphBuilder(defaultVertex, vertexProvider(_: Row, extractIdFromIndex _), edgeProvider(_: Row, extractIdFromIndex _), edgeStorageLevel, vertexStorageLevel)(dataFrame) } }
Example 193
Source File: Analysis.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.runners.{AnalysisRunner, AnalyzerContext} import com.amazon.deequ.metrics.Metric import org.apache.spark.sql.DataFrame import org.apache.spark.storage.StorageLevel @deprecated("Use the AnalysisRunner instead (the onData method there)", "24-09-2019") def run( data: DataFrame, aggregateWith: Option[StateLoader] = None, saveStatesWith: Option[StatePersister] = None, storageLevelOfGroupedDataForMultiplePasses: StorageLevel = StorageLevel.MEMORY_AND_DISK) : AnalyzerContext = { AnalysisRunner.doAnalysisRun(data, analyzers, aggregateWith = aggregateWith, saveStatesWith = saveStatesWith) } }
Example 194
Source File: CustomReceiver.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import java.io.{BufferedReader, InputStreamReader} import java.net.Socket import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { logInfo(s"Connecting to $host : $port") socket = new Socket(host, port) logInfo(s"Connected to $host : $port") val reader = new BufferedReader( new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8)) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() logInfo("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart(s"Error connecting to $host : $port", e) case t: Throwable => restart("Error receiving data", t) } } } // scalastyle:on println
Example 195
Source File: RawNetworkGrep.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.util.IntParam object RawNetworkGrep { def main(args: Array[String]) { if (args.length != 4) { System.err.println("Usage: RawNetworkGrep <numStreams> <host> <port> <batchMillis>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val Array(IntParam(numStreams), host, IntParam(port), IntParam(batchMillis)) = args val sparkConf = new SparkConf().setAppName("RawNetworkGrep") // Create the context val ssc = new StreamingContext(sparkConf, Duration(batchMillis)) val rawStreams = (1 to numStreams).map(_ => ssc.rawSocketStream[String](host, port, StorageLevel.MEMORY_ONLY_SER_2)).toArray val union = ssc.union(rawStreams) union.filter(_.contains("the")).count().foreachRDD(r => println(s"Grep count: ${r.collect().mkString}")) ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 196
Source File: SqlNetworkWordCount.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} object SparkSessionSingleton { @transient private var instance: SparkSession = _ def getInstance(sparkConf: SparkConf): SparkSession = { if (instance == null) { instance = SparkSession .builder .config(sparkConf) .getOrCreate() } instance } } // scalastyle:on println
Example 197
Source File: NetworkWordCount.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} object NetworkWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: NetworkWordCount <hostname> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() // Create the context with a 1 second batch size val sparkConf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create a socket stream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 198
Source File: GraphLoader.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx import org.apache.spark.SparkContext import org.apache.spark.graphx.impl.{EdgePartitionBuilder, GraphImpl} import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel def edgeListFile( sc: SparkContext, path: String, canonicalOrientation: Boolean = false, numEdgePartitions: Int = -1, edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY, vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) : Graph[Int, Int] = { val startTime = System.currentTimeMillis // Parse the edge data table directly into edge partitions val lines = if (numEdgePartitions > 0) { sc.textFile(path, numEdgePartitions).coalesce(numEdgePartitions) } else { sc.textFile(path) } val edges = lines.mapPartitionsWithIndex { (pid, iter) => val builder = new EdgePartitionBuilder[Int, Int] iter.foreach { line => if (!line.isEmpty && line(0) != '#') { val lineArray = line.split("\\s+") if (lineArray.length < 2) { throw new IllegalArgumentException("Invalid line: " + line) } val srcId = lineArray(0).toLong val dstId = lineArray(1).toLong if (canonicalOrientation && srcId > dstId) { builder.add(dstId, srcId, 1) } else { builder.add(srcId, dstId, 1) } } } Iterator((pid, builder.toEdgePartition)) }.persist(edgeStorageLevel).setName("GraphLoader.edgeListFile - edges (%s)".format(path)) edges.count() logInfo("It took %d ms to load the edges".format(System.currentTimeMillis - startTime)) GraphImpl.fromEdgePartitions(edges, defaultVertexAttr = 1, edgeStorageLevel = edgeStorageLevel, vertexStorageLevel = vertexStorageLevel) } // end of edgeListFile }
Example 199
Source File: PeriodicGraphCheckpointer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx.util import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import org.apache.spark.storage.StorageLevel import org.apache.spark.util.PeriodicCheckpointer data.vertices.cache() } if (data.edges.getStorageLevel == StorageLevel.NONE) { data.edges.cache() } } override protected def unpersist(data: Graph[VD, ED]): Unit = data.unpersist(blocking = false) override protected def getCheckpointFiles(data: Graph[VD, ED]): Iterable[String] = { data.getCheckpointFiles } }
Example 200
Source File: EdgeRDDImpl.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{HashPartitioner, OneToOneDependency} import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }