org.apache.spark.Logging Scala Examples
The following examples show how to use org.apache.spark.Logging.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: CatalogSuite.scala From shc with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.Logging import org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} class CatalogSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll with Logging{ def catalog = s"""{ |"table":{"namespace":"default", "name":"table1"}, |"rowkey":"key1:key2", |"columns":{ |"col00":{"cf":"rowkey", "col":"key1", "type":"string", "length":"6"}, |"col01":{"cf":"rowkey", "col":"key2", "type":"int"}, |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"}, |"col2":{"cf":"cf2", "col":"col2", "type":"double"}, |"col3":{"cf":"cf3", "col":"col3", "type":"float"}, |"col4":{"cf":"cf4", "col":"col4", "type":"int"}, |"col5":{"cf":"cf5", "col":"col5", "type":"bigint"}, |"col6":{"cf":"cf6", "col":"col6", "type":"smallint"}, |"col7":{"cf":"cf7", "col":"col7", "type":"string"}, |"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"} |} |}""".stripMargin test("Catalog meta data check") { val m = HBaseTableCatalog(Map(HBaseTableCatalog.tableCatalog->catalog)) assert(m.row.varLength == false) assert(m.row.length == 10) } }
Example 2
Source File: DataTypeConverter.scala From shc with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog import org.apache.spark.{SparkContext, Logging} class DataTypeConverter extends SHC with Logging{ ignore("Basic setup") { val sc = new SparkContext("local", "HBaseTest", conf) val sqlContext = new SQLContext(sc) val complex = s"""MAP<int, struct<varchar:string>>""" val schema = s"""{"namespace": "example.avro", | "type": "record", "name": "User", | "fields": [ {"name": "name", "type": "string"}, | {"name": "favorite_number", "type": ["int", "null"]}, | {"name": "favorite_color", "type": ["string", "null"]} ] }""".stripMargin val catalog = s"""{ |"table":{"namespace":"default", "name":"htable"}, |"rowkey":"key1:key2", |"columns":{ |"col1":{"cf":"rowkey", "col":"key1", "type":"binary"}, |"col2":{"cf":"rowkey", "col":"key2", "type":"double"}, |"col3":{"cf":"cf1", "col":"col1", "avro":"schema1"}, |"col4":{"cf":"cf1", "col":"col2", "type":"string"}, |"col5":{"cf":"cf1", "col":"col3", "type":"double", |"sedes":"org.apache.spark.sql.execution.datasources.hbase.DoubleSedes"}, |"col6":{"cf":"cf1", "col":"col4", "type":"$complex"} |} |}""".stripMargin val df = sqlContext.read.options( Map("schema1"->schema, HBaseTableCatalog.tableCatalog->catalog)) .format("org.apache.spark.sql.execution.datasources.hbase") .load() df.write.options( Map("schema1"->schema, HBaseTableCatalog.tableCatalog->catalog)) .format("org.apache.spark.sql.execution.datasources.hbase") .save() //val s = df.filter((($"col1" < Array(10.toByte)) and ($"col1" > Array(1.toByte))) or ($"col1" === Array(11.toByte))).select("col1") //val s = df.filter(Column("col1").<(Array(10.toByte)).and(Column("col1").>(Array(1.toByte))).or(Column("col1") === Array(11.toByte))).select("col1") // val s = df.filter((($"col1" < Array(10.toByte)) && ($"col1" > Array(1.toByte))) || ($"col1" === Array(11.toByte))).select("col1") //val s = df.filter(($"col1" < Array(10.toByte) && $"col1" > Array(1.toByte)) || $"col1" === Array(11.toByte) || $"col2" === 2.3).select("col1") // range should be (None, None) val s = df.filter(($"col1" < Array(10.toByte) && $"col1" > Array(1.toByte)) || $"col1" === Array(11.toByte) && $"col2" === 2.3) .select("col1") s.count() df.registerTempTable("table") val c = sqlContext.sql("select count(col1) from table") // c.queryExecution c.show val se = df.filter($"col2" > 12).filter($"col4" < Array(10.toByte)).select("col1") val se1 = df.filter($"col2" > 12).filter($"col4" < Array(10.toByte)).select("col1") se.count() se1.collect.foreach(println(_)) println(df) } }
Example 3
Source File: SignRandomProjectionLSH.scala From lexrank-summarizer with MIT License | 5 votes |
package io.github.karlhigley.lexrank import scala.collection.immutable.BitSet import scala.collection.mutable.ArrayBuffer import scala.util.Random import scala.util.hashing.MurmurHash3 import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.Logging class SignRandomProjectionLSH(poolSize: Int = 10000) extends Serializable with Logging { val pool = SignRandomProjectionLSH.generatePool(poolSize) def computeSignature(vector: SparseVector, length: Int): BitSet = { val buf = ArrayBuffer.empty[Int] val elements = vector.indices.zip(vector.values) for (bit <- 1 to length) { val components = elements.map(e => { val hash = MurmurHash3.productHash((bit, e._1)) val poolIndex = ((hash % poolSize) + poolSize) % poolSize val result = e._2 * pool(poolIndex) result }) val dotProduct = components.reduce(_ + _) if (dotProduct > 0) { buf += bit } } BitSet(buf.toArray:_*) } } object SignRandomProjectionLSH { def signatureSet(length: Int): Set[BitSet] = { BitSet(1 to length:_*).subsets.toSet } def estimateCosine(a: BitSet, b: BitSet, length: Int): Double = { val hammingDistance = (a^b).size math.cos(hammingDistance.toDouble/length.toDouble*math.Pi) } private def generatePool(size: Int): Array[Double] = { val rand = new Random() val buf = ArrayBuffer.fill[Double](size)(rand.nextGaussian) buf.toArray } }
Example 4
Source File: Driver.scala From lexrank-summarizer with MIT License | 5 votes |
package io.github.karlhigley.lexrank import scala.io.Source import org.apache.spark.{SparkContext, SparkConf, Logging} import org.apache.spark.rdd.RDD import org.apache.spark.graphx._ object Driver extends Logging { val serializerClasses: Array[Class[_]] = Array( classOf[Document], classOf[Sentence], classOf[SentenceTokens], classOf[SentenceFeatures], classOf[Featurizer], classOf[SignRandomProjectionLSH], classOf[LexRank] ) private def selectExcerpts(sentences: RDD[Sentence], scores: VertexRDD[Double], length: Int) = { scores .join(sentences.map(s => (s.id, s))) .map { case (_, (score, sentence)) => (sentence.docId, (score, sentence.id, sentence.text)) } .groupByKey() .flatMap { case (docId, sentences) => sentences.toSeq.sortWith(_._1 > _._1).take(length).map(e => (docId, e._3)) } } def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("Summarizer") sparkConf.registerKryoClasses(serializerClasses) val sc = new SparkContext(sparkConf) val config = new Configuration(args) val documents = sc.textFile(config.inputPath, minPartitions = config.partitions).flatMap( _.split('\t').toList match { case List(docId, text @ _*) => Some((docId.trim, text.mkString(" "))) case _ => None } ).map(Document.tupled).filter(d => d.id.length > 0) val segmenter = new DocumentSegmenter val (sentences, tokenized) = segmenter(documents) val tokenizedFilteredByLength = tokenized.filter(t => t.tokens.size > 2) val featurizer = new Featurizer(config.numStopwords) val features = featurizer(tokenizedFilteredByLength) val model = LexRank.build(features) val ranks = model.score(config.cutoff, config.convergence) val excerpts = selectExcerpts(sentences, ranks, config.length) excerpts .map(_.productIterator.toList.mkString("\t")) .saveAsTextFile(config.outputPath) sc.stop() } }
Example 5
Source File: CustomLogger.scala From hyperspark with Apache License 2.0 | 5 votes |
package util import org.apache.spark.Logging class CustomLogger extends Logging { protected var params: List[String] = List() protected def reformat(ps: List[String]) = { def produceBlanks(N: Int) = { if(N==0) "" else (for(i<-1 to N) yield " ").reduceLeft(_ concat _).concat("\t") } def fixsize(str: String) = { str.concat(produceBlanks(15-str.size)) } ps.map { x => fixsize(x) } } def setFormat(parameters: List[String]) { params = parameters params = reformat(params) } def getFormatString(): String = { val toprint = params.reduceLeft(_ concat _).concat("\n") toprint } def printInfo(msg: String) = { print(msg) logInfo(msg) } def printFormat() = { printInfo(getFormatString()) } def getValuesString(values: List[Any]): String = { reformat(values.map { x => x.toString() }).reduceLeft(_ concat _).concat("\n") } def printValues(values: List[Any]) = { printInfo(getValuesString(values)) } } object CustomLogger { def apply() = new CustomLogger() def apply(parameters: List[String]) = new CustomLogger().setFormat(parameters) }
Example 6
Source File: EventTransformer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import java.io.{ObjectOutput, ObjectInput} import scala.collection.JavaConversions._ import org.apache.spark.util.Utils import org.apache.spark.Logging private[streaming] object EventTransformer extends Logging { def readExternal(in: ObjectInput): (java.util.HashMap[CharSequence, CharSequence], Array[Byte]) = { val bodyLength = in.readInt() val bodyBuff = new Array[Byte](bodyLength) in.readFully(bodyBuff) val numHeaders = in.readInt() val headers = new java.util.HashMap[CharSequence, CharSequence] for (i <- 0 until numHeaders) { val keyLength = in.readInt() val keyBuff = new Array[Byte](keyLength) in.readFully(keyBuff) val key: String = Utils.deserialize(keyBuff) val valLength = in.readInt() val valBuff = new Array[Byte](valLength) in.readFully(valBuff) val value: String = Utils.deserialize(valBuff) headers.put(key, value) } (headers, bodyBuff) } def writeExternal(out: ObjectOutput, headers: java.util.Map[CharSequence, CharSequence], body: Array[Byte]) { out.writeInt(body.length) out.write(body) val numHeaders = headers.size() out.writeInt(numHeaders) for ((k, v) <- headers) { val keyBuff = Utils.serialize(k.toString) out.writeInt(keyBuff.length) out.write(keyBuff) val valBuff = Utils.serialize(v.toString) out.writeInt(valBuff.length) out.write(valBuff) } } }
Example 7
Source File: FlumePollingStreamSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import java.net.InetSocketAddress import scala.collection.JavaConversions._ import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer} import scala.concurrent.duration._ import scala.language.postfixOps import com.google.common.base.Charsets.UTF_8 import org.scalatest.BeforeAndAfter import org.scalatest.concurrent.Eventually._ import org.apache.spark.{Logging, SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.{Seconds, TestOutputStream, StreamingContext} import org.apache.spark.util.{ManualClock, Utils} private def testMultipleTimes(test: () => Unit): Unit = { var testPassed = false var attempt = 0 while (!testPassed && attempt < maxAttempts) { try { test() testPassed = true } catch { case e: Exception if Utils.isBindCollision(e) => logWarning("Exception when running flume polling test: " + e) attempt += 1 } } assert(testPassed, s"Test failed after $attempt attempts!") } private def testFlumePolling(): Unit = { try { val port = utils.startSingleSink() writeAndVerify(Seq(port)) utils.assertChannelsAreEmpty() } finally { utils.close() } } private def testFlumePollingMultipleHost(): Unit = { try { val ports = utils.startMultipleSinks() writeAndVerify(ports) utils.assertChannelsAreEmpty() } finally { utils.close() } } def writeAndVerify(sinkPorts: Seq[Int]): Unit = { // Set up the streaming context and input streams //设置流上下文和输入流 val ssc = new StreamingContext(conf, batchDuration) val addresses = sinkPorts.map(port => new InetSocketAddress("localhost", port)) val flumeStream: ReceiverInputDStream[SparkFlumeEvent] = FlumeUtils.createPollingStream(ssc, addresses, StorageLevel.MEMORY_AND_DISK, utils.eventsPerBatch, 5) val outputBuffer = new ArrayBuffer[Seq[SparkFlumeEvent]] with SynchronizedBuffer[Seq[SparkFlumeEvent]] val outputStream = new TestOutputStream(flumeStream, outputBuffer) outputStream.register() ssc.start() try { utils.sendDatAndEnsureAllDataHasBeenReceived() val clock = ssc.scheduler.clock.asInstanceOf[ManualClock] clock.advance(batchDuration.milliseconds) // The eventually is required to ensure that all data in the batch has been processed. //最终需要确保批处理中的所有数据已被处理 eventually(timeout(10 seconds), interval(100 milliseconds)) { val flattenOutputBuffer = outputBuffer.flatten val headers = flattenOutputBuffer.map(_.event.getHeaders.map { case kv => (kv._1.toString, kv._2.toString) }).map(mapAsJavaMap) val bodies = flattenOutputBuffer.map(e => new String(e.event.getBody.array(), UTF_8)) utils.assertOutput(headers, bodies) } } finally { ssc.stop() } } }
Example 8
Source File: FlumeStreamSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import scala.collection.JavaConversions._ import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer} import scala.concurrent.duration._ import scala.language.postfixOps import com.google.common.base.Charsets import org.jboss.netty.channel.ChannelPipeline import org.jboss.netty.channel.socket.SocketChannel import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory import org.jboss.netty.handler.codec.compression._ import org.scalatest.{BeforeAndAfter, Matchers} import org.scalatest.concurrent.Eventually._ import org.apache.spark.{Logging, SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext, TestOutputStream} private class CompressionChannelFactory(compressionLevel: Int) extends NioClientSocketChannelFactory { override def newChannel(pipeline: ChannelPipeline): SocketChannel = { val encoder = new ZlibEncoder(compressionLevel) pipeline.addFirst("deflater", encoder) pipeline.addFirst("inflater", new ZlibDecoder()) super.newChannel(pipeline) } } }
Example 9
Source File: MQTTTestUtils.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.mqtt import java.net.{ServerSocket, URI} import scala.language.postfixOps import com.google.common.base.Charsets.UTF_8 import org.apache.activemq.broker.{BrokerService, TransportConnector} import org.apache.commons.lang3.RandomUtils import org.eclipse.paho.client.mqttv3._ import org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence import org.apache.spark.util.Utils import org.apache.spark.{Logging, SparkConf} private[mqtt] class MQTTTestUtils extends Logging { private val persistenceDir = Utils.createTempDir() private val brokerHost = "localhost" private val brokerPort = findFreePort() private var broker: BrokerService = _ private var connector: TransportConnector = _ def brokerUri: String = { s"$brokerHost:$brokerPort" } def setup(): Unit = { broker = new BrokerService() broker.setDataDirectoryFile(Utils.createTempDir()) connector = new TransportConnector() connector.setName("mqtt") connector.setUri(new URI("mqtt://" + brokerUri)) broker.addConnector(connector) broker.start() } def teardown(): Unit = { if (broker != null) { broker.stop() broker = null } if (connector != null) { connector.stop() connector = null } Utils.deleteRecursively(persistenceDir) } private def findFreePort(): Int = { val candidatePort = RandomUtils.nextInt(1024, 65536) Utils.startServiceOnPort(candidatePort, (trialPort: Int) => { val socket = new ServerSocket(trialPort) socket.close() (null, trialPort) }, new SparkConf())._2 } def publishData(topic: String, data: String): Unit = { var client: MqttClient = null try { val persistence = new MqttDefaultFilePersistence(persistenceDir.getAbsolutePath) client = new MqttClient("tcp://" + brokerUri, MqttClient.generateClientId(), persistence) client.connect() if (client.isConnected) { val msgTopic = client.getTopic(topic) val message = new MqttMessage(data.getBytes(UTF_8)) message.setQos(1) message.setRetained(true) for (i <- 0 to 10) { try { msgTopic.publish(message) } catch { case e: MqttException if e.getReasonCode == MqttException.REASON_CODE_MAX_INFLIGHT => // wait for Spark streaming to consume something from the message queue Thread.sleep(50) } } } } finally { if (client != null) { client.disconnect() client.close() client = null } } } }
Example 10
Source File: FiltersSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.client import scala.collection.JavaConversions._ import org.apache.hadoop.hive.metastore.api.FieldSchema import org.apache.hadoop.hive.serde.serdeConstants import org.apache.spark.{Logging, SparkFunSuite} import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ class FiltersSuite extends SparkFunSuite with Logging { private val shim = new Shim_v0_13 private val testTable = new org.apache.hadoop.hive.ql.metadata.Table("default", "test") private val varCharCol = new FieldSchema() varCharCol.setName("varchar") varCharCol.setType(serdeConstants.VARCHAR_TYPE_NAME) testTable.setPartCols(varCharCol :: Nil) //字符串过滤器 filterTest("string filter", (a("stringcol", StringType) > Literal("test")) :: Nil, "stringcol > \"test\"") //字符串过滤器向后 filterTest("string filter backwards", (Literal("test") > a("stringcol", StringType)) :: Nil, "\"test\" > stringcol") //int过滤器 filterTest("int filter", (a("intcol", IntegerType) === Literal(1)) :: Nil, "intcol = 1") //int向后过滤 filterTest("int filter backwards", (Literal(1) === a("intcol", IntegerType)) :: Nil, "1 = intcol") filterTest("int and string filter", (Literal(1) === a("intcol", IntegerType)) :: (Literal("a") === a("strcol", IntegerType)) :: Nil, "1 = intcol and \"a\" = strcol") filterTest("skip varchar", (Literal("") === a("varchar", StringType)) :: Nil, "") private def filterTest(name: String, filters: Seq[Expression], result: String) = { test(name){ val converted = shim.convertFilters(testTable, filters) if (converted != result) { fail( s"Expected filters ${filters.mkString(",")} to convert to '$result' but got '$converted'") } } } private def a(name: String, dataType: DataType) = AttributeReference(name, dataType)() }
Example 11
Source File: SparkSQLDriver.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.util.{ArrayList => JArrayList, List => JList} import scala.collection.JavaConversions._ import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse import org.apache.spark.Logging import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes} private[hive] class SparkSQLDriver( val context: HiveContext = SparkSQLEnv.hiveContext) extends Driver with Logging { private[hive] var tableSchema: Schema = _ private[hive] var hiveResponse: Seq[String] = _ override def init(): Unit = { } private def getResultSetSchema(query: context.QueryExecution): Schema = { val analyzed = query.analyzed logDebug(s"Result Schema: ${analyzed.output}") if (analyzed.output.size == 0) { new Schema(new FieldSchema("Response code", "string", "") :: Nil, null) } else { val fieldSchemas = analyzed.output.map { attr => new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "") } new Schema(fieldSchemas, null) } } override def run(command: String): CommandProcessorResponse = { // TODO unify the error code try { context.sparkContext.setJobDescription(command) val execution = context.executePlan(context.sql(command).logicalPlan) hiveResponse = execution.stringResult() tableSchema = getResultSetSchema(execution) new CommandProcessorResponse(0) } catch { case ae: AnalysisException => logDebug(s"Failed in [$command]", ae) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae) case cause: Throwable => logError(s"Failed in [$command]", cause) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause) } } override def close(): Int = { hiveResponse = null tableSchema = null 0 } override def getResults(res: JList[_]): Boolean = { if (hiveResponse == null) { false } else { res.asInstanceOf[JArrayList[String]].addAll(hiveResponse) hiveResponse = null true } } override def getSchema: Schema = tableSchema override def destroy() { super.destroy() hiveResponse = null tableSchema = null } }
Example 12
Source File: SparkSQLOperationManager.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.server import java.util.{Map => JMap} import scala.collection.mutable.Map import org.apache.hive.service.cli._ import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, Operation, OperationManager} import org.apache.hive.service.cli.session.HiveSession import org.apache.spark.Logging import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.hive.thriftserver.{SparkExecuteStatementOperation, ReflectionUtils} private[thriftserver] class SparkSQLOperationManager(hiveContext: HiveContext) extends OperationManager with Logging { val handleToOperation = ReflectionUtils .getSuperField[JMap[OperationHandle, Operation]](this, "handleToOperation") val sessionToActivePool = Map[SessionHandle, String]() override def newExecuteStatementOperation( parentSession: HiveSession, statement: String, confOverlay: JMap[String, String], async: Boolean): ExecuteStatementOperation = synchronized { val runInBackground = async && hiveContext.hiveThriftServerAsync val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay, runInBackground)(hiveContext, sessionToActivePool) handleToOperation.put(operation.getHandle, operation) logDebug(s"Created Operation for $statement with session=$parentSession, " + s"runInBackground=$runInBackground") operation } }
Example 13
Source File: ThriftServerTab.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.ui import org.apache.spark.sql.hive.thriftserver.{HiveThriftServer2, SparkSQLEnv} import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._ import org.apache.spark.ui.{SparkUI, SparkUITab} import org.apache.spark.{SparkContext, Logging, SparkException} private[thriftserver] class ThriftServerTab(sparkContext: SparkContext) extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging { override val name = "JDBC/ODBC Server" val parent = getSparkUI(sparkContext) val listener = HiveThriftServer2.listener attachPage(new ThriftServerPage(this)) attachPage(new ThriftServerSessionPage(this)) parent.attachTab(this) def detach() { getSparkUI(sparkContext).detachTab(this) } } private[thriftserver] object ThriftServerTab { def getSparkUI(sparkContext: SparkContext): SparkUI = { sparkContext.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 14
Source File: SparkSQLEnv.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.io.PrintStream import scala.collection.JavaConversions._ import org.apache.spark.scheduler.StatsReportListener import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{Logging, SparkConf, SparkContext} import org.apache.spark.util.Utils def stop() { logDebug("Shutting down Spark SQL Environment") // Stop the SparkContext if (SparkSQLEnv.sparkContext != null) { sparkContext.stop() sparkContext = null hiveContext = null } } }
Example 15
Source File: BoundAttribute.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors.attachTree import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode} import org.apache.spark.sql.types._ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean) extends LeafExpression with NamedExpression { override def toString: String = s"input[$ordinal, $dataType]" // Use special getter for primitive types (for UnsafeRow) //对原始类型使用特殊的getter(对于UnsafeRow) override def eval(input: InternalRow): Any = { if (input.isNullAt(ordinal)) { null } else { dataType match { case BooleanType => input.getBoolean(ordinal) case ByteType => input.getByte(ordinal) case ShortType => input.getShort(ordinal) case IntegerType | DateType => input.getInt(ordinal) case LongType | TimestampType => input.getLong(ordinal) case FloatType => input.getFloat(ordinal) case DoubleType => input.getDouble(ordinal) case StringType => input.getUTF8String(ordinal) case BinaryType => input.getBinary(ordinal) case CalendarIntervalType => input.getInterval(ordinal) case t: DecimalType => input.getDecimal(ordinal, t.precision, t.scale) case t: StructType => input.getStruct(ordinal, t.size) case _: ArrayType => input.getArray(ordinal) case _: MapType => input.getMap(ordinal) case _ => input.get(ordinal, dataType) } } } override def name: String = s"i[$ordinal]" override def toAttribute: Attribute = throw new UnsupportedOperationException override def qualifiers: Seq[String] = throw new UnsupportedOperationException override def exprId: ExprId = throw new UnsupportedOperationException override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { val javaType = ctx.javaType(dataType) val value = ctx.getValue("i", dataType, ordinal.toString) s""" boolean ${ev.isNull} = i.isNullAt($ordinal); $javaType ${ev.primitive} = ${ev.isNull} ? ${ctx.defaultValue(dataType)} : ($value); """ } } object BindReferences extends Logging { def bindReference[A <: Expression]( expression: A, input: Seq[Attribute], allowFailures: Boolean = false): A = { expression.transform { case a: AttributeReference => attachTree(a, "Binding attribute") { val ordinal = input.indexWhere(_.exprId == a.exprId) if (ordinal == -1) { if (allowFailures) { a } else { sys.error(s"Couldn't find $a in ${input.mkString("[", ",", "]")}") } } else { BoundReference(ordinal, a.dataType, a.nullable) } } }.asInstanceOf[A] // Kind of a hack, but safe. TODO: Tighten return type when possible. } }
Example 16
Source File: package.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import scala.collection.mutable.HashSet import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.trees.TreeNodeRef import org.apache.spark.{Accumulator, AccumulatorParam, Logging} case class ColumnMetrics( elementTypes: Accumulator[HashSet[String]] = sparkContext.accumulator(HashSet.empty)) val tupleCount: Accumulator[Int] = sparkContext.accumulator[Int](0) val numColumns: Int = child.output.size val columnStats: Array[ColumnMetrics] = Array.fill(child.output.size)(new ColumnMetrics()) def dumpStats(): Unit = { logDebug(s"== ${child.simpleString} ==") logDebug(s"Tuples output: ${tupleCount.value}") child.output.zip(columnStats).foreach { case(attr, metric) => val actualDataTypes = metric.elementTypes.value.mkString("{", ",", "}") logDebug(s" ${attr.name} ${attr.dataType}: $actualDataTypes") } } protected override def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => new Iterator[InternalRow] { def hasNext: Boolean = iter.hasNext def next(): InternalRow = { val currentRow = iter.next() tupleCount += 1 var i = 0 while (i < numColumns) { val value = currentRow.get(i, output(i).dataType) if (value != null) { columnStats(i).elementTypes += HashSet(value.getClass.getName) } i += 1 } currentRow } } } } } }
Example 17
Source File: DriverRegistry.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import java.sql.{Driver, DriverManager} import scala.collection.mutable import org.apache.spark.Logging import org.apache.spark.util.Utils object DriverRegistry extends Logging { private val wrapperMap: mutable.Map[String, DriverWrapper] = mutable.Map.empty def register(className: String): Unit = { val cls = Utils.getContextOrSparkClassLoader.loadClass(className) if (cls.getClassLoader == null) { logTrace(s"$className has been loaded with bootstrap ClassLoader, wrapper is not required") } else if (wrapperMap.get(className).isDefined) { logTrace(s"Wrapper for $className already exists") } else { synchronized { if (wrapperMap.get(className).isEmpty) { val wrapper = new DriverWrapper(cls.newInstance().asInstanceOf[Driver]) DriverManager.registerDriver(wrapper) wrapperMap(className) = wrapper logTrace(s"Wrapper for $className registered") } } } } def getDriverClassName(url: String): String = DriverManager.getDriver(url) match { case wrapper: DriverWrapper => wrapper.wrapped.getClass.getCanonicalName case driver => driver.getClass.getCanonicalName } }
Example 18
Source File: CompressibleColumnBuilder.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.columnar.{ColumnBuilder, NativeColumnBuilder} import org.apache.spark.sql.types.AtomicType private[sql] trait CompressibleColumnBuilder[T <: AtomicType] extends ColumnBuilder with Logging { this: NativeColumnBuilder[T] with WithCompressionSchemes => var compressionEncoders: Seq[Encoder[T]] = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { compressionEncoders = if (useCompression) { schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType)) } else { Seq(PassThrough.encoder(columnType)) } super.initialize(initialSize, columnName, useCompression) } protected def isWorthCompressing(encoder: Encoder[T]) = { encoder.compressionRatio < 0.8 } private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = { var i = 0 while (i < compressionEncoders.length) { compressionEncoders(i).gatherCompressibilityStats(row, ordinal) i += 1 } } abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = { super.appendFrom(row, ordinal) if (!row.isNullAt(ordinal)) { gatherCompressibilityStats(row, ordinal) } } override def build(): ByteBuffer = { val nonNullBuffer = buildNonNulls() val typeId = nonNullBuffer.getInt() val encoder: Encoder[T] = { val candidate = compressionEncoders.minBy(_.compressionRatio) if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType) } // Header = column type ID + null count + null positions val headerSize = 4 + 4 + nulls.limit() val compressedSize = if (encoder.compressedSize == 0) { nonNullBuffer.remaining() } else { encoder.compressedSize } val compressedBuffer = ByteBuffer // Reserves 4 bytes for compression scheme ID .allocate(headerSize + 4 + compressedSize) .order(ByteOrder.nativeOrder) // Write the header .putInt(typeId) .putInt(nullCount) .put(nulls) logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}") encoder.compress(nonNullBuffer, compressedBuffer) } }
Example 19
Source File: ExecutorDelegationTokenUpdater.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.util.concurrent.{Executors, TimeUnit} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.{Logging, SparkConf} import org.apache.spark.util.{ThreadUtils, Utils} import scala.util.control.NonFatal private[spark] class ExecutorDelegationTokenUpdater( sparkConf: SparkConf, hadoopConf: Configuration) extends Logging { @volatile private var lastCredentialsFileSuffix = 0 private val credentialsFile = sparkConf.get("spark.yarn.credentials.file") private val freshHadoopConf = SparkHadoopUtil.get.getConfBypassingFSCache( hadoopConf, new Path(credentialsFile).toUri.getScheme) private val delegationTokenRenewer = Executors.newSingleThreadScheduledExecutor( ThreadUtils.namedThreadFactory("Delegation Token Refresh Thread")) // On the executor, this thread wakes up and picks up new tokens from HDFS, if any. //在执行程序中,该线程唤醒并从HDFS中获取新令牌(如果有的话) private val executorUpdaterRunnable = new Runnable { override def run(): Unit = Utils.logUncaughtExceptions(updateCredentialsIfRequired()) } def updateCredentialsIfRequired(): Unit = { try { val credentialsFilePath = new Path(credentialsFile) val remoteFs = FileSystem.get(freshHadoopConf) SparkHadoopUtil.get.listFilesSorted( remoteFs, credentialsFilePath.getParent, credentialsFilePath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION) .lastOption.foreach { credentialsStatus => val suffix = SparkHadoopUtil.get.getSuffixForCredentialsPath(credentialsStatus.getPath) if (suffix > lastCredentialsFileSuffix) { logInfo("Reading new delegation tokens from " + credentialsStatus.getPath) val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsStatus.getPath) lastCredentialsFileSuffix = suffix UserGroupInformation.getCurrentUser.addCredentials(newCredentials) logInfo("Tokens updated from credentials file.") } else { // Check every hour to see if new credentials arrived. logInfo("Updated delegation tokens were expected, but the driver has not updated the " + "tokens yet, will check again in an hour.") delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS) return } } val timeFromNowToRenewal = SparkHadoopUtil.get.getTimeFromNowToRenewal( sparkConf, 0.8, UserGroupInformation.getCurrentUser.getCredentials) if (timeFromNowToRenewal <= 0) { executorUpdaterRunnable.run() } else { logInfo(s"Scheduling token refresh from HDFS in $timeFromNowToRenewal millis.") delegationTokenRenewer.schedule( executorUpdaterRunnable, timeFromNowToRenewal, TimeUnit.MILLISECONDS) } } catch { // Since the file may get deleted while we are reading it, catch the Exception and come // back in an hour to try again case NonFatal(e) => logWarning("Error while trying to update credentials, will try again in 1 hour", e) delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS) } } private def getCredentialsFromHDFSFile(remoteFs: FileSystem, tokenPath: Path): Credentials = { val stream = remoteFs.open(tokenPath) try { val newCredentials = new Credentials() newCredentials.readTokenStorageStream(stream) newCredentials } finally { stream.close() } } def stop(): Unit = { delegationTokenRenewer.shutdown() } }
Example 20
Source File: SocketInputDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.util.control.NonFatal import org.apache.spark.streaming.StreamingContext import org.apache.spark.storage.StorageLevel import org.apache.spark.util.NextIterator import scala.reflect.ClassTag import java.io._ import java.net.{UnknownHostException, Socket} import org.apache.spark.Logging import org.apache.spark.streaming.receiver.Receiver private[streaming] class SocketInputDStream[T: ClassTag]( @transient ssc_ : StreamingContext, host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends ReceiverInputDStream[T](ssc_) { def getReceiver(): Receiver[T] = { new SocketReceiver(host, port, bytesToObjects, storageLevel) } } private[streaming] class SocketReceiver[T: ClassTag]( host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends Receiver[T](storageLevel) with Logging { def onStart() { // Start the thread that receives data over a connection //启动接收到连接上的数据的线程 new Thread("Socket Receiver") { setDaemon(true) override def run() { receive() } }.start() } def onStop() { // There is nothing much to do as the thread calling receive() //没有什么可做的线程调用receive() // is designed to stop by itself isStopped() returns false //是为了阻止自己isstopped()返回false } def bytesToLines(inputStream: InputStream): Iterator[String] = { val dataInputStream = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")) new NextIterator[String] { protected override def getNext() = { val nextValue = dataInputStream.readLine() if (nextValue == null) { finished = true } nextValue } protected override def close() { dataInputStream.close() } } } }
Example 21
Source File: StreamingTab.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import org.apache.spark.{Logging, SparkException} import org.apache.spark.streaming.StreamingContext import org.apache.spark.ui.{SparkUI, SparkUITab} import StreamingTab._ private[spark] class StreamingTab(val ssc: StreamingContext) extends SparkUITab(getSparkUI(ssc), "streaming") with Logging { private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static" val parent = getSparkUI(ssc) val listener = ssc.progressListener ssc.addStreamingListener(listener) ssc.sc.addSparkListener(listener) attachPage(new StreamingPage(this)) attachPage(new BatchPage(this)) def attach() { getSparkUI(ssc).attachTab(this) getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming") } def detach() { getSparkUI(ssc).detachTab(this) getSparkUI(ssc).removeStaticHandler("/static/streaming") } } private object StreamingTab { def getSparkUI(ssc: StreamingContext): SparkUI = { ssc.sc.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 22
Source File: StreamingListenerBus.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import java.util.concurrent.atomic.AtomicBoolean import org.apache.spark.Logging import org.apache.spark.util.AsynchronousListenerBus private[spark] class StreamingListenerBus extends AsynchronousListenerBus[StreamingListener, StreamingListenerEvent]("StreamingListenerBus") with Logging { private val logDroppedEvent = new AtomicBoolean(false) override def onPostEvent(listener: StreamingListener, event: StreamingListenerEvent): Unit = { event match { case receiverStarted: StreamingListenerReceiverStarted => listener.onReceiverStarted(receiverStarted) case receiverError: StreamingListenerReceiverError => listener.onReceiverError(receiverError) case receiverStopped: StreamingListenerReceiverStopped => listener.onReceiverStopped(receiverStopped) case batchSubmitted: StreamingListenerBatchSubmitted => listener.onBatchSubmitted(batchSubmitted) case batchStarted: StreamingListenerBatchStarted => listener.onBatchStarted(batchStarted) case batchCompleted: StreamingListenerBatchCompleted => listener.onBatchCompleted(batchCompleted) case _ => } } override def onDropEvent(event: StreamingListenerEvent): Unit = { if (logDroppedEvent.compareAndSet(false, true)) { // Only log the following message once to avoid duplicated annoying logs. logError("Dropping StreamingListenerEvent because no remaining room in event queue. " + "This likely means one of the StreamingListeners is too slow and cannot keep up with the " + "rate at which events are being started by the scheduler.") } } }
Example 23
Source File: RecurringTimer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import org.apache.spark.Logging import org.apache.spark.util.{Clock, SystemClock} private[streaming] class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name: String) extends Logging { private val thread = new Thread("RecurringTimer - " + name) { setDaemon(true) override def run() { loop } } @volatile private var prevTime = -1L @volatile private var nextTime = -1L @volatile private var stopped = false private def loop() { try { while (!stopped) { triggerActionForNextInterval() } triggerActionForNextInterval() } catch { case e: InterruptedException => } } } private[streaming] object RecurringTimer extends Logging { def main(args: Array[String]) { var lastRecurTime = 0L val period = 1000 def onRecur(time: Long) { val currentTime = System.currentTimeMillis() logInfo("" + currentTime + ": " + (currentTime - lastRecurTime)) lastRecurTime = currentTime } val timer = new RecurringTimer(new SystemClock(), period, onRecur, "Test") timer.start() Thread.sleep(30 * 1000) timer.stop(true) } }
Example 24
Source File: RawTextSender.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.{ByteArrayOutputStream, IOException} import java.net.ServerSocket import java.nio.ByteBuffer import scala.io.Source import org.apache.spark.{SparkConf, Logging} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.IntParam private[streaming] object RawTextSender extends Logging { def main(args: Array[String]) { if (args.length != 4) { // scalastyle:off println System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>") // scalastyle:on println System.exit(1) } // Parse the arguments using a pattern match //解析使用模式匹配的参数 val Array(IntParam(port), file, IntParam(blockSize), IntParam(bytesPerSec)) = args // Repeat the input data multiple times to fill in a buffer //多次重复输入数据以填充缓冲区 val lines = Source.fromFile(file).getLines().toArray val bufferStream = new ByteArrayOutputStream(blockSize + 1000) val ser = new KryoSerializer(new SparkConf()).newInstance() val serStream = ser.serializeStream(bufferStream) var i = 0 while (bufferStream.size < blockSize) { serStream.writeObject(lines(i)) i = (i + 1) % lines.length } val array = bufferStream.toByteArray val countBuf = ByteBuffer.wrap(new Array[Byte](4)) countBuf.putInt(array.length) countBuf.flip() val serverSocket = new ServerSocket(port) logInfo("Listening on port " + port) while (true) { val socket = serverSocket.accept() logInfo("Got a new connection") val out = new RateLimitedOutputStream(socket.getOutputStream, bytesPerSec) try { while (true) { out.write(countBuf.array) out.write(array) } } catch { case e: IOException => logError("Client disconnected") } finally { socket.close() } } } }
Example 25
Source File: FileBasedWriteAheadLogReader.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.{Closeable, EOFException} import java.nio.ByteBuffer import org.apache.hadoop.conf.Configuration import org.apache.spark.Logging private[streaming] class FileBasedWriteAheadLogReader(path: String, conf: Configuration) extends Iterator[ByteBuffer] with Closeable with Logging { private val instream = HdfsUtils.getInputStream(path, conf) private var closed = false //None被声明为一个对象,而不是一个类,在没有值的时候,使用None,如果有值可以引用,就使用Some来包含这个值,都是Option的子类 private var nextItem: Option[ByteBuffer] = None override def hasNext: Boolean = synchronized { if (closed) { //如果已关闭,就肯定不hasNext了 return false } if (nextItem.isDefined) { // handle the case where hasNext is called without calling next true } else { try { //读出来下一条,如果有,就说明还确实 hasNext val length = instream.readInt() val buffer = new Array[Byte](length) instream.readFully(buffer) nextItem = Some(ByteBuffer.wrap(buffer)) logTrace("Read next item " + nextItem.get) true } catch { case e: EOFException => logDebug("Error reading next item, EOF reached", e) close() false case e: Exception => logWarning("Error while trying to read data from HDFS.", e) close() throw e } } } override def next(): ByteBuffer = synchronized { val data = nextItem.getOrElse { close() throw new IllegalStateException( "next called without calling hasNext or after hasNext returned false") } //确保下一个调用hasNext加载新的数据 //None被声明为一个对象,而不是一个类,在没有值的时候,使用None,如果有值可以引用,就使用Some来包含这个值,都是Option的子类 nextItem = None // Ensure the next hasNext call loads new data. data } override def close(): Unit = synchronized { if (!closed) { instream.close() } closed = true } }
Example 26
Source File: RateLimitedOutputStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import scala.annotation.tailrec import java.io.OutputStream import java.util.concurrent.TimeUnit._ import org.apache.spark.Logging private[streaming] class RateLimitedOutputStream(out: OutputStream, desiredBytesPerSec: Int) extends OutputStream with Logging { require(desiredBytesPerSec > 0) private val SYNC_INTERVAL = NANOSECONDS.convert(10, SECONDS) private val CHUNK_SIZE = 8192 private var lastSyncTime = System.nanoTime private var bytesWrittenSinceSync = 0L override def write(b: Int) { waitToWrite(1) out.write(b) } override def write(bytes: Array[Byte]) { write(bytes, 0, bytes.length) } @tailrec override final def write(bytes: Array[Byte], offset: Int, length: Int) { val writeSize = math.min(length - offset, CHUNK_SIZE) if (writeSize > 0) { waitToWrite(writeSize) out.write(bytes, offset, writeSize) write(bytes, offset + writeSize, length) } } override def flush() { out.flush() } override def close() { out.close() } @tailrec private def waitToWrite(numBytes: Int) { val now = System.nanoTime val elapsedNanosecs = math.max(now - lastSyncTime, 1) val rate = bytesWrittenSinceSync.toDouble * 1000000000 / elapsedNanosecs if (rate < desiredBytesPerSec) { // It's okay to write; just update some variables and return bytesWrittenSinceSync += numBytes if (now > lastSyncTime + SYNC_INTERVAL) { // Sync interval has passed; let's resync lastSyncTime = now bytesWrittenSinceSync = numBytes } } else { // Calculate how much time we should sleep to bring ourselves to the desired rate. val targetTimeInMillis = bytesWrittenSinceSync * 1000 / desiredBytesPerSec val elapsedTimeInMillis = elapsedNanosecs / 1000000 val sleepTimeInMillis = targetTimeInMillis - elapsedTimeInMillis if (sleepTimeInMillis > 0) { logTrace("Natural rate is " + rate + " per second but desired rate is " + desiredBytesPerSec + ", sleeping for " + sleepTimeInMillis + " ms to compensate.") Thread.sleep(sleepTimeInMillis) } waitToWrite(numBytes) } } }
Example 27
Source File: FailureSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming import java.io.File import org.scalatest.BeforeAndAfter import org.apache.spark.{SparkFunSuite, Logging} import org.apache.spark.util.Utils class FailureSuite extends SparkFunSuite with BeforeAndAfter with Logging { private val batchDuration: Duration = Milliseconds(1000) private val numBatches = 30 private var directory: File = null before { directory = Utils.createTempDir() } after { if (directory != null) { //删除临时目录 Utils.deleteRecursively(directory) } //停止所有活动实时流 StreamingContext.getActive().foreach { _.stop() } } //多次失败map test("multiple failures with map") { MasterFailureTest.testMap(directory.getAbsolutePath, numBatches, batchDuration) } //多次失败updateStateByKey test("multiple failures with updateStateByKey") { MasterFailureTest.testUpdateStateByKey(directory.getAbsolutePath, numBatches, batchDuration) } }
Example 28
Source File: EventLogDownloadResource.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.status.api.v1 import java.io.OutputStream import java.util.zip.ZipOutputStream import javax.ws.rs.{GET, Produces} import javax.ws.rs.core.{MediaType, Response, StreamingOutput} import scala.util.control.NonFatal import org.apache.spark.{Logging, SparkConf} import org.apache.spark.deploy.SparkHadoopUtil @Produces(Array(MediaType.APPLICATION_OCTET_STREAM)) private[v1] class EventLogDownloadResource( val uIRoot: UIRoot, val appId: String, val attemptId: Option[String]) extends Logging { val conf = SparkHadoopUtil.get.newConfiguration(new SparkConf) @GET def getEventLogs(): Response = { try { val fileName = { attemptId match { case Some(id) => s"eventLogs-$appId-$id.zip" case None => s"eventLogs-$appId.zip" } } //实现StreamingOutput接口 val stream = new StreamingOutput { override def write(output: OutputStream): Unit = { //ZipOutputStream实现打包 val zipStream = new ZipOutputStream(output) try { uIRoot.writeEventLogs(appId, attemptId, zipStream) } finally { zipStream.close() } } } Response.ok(stream) .header("Content-Disposition", s"attachment; filename=$fileName") .header("Content-Type", MediaType.APPLICATION_OCTET_STREAM) .build() } catch { case NonFatal(e) => Response.serverError() .entity(s"Event logs are not available for app: $appId.") .status(Response.Status.SERVICE_UNAVAILABLE) .build() } } }
Example 29
Source File: NettyBlockRpcServer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConversions._ import org.apache.spark.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, messageBytes: Array[Byte], responseContext: RpcResponseCallback): Unit = { //消息解码 val message = BlockTransferMessage.Decoder.fromByteArray(messageBytes) logTrace(s"Received request: $message") message match { //提供下载Block文件的功能, case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = //数据blockIds,存放BlockId,获得块数据 openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(blocks.iterator) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteArray) //提供上传Block文件的RPC服务 case uploadBlock: UploadBlock => // StorageLevel is serialized as bytes using our JavaSerializer. //使用我们的JavaSerializer将StorageLevel序列化为字节 //存储级别 val level: StorageLevel = serializer.newInstance().deserialize(ByteBuffer.wrap(uploadBlock.metadata)) val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) //存储局部块,使用给定的存储级别 blockManager.putBlockData(BlockId(uploadBlock.blockId), data, level) responseContext.onSuccess(new Array[Byte](0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 30
Source File: MetricsConfig.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.metrics import java.io.{FileInputStream, InputStream} import java.util.Properties import scala.collection.mutable import scala.util.matching.Regex import org.apache.spark.util.Utils import org.apache.spark.{Logging, SparkConf} private[spark] class MetricsConfig(conf: SparkConf) extends Logging { private val DEFAULT_PREFIX = "*" private val INSTANCE_REGEX = "^(\\*|[a-zA-Z]+)\\.(.+)".r private val DEFAULT_METRICS_CONF_FILENAME = "metrics.properties" private[metrics] val properties = new Properties() private[metrics] var propertyCategories: mutable.HashMap[String, Properties] = null private def setDefaultProperties(prop: Properties) { prop.setProperty("*.sink.servlet.class", "org.apache.spark.metrics.sink.MetricsServlet") prop.setProperty("*.sink.servlet.path", "/metrics/json") prop.setProperty("master.sink.servlet.path", "/metrics/master/json") prop.setProperty("applications.sink.servlet.path", "/metrics/applications/json") } def initialize() { // Add default properties in case there's no properties file // 添加默认属性的情况下,没有任何属性文件 setDefaultProperties(properties) loadPropertiesFromFile(conf.getOption("spark.metrics.conf")) // Also look for the properties in provided Spark configuration //还要查找提供的Spark配置中的属性 val prefix = "spark.metrics.conf." conf.getAll.foreach { case (k, v) if k.startsWith(prefix) => properties.setProperty(k.substring(prefix.length()), v) case _ => } propertyCategories = subProperties(properties, INSTANCE_REGEX) if (propertyCategories.contains(DEFAULT_PREFIX)) { import scala.collection.JavaConversions._ val defaultProperty = propertyCategories(DEFAULT_PREFIX) for { (inst, prop) <- propertyCategories if (inst != DEFAULT_PREFIX) (k, v) <- defaultProperty if (prop.getProperty(k) == null) } { prop.setProperty(k, v) } } } //使用正则匹配properties中以source.开头的属性,然后将属性中的source反映得到的实例加入HashMap def subProperties(prop: Properties, regex: Regex): mutable.HashMap[String, Properties] = { val subProperties = new mutable.HashMap[String, Properties] import scala.collection.JavaConversions._ prop.foreach { kv => if (regex.findPrefixOf(kv._1).isDefined) { val regex(prefix, suffix) = kv._1 subProperties.getOrElseUpdate(prefix, new Properties).setProperty(suffix, kv._2) } } subProperties } def getInstance(inst: String): Properties = { propertyCategories.get(inst) match { case Some(s) => s case None => propertyCategories.getOrElse(DEFAULT_PREFIX, new Properties) } } private[this] def loadPropertiesFromFile(path: Option[String]): Unit = { var is: InputStream = null try { is = path match { case Some(f) => new FileInputStream(f) case None => Utils.getSparkClassLoader.getResourceAsStream(DEFAULT_METRICS_CONF_FILENAME) } if (is != null) { properties.load(is) } } catch { case e: Exception => val file = path.getOrElse(DEFAULT_METRICS_CONF_FILENAME) logError(s"Error loading configuration file $file", e) } finally { if (is != null) { is.close() } } } }
Example 31
Source File: PythonGatewayServer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import java.io.DataOutputStream import java.net.Socket import py4j.GatewayServer import org.apache.spark.Logging import org.apache.spark.util.Utils private[spark] object PythonGatewayServer extends Logging { def main(args: Array[String]): Unit = Utils.tryOrExit { // Start a GatewayServer on an ephemeral port val gatewayServer: GatewayServer = new GatewayServer(null, 0) gatewayServer.start() val boundPort: Int = gatewayServer.getListeningPort if (boundPort == -1) { logError("GatewayServer failed to bind; exiting") System.exit(1) } else { logDebug(s"Started PythonGatewayServer on port $boundPort") } // Communicate the bound port back to the caller via the caller-specified callback port //System.getenv()和System.getProperties()的区别 //System.getenv() 返回系统环境变量值 设置系统环境变量:当前登录用户主目录下的".bashrc"文件中可以设置系统环境变量 //System.getProperties() 返回Java进程变量值 通过命令行参数的"-D"选项 val callbackHost = sys.env("_PYSPARK_DRIVER_CALLBACK_HOST") val callbackPort = sys.env("_PYSPARK_DRIVER_CALLBACK_PORT").toInt logDebug(s"Communicating GatewayServer port to Python driver at $callbackHost:$callbackPort") val callbackSocket = new Socket(callbackHost, callbackPort) val dos = new DataOutputStream(callbackSocket.getOutputStream) dos.writeInt(boundPort) dos.close() callbackSocket.close() // Exit on EOF or broken pipe to ensure that this process dies when the Python driver dies: while (System.in.read() != -1) { // Do nothing } logDebug("Exiting due to broken pipe from Python driver") System.exit(0) } }
Example 32
Source File: MesosExternalShuffleService.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.mesos import java.net.SocketAddress import scala.collection.mutable import org.apache.spark.{Logging, SecurityManager, SparkConf} import org.apache.spark.deploy.ExternalShuffleService import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.shuffle.ExternalShuffleBlockHandler import org.apache.spark.network.shuffle.protocol.BlockTransferMessage import org.apache.spark.network.shuffle.protocol.mesos.RegisterDriver import org.apache.spark.network.util.TransportConf private[mesos] class MesosExternalShuffleService(conf: SparkConf, securityManager: SecurityManager) extends ExternalShuffleService(conf, securityManager) { protected override def newShuffleBlockHandler( conf: TransportConf): ExternalShuffleBlockHandler = { new MesosExternalShuffleBlockHandler(conf) } } private[spark] object MesosExternalShuffleService extends Logging { def main(args: Array[String]): Unit = { ExternalShuffleService.main(args, (conf: SparkConf, sm: SecurityManager) => new MesosExternalShuffleService(conf, sm)) } }
Example 33
Source File: MesosClusterDispatcher.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.mesos import java.util.concurrent.CountDownLatch import org.apache.spark.deploy.mesos.ui.MesosClusterUI import org.apache.spark.deploy.rest.mesos.MesosRestServer import org.apache.spark.scheduler.cluster.mesos._ import org.apache.spark.util.SignalLogger import org.apache.spark.{Logging, SecurityManager, SparkConf} private[mesos] class MesosClusterDispatcher( args: MesosClusterDispatcherArguments, conf: SparkConf) extends Logging { //Spark master和workers使用的公共DNS(默认空) private val publicAddress = Option(conf.getenv("SPARK_PUBLIC_DNS")).getOrElse(args.host) private val recoveryMode = conf.get("spark.mesos.deploy.recoveryMode", "NONE").toUpperCase() logInfo("Recovery mode in Mesos dispatcher set to: " + recoveryMode) private val engineFactory = recoveryMode match { case "NONE" => new BlackHoleMesosClusterPersistenceEngineFactory case "ZOOKEEPER" => new ZookeeperMesosClusterPersistenceEngineFactory(conf) case _ => throw new IllegalArgumentException("Unsupported recovery mode: " + recoveryMode) } private val scheduler = new MesosClusterScheduler(engineFactory, conf) private val server = new MesosRestServer(args.host, args.port, conf, scheduler) private val webUi = new MesosClusterUI( new SecurityManager(conf), args.webUiPort, conf, publicAddress, scheduler) private val shutdownLatch = new CountDownLatch(1) def start(): Unit = { webUi.bind() scheduler.frameworkUrl = webUi.activeWebUiUrl scheduler.start() server.start() } def awaitShutdown(): Unit = { shutdownLatch.await() } def stop(): Unit = { webUi.stop() server.stop() scheduler.stop() shutdownLatch.countDown() } } private[mesos] object MesosClusterDispatcher extends Logging { def main(args: Array[String]) { SignalLogger.register(log) val conf = new SparkConf val dispatcherArgs = new MesosClusterDispatcherArguments(args, conf) conf.setMaster(dispatcherArgs.masterUrl) conf.setAppName(dispatcherArgs.name) dispatcherArgs.zookeeperUrl.foreach { z => conf.set("spark.mesos.deploy.recoveryMode", "ZOOKEEPER") conf.set("spark.mesos.deploy.zookeeper.url", z) } val dispatcher = new MesosClusterDispatcher(dispatcherArgs, conf) dispatcher.start() val shutdownHook = new Thread() { override def run() { logInfo("Shutdown hook is shutting down dispatcher") dispatcher.stop() dispatcher.awaitShutdown() } } Runtime.getRuntime.addShutdownHook(shutdownHook) dispatcher.awaitShutdown() } }
Example 34
Source File: SparkCuratorUtil.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import org.apache.curator.framework.{CuratorFramework, CuratorFrameworkFactory} import org.apache.curator.retry.ExponentialBackoffRetry import org.apache.spark.{Logging, SparkConf} import org.apache.zookeeper.KeeperException import scala.collection.JavaConversions._ private[spark] object SparkCuratorUtil extends Logging { private val ZK_CONNECTION_TIMEOUT_MILLIS = 15000 private val ZK_SESSION_TIMEOUT_MILLIS = 60000 private val RETRY_WAIT_MILLIS = 5000 private val MAX_RECONNECT_ATTEMPTS = 3 def newClient( conf: SparkConf, //zookeeper集群URL zkUrlConf: String = "spark.deploy.zookeeper.url"): CuratorFramework = { val ZK_URL = conf.get(zkUrlConf) val zk = CuratorFrameworkFactory.newClient(ZK_URL, ZK_SESSION_TIMEOUT_MILLIS, ZK_CONNECTION_TIMEOUT_MILLIS, new ExponentialBackoffRetry(RETRY_WAIT_MILLIS, MAX_RECONNECT_ATTEMPTS)) zk.start() zk } def mkdir(zk: CuratorFramework, path: String) { if (zk.checkExists().forPath(path) == null) { try { zk.create().creatingParentsIfNeeded().forPath(path) } catch { case nodeExist: KeeperException.NodeExistsException => // do nothing, ignore node existing exception. case e: Exception => throw e } } } def deleteRecursive(zk: CuratorFramework, path: String) { if (zk.checkExists().forPath(path) != null) { for (child <- zk.getChildren.forPath(path)) { zk.delete().forPath(path + "/" + child) } zk.delete().forPath(path) } } }
Example 35
Source File: TestClient.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.client import org.apache.spark.rpc.RpcEnv import org.apache.spark.{SecurityManager, SparkConf, Logging} import org.apache.spark.deploy.{ApplicationDescription, Command} import org.apache.spark.util.Utils private[spark] object TestClient { private class TestListener extends AppClientListener with Logging { def connected(id: String) { logInfo("Connected to master, got app ID " + id) } def disconnected() { logInfo("Disconnected from master") System.exit(0) } def dead(reason: String) { logInfo("Application died with error: " + reason) System.exit(0) } def executorAdded(id: String, workerId: String, hostPort: String, cores: Int, memory: Int) {} def executorRemoved(id: String, message: String, exitStatus: Option[Int]) {} } def main(args: Array[String]) { val url = if(args.isEmpty) "127.0.0.1" else args(0) val conf = new SparkConf val rpcEnv = RpcEnv.create("spark", Utils.localHostName(), 0, conf, new SecurityManager(conf)) val executorClassnamea = TestExecutor.getClass.getCanonicalName println("====executorClassname======"+executorClassnamea) //stripSuffix返回这个字符串,给定的`suffix`剥离。 如果这个字符串不以`suffix'结尾,那么它不会被返回 val executorClassname = TestExecutor.getClass.getCanonicalName.stripSuffix("$") println("====executorClassname======"+executorClassname) val desc = new ApplicationDescription("TestClient", Some(1), 512, Command(executorClassname, Seq(), Map(), Seq(), Seq(), Seq()), "ignored") val listener = new TestListener val client = new AppClient(rpcEnv, Array(url), desc, listener, new SparkConf) client.start() rpcEnv.awaitTermination() } }
Example 36
Source File: FileSystemPersistenceEngine.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.io._ import scala.reflect.ClassTag import org.apache.spark.Logging import org.apache.spark.serializer.{DeserializationStream, SerializationStream, Serializer} import org.apache.spark.util.Utils private[master] class FileSystemPersistenceEngine( val dir: String, val serializer: Serializer) extends PersistenceEngine with Logging { new File(dir).mkdir() override def persist(name: String, obj: Object): Unit = { serializeIntoFile(new File(dir + File.separator + name), obj) } override def unpersist(name: String): Unit = { new File(dir + File.separator + name).delete() } override def read[T: ClassTag](prefix: String): Seq[T] = { val files = new File(dir).listFiles().filter(_.getName.startsWith(prefix)) files.map(deserializeFromFile[T]) } private def serializeIntoFile(file: File, value: AnyRef) { val created = file.createNewFile() if (!created) { throw new IllegalStateException("Could not create file: " + file) } val fileOut = new FileOutputStream(file) var out: SerializationStream = null Utils.tryWithSafeFinally { out = serializer.newInstance().serializeStream(fileOut) out.writeObject(value) } { fileOut.close() if (out != null) { out.close() } } } private def deserializeFromFile[T](file: File)(implicit m: ClassTag[T]): T = { val fileIn = new FileInputStream(file) var in: DeserializationStream = null try { in = serializer.newInstance().deserializeStream(fileIn) in.readObject[T]() } finally { fileIn.close() if (in != null) { in.close() } } } }
Example 37
Source File: RecoveryModeFactory.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import org.apache.spark.{Logging, SparkConf} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.serializer.Serializer private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) with Logging { //Spark保存恢复状态的目录 val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "") def createPersistenceEngine(): PersistenceEngine = { logInfo("Persisting recovery state to directory: " + RECOVERY_DIR) new FileSystemPersistenceEngine(RECOVERY_DIR, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new MonarchyLeaderAgent(master) } } private[master] class ZooKeeperRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) { def createPersistenceEngine(): PersistenceEngine = { new ZooKeeperPersistenceEngine(conf, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new ZooKeeperLeaderElectionAgent(master, conf) } }
Example 38
Source File: MasterWebUI.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master.ui import org.apache.spark.Logging import org.apache.spark.deploy.master.Master import org.apache.spark.status.api.v1.{ApiRootResource, ApplicationsListResource, ApplicationInfo, UIRoot} import org.apache.spark.ui.{SparkUI, WebUI} import org.apache.spark.ui.JettyUtils._ def detachSparkUI(ui: SparkUI) { assert(serverInfo.isDefined, "Master UI must be bound to a server before detaching SparkUIs") ui.getHandlers.foreach(detachHandler) } def getApplicationInfoList: Iterator[ApplicationInfo] = { val state = masterPage.getMasterState val activeApps = state.activeApps.sortBy(_.startTime).reverse val completedApps = state.completedApps.sortBy(_.endTime).reverse activeApps.iterator.map { ApplicationsListResource.convertApplicationInfo(_, false) } ++ completedApps.iterator.map { ApplicationsListResource.convertApplicationInfo(_, true) } } def getSparkUI(appId: String): Option[SparkUI] = { val state = masterPage.getMasterState val activeApps = state.activeApps.sortBy(_.startTime).reverse val completedApps = state.completedApps.sortBy(_.endTime).reverse (activeApps ++ completedApps).find { _.id == appId }.flatMap { master.rebuildSparkUI } } } private[master] object MasterWebUI { private val STATIC_RESOURCE_DIR = SparkUI.STATIC_RESOURCE_DIR }
Example 39
Source File: ZooKeeperLeaderElectionAgent.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import org.apache.spark.{ Logging, SparkConf } import org.apache.curator.framework.CuratorFramework import org.apache.curator.framework.recipes.leader.{ LeaderLatchListener, LeaderLatch } import org.apache.spark.deploy.SparkCuratorUtil private[master] class ZooKeeperLeaderElectionAgent(val masterInstance: LeaderElectable, conf: SparkConf) extends LeaderLatchListener with LeaderElectionAgent with Logging { //zooKeeper保存恢复状态的目录,缺省为/spark val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/leader_election" private var zk: CuratorFramework = _ private var leaderLatch: LeaderLatch = _ private var status = LeadershipStatus.NOT_LEADER start() private def start() { logInfo("Starting ZooKeeper LeaderElection agent") zk = SparkCuratorUtil.newClient(conf) leaderLatch = new LeaderLatch(zk, WORKING_DIR) leaderLatch.addListener(this) //它实现LeanderLatchListener leaderLatch.start() //启动Lead的竞争与选举 } override def stop() { leaderLatch.close() zk.close() } override def isLeader() { synchronized { // could have lost leadership by now. //有可能状态已经再次改变,即Leader再次变化,因此需要再次确认 if (!leaderLatch.hasLeadership) { return } logInfo("We have gained leadership") //已经被选举Leader updateLeadershipStatus(true) } } override def notLeader() { synchronized { // could have gained leadership by now. //有可能状态已经再次改变,即Leader再次变化,因此需要再次确认 if (leaderLatch.hasLeadership) { return } //被剥夺Leader logInfo("We have lost leadership") updateLeadershipStatus(false) } } private def updateLeadershipStatus(isLeader: Boolean) { if (isLeader && status == LeadershipStatus.NOT_LEADER) { status = LeadershipStatus.LEADER masterInstance.electedLeader() //Master已经被选举为Leader, } else if (!isLeader && status == LeadershipStatus.LEADER) { status = LeadershipStatus.NOT_LEADER masterInstance.revokedLeadership() //Master已经被剥夺Leader } } private object LeadershipStatus extends Enumeration { type LeadershipStatus = Value val LEADER, NOT_LEADER = Value } }
Example 40
Source File: ZooKeeperPersistenceEngine.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.nio.ByteBuffer import scala.collection.JavaConversions._ import scala.reflect.ClassTag import org.apache.curator.framework.CuratorFramework import org.apache.zookeeper.CreateMode import org.apache.spark.{Logging, SparkConf} import org.apache.spark.deploy.SparkCuratorUtil import org.apache.spark.serializer.Serializer private[master] class ZooKeeperPersistenceEngine(conf: SparkConf, val serializer: Serializer) extends PersistenceEngine with Logging { //zooKeeper保存恢复状态的目录,缺省为/spark private val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/master_status" private val zk: CuratorFramework = SparkCuratorUtil.newClient(conf) SparkCuratorUtil.mkdir(zk, WORKING_DIR) override def persist(name: String, obj: Object): Unit = { serializeIntoFile(WORKING_DIR + "/" + name, obj) } override def unpersist(name: String): Unit = { zk.delete().forPath(WORKING_DIR + "/" + name) } override def read[T: ClassTag](prefix: String): Seq[T] = { val file = zk.getChildren.forPath(WORKING_DIR).filter(_.startsWith(prefix)) file.map(deserializeFromFile[T]).flatten } override def close() { zk.close() } private def serializeIntoFile(path: String, value: AnyRef) { val serialized = serializer.newInstance().serialize(value) val bytes = new Array[Byte](serialized.remaining()) serialized.get(bytes) zk.create().withMode(CreateMode.PERSISTENT).forPath(path, bytes) } private def deserializeFromFile[T](filename: String)(implicit m: ClassTag[T]): Option[T] = { val fileData = zk.getData().forPath(WORKING_DIR + "/" + filename) try { Some(serializer.newInstance().deserialize[T](ByteBuffer.wrap(fileData))) } catch { case e: Exception => { logWarning("Exception while reading persisted file, deleting", e) zk.delete().forPath(WORKING_DIR + "/" + filename) None } } } }
Example 41
Source File: WorkerWebUI.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.worker.ui import java.io.File import javax.servlet.http.HttpServletRequest import org.apache.spark.{Logging, SparkConf} import org.apache.spark.deploy.worker.Worker import org.apache.spark.deploy.worker.ui.WorkerWebUI._ import org.apache.spark.ui.{SparkUI, WebUI} import org.apache.spark.ui.JettyUtils._ import org.apache.spark.util.RpcUtils def initialize() { val logPage = new LogPage(this) attachPage(logPage) attachPage(new WorkerPage(this)) attachHandler(createStaticHandler(WorkerWebUI.STATIC_RESOURCE_BASE, "/static")) attachHandler(createServletHandler("/log", (request: HttpServletRequest) => logPage.renderLog(request), worker.securityMgr)) } } private[worker] object WorkerWebUI { val STATIC_RESOURCE_BASE = SparkUI.STATIC_RESOURCE_DIR val DEFAULT_RETAINED_DRIVERS = 1000 val DEFAULT_RETAINED_EXECUTORS = 1000 }
Example 42
Source File: HistoryServerArguments.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.history import org.apache.spark.{Logging, SparkConf} import org.apache.spark.util.Utils private[history] class HistoryServerArguments(conf: SparkConf, args: Array[String]) extends Logging { private var propertiesFile: String = null parse(args.toList) private def parse(args: List[String]): Unit = { args match { case ("--dir" | "-d") :: value :: tail => logWarning("Setting log directory through the command line is deprecated as of " + "Spark 1.1.0. Please set this through spark.history.fs.logDirectory instead.") conf.set("spark.history.fs.logDirectory", value) System.setProperty("spark.history.fs.logDirectory", value) parse(tail) case ("--help" | "-h") :: tail => printUsageAndExit(0) case ("--properties-file") :: value :: tail => propertiesFile = value parse(tail) //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 case Nil => case _ => printUsageAndExit(1) } } // This mutates the SparkConf, so all accesses to it must be made after this line //可变SparkConf,因此,所有访问它必须在这行之后 Utils.loadDefaultSparkProperties(conf, propertiesFile) private def printUsageAndExit(exitCode: Int) { // scalastyle:off println System.err.println( """ |Usage: HistoryServer [options] | |Options: | --properties-file FILE Path to a custom Spark properties file. | Default is conf/spark-defaults.conf. | |Configuration options can be set by setting the corresponding JVM system property. |History Server options are always available; additional options depend on the provider. | |History Server options: | | spark.history.ui.port Port where server will listen for connections | (default 18080) | spark.history.acls.enable Whether to enable view acls for all applications | (default false) | spark.history.provider Name of history provider class (defaults to | file system-based provider) | spark.history.retainedApplications Max number of application UIs to keep loaded in memory | (default 50) |FsHistoryProvider options: | | spark.history.fs.logDirectory Directory where app logs are stored | (default: file:/tmp/spark-events) | spark.history.fs.updateInterval How often to reload log data from storage | (in seconds, default: 10) |""".stripMargin) // scalastyle:on println System.exit(exitCode) } }
Example 43
Source File: SimrSchedulerBackend.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.rpc.RpcAddress import org.apache.spark.{Logging, SparkContext, SparkEnv} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.scheduler.TaskSchedulerImpl private[spark] class SimrSchedulerBackend( scheduler: TaskSchedulerImpl, sc: SparkContext, driverFilePath: String) extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) with Logging { val tmpPath = new Path(driverFilePath + "_tmp") val filePath = new Path(driverFilePath) val maxCores = conf.getInt("spark.simr.executor.cores", 1) override def start() { super.start() val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName, //运行driver的主机名或 IP 地址 RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt), CoarseGrainedSchedulerBackend.ENDPOINT_NAME) val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("") logInfo("Writing to HDFS file: " + driverFilePath) logInfo("Writing Akka address: " + driverUrl) logInfo("Writing Spark UI Address: " + appUIAddress) // Create temporary file to prevent race condition where executors get empty driverUrl file //创建临时文件以防止执行程序获得空的驱动程序文件的竞争条件 val temp = fs.create(tmpPath, true) temp.writeUTF(driverUrl) temp.writeInt(maxCores) temp.writeUTF(appUIAddress) temp.close() // "Atomic" rename fs.rename(tmpPath, filePath) } override def stop() { val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) fs.delete(new Path(driverFilePath), false) super.stop() } }
Example 44
Source File: MesosTaskLaunchData.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster.mesos import java.nio.ByteBuffer import org.apache.mesos.protobuf.ByteString import org.apache.spark.Logging private[spark] case class MesosTaskLaunchData( serializedTask: ByteBuffer, attemptNumber: Int) extends Logging { def toByteString: ByteString = { //ByteBuffer.allocate在能够读和写之前,必须有一个缓冲区,用静态方法 allocate() 来分配缓冲区 val dataBuffer = ByteBuffer.allocate(4 + serializedTask.limit) dataBuffer.putInt(attemptNumber) dataBuffer.put(serializedTask) dataBuffer.rewind logDebug(s"ByteBuffer size: [${dataBuffer.remaining}]") ByteString.copyFrom(dataBuffer) } } private[spark] object MesosTaskLaunchData extends Logging { def fromByteString(byteString: ByteString): MesosTaskLaunchData = { val byteBuffer = byteString.asReadOnlyByteBuffer() logDebug(s"ByteBuffer size: [${byteBuffer.remaining}]") val attemptNumber = byteBuffer.getInt // updates the position by 4 bytes val serializedTask = byteBuffer.slice() // subsequence starting at the current position MesosTaskLaunchData(serializedTask, attemptNumber) } }
Example 45
Source File: ReplayListenerBus.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io.{InputStream, IOException} import scala.io.Source import com.fasterxml.jackson.core.JsonParseException import org.json4s.jackson.JsonMethods._ import org.apache.spark.Logging import org.apache.spark.util.JsonProtocol def replay( logData: InputStream, sourceName: String, maybeTruncated: Boolean = false): Unit = { var currentLine: String = null var lineNumber: Int = 1 try { val lines = Source.fromInputStream(logData).getLines() while (lines.hasNext) { currentLine = lines.next() try { postToAll(JsonProtocol.sparkEventFromJson(parse(currentLine))) } catch { case jpe: JsonParseException => // We can only ignore exception from last line of the file that might be truncated //我们只能忽略可能被截断的文件的最后一行的异常 if (!maybeTruncated || lines.hasNext) { throw jpe } else { logWarning(s"Got JsonParseException from log file $sourceName" + s" at line $lineNumber, the file might not have finished writing cleanly.") } } lineNumber += 1 } } catch { case ioe: IOException => throw ioe case e: Exception => logError(s"Exception parsing Spark event log: $sourceName", e) logError(s"Malformed line #$lineNumber: $currentLine\n") } } }
Example 46
Source File: SparkUncaughtExceptionHandler.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.util import org.apache.spark.Logging private[spark] object SparkUncaughtExceptionHandler extends Thread.UncaughtExceptionHandler with Logging { override def uncaughtException(thread: Thread, exception: Throwable) { try { logError("Uncaught exception in thread " + thread, exception) // We may have been called from a shutdown hook. If so, we must not call System.exit(). // (If we do, we will deadlock.) //我们可能已经被关闭了一个挂机,如果是这样,我们不能调用System.exit()。 //(如果我们这样做,我们会死锁。) if (!ShutdownHookManager.inShutdown()) { if (exception.isInstanceOf[OutOfMemoryError]) { System.exit(SparkExitCode.OOM) } else { System.exit(SparkExitCode.UNCAUGHT_EXCEPTION) } } } catch { case oom: OutOfMemoryError => Runtime.getRuntime.halt(SparkExitCode.OOM) case t: Throwable => Runtime.getRuntime.halt(SparkExitCode.UNCAUGHT_EXCEPTION_TWICE) } } def uncaughtException(exception: Throwable) { uncaughtException(Thread.currentThread(), exception) } }
Example 47
Source File: BlockManagerSlaveEndpoint.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import scala.concurrent.{ ExecutionContext, Future } import org.apache.spark.rpc.{ RpcEnv, RpcCallContext, RpcEndpoint } import org.apache.spark.util.ThreadUtils import org.apache.spark.{ Logging, MapOutputTracker, SparkEnv } import org.apache.spark.storage.BlockManagerMessages._ private[storage] class BlockManagerSlaveEndpoint( override val rpcEnv: RpcEnv, blockManager: BlockManager,//引用BlockManagerMaster与Mast消息通信 mapOutputTracker: MapOutputTracker) extends RpcEndpoint with Logging { private val asyncThreadPool = ThreadUtils.newDaemonCachedThreadPool("block-manager-slave-async-thread-pool") private implicit val asyncExecutionContext = ExecutionContext.fromExecutorService(asyncThreadPool) // Operations that involve removing blocks may be slow and should be done asynchronously //涉及删除块的操作可能很慢,应该是异步完成 override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { //根据BlockId删除该Executor上所有和该Shuffle相关的Block case RemoveBlock(blockId) => doAsync[Boolean]("removing block " + blockId, context) { blockManager.removeBlock(blockId) true } //收到BlockManagerMasterEndpoint发送RemoveRdd信息,根据RddId删除该Excutor上RDD所关联的所有Block case RemoveRdd(rddId) => doAsync[Int]("removing RDD " + rddId, context) { blockManager.removeRdd(rddId) } //根据shuffleId删除该Executor上所有和该Shuffle相关的Block case RemoveShuffle(shuffleId) => doAsync[Boolean]("removing shuffle " + shuffleId, context) { if (mapOutputTracker != null) { mapOutputTracker.unregisterShuffle(shuffleId) } SparkEnv.get.shuffleManager.unregisterShuffle(shuffleId) } //根据broadcastId删除该Executor上和该广播变量相关的所有Block case RemoveBroadcast(broadcastId, _) => doAsync[Int]("removing broadcast " + broadcastId, context) { //tellMaster 是否将状态汇报到Master blockManager.removeBroadcast(broadcastId, tellMaster = true) } //根据blockId和askSlaves向Master返回该Block的blockStatus case GetBlockStatus(blockId, _) => context.reply(blockManager.getStatus(blockId)) //根据blockId和askSlaves向Master返回该Block的blockStatus case GetMatchingBlockIds(filter, _) => context.reply(blockManager.getMatchingBlockIds(filter)) } //科里化函数,异步调用,方法回调 private def doAsync[T](actionMessage: String, context: RpcCallContext)(body: => T) { val future = Future { logDebug(actionMessage) body } future.onSuccess { case response => logDebug("Done " + actionMessage + ", response is " + response) context.reply(response) logDebug("Sent response: " + response + " to " + context.sender) } future.onFailure { case t: Throwable => logError("Error in " + actionMessage, t) context.sendFailure(t) } } override def onStop(): Unit = { asyncThreadPool.shutdownNow() } }
Example 48
Source File: LocalRDDCheckpointData.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Logging, SparkEnv, SparkException, TaskContext} import org.apache.spark.storage.{RDDBlockId, StorageLevel} import org.apache.spark.util.Utils def transformStorageLevel(level: StorageLevel): StorageLevel = { // If this RDD is to be cached off-heap, fail fast since we cannot provide any // correctness guarantees about subsequent computations after the first one //如果这个RDD要被堆栈缓存,那么快速失败,因为我们不能在第一个之后提供关于后续计算的任何正确性保证 if (level.useOffHeap) { throw new SparkException("Local checkpointing is not compatible with off-heap caching.") } StorageLevel(useDisk = true, level.useMemory, level.deserialized, level.replication) } }
Example 49
Source File: TestClient.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.client import org.apache.spark.deploy.{ApplicationDescription, Command} import org.apache.spark.rpc.RpcEnv import org.apache.spark.util.Utils import org.apache.spark.{Logging, SecurityManager, SparkConf} private[spark] object TestClient { private class TestListener extends AppClientListener with Logging { def connected(id: String) { logInfo("Connected to master, got app ID " + id) } def disconnected() { logInfo("Disconnected from master") System.exit(0) } def dead(reason: String) { logInfo("Application died with error: " + reason) System.exit(0) } def executorAdded(id: String, workerId: String, hostPort: String, cores: Int, memory: Int) {} def executorRemoved(id: String, message: String, exitStatus: Option[Int]) {} } def main(args: Array[String]) { val url = if(args.isEmpty) "172.0.0.1" else args(0) val conf = new SparkConf val rpcEnv = RpcEnv.create("spark", Utils.localHostName(), 0, conf, new SecurityManager(conf)) //stripSuffix去掉<string>字串中结尾的字符 val executorClassname = TestExecutor.getClass.getCanonicalName.stripSuffix("$") println("====executorClassname======"+executorClassname) val desc = new ApplicationDescription("TestClient", Some(1), 512, Command(executorClassname, Seq(), Map(), Seq(), Seq(), Seq()), "ignored") val listener = new TestListener val client = new AppClient(rpcEnv, Array(url), desc, listener, new SparkConf) client.start() rpcEnv.awaitTermination() } }
Example 50
Source File: HBasePartition.scala From Spark-SQL-on-HBase with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.hadoop.hbase.regionserver.RegionScanner import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.hbase.catalyst.expressions.PartialPredicateOperations._ import org.apache.spark.sql.hbase.types.{HBaseBytesType, Range} import org.apache.spark.{Logging, Partition} private[hbase] class HBasePartition( val idx: Int, val mappedIndex: Int, start: Option[HBaseRawType] = None, end: Option[HBaseRawType] = None, val server: Option[String] = None, val filterPredicates: Option[Expression] = None, @transient relation: HBaseRelation = null, @transient val newScanner:RegionScanner = null) extends Range[HBaseRawType](start, true, end, false, HBaseBytesType) with Partition with IndexMappable with Logging { override def index: Int = idx override def hashCode(): Int = idx @transient lazy val startNative: Seq[Any] = relation.nativeKeyConvert(start) @transient lazy val endNative: Seq[Any] = relation.nativeKeyConvert(end) def computePredicate(relation: HBaseRelation): Option[Expression] = { val predicate = if (filterPredicates.isDefined && filterPredicates.get.references.exists(_.exprId == relation.partitionKeys.head.exprId)) { val oriPredicate = filterPredicates.get val predicateReferences = oriPredicate.references.toSeq val boundReference = BindReferences.bindReference(oriPredicate, predicateReferences) val row = new GenericMutableRow(predicateReferences.size) var rowIndex = 0 var i = 0 var range: Range[_] = null while (i < relation.keyColumns.size) { range = relation.generateRange(this, oriPredicate, i) if (range != null) { rowIndex = relation.rowIndex(predicateReferences, i) if (rowIndex >= 0) row.update(rowIndex, range) // if the non-last dimension range is not point, do not proceed to the next dims if (i < relation.keyColumns.size - 1 && !range.isPoint) i = relation.keyColumns.size else i = i + 1 } else i = relation.keyColumns.size } val pr = boundReference.partialReduce(row, predicateReferences) pr match { case (null, e: Expression) => Some(e) case (true, _) => None case (false, _) => Some(Literal(false)) } } else filterPredicates logInfo(predicate.toString) predicate } override def toString = { s"HBasePartition: $idx, $mappedIndex, [$start, $end), $filterPredicates" } }
Example 51
Source File: BytesUtilsSuite.scala From Spark-SQL-on-HBase with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.Logging import org.apache.spark.sql.hbase.types.HBaseBytesType import org.apache.spark.sql.hbase.util.BinaryBytesUtils import org.apache.spark.sql.types._ import org.scalatest.{BeforeAndAfterAll, FunSuite} class BytesUtilsSuite extends FunSuite with BeforeAndAfterAll with Logging { test("Bytes Ordering Test") { val s = Seq(-257, -256, -255, -129, -128, -127, -64, -16, -4, -1, 0, 1, 4, 16, 64, 127, 128, 129, 255, 256, 257) val result = s.map(i => (i, BinaryBytesUtils.create(IntegerType).toBytes(i))) .sortWith((f, s) => HBaseBytesType.ordering.gt( f._2.asInstanceOf[HBaseBytesType.InternalType], s._2.asInstanceOf[HBaseBytesType.InternalType])) assert(result.map(a => a._1) == s.sorted.reverse) } def compare(a: Array[Byte], b: Array[Byte]): Int = { val length = Math.min(a.length, b.length) var result: Int = 0 for (i <- 0 to length - 1) { val diff: Int = (a(i) & 0xff).asInstanceOf[Byte] - (b(i) & 0xff).asInstanceOf[Byte] if (diff != 0) { result = diff } } result } test("Bytes Utility Test") { assert(BinaryBytesUtils.toBoolean(BinaryBytesUtils.create(BooleanType) .toBytes(input = true), 0) === true) assert(BinaryBytesUtils.toBoolean(BinaryBytesUtils.create(BooleanType) .toBytes(input = false), 0) === false) assert(BinaryBytesUtils.toDouble(BinaryBytesUtils.create(DoubleType).toBytes(12.34d), 0) === 12.34d) assert(BinaryBytesUtils.toDouble(BinaryBytesUtils.create(DoubleType).toBytes(-12.34d), 0) === -12.34d) assert(BinaryBytesUtils.toFloat(BinaryBytesUtils.create(FloatType).toBytes(12.34f), 0) === 12.34f) assert(BinaryBytesUtils.toFloat(BinaryBytesUtils.create(FloatType).toBytes(-12.34f), 0) === -12.34f) assert(BinaryBytesUtils.toInt(BinaryBytesUtils.create(IntegerType).toBytes(12), 0) === 12) assert(BinaryBytesUtils.toInt(BinaryBytesUtils.create(IntegerType).toBytes(-12), 0) === -12) assert(BinaryBytesUtils.toLong(BinaryBytesUtils.create(LongType).toBytes(1234l), 0) === 1234l) assert(BinaryBytesUtils.toLong(BinaryBytesUtils.create(LongType).toBytes(-1234l), 0) === -1234l) assert(BinaryBytesUtils.toShort(BinaryBytesUtils.create(ShortType) .toBytes(12.asInstanceOf[Short]), 0) === 12) assert(BinaryBytesUtils.toShort(BinaryBytesUtils.create(ShortType) .toBytes(-12.asInstanceOf[Short]), 0) === -12) assert(BinaryBytesUtils.toUTF8String(BinaryBytesUtils.create(StringType).toBytes("abc"), 0, 3) === UTF8String("abc")) assert(BinaryBytesUtils.toUTF8String(BinaryBytesUtils.create(StringType).toBytes(""), 0, 0) === UTF8String("")) assert(BinaryBytesUtils.toByte(BinaryBytesUtils.create(ByteType) .toBytes(5.asInstanceOf[Byte]), 0) === 5) assert(BinaryBytesUtils.toByte(BinaryBytesUtils.create(ByteType) .toBytes(-5.asInstanceOf[Byte]), 0) === -5) assert(compare(BinaryBytesUtils.create(IntegerType).toBytes(128), BinaryBytesUtils.create(IntegerType).toBytes(-128)) > 0) } test("byte array plus one") { var byteArray = Array[Byte](0x01.toByte, 127.toByte) assert(Bytes.compareTo(BinaryBytesUtils.addOne(byteArray), Array[Byte](0x01.toByte, 0x80.toByte)) == 0) byteArray = Array[Byte](0xff.toByte, 0xff.toByte) assert(BinaryBytesUtils.addOne(byteArray) == null) byteArray = Array[Byte](0x02.toByte, 0xff.toByte) assert(Bytes.compareTo(BinaryBytesUtils.addOne(byteArray), Array[Byte](0x03.toByte, 0x00.toByte)) == 0) } test("float comparison") { val f1 = BinaryBytesUtils.create(FloatType).toBytes(-1.23f) val f2 = BinaryBytesUtils.create(FloatType).toBytes(100f) assert(Bytes.compareTo(f1, f2) < 0) } }
Example 52
Source File: MeetupReceiver.scala From meetup-stream with Apache License 2.0 | 5 votes |
package receiver import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.storage.StorageLevel import org.apache.spark.Logging import com.ning.http.client.AsyncHttpClientConfig import com.ning.http.client._ import scala.collection.mutable.ArrayBuffer import java.io.OutputStream import java.io.ByteArrayInputStream import java.io.InputStreamReader import java.io.BufferedReader import java.io.InputStream import java.io.PipedInputStream import java.io.PipedOutputStream class MeetupReceiver(url: String) extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) with Logging { @transient var client: AsyncHttpClient = _ @transient var inputPipe: PipedInputStream = _ @transient var outputPipe: PipedOutputStream = _ def onStart() { val cf = new AsyncHttpClientConfig.Builder() cf.setRequestTimeout(Integer.MAX_VALUE) cf.setReadTimeout(Integer.MAX_VALUE) cf.setPooledConnectionIdleTimeout(Integer.MAX_VALUE) client= new AsyncHttpClient(cf.build()) inputPipe = new PipedInputStream(1024 * 1024) outputPipe = new PipedOutputStream(inputPipe) val producerThread = new Thread(new DataConsumer(inputPipe)) producerThread.start() client.prepareGet(url).execute(new AsyncHandler[Unit]{ def onBodyPartReceived(bodyPart: HttpResponseBodyPart) = { bodyPart.writeTo(outputPipe) AsyncHandler.STATE.CONTINUE } def onStatusReceived(status: HttpResponseStatus) = { AsyncHandler.STATE.CONTINUE } def onHeadersReceived(headers: HttpResponseHeaders) = { AsyncHandler.STATE.CONTINUE } def onCompleted = { println("completed") } def onThrowable(t: Throwable)={ t.printStackTrace() } }) } def onStop() { if (Option(client).isDefined) client.close() if (Option(outputPipe).isDefined) { outputPipe.flush() outputPipe.close() } if (Option(inputPipe).isDefined) { inputPipe.close() } } class DataConsumer(inputStream: InputStream) extends Runnable { override def run() { val bufferedReader = new BufferedReader( new InputStreamReader( inputStream )) var input=bufferedReader.readLine() while(input!=null){ store(input) input=bufferedReader.readLine() } } } }
Example 53
Source File: Loggable.scala From meetup-stream with Apache License 2.0 | 5 votes |
package core import org.apache.spark.Logging import org.apache.log4j.{Level, Logger} def setStreamingLogLevels() { val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements if (!log4jInitialized) { // We first log something to initialize Spark's default logging, then we override the // logging level. logInfo("Setting log level to [ERROR] for streaming example." + " To override add a custom log4j.properties to the classpath.") Logger.getRootLogger.setLevel(Level.WARN) Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) Logger.getLogger("streaming").setLevel(Level.WARN) Logger.getLogger("spark").setLevel(Level.WARN) } } }
Example 54
Source File: DemoUtils.scala From spark-orientdb-connector with Apache License 2.0 | 5 votes |
package com.metreta.spark.orientdb.connector.demo import org.apache.spark.{ Logging, SparkContext, SparkConf } import com.metreta.spark.orientdb.connector.SparkContextFunctions trait DemoUtils extends Logging { val OrientDBNodesProperty = "spark.orientdb.connection.nodes" val DefaultOrientDBNodesProperty = "127.0.0.1" val OriendtDBProtocolProperty = "spark.orientdb.protocol" val DefaultOriendtDBProtocolProperty = "plocal" val OriendtDBDBNameProperty = "spark.orientdb.dbname" // val DefaultOriendtDBDBNameProperty = "testdb" val DefaultOriendtDBDBNameProperty = """/path/to/orient""" val OriendtDBPortProperty = "spark.orientdb.port" val DefaultOriendtDBPortProperty = "2424" val OriendtDBUserProperty = "spark.orientdb.user" val DefaultOriendtDBUser = "admin" val OriendtDBPasswordProperty = "spark.orientdb.password" val DefaultOriendtDBPassword = "admin" val OriendtDBClusterModeProperty = "spark.orientdb.clustermode" //remote-colocated val DefaultOriendtDBClusterMode = "colocated" implicit def toSparkContextFunctions(sc: SparkContext): SparkContextFunctions = new SparkContextFunctions(sc) val conf = new SparkConf() .setMaster("local[*]") .setAppName("demo") .set(OrientDBNodesProperty, DefaultOrientDBNodesProperty) .set(OriendtDBProtocolProperty, DefaultOriendtDBProtocolProperty) .set(OriendtDBDBNameProperty, DefaultOriendtDBDBNameProperty) .set(OriendtDBPortProperty, DefaultOriendtDBPortProperty) .set(OriendtDBUserProperty, DefaultOriendtDBUser) .set(OriendtDBPasswordProperty, DefaultOriendtDBPassword) .set(OriendtDBClusterModeProperty, DefaultOriendtDBClusterMode) lazy val sc = new SparkContext(conf) } object DemoUtils { def apply(): DemoUtils = new DemoUtils {} }
Example 55
Source File: ClassJsonRDDFunctions.scala From spark-orientdb-connector with Apache License 2.0 | 5 votes |
package com.metreta.spark.orientdb.connector import com.metreta.spark.orientdb.connector.api.OrientDBConnector import com.orientechnologies.orient.core.record.impl.ODocument import org.apache.spark.Logging import org.apache.spark.rdd.RDD class ClassJsonRDDFunctions(rdd: RDD[String]) extends Serializable with Logging { def saveJsonToOrient(myClass: String)(implicit connector: OrientDBConnector = OrientDBConnector(rdd.sparkContext.getConf)): Unit = { rdd.foreachPartition { partition ⇒ val db = connector.databaseDocumentTx() while (partition.hasNext) { val obj = partition.next() val doc = new ODocument(myClass); doc.fromJSON(obj) db.save(doc) } db.commit() db.close() } } }
Example 56
Source File: ClassRDDPartitioner.scala From spark-orientdb-connector with Apache License 2.0 | 5 votes |
package com.metreta.spark.orientdb.connector.rdd.partitioner import scala.collection.JavaConversions.iterableAsScalaIterable import scala.collection.mutable.ArrayBuffer import org.apache.spark.Logging import org.apache.spark.Partition import com.metreta.spark.orientdb.connector.api.OrientDBConnector import com.orientechnologies.orient.core.metadata.schema.OClass import com.orientechnologies.orient.core.metadata.schema.OSchema import com.orientechnologies.orient.core.storage.OStorage import com.metreta.spark.orientdb.connector.SystemTables import scala.collection.JavaConversions.iterableAsScalaIterable def getPartitions(): Array[Partition] = { val db = connector.databaseDocumentTx() var partitions = new ArrayBuffer[OrientPartition] val schema: OSchema = connector.getSchema(db) var klass: OClass = schema.getClass(mClass) val storage: OStorage = connector.getStorage(db) klass.getClusterIds.zipWithIndex foreach { case (clusterId, index) => partitions = partitions.+=(OrientPartition( index, null, // <- Host Address ????? PartitionName(klass.getName, storage.getClusterById(clusterId).getName))) } partitions.toArray } }
Example 57
Source File: CustomReceiver.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import java.io.{InputStreamReader, BufferedReader, InputStream} import java.net.Socket import org.apache.spark.{SparkConf, Logging} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { logInfo("Connecting to " + host + ":" + port) socket = new Socket(host, port) logInfo("Connected to " + host + ":" + port) val reader = new BufferedReader(new InputStreamReader(socket.getInputStream(), "UTF-8")) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() logInfo("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } } // scalastyle:on println
Example 58
Source File: StreamingExamples.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.apache.spark.Logging import org.apache.log4j.{Level, Logger} def setStreamingLogLevels() { val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements if (!log4jInitialized) { // We first log something to initialize Spark's default logging, then we override the // logging level. logInfo("Setting log level to [WARN] for streaming example." + " To override add a custom log4j.properties to the classpath.") Logger.getRootLogger.setLevel(Level.WARN) } } }
Example 59
Source File: GraphLoader.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx import org.apache.spark.storage.StorageLevel import org.apache.spark.{Logging, SparkContext} import org.apache.spark.graphx.impl.{EdgePartitionBuilder, GraphImpl} def edgeListFile( sc: SparkContext, path: String, canonicalOrientation: Boolean = false, numEdgePartitions: Int = -1, edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY, vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) : Graph[Int, Int] = { val startTime = System.currentTimeMillis // Parse the edge data table directly into edge partitions val lines = if (numEdgePartitions > 0) { sc.textFile(path, numEdgePartitions).coalesce(numEdgePartitions) } else { sc.textFile(path) } val edges = lines.mapPartitionsWithIndex { (pid, iter) => val builder = new EdgePartitionBuilder[Int, Int] iter.foreach { line => if (!line.isEmpty && line(0) != '#') { val lineArray = line.split("\\s+") if (lineArray.length < 2) { throw new IllegalArgumentException("Invalid line: " + line) } val srcId = lineArray(0).toLong val dstId = lineArray(1).toLong if (canonicalOrientation && srcId > dstId) { builder.add(dstId, srcId, 1) } else { builder.add(srcId, dstId, 1) } } } Iterator((pid, builder.toEdgePartition)) }.persist(edgeStorageLevel).setName("GraphLoader.edgeListFile - edges (%s)".format(path)) edges.count() logInfo("It took %d ms to load the edges".format(System.currentTimeMillis - startTime)) GraphImpl.fromEdgePartitions(edges, defaultVertexAttr = 1, edgeStorageLevel = edgeStorageLevel, vertexStorageLevel = vertexStorageLevel) } // end of edgeListFile }
Example 60
Source File: CachedRDDManager.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.execution.recursion import org.apache.spark.Logging import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import scala.collection.mutable.{HashMap, HashSet, Set} class CachedRDDManager(defaultStorageLevel: StorageLevel) extends Logging with Serializable { val iterationToRDDMap = new HashMap[Int, HashSet[RDD[_]]] var currentIteration : Int = 0 def persist(rdd: RDD[_]): Unit = { persist(rdd, false) } def persist(rdd: RDD[_], doMemoryCheckpoint: Boolean): Unit = { iterationToRDDMap.getOrElseUpdate(currentIteration, new HashSet[RDD[_]]).add(rdd) rdd.persist(defaultStorageLevel) if (doMemoryCheckpoint) rdd.memoryCheckpoint() } def cleanUpIteration(iterationsBackToRemove: Int = 2) = { val start = System.currentTimeMillis() if (currentIteration >= iterationsBackToRemove) { val iterationId = currentIteration - iterationsBackToRemove if (iterationToRDDMap.contains(iterationId)) { val rdds: HashSet[RDD[_]] = iterationToRDDMap.remove(iterationId).get if (rdds.nonEmpty) logInfo("Unpersisting "+rdds.size+" rdds for iteration " + iterationId) rdds.foreach(rdd => rdd.unpersist(false)) } } logInfo("CleanUpIteration took " + (System.currentTimeMillis() - start) + " ms") currentIteration += 1 } def cleanUpIterationById(iterationId: Int) = { if (iterationToRDDMap.contains(iterationId)) { val rdds: HashSet[RDD[_]] = iterationToRDDMap.remove(iterationId).get rdds.foreach(rdd => rdd.unpersist(false)) } } def incrementIteration() { currentIteration += 1} def clear() = { iterationToRDDMap.clear() } def clear(remainCached: Seq[RDD[_]]) = { iterationToRDDMap.keySet.foreach(key => logInfo("key: " + key + " value: " + iterationToRDDMap.get(key))) iterationToRDDMap.keySet .foreach(key => iterationToRDDMap.get(key) .foreach(value => value.foreach(item => {if (!remainCached.contains(item)) item.unpersist(false)}))) iterationToRDDMap.clear() } def unpersist(rdds: Set[RDD[_]]) = { for (rdd <- rdds) { iterationToRDDMap.synchronized { // rdd should only be in 1 iteration val iterations = iterationToRDDMap.filter(x => x._2.contains(rdd)) if (iterations.nonEmpty) { val iteration = iterations.head iteration._2.remove(rdd) rdd.unpersist(false) if (iteration._2.isEmpty) iterationToRDDMap.remove(iteration._1) } } } } override def toString = { val output = new StringBuilder iterationToRDDMap.keySet.toSeq.sorted .foreach(iteration => { val rdds = iterationToRDDMap.get(iteration) rdds.foreach(rdd => output.append(iteration + ":" + rdd + "\n")) }) output.toString() } }
Example 61
Source File: QuerySuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException} import org.scalatest.FunSuite import scala.collection.mutable.ArrayBuffer abstract class QuerySuite extends FunSuite with Logging { case class TestCase(program: String, query: String, data: Map[String, Seq[String]], answers: Seq[String], answersSize: Int) { def this(program: String, query: String, data: Map[String, Seq[String]], answersSize: Int) = this(program, query, data, null, answersSize) def this(program: String, query: String, data: Map[String, Seq[String]], answers: Seq[String]) = this(program, query, data, answers, answers.size) } def runTest(testCase: TestCase): Unit = runTests(Seq(testCase)) def runTests(testCases: Seq[TestCase]): Unit = { val sparkCtx = new SparkContext("local[*]", "QuerySuite", new SparkConf() .set("spark.eventLog.enabled", "true") //.set("spark.eventLog.dir", "../logs") .set("spark.ui.enabled", "false") .set("spark.sql.shuffle.partitions", "5") .setAll(Map.empty[String, String]) ) val bigDatalogCtx = new BigDatalogContext(sparkCtx) var count: Int = 1 for (testCase <- testCases) { bigDatalogCtx.loadProgram(testCase.program) for ((relationName, data) <- testCase.data) { val relationInfo = bigDatalogCtx.relationCatalog.getRelationInfo(relationName) if (relationInfo == null) throw new SparkException("You are attempting to load an unknown relation.") bigDatalogCtx.registerAndLoadTable(relationName, data, bigDatalogCtx.conf.numShufflePartitions) } val query = testCase.query val answers = testCase.answers logInfo("========== START BigDatalog Query " + count + " START ==========") val program = bigDatalogCtx.query(query) val results = program.execute().collect() // for some test cases we will only know the size of the answer set, not the actual answers if (answers == null) { assert(results.size == testCase.answersSize) } else { if (results.size != answers.size) { displayDifferences(results.map(_.toString), answers) // yes this will fail assert(results.size == answers.size) } else { for (result <- results) assert(answers.contains(result.toString())) } val resultStrings = results.map(_.toString).toSet for (answer <- answers) assert(resultStrings.contains(answer.toString())) } logInfo("========== END BigDatalog Query " + count + " END ==========\n") count += 1 bigDatalogCtx.reset() } sparkCtx.stop() } private def displayDifferences(results: Seq[String], answers: Seq[String]): Unit = { val missingAnswers = new ArrayBuffer[String] val missingResults = new ArrayBuffer[String] for (result <- results) if (!answers.contains(result)) missingAnswers += result for (answer <- answers) if (!results.contains(answer)) missingResults += answer if (missingAnswers.nonEmpty) logInfo("Results not in Answers: " + missingAnswers.mkString(", ")) if (missingResults.nonEmpty) logInfo("Answers not in Results: " + missingResults.mkString(", ")) } }
Example 62
Source File: LibSVMRelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import com.google.common.base.Objects import org.apache.spark.Logging import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrameReader, DataFrame, Row, SQLContext} import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{DoubleType, StructField, StructType} @Since("1.6.0") class DefaultSource extends RelationProvider with DataSourceRegister { @Since("1.6.0") override def shortName(): String = "libsvm" @Since("1.6.0") override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) : BaseRelation = { val path = parameters.getOrElse("path", throw new IllegalArgumentException("'path' must be specified")) val numFeatures = parameters.getOrElse("numFeatures", "-1").toInt val vectorType = parameters.getOrElse("vectorType", "sparse") new LibSVMRelation(path, numFeatures, vectorType)(sqlContext) } }
Example 63
Source File: Transformer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import scala.annotation.varargs import org.apache.spark.Logging import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ protected def validateInputType(inputType: DataType): Unit = {} override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType validateInputType(inputType) if (schema.fieldNames.contains($(outputCol))) { throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.") } val outputFields = schema.fields :+ StructField($(outputCol), outputDataType, nullable = false) StructType(outputFields) } override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) dataset.withColumn($(outputCol), callUDF(this.createTransformFunc, outputDataType, dataset($(inputCol)))) } override def copy(extra: ParamMap): T = defaultCopy(extra) }
Example 64
Source File: LocalKMeans.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.clustering import scala.util.Random import org.apache.spark.Logging import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.BLAS.{axpy, scal} def kMeansPlusPlus( seed: Int, points: Array[VectorWithNorm], weights: Array[Double], k: Int, maxIterations: Int ): Array[VectorWithNorm] = { val rand = new Random(seed) val dimensions = points(0).vector.size val centers = new Array[VectorWithNorm](k) // Initialize centers by sampling using the k-means++ procedure. centers(0) = pickWeighted(rand, points, weights).toDense for (i <- 1 until k) { // Pick the next center with a probability proportional to cost under current centers val curCenters = centers.view.take(i) val sum = points.view.zip(weights).map { case (p, w) => w * KMeans.pointCost(curCenters, p) }.sum val r = rand.nextDouble() * sum var cumulativeScore = 0.0 var j = 0 while (j < points.length && cumulativeScore < r) { cumulativeScore += weights(j) * KMeans.pointCost(curCenters, points(j)) j += 1 } if (j == 0) { logWarning("kMeansPlusPlus initialization ran out of distinct points for centers." + s" Using duplicate point for center k = $i.") centers(i) = points(0).toDense } else { centers(i) = points(j - 1).toDense } } // Run up to maxIterations iterations of Lloyd's algorithm val oldClosest = Array.fill(points.length)(-1) var iteration = 0 var moved = true while (moved && iteration < maxIterations) { moved = false val counts = Array.fill(k)(0.0) val sums = Array.fill(k)(Vectors.zeros(dimensions)) var i = 0 while (i < points.length) { val p = points(i) val index = KMeans.findClosest(centers, p)._1 axpy(weights(i), p.vector, sums(index)) counts(index) += weights(i) if (index != oldClosest(i)) { moved = true oldClosest(i) = index } i += 1 } // Update centers var j = 0 while (j < k) { if (counts(j) == 0.0) { // Assign center to a random point centers(j) = points(rand.nextInt(points.length)).toDense } else { scal(1.0 / counts(j), sums(j)) centers(j) = new VectorWithNorm(sums(j)) } j += 1 } iteration += 1 } if (iteration == maxIterations) { logInfo(s"Local KMeans++ reached the max number of iterations: $maxIterations.") } else { logInfo(s"Local KMeans++ converged in $iteration iterations.") } centers } private def pickWeighted[T](rand: Random, data: Array[T], weights: Array[Double]): T = { val r = rand.nextDouble() * weights.sum var i = 0 var curWeight = 0.0 while (i < data.length && curWeight < r) { curWeight += weights(i) i += 1 } data(i - 1) } }
Example 65
Source File: PearsonCorrelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.toBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 66
Source File: SpearmanCorrelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.Logging import org.apache.spark.SparkContext._ import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => vec.toArray.view.zipWithIndex.map { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 var cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = cachedUids.map { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Example 67
Source File: DataValidators.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.Logging import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("1.3.0") def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data => val numInvalid = data.filter(x => x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count() if (numInvalid != 0) { logError("Classification labels should be in {0 to " + (k - 1) + "}. " + "Found " + numInvalid + " invalid labels") } numInvalid == 0 } }
Example 68
Source File: TwitterInputDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.twitter import twitter4j._ import twitter4j.auth.Authorization import twitter4j.conf.ConfigurationBuilder import twitter4j.auth.OAuthAuthorization import org.apache.spark.streaming._ import org.apache.spark.streaming.dstream._ import org.apache.spark.storage.StorageLevel import org.apache.spark.Logging import org.apache.spark.streaming.receiver.Receiver private[streaming] class TwitterInputDStream( ssc_ : StreamingContext, twitterAuth: Option[Authorization], filters: Seq[String], storageLevel: StorageLevel ) extends ReceiverInputDStream[Status](ssc_) { private def createOAuthAuthorization(): Authorization = { new OAuthAuthorization(new ConfigurationBuilder().build()) } private val authorization = twitterAuth.getOrElse(createOAuthAuthorization()) override def getReceiver(): Receiver[Status] = { new TwitterReceiver(authorization, filters, storageLevel) } } private[streaming] class TwitterReceiver( twitterAuth: Authorization, filters: Seq[String], storageLevel: StorageLevel ) extends Receiver[Status](storageLevel) with Logging { @volatile private var twitterStream: TwitterStream = _ @volatile private var stopped = false def onStart() { try { val newTwitterStream = new TwitterStreamFactory().getInstance(twitterAuth) newTwitterStream.addListener(new StatusListener { def onStatus(status: Status): Unit = { store(status) } // Unimplemented def onDeletionNotice(statusDeletionNotice: StatusDeletionNotice) {} def onTrackLimitationNotice(i: Int) {} def onScrubGeo(l: Long, l1: Long) {} def onStallWarning(stallWarning: StallWarning) {} def onException(e: Exception) { if (!stopped) { restart("Error receiving tweets", e) } } }) val query = new FilterQuery if (filters.size > 0) { query.track(filters.mkString(",")) newTwitterStream.filter(query) } else { newTwitterStream.sample() } setTwitterStream(newTwitterStream) logInfo("Twitter receiver started") stopped = false } catch { case e: Exception => restart("Error starting Twitter stream", e) } } def onStop() { stopped = true setTwitterStream(null) logInfo("Twitter receiver stopped") } private def setTwitterStream(newTwitterStream: TwitterStream) = synchronized { if (twitterStream != null) { twitterStream.shutdown() } twitterStream = newTwitterStream } }
Example 69
Source File: TwitterStreamSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.twitter import org.scalatest.BeforeAndAfter import twitter4j.Status import twitter4j.auth.{NullAuthorization, Authorization} import org.apache.spark.{Logging, SparkFunSuite} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.ReceiverInputDStream class TwitterStreamSuite extends SparkFunSuite with BeforeAndAfter with Logging { val batchDuration = Seconds(1) private val master: String = "local[2]" private val framework: String = this.getClass.getSimpleName test("twitter input stream") { val ssc = new StreamingContext(master, framework, batchDuration) val filters = Seq("filter1", "filter2") val authorization: Authorization = NullAuthorization.getInstance() // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None) val test2: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters) val test3: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_AND_DISK_SER_2) val test4: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization)) val test5: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization), filters) val test6: ReceiverInputDStream[Status] = TwitterUtils.createStream( ssc, Some(authorization), filters, StorageLevel.MEMORY_AND_DISK_SER_2) // Note that actually testing the data receiving is hard as authentication keys are // necessary for accessing Twitter live stream ssc.stop() } }
Example 70
Source File: EventTransformer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import java.io.{ObjectOutput, ObjectInput} import scala.collection.JavaConverters._ import org.apache.spark.util.Utils import org.apache.spark.Logging private[streaming] object EventTransformer extends Logging { def readExternal(in: ObjectInput): (java.util.HashMap[CharSequence, CharSequence], Array[Byte]) = { val bodyLength = in.readInt() val bodyBuff = new Array[Byte](bodyLength) in.readFully(bodyBuff) val numHeaders = in.readInt() val headers = new java.util.HashMap[CharSequence, CharSequence] for (i <- 0 until numHeaders) { val keyLength = in.readInt() val keyBuff = new Array[Byte](keyLength) in.readFully(keyBuff) val key: String = Utils.deserialize(keyBuff) val valLength = in.readInt() val valBuff = new Array[Byte](valLength) in.readFully(valBuff) val value: String = Utils.deserialize(valBuff) headers.put(key, value) } (headers, bodyBuff) } def writeExternal(out: ObjectOutput, headers: java.util.Map[CharSequence, CharSequence], body: Array[Byte]) { out.writeInt(body.length) out.write(body) val numHeaders = headers.size() out.writeInt(numHeaders) for ((k, v) <- headers.asScala) { val keyBuff = Utils.serialize(k.toString) out.writeInt(keyBuff.length) out.write(keyBuff) val valBuff = Utils.serialize(v.toString) out.writeInt(valBuff.length) out.write(valBuff) } } }
Example 71
Source File: FlumeStreamSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import scala.collection.JavaConverters._ import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer} import scala.concurrent.duration._ import scala.language.postfixOps import com.google.common.base.Charsets import org.jboss.netty.channel.ChannelPipeline import org.jboss.netty.channel.socket.SocketChannel import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory import org.jboss.netty.handler.codec.compression._ import org.scalatest.{BeforeAndAfter, Matchers} import org.scalatest.concurrent.Eventually._ import org.apache.spark.{Logging, SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext, TestOutputStream} class FlumeStreamSuite extends SparkFunSuite with BeforeAndAfter with Matchers with Logging { val conf = new SparkConf().setMaster("local[4]").setAppName("FlumeStreamSuite") var ssc: StreamingContext = null test("flume input stream") { testFlumeStream(testCompression = false) } test("flume input compressed stream") { testFlumeStream(testCompression = true) } private class CompressionChannelFactory(compressionLevel: Int) extends NioClientSocketChannelFactory { override def newChannel(pipeline: ChannelPipeline): SocketChannel = { val encoder = new ZlibEncoder(compressionLevel) pipeline.addFirst("deflater", encoder) pipeline.addFirst("inflater", new ZlibDecoder()) super.newChannel(pipeline) } } }
Example 72
Source File: MQTTTestUtils.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.mqtt import java.net.{ServerSocket, URI} import scala.language.postfixOps import com.google.common.base.Charsets.UTF_8 import org.apache.activemq.broker.{BrokerService, TransportConnector} import org.apache.commons.lang3.RandomUtils import org.eclipse.paho.client.mqttv3._ import org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence import org.apache.spark.util.Utils import org.apache.spark.{Logging, SparkConf} private[mqtt] class MQTTTestUtils extends Logging { private val persistenceDir = Utils.createTempDir() private val brokerHost = "localhost" private val brokerPort = findFreePort() private var broker: BrokerService = _ private var connector: TransportConnector = _ def brokerUri: String = { s"$brokerHost:$brokerPort" } def setup(): Unit = { broker = new BrokerService() broker.setDataDirectoryFile(Utils.createTempDir()) connector = new TransportConnector() connector.setName("mqtt") connector.setUri(new URI("mqtt://" + brokerUri)) broker.addConnector(connector) broker.start() } def teardown(): Unit = { if (broker != null) { broker.stop() broker = null } if (connector != null) { connector.stop() connector = null } Utils.deleteRecursively(persistenceDir) } private def findFreePort(): Int = { val candidatePort = RandomUtils.nextInt(1024, 65536) Utils.startServiceOnPort(candidatePort, (trialPort: Int) => { val socket = new ServerSocket(trialPort) socket.close() (null, trialPort) }, new SparkConf())._2 } def publishData(topic: String, data: String): Unit = { var client: MqttClient = null try { val persistence = new MqttDefaultFilePersistence(persistenceDir.getAbsolutePath) client = new MqttClient("tcp://" + brokerUri, MqttClient.generateClientId(), persistence) client.connect() if (client.isConnected) { val msgTopic = client.getTopic(topic) val message = new MqttMessage(data.getBytes(UTF_8)) message.setQos(1) message.setRetained(true) for (i <- 0 to 10) { try { msgTopic.publish(message) } catch { case e: MqttException if e.getReasonCode == MqttException.REASON_CODE_MAX_INFLIGHT => // wait for Spark streaming to consume something from the message queue Thread.sleep(50) } } } } finally { if (client != null) { client.disconnect() client.close() client = null } } } }
Example 73
Source File: OrcFileOperator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader} import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.spark.Logging import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.hive.HiveMetastoreTypes import org.apache.spark.sql.types.StructType private[orc] object OrcFileOperator extends Logging { def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = { def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = { reader.getObjectInspector match { case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 => logInfo( s"ORC file $path has empty schema, it probably contains no rows. " + "Trying to read another ORC file to figure out the schema.") false case _ => true } } val conf = config.getOrElse(new Configuration) val fs = { val hdfsPath = new Path(basePath) hdfsPath.getFileSystem(conf) } listOrcFiles(basePath, conf).iterator.map { path => path -> OrcFile.createReader(fs, path) }.collectFirst { case (path, reader) if isWithNonEmptySchema(path, reader) => reader } } def readSchema(path: String, conf: Option[Configuration]): StructType = { val reader = getFileReader(path, conf).getOrElse { throw new AnalysisException( s"Failed to discover schema from ORC files stored in $path. " + "Probably there are either no ORC files or only empty ORC files.") } val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector] val schema = readerInspector.getTypeName logDebug(s"Reading schema from file $path, got Hive schema string: $schema") HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType] } def getObjectInspector( path: String, conf: Option[Configuration]): Option[StructObjectInspector] = { getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector]) } def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = { val origPath = new Path(pathStr) val fs = origPath.getFileSystem(conf) val path = origPath.makeQualified(fs.getUri, fs.getWorkingDirectory) val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath) .filterNot(_.isDir) .map(_.getPath) .filterNot(_.getName.startsWith("_")) .filterNot(_.getName.startsWith(".")) if (paths == null || paths.isEmpty) { throw new IllegalArgumentException( s"orcFileOperator: path $path does not have valid orc files matching the pattern") } paths } }
Example 74
Source File: FiltersSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.client import java.util.Collections import org.apache.hadoop.hive.metastore.api.FieldSchema import org.apache.hadoop.hive.serde.serdeConstants import org.apache.spark.{Logging, SparkFunSuite} import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ class FiltersSuite extends SparkFunSuite with Logging { private val shim = new Shim_v0_13 private val testTable = new org.apache.hadoop.hive.ql.metadata.Table("default", "test") private val varCharCol = new FieldSchema() varCharCol.setName("varchar") varCharCol.setType(serdeConstants.VARCHAR_TYPE_NAME) testTable.setPartCols(Collections.singletonList(varCharCol)) filterTest("string filter", (a("stringcol", StringType) > Literal("test")) :: Nil, "stringcol > \"test\"") filterTest("string filter backwards", (Literal("test") > a("stringcol", StringType)) :: Nil, "\"test\" > stringcol") filterTest("int filter", (a("intcol", IntegerType) === Literal(1)) :: Nil, "intcol = 1") filterTest("int filter backwards", (Literal(1) === a("intcol", IntegerType)) :: Nil, "1 = intcol") filterTest("int and string filter", (Literal(1) === a("intcol", IntegerType)) :: (Literal("a") === a("strcol", IntegerType)) :: Nil, "1 = intcol and \"a\" = strcol") filterTest("skip varchar", (Literal("") === a("varchar", StringType)) :: Nil, "") private def filterTest(name: String, filters: Seq[Expression], result: String) = { test(name){ val converted = shim.convertFilters(testTable, filters) if (converted != result) { fail( s"Expected filters ${filters.mkString(",")} to convert to '$result' but got '$converted'") } } } private def a(name: String, dataType: DataType) = AttributeReference(name, dataType)() }
Example 75
Source File: SparkSQLDriver.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.util.{Arrays, ArrayList => JArrayList, List => JList} import org.apache.log4j.LogManager import org.apache.spark.sql.AnalysisException import scala.collection.JavaConverters._ import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse import org.apache.spark.Logging import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes} private[hive] class SparkSQLDriver( val context: HiveContext = SparkSQLEnv.hiveContext) extends Driver with Logging { private[hive] var tableSchema: Schema = _ private[hive] var hiveResponse: Seq[String] = _ override def init(): Unit = { } private def getResultSetSchema(query: context.QueryExecution): Schema = { val analyzed = query.analyzed logDebug(s"Result Schema: ${analyzed.output}") if (analyzed.output.isEmpty) { new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null) } else { val fieldSchemas = analyzed.output.map { attr => new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "") } new Schema(fieldSchemas.asJava, null) } } override def run(command: String): CommandProcessorResponse = { // TODO unify the error code try { context.sparkContext.setJobDescription(command) val execution = context.executePlan(context.sql(command).logicalPlan) hiveResponse = execution.stringResult() tableSchema = getResultSetSchema(execution) new CommandProcessorResponse(0) } catch { case ae: AnalysisException => logDebug(s"Failed in [$command]", ae) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae) case cause: Throwable => logError(s"Failed in [$command]", cause) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause) } } override def close(): Int = { hiveResponse = null tableSchema = null 0 } override def getResults(res: JList[_]): Boolean = { if (hiveResponse == null) { false } else { res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava) hiveResponse = null true } } override def getSchema: Schema = tableSchema override def destroy() { super.destroy() hiveResponse = null tableSchema = null } }
Example 76
Source File: SparkSQLOperationManager.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.server import java.util.{Map => JMap} import scala.collection.mutable.Map import org.apache.hive.service.cli._ import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, Operation, OperationManager} import org.apache.hive.service.cli.session.HiveSession import org.apache.spark.Logging import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.hive.thriftserver.{SparkExecuteStatementOperation, ReflectionUtils} private[thriftserver] class SparkSQLOperationManager() extends OperationManager with Logging { val handleToOperation = ReflectionUtils .getSuperField[JMap[OperationHandle, Operation]](this, "handleToOperation") val sessionToActivePool = Map[SessionHandle, String]() val sessionToContexts = Map[SessionHandle, HiveContext]() override def newExecuteStatementOperation( parentSession: HiveSession, statement: String, confOverlay: JMap[String, String], async: Boolean): ExecuteStatementOperation = synchronized { val hiveContext = sessionToContexts(parentSession.getSessionHandle) val runInBackground = async && hiveContext.hiveThriftServerAsync val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay, runInBackground)(hiveContext, sessionToActivePool) handleToOperation.put(operation.getHandle, operation) logDebug(s"Created Operation for $statement with session=$parentSession, " + s"runInBackground=$runInBackground") operation } }
Example 77
Source File: ThriftServerTab.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.ui import org.apache.spark.sql.hive.thriftserver.{HiveThriftServer2, SparkSQLEnv} import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._ import org.apache.spark.ui.{SparkUI, SparkUITab} import org.apache.spark.{SparkContext, Logging, SparkException} private[thriftserver] class ThriftServerTab(sparkContext: SparkContext) extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging { override val name = "JDBC/ODBC Server" val parent = getSparkUI(sparkContext) val listener = HiveThriftServer2.listener attachPage(new ThriftServerPage(this)) attachPage(new ThriftServerSessionPage(this)) parent.attachTab(this) def detach() { getSparkUI(sparkContext).detachTab(this) } } private[thriftserver] object ThriftServerTab { def getSparkUI(sparkContext: SparkContext): SparkUI = { sparkContext.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 78
Source File: SparkSQLEnv.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.io.PrintStream import scala.collection.JavaConverters._ import org.apache.spark.scheduler.StatsReportListener import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{Logging, SparkConf, SparkContext} import org.apache.spark.util.Utils def stop() { logDebug("Shutting down Spark SQL Environment") // Stop the SparkContext if (SparkSQLEnv.sparkContext != null) { sparkContext.stop() sparkContext = null hiveContext = null } } }
Example 79
Source File: BoundAttribute.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors.attachTree import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode} import org.apache.spark.sql.types._ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean) extends LeafExpression with NamedExpression { override def toString: String = s"input[$ordinal, $dataType]" // Use special getter for primitive types (for UnsafeRow) override def eval(input: InternalRow): Any = { if (input.isNullAt(ordinal)) { null } else { dataType match { case BooleanType => input.getBoolean(ordinal) case ByteType => input.getByte(ordinal) case ShortType => input.getShort(ordinal) case IntegerType | DateType => input.getInt(ordinal) case LongType | TimestampType => input.getLong(ordinal) case FloatType => input.getFloat(ordinal) case DoubleType => input.getDouble(ordinal) case StringType => input.getUTF8String(ordinal) case BinaryType => input.getBinary(ordinal) case CalendarIntervalType => input.getInterval(ordinal) case t: DecimalType => input.getDecimal(ordinal, t.precision, t.scale) case t: StructType => input.getStruct(ordinal, t.size) case _: ArrayType => input.getArray(ordinal) case _: MapType => input.getMap(ordinal) case _ => input.get(ordinal, dataType) } } } override def name: String = s"i[$ordinal]" override def toAttribute: Attribute = throw new UnsupportedOperationException override def qualifiers: Seq[String] = throw new UnsupportedOperationException override def exprId: ExprId = throw new UnsupportedOperationException override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { val javaType = ctx.javaType(dataType) val value = ctx.getValue(ctx.INPUT_ROW, dataType, ordinal.toString) s""" boolean ${ev.isNull} = ${ctx.INPUT_ROW}.isNullAt($ordinal); $javaType ${ev.value} = ${ev.isNull} ? ${ctx.defaultValue(dataType)} : ($value); """ } } object BindReferences extends Logging { def bindReference[A <: Expression]( expression: A, input: Seq[Attribute], allowFailures: Boolean = false): A = { expression.transform { case a: AttributeReference => attachTree(a, "Binding attribute") { val ordinal = input.indexWhere(_.exprId == a.exprId) if (ordinal == -1) { if (allowFailures) { a } else { sys.error(s"Couldn't find $a in ${input.mkString("[", ",", "]")}") } } else { BoundReference(ordinal, a.dataType, a.nullable) } } }.asInstanceOf[A] // Kind of a hack, but safe. TODO: Tighten return type when possible. } }
Example 80
Source File: RuleExecutor.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.rules import scala.collection.JavaConverters._ import com.google.common.util.concurrent.AtomicLongMap import org.apache.spark.Logging import org.apache.spark.sql.catalyst.trees.TreeNode import org.apache.spark.sql.catalyst.util.sideBySide object RuleExecutor { protected val timeMap = AtomicLongMap.create[String]() def execute(plan: TreeType): TreeType = { var curPlan = plan batches.foreach { batch => val batchStartPlan = curPlan var iteration = 1 var lastPlan = curPlan var continue = true // Run until fix point (or the max number of iterations as specified in the strategy. while (continue) { curPlan = batch.rules.foldLeft(curPlan) { case (plan, rule) => val startTime = System.nanoTime() val result = rule(plan) val runTime = System.nanoTime() - startTime RuleExecutor.timeMap.addAndGet(rule.ruleName, runTime) if (!result.fastEquals(plan)) { logTrace( s""" |=== Applying Rule ${rule.ruleName} === |${sideBySide(plan.treeString, result.treeString).mkString("\n")} """.stripMargin) } result } iteration += 1 if (iteration > batch.strategy.maxIterations) { // Only log if this is a rule that is supposed to run more than once. if (iteration != 2) { logInfo(s"Max iterations (${iteration - 1}) reached for batch ${batch.name}") } continue = false } if (curPlan.fastEquals(lastPlan)) { logTrace( s"Fixed point reached for batch ${batch.name} after ${iteration - 1} iterations.") continue = false } lastPlan = curPlan } if (!batchStartPlan.fastEquals(curPlan)) { logDebug( s""" |=== Result of Batch ${batch.name} === |${sideBySide(plan.treeString, curPlan.treeString).mkString("\n")} """.stripMargin) } else { logTrace(s"Batch ${batch.name} has no effect.") } } curPlan } }
Example 81
Source File: package.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import scala.collection.mutable.HashSet import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.trees.TreeNodeRef import org.apache.spark.{Accumulator, AccumulatorParam, Logging} case class ColumnMetrics( elementTypes: Accumulator[HashSet[String]] = sparkContext.accumulator(HashSet.empty)) val tupleCount: Accumulator[Int] = sparkContext.accumulator[Int](0) val numColumns: Int = child.output.size val columnStats: Array[ColumnMetrics] = Array.fill(child.output.size)(new ColumnMetrics()) def dumpStats(): Unit = { logDebug(s"== ${child.simpleString} ==") logDebug(s"Tuples output: ${tupleCount.value}") child.output.zip(columnStats).foreach { case(attr, metric) => val actualDataTypes = metric.elementTypes.value.mkString("{", ",", "}") logDebug(s" ${attr.name} ${attr.dataType}: $actualDataTypes") } } protected override def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => new Iterator[InternalRow] { def hasNext: Boolean = iter.hasNext def next(): InternalRow = { val currentRow = iter.next() tupleCount += 1 var i = 0 while (i < numColumns) { val value = currentRow.get(i, output(i).dataType) if (value != null) { columnStats(i).elementTypes += HashSet(value.getClass.getName) } i += 1 } currentRow } } } } } }
Example 82
Source File: DriverRegistry.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import java.sql.{Driver, DriverManager} import scala.collection.mutable import org.apache.spark.Logging import org.apache.spark.util.Utils object DriverRegistry extends Logging { private val wrapperMap: mutable.Map[String, DriverWrapper] = mutable.Map.empty def register(className: String): Unit = { val cls = Utils.getContextOrSparkClassLoader.loadClass(className) if (cls.getClassLoader == null) { logTrace(s"$className has been loaded with bootstrap ClassLoader, wrapper is not required") } else if (wrapperMap.get(className).isDefined) { logTrace(s"Wrapper for $className already exists") } else { synchronized { if (wrapperMap.get(className).isEmpty) { val wrapper = new DriverWrapper(cls.newInstance().asInstanceOf[Driver]) DriverManager.registerDriver(wrapper) wrapperMap(className) = wrapper logTrace(s"Wrapper for $className registered") } } } } }
Example 83
Source File: FrequentItems.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.stat import scala.collection.mutable.{Map => MutableMap} import org.apache.spark.Logging import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types._ import org.apache.spark.sql.{Row, Column, DataFrame} private[sql] object FrequentItems extends Logging { private[sql] def singlePassFreqItems( df: DataFrame, cols: Seq[String], support: Double): DataFrame = { require(support >= 1e-4, s"support ($support) must be greater than 1e-4.") val numCols = cols.length // number of max items to keep counts for val sizeOfMap = (1 / support).toInt val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap)) val originalSchema = df.schema val colInfo: Array[(String, DataType)] = cols.map { name => val index = originalSchema.fieldIndex(name) (name, originalSchema.fields(index).dataType) }.toArray val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)( seqOp = (counts, row) => { var i = 0 while (i < numCols) { val thisMap = counts(i) val key = row.get(i) thisMap.add(key, 1L) i += 1 } counts }, combOp = (baseCounts, counts) => { var i = 0 while (i < numCols) { baseCounts(i).merge(counts(i)) i += 1 } baseCounts } ) val justItems = freqItems.map(m => m.baseMap.keys.toArray) val resultRow = Row(justItems : _*) // append frequent Items to the column name for easy debugging val outputCols = colInfo.map { v => StructField(v._1 + "_freqItems", ArrayType(v._2, false)) } val schema = StructType(outputCols).toAttributes new DataFrame(df.sqlContext, LocalRelation.fromExternalRows(schema, Seq(resultRow))) } }
Example 84
Source File: CompressibleColumnBuilder.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnBuilder, NativeColumnBuilder} import org.apache.spark.sql.types.AtomicType private[columnar] trait CompressibleColumnBuilder[T <: AtomicType] extends ColumnBuilder with Logging { this: NativeColumnBuilder[T] with WithCompressionSchemes => var compressionEncoders: Seq[Encoder[T]] = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { compressionEncoders = if (useCompression) { schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType)) } else { Seq(PassThrough.encoder(columnType)) } super.initialize(initialSize, columnName, useCompression) } protected def isWorthCompressing(encoder: Encoder[T]) = { encoder.compressionRatio < 0.8 } private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = { var i = 0 while (i < compressionEncoders.length) { compressionEncoders(i).gatherCompressibilityStats(row, ordinal) i += 1 } } abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = { super.appendFrom(row, ordinal) if (!row.isNullAt(ordinal)) { gatherCompressibilityStats(row, ordinal) } } override def build(): ByteBuffer = { val nonNullBuffer = buildNonNulls() val encoder: Encoder[T] = { val candidate = compressionEncoders.minBy(_.compressionRatio) if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType) } // Header = null count + null positions val headerSize = 4 + nulls.limit() val compressedSize = if (encoder.compressedSize == 0) { nonNullBuffer.remaining() } else { encoder.compressedSize } val compressedBuffer = ByteBuffer // Reserves 4 bytes for compression scheme ID .allocate(headerSize + 4 + compressedSize) .order(ByteOrder.nativeOrder) // Write the header .putInt(nullCount) .put(nulls) logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}") encoder.compress(nonNullBuffer, compressedBuffer) } }
Example 85
Source File: ExecutorDelegationTokenUpdater.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.util.concurrent.{Executors, TimeUnit} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.{Logging, SparkConf} import org.apache.spark.util.{ThreadUtils, Utils} import scala.util.control.NonFatal private[spark] class ExecutorDelegationTokenUpdater( sparkConf: SparkConf, hadoopConf: Configuration) extends Logging { @volatile private var lastCredentialsFileSuffix = 0 private val credentialsFile = sparkConf.get("spark.yarn.credentials.file") private val freshHadoopConf = SparkHadoopUtil.get.getConfBypassingFSCache( hadoopConf, new Path(credentialsFile).toUri.getScheme) private val delegationTokenRenewer = Executors.newSingleThreadScheduledExecutor( ThreadUtils.namedThreadFactory("Delegation Token Refresh Thread")) // On the executor, this thread wakes up and picks up new tokens from HDFS, if any. private val executorUpdaterRunnable = new Runnable { override def run(): Unit = Utils.logUncaughtExceptions(updateCredentialsIfRequired()) } def updateCredentialsIfRequired(): Unit = { try { val credentialsFilePath = new Path(credentialsFile) val remoteFs = FileSystem.get(freshHadoopConf) SparkHadoopUtil.get.listFilesSorted( remoteFs, credentialsFilePath.getParent, credentialsFilePath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION) .lastOption.foreach { credentialsStatus => val suffix = SparkHadoopUtil.get.getSuffixForCredentialsPath(credentialsStatus.getPath) if (suffix > lastCredentialsFileSuffix) { logInfo("Reading new delegation tokens from " + credentialsStatus.getPath) val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsStatus.getPath) lastCredentialsFileSuffix = suffix UserGroupInformation.getCurrentUser.addCredentials(newCredentials) logInfo("Tokens updated from credentials file.") } else { // Check every hour to see if new credentials arrived. logInfo("Updated delegation tokens were expected, but the driver has not updated the " + "tokens yet, will check again in an hour.") delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS) return } } val timeFromNowToRenewal = SparkHadoopUtil.get.getTimeFromNowToRenewal( sparkConf, 0.8, UserGroupInformation.getCurrentUser.getCredentials) if (timeFromNowToRenewal <= 0) { // We just checked for new credentials but none were there, wait a minute and retry. // This handles the shutdown case where the staging directory may have been removed(see // SPARK-12316 for more details). delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.MINUTES) } else { logInfo(s"Scheduling token refresh from HDFS in $timeFromNowToRenewal millis.") delegationTokenRenewer.schedule( executorUpdaterRunnable, timeFromNowToRenewal, TimeUnit.MILLISECONDS) } } catch { // Since the file may get deleted while we are reading it, catch the Exception and come // back in an hour to try again case NonFatal(e) => logWarning("Error while trying to update credentials, will try again in 1 hour", e) delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS) } } private def getCredentialsFromHDFSFile(remoteFs: FileSystem, tokenPath: Path): Credentials = { val stream = remoteFs.open(tokenPath) try { val newCredentials = new Credentials() newCredentials.readTokenStorageStream(stream) newCredentials } finally { stream.close() } } def stop(): Unit = { delegationTokenRenewer.shutdown() } }
Example 86
Source File: SocketInputDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.util.control.NonFatal import org.apache.spark.streaming.StreamingContext import org.apache.spark.storage.StorageLevel import org.apache.spark.util.NextIterator import scala.reflect.ClassTag import java.io._ import java.net.{UnknownHostException, Socket} import org.apache.spark.Logging import org.apache.spark.streaming.receiver.Receiver private[streaming] class SocketInputDStream[T: ClassTag]( ssc_ : StreamingContext, host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends ReceiverInputDStream[T](ssc_) { def getReceiver(): Receiver[T] = { new SocketReceiver(host, port, bytesToObjects, storageLevel) } } private[streaming] class SocketReceiver[T: ClassTag]( host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends Receiver[T](storageLevel) with Logging { def onStart() { // Start the thread that receives data over a connection new Thread("Socket Receiver") { setDaemon(true) override def run() { receive() } }.start() } def onStop() { // There is nothing much to do as the thread calling receive() // is designed to stop by itself isStopped() returns false } def bytesToLines(inputStream: InputStream): Iterator[String] = { val dataInputStream = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")) new NextIterator[String] { protected override def getNext() = { val nextValue = dataInputStream.readLine() if (nextValue == null) { finished = true } nextValue } protected override def close() { dataInputStream.close() } } } }
Example 87
Source File: StreamingTab.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import org.apache.spark.{Logging, SparkException} import org.apache.spark.streaming.StreamingContext import org.apache.spark.ui.{SparkUI, SparkUITab} import StreamingTab._ private[spark] class StreamingTab(val ssc: StreamingContext) extends SparkUITab(getSparkUI(ssc), "streaming") with Logging { private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static" val parent = getSparkUI(ssc) val listener = ssc.progressListener ssc.addStreamingListener(listener) ssc.sc.addSparkListener(listener) attachPage(new StreamingPage(this)) attachPage(new BatchPage(this)) def attach() { getSparkUI(ssc).attachTab(this) getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming") } def detach() { getSparkUI(ssc).detachTab(this) getSparkUI(ssc).removeStaticHandler("/static/streaming") } } private object StreamingTab { def getSparkUI(ssc: StreamingContext): SparkUI = { ssc.sc.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 88
Source File: StreamingListenerBus.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import java.util.concurrent.atomic.AtomicBoolean import org.apache.spark.Logging import org.apache.spark.util.AsynchronousListenerBus private[spark] class StreamingListenerBus extends AsynchronousListenerBus[StreamingListener, StreamingListenerEvent]("StreamingListenerBus") with Logging { private val logDroppedEvent = new AtomicBoolean(false) override def onPostEvent(listener: StreamingListener, event: StreamingListenerEvent): Unit = { event match { case receiverStarted: StreamingListenerReceiverStarted => listener.onReceiverStarted(receiverStarted) case receiverError: StreamingListenerReceiverError => listener.onReceiverError(receiverError) case receiverStopped: StreamingListenerReceiverStopped => listener.onReceiverStopped(receiverStopped) case batchSubmitted: StreamingListenerBatchSubmitted => listener.onBatchSubmitted(batchSubmitted) case batchStarted: StreamingListenerBatchStarted => listener.onBatchStarted(batchStarted) case batchCompleted: StreamingListenerBatchCompleted => listener.onBatchCompleted(batchCompleted) case outputOperationStarted: StreamingListenerOutputOperationStarted => listener.onOutputOperationStarted(outputOperationStarted) case outputOperationCompleted: StreamingListenerOutputOperationCompleted => listener.onOutputOperationCompleted(outputOperationCompleted) case _ => } } override def onDropEvent(event: StreamingListenerEvent): Unit = { if (logDroppedEvent.compareAndSet(false, true)) { // Only log the following message once to avoid duplicated annoying logs. logError("Dropping StreamingListenerEvent because no remaining room in event queue. " + "This likely means one of the StreamingListeners is too slow and cannot keep up with the " + "rate at which events are being started by the scheduler.") } } }
Example 89
Source File: RecurringTimer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import org.apache.spark.Logging import org.apache.spark.util.{Clock, SystemClock} private[streaming] class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name: String) extends Logging { private val thread = new Thread("RecurringTimer - " + name) { setDaemon(true) override def run() { loop } } @volatile private var prevTime = -1L @volatile private var nextTime = -1L @volatile private var stopped = false private def loop() { try { while (!stopped) { triggerActionForNextInterval() } triggerActionForNextInterval() } catch { case e: InterruptedException => } } } private[streaming] object RecurringTimer extends Logging { def main(args: Array[String]) { var lastRecurTime = 0L val period = 1000 def onRecur(time: Long) { val currentTime = System.currentTimeMillis() logInfo("" + currentTime + ": " + (currentTime - lastRecurTime)) lastRecurTime = currentTime } val timer = new RecurringTimer(new SystemClock(), period, onRecur, "Test") timer.start() Thread.sleep(30 * 1000) timer.stop(true) } }
Example 90
Source File: RawTextSender.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.{ByteArrayOutputStream, IOException} import java.net.ServerSocket import java.nio.ByteBuffer import scala.io.Source import org.apache.spark.{SparkConf, Logging} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.IntParam private[streaming] object RawTextSender extends Logging { def main(args: Array[String]) { if (args.length != 4) { // scalastyle:off println System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>") // scalastyle:on println System.exit(1) } // Parse the arguments using a pattern match val Array(IntParam(port), file, IntParam(blockSize), IntParam(bytesPerSec)) = args // Repeat the input data multiple times to fill in a buffer val lines = Source.fromFile(file).getLines().toArray val bufferStream = new ByteArrayOutputStream(blockSize + 1000) val ser = new KryoSerializer(new SparkConf()).newInstance() val serStream = ser.serializeStream(bufferStream) var i = 0 while (bufferStream.size < blockSize) { serStream.writeObject(lines(i)) i = (i + 1) % lines.length } val array = bufferStream.toByteArray val countBuf = ByteBuffer.wrap(new Array[Byte](4)) countBuf.putInt(array.length) countBuf.flip() val serverSocket = new ServerSocket(port) logInfo("Listening on port " + port) while (true) { val socket = serverSocket.accept() logInfo("Got a new connection") val out = new RateLimitedOutputStream(socket.getOutputStream, bytesPerSec) try { while (true) { out.write(countBuf.array) out.write(array) } } catch { case e: IOException => logError("Client disconnected") } finally { socket.close() } } } }
Example 91
Source File: FileBasedWriteAheadLogReader.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.{IOException, Closeable, EOFException} import java.nio.ByteBuffer import org.apache.hadoop.conf.Configuration import org.apache.spark.Logging private[streaming] class FileBasedWriteAheadLogReader(path: String, conf: Configuration) extends Iterator[ByteBuffer] with Closeable with Logging { private val instream = HdfsUtils.getInputStream(path, conf) private var closed = (instream == null) // the file may be deleted as we're opening the stream private var nextItem: Option[ByteBuffer] = None override def hasNext: Boolean = synchronized { if (closed) { return false } if (nextItem.isDefined) { // handle the case where hasNext is called without calling next true } else { try { val length = instream.readInt() val buffer = new Array[Byte](length) instream.readFully(buffer) nextItem = Some(ByteBuffer.wrap(buffer)) logTrace("Read next item " + nextItem.get) true } catch { case e: EOFException => logDebug("Error reading next item, EOF reached", e) close() false case e: IOException => logWarning("Error while trying to read data. If the file was deleted, " + "this should be okay.", e) close() if (HdfsUtils.checkFileExists(path, conf)) { // If file exists, this could be a legitimate error throw e } else { // File was deleted. This can occur when the daemon cleanup thread takes time to // delete the file during recovery. false } case e: Exception => logWarning("Error while trying to read data from HDFS.", e) close() throw e } } } override def next(): ByteBuffer = synchronized { val data = nextItem.getOrElse { close() throw new IllegalStateException( "next called without calling hasNext or after hasNext returned false") } nextItem = None // Ensure the next hasNext call loads new data. data } override def close(): Unit = synchronized { if (!closed) { instream.close() } closed = true } }
Example 92
Source File: RateLimitedOutputStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import scala.annotation.tailrec import java.io.OutputStream import java.util.concurrent.TimeUnit._ import org.apache.spark.Logging private[streaming] class RateLimitedOutputStream(out: OutputStream, desiredBytesPerSec: Int) extends OutputStream with Logging { require(desiredBytesPerSec > 0) private val SYNC_INTERVAL = NANOSECONDS.convert(10, SECONDS) private val CHUNK_SIZE = 8192 private var lastSyncTime = System.nanoTime private var bytesWrittenSinceSync = 0L override def write(b: Int) { waitToWrite(1) out.write(b) } override def write(bytes: Array[Byte]) { write(bytes, 0, bytes.length) } @tailrec override final def write(bytes: Array[Byte], offset: Int, length: Int) { val writeSize = math.min(length - offset, CHUNK_SIZE) if (writeSize > 0) { waitToWrite(writeSize) out.write(bytes, offset, writeSize) write(bytes, offset + writeSize, length) } } override def flush() { out.flush() } override def close() { out.close() } @tailrec private def waitToWrite(numBytes: Int) { val now = System.nanoTime val elapsedNanosecs = math.max(now - lastSyncTime, 1) val rate = bytesWrittenSinceSync.toDouble * 1000000000 / elapsedNanosecs if (rate < desiredBytesPerSec) { // It's okay to write; just update some variables and return bytesWrittenSinceSync += numBytes if (now > lastSyncTime + SYNC_INTERVAL) { // Sync interval has passed; let's resync lastSyncTime = now bytesWrittenSinceSync = numBytes } } else { // Calculate how much time we should sleep to bring ourselves to the desired rate. val targetTimeInMillis = bytesWrittenSinceSync * 1000 / desiredBytesPerSec val elapsedTimeInMillis = elapsedNanosecs / 1000000 val sleepTimeInMillis = targetTimeInMillis - elapsedTimeInMillis if (sleepTimeInMillis > 0) { logTrace("Natural rate is " + rate + " per second but desired rate is " + desiredBytesPerSec + ", sleeping for " + sleepTimeInMillis + " ms to compensate.") Thread.sleep(sleepTimeInMillis) } waitToWrite(numBytes) } } }
Example 93
Source File: FailureSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming import java.io.File import org.scalatest.BeforeAndAfter import org.apache.spark.{SparkFunSuite, Logging} import org.apache.spark.util.Utils class FailureSuite extends SparkFunSuite with BeforeAndAfter with Logging { private val batchDuration: Duration = Milliseconds(1000) private val numBatches = 30 private var directory: File = null before { directory = Utils.createTempDir() } after { if (directory != null) { Utils.deleteRecursively(directory) } StreamingContext.getActive().foreach { _.stop() } } test("multiple failures with map") { MasterFailureTest.testMap(directory.getAbsolutePath, numBatches, batchDuration) } test("multiple failures with updateStateByKey") { MasterFailureTest.testUpdateStateByKey(directory.getAbsolutePath, numBatches, batchDuration) } }
Example 94
Source File: EventLogDownloadResource.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.status.api.v1 import java.io.OutputStream import java.util.zip.ZipOutputStream import javax.ws.rs.{GET, Produces} import javax.ws.rs.core.{MediaType, Response, StreamingOutput} import scala.util.control.NonFatal import org.apache.spark.{Logging, SparkConf} import org.apache.spark.deploy.SparkHadoopUtil @Produces(Array(MediaType.APPLICATION_OCTET_STREAM)) private[v1] class EventLogDownloadResource( val uIRoot: UIRoot, val appId: String, val attemptId: Option[String]) extends Logging { val conf = SparkHadoopUtil.get.newConfiguration(new SparkConf) @GET def getEventLogs(): Response = { try { val fileName = { attemptId match { case Some(id) => s"eventLogs-$appId-$id.zip" case None => s"eventLogs-$appId.zip" } } val stream = new StreamingOutput { override def write(output: OutputStream): Unit = { val zipStream = new ZipOutputStream(output) try { uIRoot.writeEventLogs(appId, attemptId, zipStream) } finally { zipStream.close() } } } Response.ok(stream) .header("Content-Disposition", s"attachment; filename=$fileName") .header("Content-Type", MediaType.APPLICATION_OCTET_STREAM) .build() } catch { case NonFatal(e) => Response.serverError() .entity(s"Event logs are not available for app: $appId.") .status(Response.Status.SERVICE_UNAVAILABLE) .build() } } }
Example 95
Source File: NettyRpcCallContext.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rpc.netty import scala.concurrent.Promise import org.apache.spark.Logging import org.apache.spark.network.client.RpcResponseCallback import org.apache.spark.rpc.{RpcAddress, RpcCallContext} private[netty] abstract class NettyRpcCallContext(override val senderAddress: RpcAddress) extends RpcCallContext with Logging { protected def send(message: Any): Unit override def reply(response: Any): Unit = { send(response) } override def sendFailure(e: Throwable): Unit = { send(RpcFailure(e)) } } private[netty] class RemoteNettyRpcCallContext( nettyEnv: NettyRpcEnv, callback: RpcResponseCallback, senderAddress: RpcAddress) extends NettyRpcCallContext(senderAddress) { override protected def send(message: Any): Unit = { val reply = nettyEnv.serialize(message) callback.onSuccess(reply) } }
Example 96
Source File: BlockTransferService.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.network import java.io.Closeable import java.nio.ByteBuffer import scala.concurrent.{Promise, Await, Future} import scala.concurrent.duration.Duration import org.apache.spark.Logging import org.apache.spark.network.buffer.{NioManagedBuffer, ManagedBuffer} import org.apache.spark.network.shuffle.{ShuffleClient, BlockFetchingListener} import org.apache.spark.storage.{BlockManagerId, BlockId, StorageLevel} private[spark] abstract class BlockTransferService extends ShuffleClient with Closeable with Logging { def uploadBlockSync( hostname: String, port: Int, execId: String, blockId: BlockId, blockData: ManagedBuffer, level: StorageLevel): Unit = { Await.result(uploadBlock(hostname, port, execId, blockId, blockData, level), Duration.Inf) } }
Example 97
Source File: NettyBlockRpcServer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConverters._ import org.apache.spark.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(appId, blocks.iterator.asJava) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteBuffer) case uploadBlock: UploadBlock => // StorageLevel is serialized as bytes using our JavaSerializer. val level: StorageLevel = serializer.newInstance().deserialize(ByteBuffer.wrap(uploadBlock.metadata)) val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) blockManager.putBlockData(BlockId(uploadBlock.blockId), data, level) responseContext.onSuccess(ByteBuffer.allocate(0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 98
Source File: MetricsConfig.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.metrics import java.io.{FileInputStream, InputStream} import java.util.Properties import scala.collection.JavaConverters._ import scala.collection.mutable import scala.util.matching.Regex import org.apache.spark.util.Utils import org.apache.spark.{Logging, SparkConf} private[spark] class MetricsConfig(conf: SparkConf) extends Logging { private val DEFAULT_PREFIX = "*" private val INSTANCE_REGEX = "^(\\*|[a-zA-Z]+)\\.(.+)".r private val DEFAULT_METRICS_CONF_FILENAME = "metrics.properties" private[metrics] val properties = new Properties() private[metrics] var propertyCategories: mutable.HashMap[String, Properties] = null private def setDefaultProperties(prop: Properties) { prop.setProperty("*.sink.servlet.class", "org.apache.spark.metrics.sink.MetricsServlet") prop.setProperty("*.sink.servlet.path", "/metrics/json") prop.setProperty("master.sink.servlet.path", "/metrics/master/json") prop.setProperty("applications.sink.servlet.path", "/metrics/applications/json") } def initialize() { // Add default properties in case there's no properties file setDefaultProperties(properties) loadPropertiesFromFile(conf.getOption("spark.metrics.conf")) // Also look for the properties in provided Spark configuration val prefix = "spark.metrics.conf." conf.getAll.foreach { case (k, v) if k.startsWith(prefix) => properties.setProperty(k.substring(prefix.length()), v) case _ => } propertyCategories = subProperties(properties, INSTANCE_REGEX) if (propertyCategories.contains(DEFAULT_PREFIX)) { val defaultProperty = propertyCategories(DEFAULT_PREFIX).asScala for((inst, prop) <- propertyCategories if (inst != DEFAULT_PREFIX); (k, v) <- defaultProperty if (prop.get(k) == null)) { prop.put(k, v) } } } def subProperties(prop: Properties, regex: Regex): mutable.HashMap[String, Properties] = { val subProperties = new mutable.HashMap[String, Properties] prop.asScala.foreach { kv => if (regex.findPrefixOf(kv._1.toString).isDefined) { val regex(prefix, suffix) = kv._1.toString subProperties.getOrElseUpdate(prefix, new Properties).setProperty(suffix, kv._2.toString) } } subProperties } def getInstance(inst: String): Properties = { propertyCategories.get(inst) match { case Some(s) => s case None => propertyCategories.getOrElse(DEFAULT_PREFIX, new Properties) } } private[this] def loadPropertiesFromFile(path: Option[String]): Unit = { var is: InputStream = null try { is = path match { case Some(f) => new FileInputStream(f) case None => Utils.getSparkClassLoader.getResourceAsStream(DEFAULT_METRICS_CONF_FILENAME) } if (is != null) { properties.load(is) } } catch { case e: Exception => val file = path.getOrElse(DEFAULT_METRICS_CONF_FILENAME) logError(s"Error loading configuration file $file", e) } finally { if (is != null) { is.close() } } } }
Example 99
Source File: PythonGatewayServer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import java.io.DataOutputStream import java.net.Socket import py4j.GatewayServer import org.apache.spark.Logging import org.apache.spark.util.Utils private[spark] object PythonGatewayServer extends Logging { def main(args: Array[String]): Unit = Utils.tryOrExit { // Start a GatewayServer on an ephemeral port val gatewayServer: GatewayServer = new GatewayServer(null, 0) gatewayServer.start() val boundPort: Int = gatewayServer.getListeningPort if (boundPort == -1) { logError("GatewayServer failed to bind; exiting") System.exit(1) } else { logDebug(s"Started PythonGatewayServer on port $boundPort") } // Communicate the bound port back to the caller via the caller-specified callback port val callbackHost = sys.env("_PYSPARK_DRIVER_CALLBACK_HOST") val callbackPort = sys.env("_PYSPARK_DRIVER_CALLBACK_PORT").toInt logDebug(s"Communicating GatewayServer port to Python driver at $callbackHost:$callbackPort") val callbackSocket = new Socket(callbackHost, callbackPort) val dos = new DataOutputStream(callbackSocket.getOutputStream) dos.writeInt(boundPort) dos.close() callbackSocket.close() // Exit on EOF or broken pipe to ensure that this process dies when the Python driver dies: while (System.in.read() != -1) { // Do nothing } logDebug("Exiting due to broken pipe from Python driver") System.exit(0) } }
Example 100
Source File: MesosExternalShuffleService.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.mesos import java.net.SocketAddress import java.nio.ByteBuffer import scala.collection.mutable import org.apache.spark.{Logging, SecurityManager, SparkConf} import org.apache.spark.deploy.ExternalShuffleService import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.shuffle.ExternalShuffleBlockHandler import org.apache.spark.network.shuffle.protocol.BlockTransferMessage import org.apache.spark.network.shuffle.protocol.mesos.RegisterDriver import org.apache.spark.network.util.TransportConf private[mesos] class MesosExternalShuffleService(conf: SparkConf, securityManager: SecurityManager) extends ExternalShuffleService(conf, securityManager) { protected override def newShuffleBlockHandler( conf: TransportConf): ExternalShuffleBlockHandler = { new MesosExternalShuffleBlockHandler(conf) } } private[spark] object MesosExternalShuffleService extends Logging { def main(args: Array[String]): Unit = { ExternalShuffleService.main(args, (conf: SparkConf, sm: SecurityManager) => new MesosExternalShuffleService(conf, sm)) } }
Example 101
Source File: MesosClusterDispatcher.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.mesos import java.util.concurrent.CountDownLatch import org.apache.spark.deploy.mesos.ui.MesosClusterUI import org.apache.spark.deploy.rest.mesos.MesosRestServer import org.apache.spark.scheduler.cluster.mesos._ import org.apache.spark.util.SignalLogger import org.apache.spark.{Logging, SecurityManager, SparkConf} private[mesos] class MesosClusterDispatcher( args: MesosClusterDispatcherArguments, conf: SparkConf) extends Logging { private val publicAddress = Option(conf.getenv("SPARK_PUBLIC_DNS")).getOrElse(args.host) private val recoveryMode = conf.get("spark.mesos.deploy.recoveryMode", "NONE").toUpperCase() logInfo("Recovery mode in Mesos dispatcher set to: " + recoveryMode) private val engineFactory = recoveryMode match { case "NONE" => new BlackHoleMesosClusterPersistenceEngineFactory case "ZOOKEEPER" => new ZookeeperMesosClusterPersistenceEngineFactory(conf) case _ => throw new IllegalArgumentException("Unsupported recovery mode: " + recoveryMode) } private val scheduler = new MesosClusterScheduler(engineFactory, conf) private val server = new MesosRestServer(args.host, args.port, conf, scheduler) private val webUi = new MesosClusterUI( new SecurityManager(conf), args.webUiPort, conf, publicAddress, scheduler) private val shutdownLatch = new CountDownLatch(1) def start(): Unit = { webUi.bind() scheduler.frameworkUrl = webUi.activeWebUiUrl scheduler.start() server.start() } def awaitShutdown(): Unit = { shutdownLatch.await() } def stop(): Unit = { webUi.stop() server.stop() scheduler.stop() shutdownLatch.countDown() } } private[mesos] object MesosClusterDispatcher extends Logging { def main(args: Array[String]) { SignalLogger.register(log) val conf = new SparkConf val dispatcherArgs = new MesosClusterDispatcherArguments(args, conf) conf.setMaster(dispatcherArgs.masterUrl) conf.setAppName(dispatcherArgs.name) dispatcherArgs.zookeeperUrl.foreach { z => conf.set("spark.mesos.deploy.recoveryMode", "ZOOKEEPER") conf.set("spark.mesos.deploy.zookeeper.url", z) } val dispatcher = new MesosClusterDispatcher(dispatcherArgs, conf) dispatcher.start() val shutdownHook = new Thread() { override def run() { logInfo("Shutdown hook is shutting down dispatcher") dispatcher.stop() dispatcher.awaitShutdown() } } Runtime.getRuntime.addShutdownHook(shutdownHook) dispatcher.awaitShutdown() } }
Example 102
Source File: SparkCuratorUtil.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import scala.collection.JavaConverters._ import org.apache.curator.framework.{CuratorFramework, CuratorFrameworkFactory} import org.apache.curator.retry.ExponentialBackoffRetry import org.apache.zookeeper.KeeperException import org.apache.spark.{Logging, SparkConf} private[spark] object SparkCuratorUtil extends Logging { private val ZK_CONNECTION_TIMEOUT_MILLIS = 15000 private val ZK_SESSION_TIMEOUT_MILLIS = 60000 private val RETRY_WAIT_MILLIS = 5000 private val MAX_RECONNECT_ATTEMPTS = 3 def newClient( conf: SparkConf, zkUrlConf: String = "spark.deploy.zookeeper.url"): CuratorFramework = { val ZK_URL = conf.get(zkUrlConf) val zk = CuratorFrameworkFactory.newClient(ZK_URL, ZK_SESSION_TIMEOUT_MILLIS, ZK_CONNECTION_TIMEOUT_MILLIS, new ExponentialBackoffRetry(RETRY_WAIT_MILLIS, MAX_RECONNECT_ATTEMPTS)) zk.start() zk } def mkdir(zk: CuratorFramework, path: String) { if (zk.checkExists().forPath(path) == null) { try { zk.create().creatingParentsIfNeeded().forPath(path) } catch { case nodeExist: KeeperException.NodeExistsException => // do nothing, ignore node existing exception. case e: Exception => throw e } } } def deleteRecursive(zk: CuratorFramework, path: String) { if (zk.checkExists().forPath(path) != null) { for (child <- zk.getChildren.forPath(path).asScala) { zk.delete().forPath(path + "/" + child) } zk.delete().forPath(path) } } }
Example 103
Source File: TestClient.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.client import org.apache.spark.rpc.RpcEnv import org.apache.spark.{SecurityManager, SparkConf, Logging} import org.apache.spark.deploy.{ApplicationDescription, Command} import org.apache.spark.util.Utils private[spark] object TestClient { private class TestListener extends AppClientListener with Logging { def connected(id: String) { logInfo("Connected to master, got app ID " + id) } def disconnected() { logInfo("Disconnected from master") System.exit(0) } def dead(reason: String) { logInfo("Application died with error: " + reason) System.exit(0) } def executorAdded(id: String, workerId: String, hostPort: String, cores: Int, memory: Int) {} def executorRemoved(id: String, message: String, exitStatus: Option[Int]) {} } def main(args: Array[String]) { val url = args(0) val conf = new SparkConf val rpcEnv = RpcEnv.create("spark", Utils.localHostName(), 0, conf, new SecurityManager(conf)) val executorClassname = TestExecutor.getClass.getCanonicalName.stripSuffix("$") val desc = new ApplicationDescription("TestClient", Some(1), 512, Command(executorClassname, Seq(), Map(), Seq(), Seq(), Seq()), "ignored") val listener = new TestListener val client = new AppClient(rpcEnv, Array(url), desc, listener, new SparkConf) client.start() rpcEnv.awaitTermination() } }
Example 104
Source File: FileSystemPersistenceEngine.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.io._ import scala.reflect.ClassTag import org.apache.spark.Logging import org.apache.spark.serializer.{DeserializationStream, SerializationStream, Serializer} import org.apache.spark.util.Utils private[master] class FileSystemPersistenceEngine( val dir: String, val serializer: Serializer) extends PersistenceEngine with Logging { new File(dir).mkdir() override def persist(name: String, obj: Object): Unit = { serializeIntoFile(new File(dir + File.separator + name), obj) } override def unpersist(name: String): Unit = { val f = new File(dir + File.separator + name) if (!f.delete()) { logWarning(s"Error deleting ${f.getPath()}") } } override def read[T: ClassTag](prefix: String): Seq[T] = { val files = new File(dir).listFiles().filter(_.getName.startsWith(prefix)) files.map(deserializeFromFile[T]) } private def serializeIntoFile(file: File, value: AnyRef) { val created = file.createNewFile() if (!created) { throw new IllegalStateException("Could not create file: " + file) } val fileOut = new FileOutputStream(file) var out: SerializationStream = null Utils.tryWithSafeFinally { out = serializer.newInstance().serializeStream(fileOut) out.writeObject(value) } { fileOut.close() if (out != null) { out.close() } } } private def deserializeFromFile[T](file: File)(implicit m: ClassTag[T]): T = { val fileIn = new FileInputStream(file) var in: DeserializationStream = null try { in = serializer.newInstance().deserializeStream(fileIn) in.readObject[T]() } finally { fileIn.close() if (in != null) { in.close() } } } }
Example 105
Source File: RecoveryModeFactory.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import org.apache.spark.{Logging, SparkConf} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.serializer.Serializer private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) with Logging { val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "") def createPersistenceEngine(): PersistenceEngine = { logInfo("Persisting recovery state to directory: " + RECOVERY_DIR) new FileSystemPersistenceEngine(RECOVERY_DIR, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new MonarchyLeaderAgent(master) } } private[master] class ZooKeeperRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) { def createPersistenceEngine(): PersistenceEngine = { new ZooKeeperPersistenceEngine(conf, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new ZooKeeperLeaderElectionAgent(master, conf) } }
Example 106
Source File: MasterWebUI.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master.ui import org.apache.spark.Logging import org.apache.spark.deploy.master.Master import org.apache.spark.status.api.v1.{ApiRootResource, ApplicationsListResource, ApplicationInfo, UIRoot} import org.apache.spark.ui.{SparkUI, WebUI} import org.apache.spark.ui.JettyUtils._ def detachSparkUI(ui: SparkUI) { assert(serverInfo.isDefined, "Master UI must be bound to a server before detaching SparkUIs") ui.getHandlers.foreach(detachHandler) } def getApplicationInfoList: Iterator[ApplicationInfo] = { val state = masterPage.getMasterState val activeApps = state.activeApps.sortBy(_.startTime).reverse val completedApps = state.completedApps.sortBy(_.endTime).reverse activeApps.iterator.map { ApplicationsListResource.convertApplicationInfo(_, false) } ++ completedApps.iterator.map { ApplicationsListResource.convertApplicationInfo(_, true) } } def getSparkUI(appId: String): Option[SparkUI] = { val state = masterPage.getMasterState val activeApps = state.activeApps.sortBy(_.startTime).reverse val completedApps = state.completedApps.sortBy(_.endTime).reverse (activeApps ++ completedApps).find { _.id == appId }.flatMap { master.rebuildSparkUI } } } private[master] object MasterWebUI { private val STATIC_RESOURCE_DIR = SparkUI.STATIC_RESOURCE_DIR }
Example 107
Source File: ZooKeeperLeaderElectionAgent.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import org.apache.spark.{Logging, SparkConf} import org.apache.curator.framework.CuratorFramework import org.apache.curator.framework.recipes.leader.{LeaderLatchListener, LeaderLatch} import org.apache.spark.deploy.SparkCuratorUtil private[master] class ZooKeeperLeaderElectionAgent(val masterInstance: LeaderElectable, conf: SparkConf) extends LeaderLatchListener with LeaderElectionAgent with Logging { val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/leader_election" private var zk: CuratorFramework = _ private var leaderLatch: LeaderLatch = _ private var status = LeadershipStatus.NOT_LEADER start() private def start() { logInfo("Starting ZooKeeper LeaderElection agent") zk = SparkCuratorUtil.newClient(conf) leaderLatch = new LeaderLatch(zk, WORKING_DIR) leaderLatch.addListener(this) leaderLatch.start() } override def stop() { leaderLatch.close() zk.close() } override def isLeader() { synchronized { // could have lost leadership by now. if (!leaderLatch.hasLeadership) { return } logInfo("We have gained leadership") updateLeadershipStatus(true) } } override def notLeader() { synchronized { // could have gained leadership by now. if (leaderLatch.hasLeadership) { return } logInfo("We have lost leadership") updateLeadershipStatus(false) } } private def updateLeadershipStatus(isLeader: Boolean) { if (isLeader && status == LeadershipStatus.NOT_LEADER) { status = LeadershipStatus.LEADER masterInstance.electedLeader() } else if (!isLeader && status == LeadershipStatus.LEADER) { status = LeadershipStatus.NOT_LEADER masterInstance.revokedLeadership() } } private object LeadershipStatus extends Enumeration { type LeadershipStatus = Value val LEADER, NOT_LEADER = Value } }
Example 108
Source File: ZooKeeperPersistenceEngine.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.curator.framework.CuratorFramework import org.apache.zookeeper.CreateMode import org.apache.spark.{Logging, SparkConf} import org.apache.spark.deploy.SparkCuratorUtil import org.apache.spark.serializer.Serializer private[master] class ZooKeeperPersistenceEngine(conf: SparkConf, val serializer: Serializer) extends PersistenceEngine with Logging { private val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/master_status" private val zk: CuratorFramework = SparkCuratorUtil.newClient(conf) SparkCuratorUtil.mkdir(zk, WORKING_DIR) override def persist(name: String, obj: Object): Unit = { serializeIntoFile(WORKING_DIR + "/" + name, obj) } override def unpersist(name: String): Unit = { zk.delete().forPath(WORKING_DIR + "/" + name) } override def read[T: ClassTag](prefix: String): Seq[T] = { zk.getChildren.forPath(WORKING_DIR).asScala .filter(_.startsWith(prefix)).map(deserializeFromFile[T]).flatten } override def close() { zk.close() } private def serializeIntoFile(path: String, value: AnyRef) { val serialized = serializer.newInstance().serialize(value) val bytes = new Array[Byte](serialized.remaining()) serialized.get(bytes) zk.create().withMode(CreateMode.PERSISTENT).forPath(path, bytes) } private def deserializeFromFile[T](filename: String)(implicit m: ClassTag[T]): Option[T] = { val fileData = zk.getData().forPath(WORKING_DIR + "/" + filename) try { Some(serializer.newInstance().deserialize[T](ByteBuffer.wrap(fileData))) } catch { case e: Exception => { logWarning("Exception while reading persisted file, deleting", e) zk.delete().forPath(WORKING_DIR + "/" + filename) None } } } }
Example 109
Source File: CommandUtils.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.worker import java.io.{File, FileOutputStream, InputStream, IOException} import scala.collection.JavaConverters._ import scala.collection.Map import org.apache.spark.Logging import org.apache.spark.SecurityManager import org.apache.spark.deploy.Command import org.apache.spark.launcher.WorkerCommandBuilder import org.apache.spark.util.Utils def redirectStream(in: InputStream, file: File) { val out = new FileOutputStream(file, true) // TODO: It would be nice to add a shutdown hook here that explains why the output is // terminating. Otherwise if the worker dies the executor logs will silently stop. new Thread("redirect output to " + file) { override def run() { try { Utils.copyStream(in, out, true) } catch { case e: IOException => logInfo("Redirection to " + file + " closed: " + e.getMessage) } } }.start() } }
Example 110
Source File: WorkerWebUI.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.worker.ui import java.io.File import javax.servlet.http.HttpServletRequest import org.apache.spark.Logging import org.apache.spark.deploy.worker.Worker import org.apache.spark.ui.{SparkUI, WebUI} import org.apache.spark.ui.JettyUtils._ import org.apache.spark.util.RpcUtils def initialize() { val logPage = new LogPage(this) attachPage(logPage) attachPage(new WorkerPage(this)) attachHandler(createStaticHandler(WorkerWebUI.STATIC_RESOURCE_BASE, "/static")) attachHandler(createServletHandler("/log", (request: HttpServletRequest) => logPage.renderLog(request), worker.securityMgr, worker.conf)) } } private[worker] object WorkerWebUI { val STATIC_RESOURCE_BASE = SparkUI.STATIC_RESOURCE_DIR val DEFAULT_RETAINED_DRIVERS = 1000 val DEFAULT_RETAINED_EXECUTORS = 1000 }
Example 111
Source File: WorkerWatcher.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.worker import org.apache.spark.Logging import org.apache.spark.rpc._ private[spark] class WorkerWatcher( override val rpcEnv: RpcEnv, workerUrl: String, isTesting: Boolean = false) extends RpcEndpoint with Logging { logInfo(s"Connecting to worker $workerUrl") if (!isTesting) { rpcEnv.asyncSetupEndpointRefByURI(workerUrl) } // Used to avoid shutting down JVM during tests // In the normal case, exitNonZero will call `System.exit(-1)` to shutdown the JVM. In the unit // test, the user should call `setTesting(true)` so that `exitNonZero` will set `isShutDown` to // true rather than calling `System.exit`. The user can check `isShutDown` to know if // `exitNonZero` is called. private[deploy] var isShutDown = false // Lets filter events only from the worker's rpc system private val expectedAddress = RpcAddress.fromURIString(workerUrl) private def isWorker(address: RpcAddress) = expectedAddress == address private def exitNonZero() = if (isTesting) isShutDown = true else System.exit(-1) override def receive: PartialFunction[Any, Unit] = { case e => logWarning(s"Received unexpected message: $e") } override def onConnected(remoteAddress: RpcAddress): Unit = { if (isWorker(remoteAddress)) { logInfo(s"Successfully connected to $workerUrl") } } override def onDisconnected(remoteAddress: RpcAddress): Unit = { if (isWorker(remoteAddress)) { // This log message will never be seen logError(s"Lost connection to worker rpc endpoint $workerUrl. Exiting.") exitNonZero() } } override def onNetworkError(cause: Throwable, remoteAddress: RpcAddress): Unit = { if (isWorker(remoteAddress)) { // These logs may not be seen if the worker (and associated pipe) has died logError(s"Could not initialize connection to worker $workerUrl. Exiting.") logError(s"Error was: $cause") exitNonZero() } } }
Example 112
Source File: HistoryServerArguments.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.history import org.apache.spark.{Logging, SparkConf} import org.apache.spark.util.Utils private[history] class HistoryServerArguments(conf: SparkConf, args: Array[String]) extends Logging { private var propertiesFile: String = null parse(args.toList) private def parse(args: List[String]): Unit = { if (args.length == 1) { setLogDirectory(args.head) } else { args match { case ("--dir" | "-d") :: value :: tail => setLogDirectory(value) parse(tail) case ("--help" | "-h") :: tail => printUsageAndExit(0) case ("--properties-file") :: value :: tail => propertiesFile = value parse(tail) case Nil => case _ => printUsageAndExit(1) } } } private def setLogDirectory(value: String): Unit = { logWarning("Setting log directory through the command line is deprecated as of " + "Spark 1.1.0. Please set this through spark.history.fs.logDirectory instead.") conf.set("spark.history.fs.logDirectory", value) } // This mutates the SparkConf, so all accesses to it must be made after this line Utils.loadDefaultSparkProperties(conf, propertiesFile) private def printUsageAndExit(exitCode: Int) { // scalastyle:off println System.err.println( """ |Usage: HistoryServer [options] | |Options: | DIR Deprecated; set spark.history.fs.logDirectory directly | --dir DIR (-d DIR) Deprecated; set spark.history.fs.logDirectory directly | --properties-file FILE Path to a custom Spark properties file. | Default is conf/spark-defaults.conf. | |Configuration options can be set by setting the corresponding JVM system property. |History Server options are always available; additional options depend on the provider. | |History Server options: | | spark.history.ui.port Port where server will listen for connections | (default 18080) | spark.history.acls.enable Whether to enable view acls for all applications | (default false) | spark.history.provider Name of history provider class (defaults to | file system-based provider) | spark.history.retainedApplications Max number of application UIs to keep loaded in memory | (default 50) |FsHistoryProvider options: | | spark.history.fs.logDirectory Directory where app logs are stored | (default: file:/tmp/spark-events) | spark.history.fs.updateInterval How often to reload log data from storage | (in seconds, default: 10) |""".stripMargin) // scalastyle:on println System.exit(exitCode) } }
Example 113
Source File: LocalSparkCluster.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import scala.collection.mutable.ArrayBuffer import org.apache.spark.rpc.RpcEnv import org.apache.spark.{Logging, SparkConf} import org.apache.spark.deploy.worker.Worker import org.apache.spark.deploy.master.Master import org.apache.spark.util.Utils for (workerNum <- 1 to numWorkers) { val workerEnv = Worker.startRpcEnvAndEndpoint(localHostname, 0, 0, coresPerWorker, memoryPerWorker, masters, null, Some(workerNum), _conf) workerRpcEnvs += workerEnv } masters } def stop() { logInfo("Shutting down local Spark cluster.") // Stop the workers before the master so they don't get upset that it disconnected workerRpcEnvs.foreach(_.shutdown()) masterRpcEnvs.foreach(_.shutdown()) workerRpcEnvs.foreach(_.awaitTermination()) masterRpcEnvs.foreach(_.awaitTermination()) masterRpcEnvs.clear() workerRpcEnvs.clear() } }
Example 114
Source File: SimrSchedulerBackend.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.rpc.RpcAddress import org.apache.spark.{Logging, SparkContext, SparkEnv} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.scheduler.TaskSchedulerImpl private[spark] class SimrSchedulerBackend( scheduler: TaskSchedulerImpl, sc: SparkContext, driverFilePath: String) extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) with Logging { val tmpPath = new Path(driverFilePath + "_tmp") val filePath = new Path(driverFilePath) val maxCores = conf.getInt("spark.simr.executor.cores", 1) override def start() { super.start() val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName, RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt), CoarseGrainedSchedulerBackend.ENDPOINT_NAME) val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("") logInfo("Writing to HDFS file: " + driverFilePath) logInfo("Writing Akka address: " + driverUrl) logInfo("Writing Spark UI Address: " + appUIAddress) // Create temporary file to prevent race condition where executors get empty driverUrl file val temp = fs.create(tmpPath, true) temp.writeUTF(driverUrl) temp.writeInt(maxCores) temp.writeUTF(appUIAddress) temp.close() // "Atomic" rename fs.rename(tmpPath, filePath) } override def stop() { val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) if (!fs.delete(new Path(driverFilePath), false)) { logWarning(s"error deleting ${driverFilePath}") } super.stop() } }
Example 115
Source File: MesosClusterPersistenceEngine.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster.mesos import scala.collection.JavaConverters._ import org.apache.curator.framework.CuratorFramework import org.apache.zookeeper.CreateMode import org.apache.zookeeper.KeeperException.NoNodeException import org.apache.spark.{Logging, SparkConf} import org.apache.spark.deploy.SparkCuratorUtil import org.apache.spark.util.Utils private[spark] class ZookeeperMesosClusterPersistenceEngine( baseDir: String, zk: CuratorFramework, conf: SparkConf) extends MesosClusterPersistenceEngine with Logging { private val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark_mesos_dispatcher") + "/" + baseDir SparkCuratorUtil.mkdir(zk, WORKING_DIR) def path(name: String): String = { WORKING_DIR + "/" + name } override def expunge(name: String): Unit = { zk.delete().forPath(path(name)) } override def persist(name: String, obj: Object): Unit = { val serialized = Utils.serialize(obj) val zkPath = path(name) zk.create().withMode(CreateMode.PERSISTENT).forPath(zkPath, serialized) } override def fetch[T](name: String): Option[T] = { val zkPath = path(name) try { val fileData = zk.getData().forPath(zkPath) Some(Utils.deserialize[T](fileData)) } catch { case e: NoNodeException => None case e: Exception => { logWarning("Exception while reading persisted file, deleting", e) zk.delete().forPath(zkPath) None } } } override def fetchAll[T](): Iterable[T] = { zk.getChildren.forPath(WORKING_DIR).asScala.flatMap(fetch[T]) } }
Example 116
Source File: MesosTaskLaunchData.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster.mesos import java.nio.ByteBuffer import org.apache.mesos.protobuf.ByteString import org.apache.spark.Logging private[spark] case class MesosTaskLaunchData( serializedTask: ByteBuffer, attemptNumber: Int) extends Logging { def toByteString: ByteString = { val dataBuffer = ByteBuffer.allocate(4 + serializedTask.limit) dataBuffer.putInt(attemptNumber) dataBuffer.put(serializedTask) dataBuffer.rewind logDebug(s"ByteBuffer size: [${dataBuffer.remaining}]") ByteString.copyFrom(dataBuffer) } } private[spark] object MesosTaskLaunchData extends Logging { def fromByteString(byteString: ByteString): MesosTaskLaunchData = { val byteBuffer = byteString.asReadOnlyByteBuffer() logDebug(s"ByteBuffer size: [${byteBuffer.remaining}]") val attemptNumber = byteBuffer.getInt // updates the position by 4 bytes val serializedTask = byteBuffer.slice() // subsequence starting at the current position MesosTaskLaunchData(serializedTask, attemptNumber) } }
Example 117
Source File: ReplayListenerBus.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io.{InputStream, IOException} import scala.io.Source import com.fasterxml.jackson.core.JsonParseException import org.json4s.jackson.JsonMethods._ import org.apache.spark.Logging import org.apache.spark.util.JsonProtocol def replay( logData: InputStream, sourceName: String, maybeTruncated: Boolean = false): Unit = { var currentLine: String = null var lineNumber: Int = 1 try { val lines = Source.fromInputStream(logData).getLines() while (lines.hasNext) { currentLine = lines.next() try { postToAll(JsonProtocol.sparkEventFromJson(parse(currentLine))) } catch { case jpe: JsonParseException => // We can only ignore exception from last line of the file that might be truncated if (!maybeTruncated || lines.hasNext) { throw jpe } else { logWarning(s"Got JsonParseException from log file $sourceName" + s" at line $lineNumber, the file might not have finished writing cleanly.") } } lineNumber += 1 } } catch { case ioe: IOException => throw ioe case e: Exception => logError(s"Exception parsing Spark event log: $sourceName", e) logError(s"Malformed line #$lineNumber: $currentLine\n") } } }
Example 118
Source File: SparkUncaughtExceptionHandler.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.util import org.apache.spark.Logging private[spark] object SparkUncaughtExceptionHandler extends Thread.UncaughtExceptionHandler with Logging { override def uncaughtException(thread: Thread, exception: Throwable) { try { // Make it explicit that uncaught exceptions are thrown when container is shutting down. // It will help users when they analyze the executor logs val inShutdownMsg = if (ShutdownHookManager.inShutdown()) "[Container in shutdown] " else "" val errMsg = "Uncaught exception in thread " logError(inShutdownMsg + errMsg + thread, exception) // We may have been called from a shutdown hook. If so, we must not call System.exit(). // (If we do, we will deadlock.) if (!ShutdownHookManager.inShutdown()) { if (exception.isInstanceOf[OutOfMemoryError]) { System.exit(SparkExitCode.OOM) } else { System.exit(SparkExitCode.UNCAUGHT_EXCEPTION) } } } catch { case oom: OutOfMemoryError => Runtime.getRuntime.halt(SparkExitCode.OOM) case t: Throwable => Runtime.getRuntime.halt(SparkExitCode.UNCAUGHT_EXCEPTION_TWICE) } } def uncaughtException(exception: Throwable) { uncaughtException(Thread.currentThread(), exception) } }
Example 119
Source File: BlockManagerSlaveEndpoint.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import scala.concurrent.{ExecutionContext, Future} import org.apache.spark.{Logging, MapOutputTracker, SparkEnv} import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.storage.BlockManagerMessages._ import org.apache.spark.util.{ThreadUtils, Utils} private[storage] class BlockManagerSlaveEndpoint( override val rpcEnv: RpcEnv, blockManager: BlockManager, mapOutputTracker: MapOutputTracker) extends ThreadSafeRpcEndpoint with Logging { private val asyncThreadPool = ThreadUtils.newDaemonCachedThreadPool("block-manager-slave-async-thread-pool") private implicit val asyncExecutionContext = ExecutionContext.fromExecutorService(asyncThreadPool) // Operations that involve removing blocks may be slow and should be done asynchronously override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case RemoveBlock(blockId) => doAsync[Boolean]("removing block " + blockId, context) { blockManager.removeBlock(blockId) true } case RemoveRdd(rddId) => doAsync[Int]("removing RDD " + rddId, context) { blockManager.removeRdd(rddId) } case RemoveShuffle(shuffleId) => doAsync[Boolean]("removing shuffle " + shuffleId, context) { if (mapOutputTracker != null) { mapOutputTracker.unregisterShuffle(shuffleId) } SparkEnv.get.shuffleManager.unregisterShuffle(shuffleId) } case RemoveBroadcast(broadcastId, _) => doAsync[Int]("removing broadcast " + broadcastId, context) { blockManager.removeBroadcast(broadcastId, tellMaster = true) } case GetBlockStatus(blockId, _) => context.reply(blockManager.getStatus(blockId)) case GetMatchingBlockIds(filter, _) => context.reply(blockManager.getMatchingBlockIds(filter)) case TriggerThreadDump => context.reply(Utils.getThreadDump()) } private def doAsync[T](actionMessage: String, context: RpcCallContext)(body: => T) { val future = Future { logDebug(actionMessage) body } future.onSuccess { case response => logDebug("Done " + actionMessage + ", response is " + response) context.reply(response) logDebug("Sent response: " + response + " to " + context.senderAddress) } future.onFailure { case t: Throwable => logError("Error in " + actionMessage, t) context.sendFailure(t) } } override def onStop(): Unit = { asyncThreadPool.shutdownNow() } }
Example 120
Source File: OrderedRDDFunctions.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Logging, Partitioner, RangePartitioner} import org.apache.spark.annotation.DeveloperApi def filterByRange(lower: K, upper: K): RDD[P] = self.withScope { def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper) val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => { val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } PartitionPruningRDD.create(self, partitionIndicies.contains) } case _ => self } rddToFilter.filter { case (k, v) => inRange(k) } } }
Example 121
Source File: MemoryRDDCheckpointData.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.Logging import scala.reflect.ClassTag class MemoryRDDCheckpointData[T: ClassTag](@transient private val rdd: RDD[T]) extends RDDCheckpointData[T](rdd) with Logging { protected override def doCheckpoint(): CheckpointRDD[T] = { val level = rdd.getStorageLevel // If you're using this, persist with storage level using memory before reaching this code. // By the time this method is reached, the rdd should already be cached. This is part of truncating the lineage. // We do not set the storage level here so the user intentionally receives the error. // LocalCheckpointing is not sufficient for this purpose since it requires executing a new job. // If instead local checkpointing, or checkpointing in general, was integrated into the block manager, // this approach would become unnecessary. // Assume storage level uses memory; otherwise eviction may cause data loss assume(level.useMemory, s"Storage level $level is not appropriate for memory checkpointing") new MemoryCheckpointRDD[T](rdd) } }
Example 122
Source File: SparkFunSuite.scala From yggdrasil with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.Logging import org.scalatest.{FunSuite, Outcome} final protected override def withFixture(test: NoArgTest): Outcome = { val testName = test.text val suiteName = this.getClass.getName val shortSuiteName = suiteName.replaceAll("org.apache.spark", "o.a.s") try { logInfo(s"\n\n===== TEST OUTPUT FOR $shortSuiteName: '$testName' =====\n") test() } finally { logInfo(s"\n\n===== FINISHED $shortSuiteName: '$testName' =====\n") } } }
Example 123
Source File: StoryJSONExtractor.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.story import java.io._ import java.util.Date import io.gzet.story.util.Tokenizer import org.apache.spark.{Logging, SparkConf, SparkContext} import org.elasticsearch.spark._ import org.json4s.DefaultFormats import org.json4s.native.JsonMethods._ import scala.util.Try object StoryJSONExtractor extends SimpleConfig with Logging { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("Story Extractor") val sc = new SparkContext(sparkConf) val outputDir = args.head val minWeight = Try(args.last.toInt).getOrElse(0) val nodes = sc.esJsonRDD(esNodesResource).map({ case (_, strJson) => implicit val format = DefaultFormats val json = parse(strJson) val title = (json \ "title").extractOrElse[String]("") val gid = (json \ "gid").extractOrElse[Int](-1) val articles = (json \ "articles").extractOrElse[Int](-1) val cid = (json \ "cid").extractOrElse[Int](-1) val date = (json \ "date").extractOrElse[Long](0L) Array(cid, gid, new Date(date).toString, articles, Tokenizer.lucene(title.replaceAll("\\n", "").replaceAll("\\r", "")).mkString(" ")).mkString(",") }).collect() val nodesMap = sc.broadcast(sc.esJsonRDD(esNodesResource).map({ case (_, strJson) => implicit val format = DefaultFormats val json = parse(strJson) val gid = (json \ "gid").extractOrElse[Int](-1) val cid = (json \ "cid").extractOrElse[Int](-1) (cid, gid) }).collectAsMap()) val edges = sc.esJsonRDD(esEdgesResource).map({ case (_, strJson) => implicit val format = DefaultFormats val json = parse(strJson) val source = (json \ "source").extractOrElse[Int](-1) val target = (json \ "target").extractOrElse[Int](-1) val weight = (json \ "weight").extractOrElse[Int](-1) (source, target, weight) }).filter(_._3 > minWeight).map({ case (source, target, weight) => val mutation = nodesMap.value.getOrElse(source, -1) != nodesMap.value.getOrElse(target, -1) Array(source, target, weight, mutation).mkString(",") }).collect() printToFile(new File(s"$outputDir/nodes")) { p => p.println("id,story,date,articles,label") nodes.foreach(p.println) } printToFile(new File(s"$outputDir/edges")) { p => p.println("source,target,weight,mutation") edges.foreach(p.println) } } def printToFile(f: java.io.File)(op: java.io.PrintWriter => Unit) { val p = new java.io.PrintWriter(f) try { op(p) } finally { p.close() } } }
Example 124
Source File: SimhashIndexing.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.story import java.net.URL import com.datastax.spark.connector._ import io.gzet.story.model.Article import io.gzet.story.util.SimhashUtils._ import io.gzet.story.util.{HtmlFetcher, Tokenizer} import io.gzet.utils.spark.gdelt.GKGParser import org.apache.lucene.analysis.en.EnglishAnalyzer import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException} import scala.util.Try object SimhashIndexing extends SimpleConfig with Logging { def main(args: Array[String]) = { val sc = new SparkContext(new SparkConf().setAppName("GDELT Indexing")) if (args.isEmpty) throw new SparkException("usage: <gdeltInputDir>") val gdeltInputDir = args.head val gkgRDD = sc.textFile(gdeltInputDir) .map(GKGParser.toJsonGKGV2) .map(GKGParser.toCaseClass2) val urlRDD = gkgRDD.map(g => g.documentId.getOrElse("NA")) .filter(url => Try(new URL(url)).isSuccess) .distinct() .repartition(partitions) val contentRDD = urlRDD.mapPartitions({ it => val html = new HtmlFetcher(gooseConnectionTimeout, gooseSocketTimeout) it map html.fetch }) val corpusRDD = contentRDD.mapPartitions({ it => val analyzer = new EnglishAnalyzer() it.map(content => (content, Tokenizer.lucene(content.body, analyzer))) }).filter({ case (content, corpus) => corpus.length > minWords }) //CREATE TABLE gzet.articles ( hash int PRIMARY KEY, url text, title text, body text ); corpusRDD.mapValues(_.mkString(" ").simhash).map({ case (content, simhash) => Article(simhash, content.body, content.title, content.url) }).saveToCassandra(cassandraKeyspace, cassandraTable) } }
Example 125
Source File: StoryBatchDedup.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.story import io.gzet.story.model.{Content, Article} import org.apache.spark.graphx.{Graph, Edge} import org.apache.spark.{Logging, SparkConf, SparkContext} import io.gzet.story.util.SimhashUtils._ import com.datastax.spark.connector._ object StoryBatchDedup extends SimpleConfig with Logging { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("Story Extractor") val sc = new SparkContext(sparkConf) val simhashRDD = sc.cassandraTable[Article]("gzet", "articles").zipWithIndex().map({ case (a, id) => ((id, Content(a.url, a.title, a.body)), a.hash) }) Set(0) val duplicateTupleRDD = simhashRDD.flatMap({ case ((id, content), simhash) => searchmasks.map({ mask => (simhash ^ mask, id) }) }).groupByKey() val edgeRDD = duplicateTupleRDD.values.flatMap({ it => val list = it.toList for (x <- list; y <- list) yield (x, y) }).filter({ case (x, y) => x != y }).distinct().map({case (x, y) => Edge(x, y, 0) }) val duplicateRDD = Graph.fromEdges(edgeRDD, 0L) .connectedComponents() .vertices .join(simhashRDD.keys) .values duplicateRDD.sortBy(_._1).collect().foreach({ case (story, content) => println(story + "\t" + content.title) }) } }
Example 126
Source File: TwitterStream.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.timeseries import com.google.gson.GsonBuilder import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.twitter.TwitterUtils import org.apache.spark.streaming.{Minutes, StreamingContext} import org.apache.spark.{Logging, SparkConf, SparkContext} import twitter4j.auth.OAuthAuthorization import twitter4j.conf.ConfigurationBuilder import scala.util.Try object TwitterStream extends SimpleConfig with Logging { def getTwitterStream(ssc: StreamingContext, filters: Seq[String] = Nil) = { val builder = new ConfigurationBuilder() builder.setOAuthConsumerKey(twitterApiKey) builder.setOAuthConsumerSecret(twitterApiSecret) builder.setOAuthAccessToken(twitterTokenKey) builder.setOAuthAccessTokenSecret(twitterTokenSecret) val configuration = builder.build() TwitterUtils.createStream( ssc, Some(new OAuthAuthorization(configuration)), filters, StorageLevel.MEMORY_ONLY ) } def main(args: Array[String]) = { val sparkConf = new SparkConf().setAppName("Twitter Extractor") val sc = new SparkContext(sparkConf) val ssc = new StreamingContext(sc, Minutes(5)) val twitterStream = getTwitterStream(ssc, args).mapPartitions({ it => val gson = new GsonBuilder().create() it map { s => Try(gson.toJson(s)) } }) twitterStream .filter(_.isSuccess) .map(_.get) .saveAsTextFiles("twitter") // Start streaming context ssc.start() ssc.awaitTermination() } }
Example 127
Source File: MetricImplicits.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.timeseries.timely import java.io.PrintStream import java.net.Socket import java.nio.charset.StandardCharsets import io.gzet.timeseries.SimpleConfig import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream import org.apache.spark.{Logging, Partitioner} object MetricImplicits extends Logging with SimpleConfig { def nonNegativeMod(x: Int, mod: Int): Int = { val rawMod = x % mod rawMod + (if (rawMod < 0) mod else 0) } class MetricPartitioner(partitions: Int) extends Partitioner { require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.") override def numPartitions: Int = partitions override def getPartition(key: Any): Int = { val k = key.asInstanceOf[MetricKey] nonNegativeMod(k.metricName.hashCode, partitions) } } implicit class Metrics(rdd: RDD[Metric]) { val partitions = rdd.partitions.length val partitioner = new MetricPartitioner(partitions) def publish() = { val sSortedMetricRDD = rdd filter { metric => metric.tags.nonEmpty } map { metric => (MetricKey(metric.name, metric.time), metric) } repartitionAndSortWithinPartitions partitioner sSortedMetricRDD.values foreachPartition { it: Iterator[Metric] => val sock = new Socket(timelyHost, timelyPort) val writer = new PrintStream(sock.getOutputStream, true, StandardCharsets.UTF_8.name) it foreach { metric => writer.println(metric.toPut) } writer.flush() } } } implicit class MetricStream(stream: DStream[Metric]) { def publish() = { stream foreachRDD { rdd => rdd.publish() } } } } case class Metric(name: String, time: Long, value: Double, tags: Map[String, String], viz: Option[String] = None) { def toPut = { val vizMap = if(viz.isDefined) List("viz" -> viz.get) else List[(String, String)]() val strTags = vizMap.union(tags.toList).map({ case (k, v) => s"$k=$v" }).mkString(" ") s"put $name $time $value $strTags" } } case class MetricKey(metricName: String, metricTime: Long) object MetricKey { implicit def orderingByMetricDate[A <: MetricKey] : Ordering[A] = { Ordering.by(fk => (fk.metricName, fk.metricTime)) } }
Example 128
Source File: GDBIndex.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.io.{DataInput, File} import java.nio.{ByteBuffer, ByteOrder} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, Path} import org.apache.spark.Logging object GDBIndex { def apply(path: String, name: String, conf: Configuration = new Configuration()) = { val filename = StringBuilder.newBuilder.append(path).append(File.separator).append(name).append(".gdbtablx").toString() val hdfsPath = new Path(filename) val dataInput = hdfsPath.getFileSystem(conf).open(hdfsPath) val bytes = new Array[Byte](16) dataInput.readFully(bytes) val byteBuffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN) val signature = byteBuffer.getInt val n1024Blocks = byteBuffer.getInt val numRows = byteBuffer.getInt val indexSize = byteBuffer.getInt new GDBIndex(dataInput, numRows, indexSize) } } private[gdb] class GDBIndex(dataInput: FSDataInputStream, val numRows: Int, indexSize: Int ) extends Logging with AutoCloseable with Serializable { def readSeekForRowNum(rowNum: Int) = { val bytes = new Array[Byte](indexSize) dataInput.seek(16 + rowNum * indexSize) dataInput.readFully(bytes) ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).getInt } def iterator(startAtRow: Int = 0, numRowsToRead: Int = -1) = { dataInput.seek(16 + startAtRow * indexSize) val maxRows = if (numRowsToRead == -1) numRows else numRowsToRead // log.info(s"iterator::startAtRow=$startAtRow maxRows=$maxRows") new GDBIndexIterator(dataInput, startAtRow, maxRows, indexSize).withFilter(_.isSeekable) } def close() { dataInput.close() } } private[gdb] class GDBIndexIterator(dataInput: DataInput, startID: Int, maxRows: Int, indexSize: Int ) extends Iterator[IndexInfo] with Logging with Serializable { private val indexInfo = IndexInfo(0, 0) private val bytes = new Array[Byte](indexSize) private val byteBuffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN) private var objectID = startID private var nextRow = 0 def hasNext() = nextRow < maxRows def next() = { // log.info(s"next::nextRow=$nextRow maxRows=$maxRows") nextRow += 1 objectID += 1 indexInfo.objectID = objectID byteBuffer.clear dataInput.readFully(bytes) indexInfo.seek = byteBuffer.getInt indexInfo } }
Example 129
Source File: GDBRDD.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import org.apache.hadoop.conf.Configuration import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.{Logging, Partition, SparkContext, TaskContext} case class GDBRDD(@transient sc: SparkContext, gdbPath: String, gdbName: String, numPartitions: Int) extends RDD[Row](sc, Nil) with Logging { @DeveloperApi override def compute(partition: Partition, context: TaskContext): Iterator[Row] = { val part = partition.asInstanceOf[GDBPartition] val hadoopConf = if (sc == null) new Configuration() else sc.hadoopConfiguration val index = GDBIndex(gdbPath, part.hexName, hadoopConf) val table = GDBTable(gdbPath, part.hexName, hadoopConf) context.addTaskCompletionListener(context => { table.close() index.close() }) table.rowIterator(index, part.startAtRow, part.numRowsToRead) } override protected def getPartitions: Array[Partition] = { val hadoopConf = if (sc == null) new Configuration() else sc.hadoopConfiguration GDBTable.findTable(gdbPath, gdbName, hadoopConf) match { case Some(catTab) => { val index = GDBIndex(gdbPath, catTab.hexName, hadoopConf) try { val numRows = index.numRows val numRowsPerPartition = (numRows.toDouble / numPartitions).ceil.toInt var startAtRow = 0 (0 until numPartitions).map(i => { val endAtRow = startAtRow + numRowsPerPartition val numRowsToRead = if (endAtRow <= numRows) numRowsPerPartition else numRows - startAtRow val gdbPartition = GDBPartition(i, catTab.hexName, startAtRow, numRowsToRead) startAtRow += numRowsToRead gdbPartition }).toArray } finally { index.close() } } case _ => { log.error(s"Cannot find '$gdbName' in $gdbPath, creating an empty array of Partitions !") Array.empty[Partition] } } } } private[this] case class GDBPartition(m_index: Int, val hexName: String, val startAtRow: Int, val numRowsToRead: Int ) extends Partition { override def index = m_index }
Example 130
Source File: GDBRelation.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import org.apache.spark.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{BaseRelation, TableScan} import org.apache.spark.sql.types._ import org.apache.spark.sql.{Row, SQLContext} case class GDBRelation(gdbPath: String, gdbName: String, numPartition: Int) (@transient val sqlContext: SQLContext) extends BaseRelation with Logging with TableScan { override val schema = inferSchema() private def inferSchema() = { val sc = sqlContext.sparkContext GDBTable.findTable(gdbPath, gdbName, sc.hadoopConfiguration) match { case Some(catTab) => { val table = GDBTable(gdbPath, catTab.hexName, sc.hadoopConfiguration) try { table.schema() } finally { table.close() } } case _ => { log.error(s"Cannot find '$gdbName' in $gdbPath, creating an empty schema !") StructType(Seq.empty[StructField]) } } } override def buildScan(): RDD[Row] = { GDBRDD(sqlContext.sparkContext, gdbPath, gdbName, numPartition) } }
Example 131
Source File: Main.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.app import com.esri.core.geometry.Polyline import com.esri.udt.{PointType, PolylineType} import org.apache.spark.sql.{SQLContext, SaveMode} import org.apache.spark.{Logging, SparkConf, SparkContext} val sqlContext = new SQLContext(sc) val df = sqlContext.read.format("com.esri.gdb") .option("path", path) .option("name", name) .option("numPartitions", "1") .load() df.printSchema() df.registerTempTable(name) sqlContext.udf.register("getX", (point: PointType) => point.x) sqlContext.udf.register("getY", (point: PointType) => point.y) sqlContext.udf.register("line", (point: PointType) => PolylineType({ val polyline = new Polyline() polyline.startPath(point.x - 2, point.y - 2) polyline.lineTo(point.x + 2, point.y + 2) polyline } )) sqlContext.sql(s"select line(Shape),getX(Shape)-2 as x from $name") .write .mode(SaveMode.Overwrite) .format("json") .save(s"/tmp/$name.json") } finally { sc.stop() } }
Example 132
Source File: HBaseSource.scala From shc with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.hbase.examples import org.apache.spark.{SparkConf, Logging, SparkContext} import org.apache.spark.sql.execution.datasources.hbase._ import org.apache.spark.sql.SQLContext import org.apache.spark.sql._ case class HBaseRecord( col0: String, col1: Boolean, col2: Double, col3: Float, col4: Int, col5: Long, col6: Short, col7: String, col8: Byte) object HBaseRecord { def apply(i: Int): HBaseRecord = { val s = s"""row${"%03d".format(i)}""" HBaseRecord(s, i % 2 == 0, i.toDouble, i.toFloat, i, i.toLong, i.toShort, s"String$i extra", i.toByte) } } object HBaseSource { val cat = s"""{ |"table":{"namespace":"default", "name":"table1"}, |"rowkey":"key", |"columns":{ |"col0":{"cf":"rowkey", "col":"key", "type":"string"}, |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"}, |"col2":{"cf":"cf2", "col":"col2", "type":"double"}, |"col3":{"cf":"cf3", "col":"col3", "type":"float"}, |"col4":{"cf":"cf4", "col":"col4", "type":"int"}, |"col5":{"cf":"cf5", "col":"col5", "type":"bigint"}, |"col6":{"cf":"cf6", "col":"col6", "type":"smallint"}, |"col7":{"cf":"cf7", "col":"col7", "type":"string"}, |"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"} |} |}""".stripMargin def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("HBaseTest") val sc = new SparkContext(sparkConf) val sqlContext = new SQLContext(sc) import sqlContext._ import sqlContext.implicits._ def withCatalog(cat: String): DataFrame = { sqlContext .read .options(Map(HBaseTableCatalog.tableCatalog->cat)) .format("org.apache.spark.sql.execution.datasources.hbase") .load() } val data = (0 to 255).map { i => HBaseRecord(i) } sc.parallelize(data).toDF.write.options( Map(HBaseTableCatalog.tableCatalog -> cat, HBaseTableCatalog.newTable -> "5")) .format("org.apache.spark.sql.execution.datasources.hbase") .save() val df = withCatalog(cat) df.show df.filter($"col0" <= "row005") .select($"col0", $"col1").show df.filter($"col0" === "row005" || $"col0" <= "row005") .select($"col0", $"col1").show df.filter($"col0" > "row250") .select($"col0", $"col1").show df.registerTempTable("table1") val c = sqlContext.sql("select count(col1) from table1 where col0 < 'row050'") c.show() } }
Example 133
package org.apache.spark.sql import java.io.File import com.google.common.io.Files import org.apache.hadoop.hbase.{HColumnDescriptor, HTableDescriptor, TableName, HBaseTestingUtility} import org.apache.hadoop.hbase.client.{Scan, Put, ConnectionFactory, Table} import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.sql.execution.datasources.hbase.SparkHBaseConf import org.apache.spark.sql.types.UTF8String import org.apache.spark.{SparkContext, SparkConf, Logging} import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} import scala.collection.JavaConverters._ class SHC extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll with Logging { implicit class StringToColumn(val sc: StringContext) { def $(args: Any*): ColumnName = { new ColumnName(sc.s(args: _*)) } } private[spark] var htu = HBaseTestingUtility.createLocalHTU() private[spark] def tableName = "table1" private[spark] def columnFamilies: Array[String] = Array.tabulate(9){ x=> s"cf$x"} var table: Table = null val conf = new SparkConf conf.set(SparkHBaseConf.testConf, "true") SparkHBaseConf.conf = htu.getConfiguration // private[spark] var columnFamilyStr = Bytes.toString(columnFamily) def catalog = s"""{ |"table":{"namespace":"default", "name":"table1"}, |"rowkey":"key", |"columns":{ |"col0":{"cf":"rowkey", "col":"key", "type":"string"}, |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"}, |"col2":{"cf":"cf2", "col":"col2", "type":"double"}, |"col3":{"cf":"cf3", "col":"col3", "type":"float"}, |"col4":{"cf":"cf4", "col":"col4", "type":"int"}, |"col5":{"cf":"cf5", "col":"col5", "type":"bigint"}, |"col6":{"cf":"cf6", "col":"col6", "type":"smallint"}, |"col7":{"cf":"cf7", "col":"col7", "type":"string"}, |"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"} |} |}""".stripMargin override def beforeAll() { val tempDir: File = Files.createTempDir tempDir.deleteOnExit htu.cleanupTestDir htu.startMiniZKCluster htu.startMiniHBaseCluster(1, 4) logInfo(" - minicluster started") println(" - minicluster started") } override def afterAll() { try { table.close() println("shutdown") htu.deleteTable(TableName.valueOf(tableName)) logInfo("shuting down minicluster") htu.shutdownMiniHBaseCluster htu.shutdownMiniZKCluster logInfo(" - minicluster shut down") htu.cleanupTestDir } catch { case _ => logError("teardown error") } } def createTable(name: String, cfs: Array[String]) { val tName = Bytes.toBytes(name) val bcfs = cfs.map(Bytes.toBytes(_)) try { htu.deleteTable(TableName.valueOf(tName)) } catch { case _ => logInfo(" - no table " + name + " found") } htu.createMultiRegionTable(TableName.valueOf(tName), bcfs) } def createTable(name: Array[Byte], cfs: Array[Array[Byte]]) { try { htu.deleteTable(TableName.valueOf(name)) } catch { case _ => logInfo(" - no table " + Bytes.toString(name) + " found") } htu.createMultiRegionTable(TableName.valueOf(name), cfs) } }
Example 134
Source File: CounterEtlFunctions.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.counter.loader.core import org.apache.s2graph.core.{S2Edge, S2Graph, GraphUtil} import org.apache.s2graph.counter.loader.config.StreamingConfig import org.apache.s2graph.counter.models.CounterModel import org.apache.s2graph.spark.config.S2ConfigFactory import org.apache.spark.Logging import play.api.libs.json._ import scala.collection.mutable.{HashMap => MutableHashMap} object CounterEtlFunctions extends Logging { lazy val filterOps = Seq("insert", "insertBulk", "update", "increment").map(op => GraphUtil.operations(op)) lazy val preFetchSize = StreamingConfig.PROFILE_PREFETCH_SIZE lazy val config = S2ConfigFactory.config lazy val counterModel = new CounterModel(config) lazy val graph = new S2Graph(config)(scala.concurrent.ExecutionContext.Implicits.global) def logToEdge(line: String): Option[S2Edge] = { for { elem <- graph.toGraphElement(line) if elem.isInstanceOf[S2Edge] edge <- Some(elem.asInstanceOf[S2Edge]).filter { x => filterOps.contains(x.op) } } yield { edge } } def parseEdgeFormat(line: String): Option[CounterEtlItem] = { for { elem <- graph.toGraphElement(line) if elem.isInstanceOf[S2Edge] edge <- Some(elem.asInstanceOf[S2Edge]).filter { x => filterOps.contains(x.op) } } yield { val label = edge.innerLabel val labelName = label.label val tgtService = label.tgtColumn.service.serviceName val tgtId = edge.tgtVertex.innerId.toString() val srcId = edge.srcVertex.innerId.toString() // make empty property if no exist edge property val dimension = Json.parse(Some(GraphUtil.split(line)).filter(_.length >= 7).map(_(6)).getOrElse("{}")) val bucketKeys = Seq("_from") val bucketKeyValues = { for { variable <- bucketKeys } yield { val jsValue = variable match { case "_from" => JsString(srcId) case s => (dimension \ s).get } s"[[$variable]]" -> jsValue } } val property = Json.toJson(bucketKeyValues :+ ("value" -> JsString("1")) toMap) // val property = Json.toJson(Map("_from" -> srcId, "_to" -> tgtId, "value" -> "1")) CounterEtlItem(edge.ts, tgtService, labelName, tgtId, dimension, property) } } def parseEdgeFormat(lines: List[String]): List[CounterEtlItem] = { for { line <- lines item <- parseEdgeFormat(line) } yield { item } } def checkPolicyAndMergeDimension(service: String, action: String, items: List[CounterEtlItem]): List[CounterEtlItem] = { counterModel.findByServiceAction(service, action).map { policy => if (policy.useProfile) { policy.bucketImpId match { case Some(_) => DimensionProps.mergeDimension(policy, items) case None => Nil } } else { items } }.getOrElse(Nil) } }
Example 135
Source File: SubscriberListener.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.spark.spark import org.apache.spark.Logging import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.scheduler.{StreamingListener, StreamingListenerReceiverError, StreamingListenerReceiverStarted, StreamingListenerReceiverStopped} class SubscriberListener(ssc: StreamingContext) extends StreamingListener with Logging { override def onReceiverError(receiverError: StreamingListenerReceiverError): Unit = { logInfo("onReceiverError") } override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted): Unit = { logInfo("onReceiverStarted") } override def onReceiverStopped(receiverStopped: StreamingListenerReceiverStopped): Unit = { logInfo("onReceiverStopped") ssc.stop() } }
Example 136
Source File: StreamHelper.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import kafka.KafkaHelper import kafka.common.TopicAndPartition import kafka.consumer.PartitionTopicInfo import kafka.message.MessageAndMetadata import kafka.serializer.Decoder import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.{Logging, SparkException} import scala.reflect.ClassTag case class StreamHelper(kafkaParams: Map[String, String]) extends Logging { // helper for kafka zookeeper lazy val kafkaHelper = KafkaHelper(kafkaParams) lazy val kc = new KafkaCluster(kafkaParams) // 1. get leader's earliest and latest offset // 2. get consumer offset // 3-1. if (2) is bounded in (1) use (2) for stream // 3-2. else use (1) by "auto.offset.reset" private def getStartOffsets(topics: Set[String]): Map[TopicAndPartition, Long] = { lazy val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase) lazy val consumerOffsets = kafkaHelper.getConsumerOffsets(topics.toSeq) { for { topicPartitions <- kc.getPartitions(topics).right smallOffsets <- kc.getEarliestLeaderOffsets(topicPartitions).right largeOffsets <- kc.getLatestLeaderOffsets(topicPartitions).right } yield { { for { tp <- topicPartitions } yield { val co = consumerOffsets.getOrElse(tp, PartitionTopicInfo.InvalidOffset) val so = smallOffsets.get(tp).map(_.offset).get val lo = largeOffsets.get(tp).map(_.offset).get logWarning(s"$tp: $co $so $lo") if (co >= so && co <= lo) { (tp, co) } else { (tp, reset match { case Some("smallest") => so case _ => lo }) } } }.toMap } }.fold(errs => throw new SparkException(errs.mkString("\n")), ok => ok) } def createStream[K: ClassTag, V: ClassTag, KD <: Decoder[K]: ClassTag, VD <: Decoder[V]: ClassTag](ssc: StreamingContext, topics: Set[String]): InputDStream[(K, V)] = { type R = (K, V) val messageHandler = (mmd: MessageAndMetadata[K, V]) => (mmd.key(), mmd.message()) kafkaHelper.registerConsumerInZK(topics) new DirectKafkaInputDStream[K, V, KD, VD, R](ssc, kafkaParams, getStartOffsets(topics), messageHandler) } def commitConsumerOffsets(offsets: HasOffsetRanges): Unit = { val offsetsMap = { for { range <- offsets.offsetRanges if range.fromOffset < range.untilOffset } yield { logDebug(range.toString()) TopicAndPartition(range.topic, range.partition) -> range.untilOffset } }.toMap kafkaHelper.commitConsumerOffsets(offsetsMap) } def commitConsumerOffset(range: OffsetRange): Unit = { if (range.fromOffset < range.untilOffset) { try { val tp = TopicAndPartition(range.topic, range.partition) logDebug("Committed offset " + range.untilOffset + " for topic " + tp) kafkaHelper.commitConsumerOffset(tp, range.untilOffset) } catch { case t: Throwable => // log it and let it go logWarning("exception during commitOffsets", t) throw t } } } def commitConsumerOffsets[R](stream: InputDStream[R]): Unit = { stream.foreachRDD { rdd => commitConsumerOffsets(rdd.asInstanceOf[HasOffsetRanges]) } } }
Example 137
Source File: KafkaRDDFunctions.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import org.apache.spark.Logging import org.apache.spark.rdd.RDD import scala.language.implicitConversions import scala.reflect.ClassTag class KafkaRDDFunctions[T: ClassTag](self: RDD[T]) extends Logging with Serializable { def foreachPartitionWithOffsetRange(f: (OffsetRange, Iterator[T]) => Unit): Unit = { val offsets = self.asInstanceOf[HasOffsetRanges].offsetRanges foreachPartitionWithIndex { (i, part) => val osr: OffsetRange = offsets(i) f(osr, part) } } def foreachPartitionWithIndex(f: (Int, Iterator[T]) => Unit): Unit = { self.mapPartitionsWithIndex[Nothing] { (i, part) => f(i, part) Iterator.empty }.foreach { (_: Nothing) => () } } } object KafkaRDDFunctions { implicit def rddToKafkaRDDFunctions[T: ClassTag](rdd: RDD[T]): KafkaRDDFunctions[T] = { new KafkaRDDFunctions(rdd) } }
Example 138
Source File: RiakRelation.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.riak import com.basho.riak.spark._ import scala.reflect._ import com.basho.riak.spark.rdd.connector.{RiakConnectorConf, RiakConnector} import com.basho.riak.spark.rdd.{ReadConf, RiakTSRDD} import com.basho.riak.spark.util.TSConversionUtil import com.basho.riak.spark.writer.WriteConf import com.basho.riak.spark.writer.mapper.SqlDataMapper import org.apache.spark.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{InsertableRelation, BaseRelation, Filter, PrunedFilteredScan} import org.apache.spark.sql.types._ import org.apache.spark.sql._ import scala.collection.convert.decorateAsScala._ import com.basho.riak.spark.query.QueryBucketDef object RiakRelation { def apply(bucket: String, sqlContext: SQLContext, schema: Option[StructType] = None, connector: Option[RiakConnector] = None, readConf: ReadConf, writeConf: WriteConf): RiakRelation = { new RiakRelation(bucket, connector.getOrElse(RiakConnector(sqlContext.sparkContext.getConf)), readConf, writeConf, sqlContext, schema) } def apply(sqlContext: SQLContext, parameters: Map[String, String], schema: Option[StructType]): RiakRelation = { val existingConf = sqlContext.sparkContext.getConf val bucketDef = BucketDef(parameters(DefaultSource.RiakBucketProperty), None) val riakConnector = new RiakConnector(RiakConnectorConf(existingConf, parameters)) val readConf = ReadConf(existingConf, parameters) val writeConf = WriteConf(existingConf, parameters) RiakRelation(bucketDef.bucket, sqlContext, schema, Some(riakConnector), readConf, writeConf) } }
Example 139
Source File: TSDataQueryingIterator.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.query import com.basho.riak.client.core.query.timeseries.Row import org.apache.spark.Logging import com.basho.riak.client.core.query.timeseries.ColumnDescription class TSDataQueryingIterator(query: QueryTS) extends Iterator[Row] with Logging { private var _iterator: Option[Iterator[Row]] = None private val subqueries = query.queryData.iterator private var columns: Option[Seq[ColumnDescription]] = None prefetch() } columns match { case None => Seq() case Some(cds) => cds } } protected[this] def prefetch() = { while( subqueries.hasNext && !isPrefetchedDataAvailable) { val nextSubQuery = subqueries.next logTrace(s"Prefetching chunk of data: ts-query(token=$nextSubQuery)") val r = query.nextChunk(nextSubQuery) r match { case (cds, rows) => if (isTraceEnabled()) { logTrace(s"ts-query($nextSubQuery) returns:\n columns: ${r._1}\n data:\n\t ${r._2}") } else { logDebug(s"ts-query($nextSubQuery) returns:\n data.size: ${r._2.size}") } if (cds != null && cds.nonEmpty) { columns = Some(cds) } else if (columns.isEmpty) { // We have to initialize columns here, to make a difference and use it as indikator columns = Some(Seq()) } _iterator = Some(rows.iterator) case _ => _iterator = None logWarning(s"ts-query(token=$nextSubQuery) returns: NOTHING") } } } private def isPrefetchedDataAvailable: Boolean = !(_iterator.isEmpty || (_iterator.isDefined && !_iterator.get.hasNext)) override def hasNext: Boolean = { if (!isPrefetchedDataAvailable) { prefetch() } _iterator match { case Some(it) => it.hasNext case None => false } } override def next(): Row = { if (!hasNext) { throw new NoSuchElementException("next on empty iterator") } _iterator.get.next } } object TSDataQueryingIterator { def apply[R](query: QueryTS): TSDataQueryingIterator = new TSDataQueryingIterator(query) }
Example 140
Source File: KVDataQueryingIterator.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.query import com.basho.riak.client.core.query.{Location, RiakObject} import org.apache.spark.Logging class KVDataQueryingIterator[T](query: Query[T]) extends Iterator[(Location, RiakObject)] with Logging { type ResultT = (Location, RiakObject) private var isThereNextValue: Option[Boolean] = None private var nextToken: Option[T] = None private var _iterator: Option[Iterator[ResultT]] = None protected[this] def prefetch(): Boolean = { logTrace(s"Prefetching chunk of data: query(token=$nextToken)") val r = query.nextChunk(nextToken) if( isTraceEnabled() ) { logTrace(s"query(token=$nextToken) returns:\n token: ${r._1}\n data:\n\t ${r._2}") } else { logDebug(s"query(token=$nextToken) returns:\n token: ${r._1}\n data.size: ${r._2.size}") } nextToken = r._1 r match { case (_, Nil) => logDebug("prefetch returned Nothing, all data was already processed (empty chunk was returned)") _iterator = KVDataQueryingIterator.OPTION_EMPTY_ITERATOR case (_, data: Iterable[(Location,RiakObject)]) => if(nextToken.isEmpty){ logDebug("prefetch returned the last chunk, all data was processed") } _iterator = Some(data.iterator) } _iterator.get.hasNext } override def hasNext: Boolean = { isThereNextValue match { case Some(b: Boolean) => // cached value will be returned case None if _iterator.isDefined && _iterator.get.hasNext => logTrace(s"prefetch is not required, at least one pre-fetched value available") isThereNextValue = KVDataQueryingIterator.OPTION_TRUE case None if _iterator.isDefined && _iterator.get.isEmpty && nextToken.isEmpty => logTrace("prefetch is not required, all data was already processed") isThereNextValue = KVDataQueryingIterator.OPTION_FALSE case None => isThereNextValue = Some(prefetch()) } isThereNextValue.get } override def next(): (Location, RiakObject) = { if( !hasNext ){ throw new NoSuchElementException("next on iterator") } isThereNextValue = None _iterator.get.next() } } object KVDataQueryingIterator { private val OPTION_EMPTY_ITERATOR = Some(Iterator.empty) private val OPTION_TRUE = Some(true) private val OPTION_FALSE = Some(false) def apply[T](query: Query[T]): KVDataQueryingIterator[T] = new KVDataQueryingIterator[T](query) }
Example 141
Source File: DataMapper.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.util import com.basho.riak.client.api.convert.JSONConverter import com.fasterxml.jackson.module.scala.DefaultScalaModule import org.apache.spark.Logging trait DataMapper extends Serializable { DataMapper.ensureInitialized() } object DataMapper extends Logging { private var isInitialized = false def ensureInitialized(): Boolean = { if (!isInitialized) { // Register Scala module to serialize/deserialize Scala stuff smoothly JSONConverter.registerJacksonModule(DefaultScalaModule) logDebug("Jackson DefaultScalaModule has been registered") isInitialized = true } else { logTrace("Jackson DefaultScalaModule initialization was skipped since module has been registered.") } isInitialized } }
Example 142
Source File: SparkStreamingFixture.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.streaming import org.apache.spark.{Logging, SparkContext} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.junit.{After, Before} trait SparkStreamingFixture extends Logging { protected var sc: SparkContext protected var ssc: StreamingContext = _ protected val batchDuration = Seconds(1) @Before def startStreamingContext(): Unit = { ssc = new StreamingContext(sc, batchDuration) logInfo("Streaming context created") } @After def stopStreamingContext(): Unit = { Option(ssc).foreach(_.stop()) logInfo("Streaming context stopped") } }
Example 143
Source File: SocketStreamingDataSource.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.streaming import java.net.InetSocketAddress import java.nio.channels.{AsynchronousCloseException, AsynchronousServerSocketChannel, AsynchronousSocketChannel, CompletionHandler} import com.basho.riak.stub.SocketUtils import org.apache.spark.Logging class SocketStreamingDataSource extends Logging { private var serverChannel: AsynchronousServerSocketChannel = _ private var clientChannel: AsynchronousSocketChannel = _ def start(writeToSocket: AsynchronousSocketChannel => Unit): Int = { serverChannel = AsynchronousServerSocketChannel.open() require(serverChannel.isOpen) serverChannel.bind(new InetSocketAddress(0)) serverChannel.accept(serverChannel, new CompletionHandler[AsynchronousSocketChannel, AsynchronousServerSocketChannel]() { override def completed(client: AsynchronousSocketChannel, server: AsynchronousServerSocketChannel): Unit = { logInfo(s"Incoming connection: ${SocketUtils.serverConnectionAsStr(client)}") clientChannel = client writeToSocket(client) client.isOpen match { case true => val connectionString = SocketUtils.serverConnectionAsStr(client) client.shutdownInput() client.shutdownOutput() client.close() logInfo(s"Client $connectionString was gracefully disconnected") case false => // client is already closed - do nothing } } override def failed(exc: Throwable, serverChannel: AsynchronousServerSocketChannel): Unit = exc match { case _: AsynchronousCloseException => case _ => logError(s"Something went wrong: ${serverChannel.toString}", exc); } }) serverChannel.getLocalAddress.asInstanceOf[InetSocketAddress].getPort } def stop(): Unit = { Option(clientChannel).foreach(_.close()) Option(serverChannel).foreach(_.close()) } }
Example 144
Source File: AbstractFailoverOfflineTest.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.rdd.failover import com.basho.riak.client.core.query.Namespace import com.basho.riak.client.core.util.HostAndPort import com.basho.riak.stub.{RiakMessageHandler, RiakNodeStub} import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.spark.{Logging, SparkConf, SparkContext} import org.hamcrest.{Description, Matchers} import org.junit.internal.matchers.ThrowableCauseMatcher import org.junit.{After, Before} import scala.collection.JavaConversions._ abstract class AbstractFailoverOfflineTest extends Logging { protected final val NAMESPACE = new Namespace("default", "test-bucket") protected final val COVERAGE_ENTRIES_COUNT = 64 protected var sc: SparkContext = _ protected var riakNodes: Seq[(HostAndPort, RiakNodeStub)] = _ // tuple HostAndPort -> stub val riakHosts: Int = 1 val riakMessageHandler: Option[RiakMessageHandler] = None def sparkConf: SparkConf = new SparkConf(false) .setMaster("local") .setAppName(getClass.getSimpleName) .set("spark.riak.connection.host", riakNodes.map{case (hp, _) => s"${hp.getHost}:${hp.getPort}"}.mkString(",")) .set("spark.riak.output.wquorum", "1") .set("spark.riak.input.fetch-size", "2") def initRiakNodes(): Seq[(HostAndPort, RiakNodeStub)] = { require(riakMessageHandler.isDefined) // start riak stubs on localhost and free random port (1 to riakHosts).map { _ => val riakNode = RiakNodeStub(riakMessageHandler.get) riakNode.start() -> riakNode } } @Before def setUp(): Unit = { riakNodes = initRiakNodes() sc = new SparkContext(sparkConf) } @After def tearDown(): Unit = { Option(riakNodes).foreach(_.foreach(n => n._2.stop())) Option(sc).foreach(_.stop()) } def distributeEvenly(size: Int, splitCount: Int): Seq[Int] = { val (base, rem) = (size / splitCount, size % splitCount) (0 until splitCount).map(i => if (i < rem) base + 1 else base) } } class RootCauseMatcher[T <: Throwable](val excClass: Class[T]) extends ThrowableCauseMatcher[T](Matchers.isA(excClass)) { private def getOneBeforeRootCause(item: T): Throwable = { val throwables = ExceptionUtils.getThrowableList(item) if (throwables.length > 1) { throwables.reverse.tail.head } else { throwables.head } } override def matchesSafely(item: T): Boolean = super.matchesSafely(getOneBeforeRootCause(item).asInstanceOf[T]) override def describeMismatchSafely(item: T, description: Description): Unit = super.describeMismatchSafely(getOneBeforeRootCause(item).asInstanceOf[T], description) }
Example 145
Source File: BisectingKMeansModel.scala From bisecting-kmeans with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.bisectingkmeans import breeze.linalg.{Vector => BV, norm => breezeNorm} import org.apache.spark.Logging import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD def toJavaLinkageMatrix: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = { val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]() this.node.toLinkageMatrix.foreach {x => val row = new java.util.ArrayList[java.lang.Double]() row.add(x._1.toDouble) row.add(x._2.toDouble) row.add(x._3.toDouble) row.add(x._4.toDouble) javaList.add(row) } javaList } }
Example 146
Source File: SparkCassSSTableLoaderClientManager.scala From Spark2Cassandra with Apache License 2.0 | 5 votes |
package com.github.jparkie.spark.cassandra.client import java.net.InetAddress import com.datastax.spark.connector.cql.{ AuthConf, CassandraConnector } import com.github.jparkie.spark.cassandra.conf.SparkCassServerConf import org.apache.spark.Logging import scala.collection.mutable private[cassandra] trait SparkCassSSTableLoaderClientManager extends Serializable with Logging { case class SessionKey( hosts: Set[InetAddress], port: Int, authConf: AuthConf, sparkCassServerConf: SparkCassServerConf ) extends Serializable @transient private[client] val internalClients = mutable.HashMap.empty[SessionKey, SparkCassSSTableLoaderClient] private[client] def buildSessionKey( cassandraConnector: CassandraConnector, sparkCassServerConf: SparkCassServerConf ): SessionKey = { SessionKey(cassandraConnector.hosts, cassandraConnector.port, cassandraConnector.authConf, sparkCassServerConf) } private[client] def buildClient( cassandraConnector: CassandraConnector, sparkCassServerConf: SparkCassServerConf ): SparkCassSSTableLoaderClient = { val newSession = cassandraConnector.openSession() logInfo(s"Created SSTableLoaderClient to the following Cassandra nodes: ${cassandraConnector.hosts}") val sparkCassSSTableLoaderClient = new SparkCassSSTableLoaderClient(newSession, sparkCassServerConf) sys.addShutdownHook { logInfo("Closed Cassandra Session for SSTableLoaderClient.") sparkCassSSTableLoaderClient.stop() } sparkCassSSTableLoaderClient } private[cassandra] def evictAll(): Unit = synchronized { internalClients.values.foreach(_.stop()) internalClients.clear() } } object SparkCassSSTableLoaderClientManager extends SparkCassSSTableLoaderClientManager
Example 147
Source File: HBasePartition.scala From Backup-Repo with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.hadoop.hbase.regionserver.RegionScanner import org.apache.spark.{Logging, Partition} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.hbase.catalyst.expressions.PartialPredicateOperations._ import org.apache.spark.sql.hbase.types.{HBaseBytesType, Range} private[hbase] class HBasePartition( val idx: Int, val mappedIndex: Int, start: Option[HBaseRawType] = None, end: Option[HBaseRawType] = None, val server: Option[String] = None, val filterPredicates: Option[Expression] = None, @transient relation: HBaseRelation = null, @transient val newScanner:RegionScanner = null) extends Range[HBaseRawType](start, true, end, false, HBaseBytesType) with Partition with IndexMappable with Logging { override def index: Int = idx override def hashCode(): Int = idx @transient lazy val startNative: Seq[Any] = relation.nativeKeyConvert(start) @transient lazy val endNative: Seq[Any] = relation.nativeKeyConvert(end) def computePredicate(relation: HBaseRelation): Option[Expression] = { val predicate = if (filterPredicates.isDefined && filterPredicates.get.references.exists(_.exprId == relation.partitionKeys.head.exprId)) { val oriPredicate = filterPredicates.get val predicateReferences = oriPredicate.references.toSeq val boundReference = BindReferences.bindReference(oriPredicate, predicateReferences) val row = new GenericMutableRow(predicateReferences.size) var rowIndex = 0 var i = 0 var range: Range[_] = null while (i < relation.keyColumns.size) { range = relation.generateRange(this, oriPredicate, i) if (range != null) { rowIndex = relation.rowIndex(predicateReferences, i) if (rowIndex >= 0) row.update(rowIndex, range) // if the non-last dimension range is not point, do not proceed to the next dims if (i < relation.keyColumns.size - 1 && !range.isPoint) i = relation.keyColumns.size else i = i + 1 } else i = relation.keyColumns.size } val pr = boundReference.partialReduce(row, predicateReferences) pr match { case (null, e: Expression) => Some(e) case (true, _) => None case (false, _) => Some(Literal(false)) } } else filterPredicates logInfo(predicate.toString) predicate } override def toString = { s"HBasePartition: $idx, $mappedIndex, [$start, $end), $filterPredicates" } }
Example 148
Source File: BytesUtilsSuite.scala From Backup-Repo with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.spark.Logging import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.sql.types._ import org.apache.spark.sql.hbase.types.HBaseBytesType import org.apache.spark.sql.hbase.util.BytesUtils import org.scalatest.{BeforeAndAfterAll, FunSuite} class BytesUtilsSuite extends FunSuite with BeforeAndAfterAll with Logging { test("Bytes Ordering Test") { val s = Seq(-257, -256, -255, -129, -128, -127, -64, -16, -4, -1, 0, 1, 4, 16, 64, 127, 128, 129, 255, 256, 257) val result = s.map(i => (i, BytesUtils.create(IntegerType).toBytes(i))) .sortWith((f, s) => HBaseBytesType.ordering.gt( f._2.asInstanceOf[HBaseBytesType.InternalType], s._2.asInstanceOf[HBaseBytesType.InternalType])) assert(result.map(a => a._1) == s.sorted.reverse) } def compare(a: Array[Byte], b: Array[Byte]): Int = { val length = Math.min(a.length, b.length) var result: Int = 0 for (i <- 0 to length - 1) { val diff: Int = (a(i) & 0xff).asInstanceOf[Byte] - (b(i) & 0xff).asInstanceOf[Byte] if (diff != 0) { result = diff } } result } test("Bytes Utility Test") { assert(BytesUtils.toBoolean(BytesUtils.create(BooleanType) .toBytes(input = true), 0) === true) assert(BytesUtils.toBoolean(BytesUtils.create(BooleanType) .toBytes(input = false), 0) === false) assert(BytesUtils.toDouble(BytesUtils.create(DoubleType).toBytes(12.34d), 0) === 12.34d) assert(BytesUtils.toDouble(BytesUtils.create(DoubleType).toBytes(-12.34d), 0) === -12.34d) assert(BytesUtils.toFloat(BytesUtils.create(FloatType).toBytes(12.34f), 0) === 12.34f) assert(BytesUtils.toFloat(BytesUtils.create(FloatType).toBytes(-12.34f), 0) === -12.34f) assert(BytesUtils.toInt(BytesUtils.create(IntegerType).toBytes(12), 0) === 12) assert(BytesUtils.toInt(BytesUtils.create(IntegerType).toBytes(-12), 0) === -12) assert(BytesUtils.toLong(BytesUtils.create(LongType).toBytes(1234l), 0) === 1234l) assert(BytesUtils.toLong(BytesUtils.create(LongType).toBytes(-1234l), 0) === -1234l) assert(BytesUtils.toShort(BytesUtils.create(ShortType) .toBytes(12.asInstanceOf[Short]), 0) === 12) assert(BytesUtils.toShort(BytesUtils.create(ShortType) .toBytes(-12.asInstanceOf[Short]), 0) === -12) assert(BytesUtils.toUTF8String(BytesUtils.create(StringType).toBytes("abc"), 0, 3) === UTF8String("abc")) assert(BytesUtils.toUTF8String(BytesUtils.create(StringType).toBytes(""), 0, 0) === UTF8String("")) assert(BytesUtils.toByte(BytesUtils.create(ByteType) .toBytes(5.asInstanceOf[Byte]), 0) === 5) assert(BytesUtils.toByte(BytesUtils.create(ByteType) .toBytes(-5.asInstanceOf[Byte]), 0) === -5) assert(compare(BytesUtils.create(IntegerType).toBytes(128), BytesUtils.create(IntegerType).toBytes(-128)) > 0) } test("byte array plus one") { var byteArray = Array[Byte](0x01.toByte, 127.toByte) assert(Bytes.compareTo(BytesUtils.addOne(byteArray), Array[Byte](0x01.toByte, 0x80.toByte)) == 0) byteArray = Array[Byte](0xff.toByte, 0xff.toByte) assert(BytesUtils.addOne(byteArray) == null) byteArray = Array[Byte](0x02.toByte, 0xff.toByte) assert(Bytes.compareTo(BytesUtils.addOne(byteArray), Array[Byte](0x03.toByte, 0x00.toByte)) == 0) } test("float comparison") { val f1 = BytesUtils.create(FloatType).toBytes(-1.23f) val f2 = BytesUtils.create(FloatType).toBytes(100f) assert(Bytes.compareTo(f1, f2) < 0) } }
Example 149
Source File: WebSocketReceiver.scala From spark-streaming-demo with Apache License 2.0 | 5 votes |
package com.datastax.examples.meetup.websocket import com.datastax.examples.meetup.model._ import org.apache.spark.storage.StorageLevel import scalawebsocket.WebSocket import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.Logging import org.json4s._ import org.json4s.jackson.JsonMethods._ class WebSocketReceiver(url: String, storageLevel: StorageLevel) extends Receiver[MeetupRsvp](storageLevel) with Logging { @volatile private var webSocket: WebSocket = _ def onStart() { try{ logInfo("Connecting to WebSocket: " + url) val newWebSocket = WebSocket().open(url).onTextMessage({ msg: String => parseJson(msg) }) setWebSocket(newWebSocket) logInfo("Connected to: WebSocket" + url) } catch { case e: Exception => restart("Error starting WebSocket stream", e) } } def onStop() { setWebSocket(null) logInfo("WebSocket receiver stopped") } private def setWebSocket(newWebSocket: WebSocket) = synchronized { if (webSocket != null) { webSocket.shutdown() } webSocket = newWebSocket } private def parseJson(jsonStr: String): Unit = { implicit lazy val formats = DefaultFormats try { val json = parse(jsonStr) val rsvp = json.extract[MeetupRsvp] store(rsvp) } catch { case e: MappingException => logError("Unable to map JSON message to MeetupRsvp object:" + e.msg) case e: Exception => logError("Unable to map JSON message to MeetupRsvp object") } } }
Example 150
Source File: GraphLoader.scala From graphx-algorithm with GNU General Public License v2.0 | 5 votes |
package org.apache.spark.graphx import org.apache.spark.storage.StorageLevel import org.apache.spark.{Logging, SparkContext} import org.apache.spark.graphx.impl.{EdgePartitionBuilder, GraphImpl} def edgeListFile( sc: SparkContext, path: String, canonicalOrientation: Boolean = false, numEdgePartitions: Int = -1, edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY, vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) : Graph[Int, Int] = { val startTime = System.currentTimeMillis // Parse the edge data table directly into edge partitions val lines = if (numEdgePartitions > 0) { sc.textFile(path, numEdgePartitions).coalesce(numEdgePartitions) } else { sc.textFile(path) } val edges = lines.mapPartitionsWithIndex { (pid, iter) => val builder = new EdgePartitionBuilder[Int, Int] iter.foreach { line => if (!line.isEmpty && line(0) != '#') { val lineArray = line.split("\\s+") if (lineArray.length < 2) { logWarning("Invalid line: " + line) } val srcId = lineArray(0).toLong val dstId = lineArray(1).toLong if (canonicalOrientation && srcId > dstId) { builder.add(dstId, srcId, 1) } else { builder.add(srcId, dstId, 1) } } } Iterator((pid, builder.toEdgePartition)) }.persist(edgeStorageLevel).setName("GraphLoader.edgeListFile - edges (%s)".format(path)) edges.count() logInfo("It took %d ms to load the edges".format(System.currentTimeMillis - startTime)) GraphImpl.fromEdgePartitions(edges, defaultVertexAttr = 1, edgeStorageLevel = edgeStorageLevel, vertexStorageLevel = vertexStorageLevel) } // end of edgeListFile }
Example 151
Source File: GraphLoaderPlus.scala From graphx-algorithm with GNU General Public License v2.0 | 5 votes |
package org.apache.spark.graphx import org.apache.spark.storage.StorageLevel import org.apache.spark.{Logging, SparkContext} import org.apache.spark.graphx.impl.{EdgePartitionBuilder, GraphImpl} def edgeListFile( sc: SparkContext, path: String, canonicalOrientation: Boolean = false, numEdgePartitions: Int = -1, edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY, vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) : Graph[Int, Int] = { val startTime = System.currentTimeMillis // Parse the edge data table directly into edge partitions val lines = if (numEdgePartitions > 0) { sc.textFile(path, numEdgePartitions).coalesce(numEdgePartitions) } else { sc.textFile(path) } val edges = lines.mapPartitionsWithIndex { (pid, iter) => val builder = new EdgePartitionBuilder[Int, Int] iter.foreach { line => if (!line.isEmpty && line(0) != '#') { val lineArray = line.split("\\s+") if (lineArray.length < 2) { throw new IllegalArgumentException("Invalid line: " + line) } if (lineArray.length == 2) { val srcId = lineArray(0).toLong val dstId = lineArray(1).toLong if (canonicalOrientation && srcId > dstId) { builder.add(dstId, srcId, 1) } else { builder.add(srcId, dstId, 1) } } else { val srcId = lineArray(0).toLong val dstId = lineArray(1).toLong val weight = lineArray(2).toInt if (canonicalOrientation && srcId > dstId) { builder.add(dstId, srcId, weight) } else { builder.add(srcId, dstId, weight) } } } } Iterator((pid, builder.toEdgePartition)) }.persist(edgeStorageLevel).setName("GraphLoaderPlus.edgeListFile - edges (%s)".format(path)) edges.count() logInfo("It took %d ms to load the edges".format(System.currentTimeMillis - startTime)) GraphImpl.fromEdgePartitions(edges, defaultVertexAttr = 1, edgeStorageLevel = edgeStorageLevel, vertexStorageLevel = vertexStorageLevel) } // end of edgeListFile }
Example 152
Source File: SparkFunSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark // scalastyle:off import org.apache.log4j.{Level, Logger} import org.scalatest.{FunSuite, Outcome} import org.apache.spark.Logging final protected override def withFixture(test: NoArgTest): Outcome = { val testName = test.text val suiteName = this.getClass.getName val shortSuiteName = suiteName.replaceAll("org.apache.spark", "o.a.s") try { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) logInfo(s"\n\n===== TEST OUTPUT FOR $shortSuiteName: '$testName' =====\n") test() } finally { logInfo(s"\n\n===== FINISHED $shortSuiteName: '$testName' =====\n") } } }
Example 153
Source File: ExtendedPlanner.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.extension import org.apache.spark.Logging import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.{SparkPlan, SparkPlanner} def planLaterExt(p: LogicalPlan): SparkPlan = planLater(p) def optimizedPlan(p: LogicalPlan): LogicalPlan = sqlContext.executePlan(p).optimizedPlan def optimizedRelationLookup(u: UnresolvedRelation): Option[LogicalPlan] = { if (sqlContext.catalog.tableExists(u.tableIdentifier)) { Some(optimizedPlan(u)) } else { None } } // TODO (AC) Remove this once table-valued function are rebased on top. def analyze(p: LogicalPlan): LogicalPlan = sqlContext.analyzer.execute(p) override def plan(p: LogicalPlan): Iterator[SparkPlan] = { val iter = strategies.view.flatMap({ strategy => val plans = strategy(p) if (plans.isEmpty) { logTrace(s"Strategy $strategy did not produce plans for $p") } else { logDebug(s"Strategy $strategy produced a plan for $p: ${plans.head}") } plans }).toIterator assert(iter.hasNext, s"No plan for $p") iter } }
Example 154
Source File: DropRunnableCommand.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.Logging import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.sources.DropRelation import org.apache.spark.sql.{Row, SQLContext} import scala.util.Try private[sql] case class DropRunnableCommand(toDrop: Map[String, Option[DropRelation]]) extends RunnableCommand with Logging { override def run(sqlContext: SQLContext): Seq[Row] = { toDrop.foreach { case (name, dropOption) => sqlContext.dropTempTable(name) dropOption.foreach { dropRelation => Try { dropRelation.dropTable() }.recover { // When the provider indicates an exception while dropping, we still have to continue // dropping all the referencing tables, otherwise there could be integrity issues case ex => logWarning( s"""Error occurred when dropping table '$name':${ex.getMessage}, however |table '$name' will still be dropped from Spark catalog. """.stripMargin) }.get } } Seq.empty } }
Example 155
Source File: SQLRunner.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package com.sap.spark.cli import java.io._ import org.apache.spark.sql.{DataFrame, Row, SQLContext} import org.apache.spark.{Logging, SparkContext} import scala.annotation.tailrec protected[cli] case class CLIOptions( sqlFiles: List[String] = Nil, output: Option[String] = None) def main(args: Array[String]): Unit = { def fail(msg: String = USAGE): Unit = { logError(msg) System.exit(1) } val opts = parseOpts(args.toList) val outputStream: OutputStream = opts.output match { case Some(filename) => new FileOutputStream(new File(filename)) case None => System.out } opts.sqlFiles .map((string: String) => new FileInputStream(new File(string))) .foreach(sql(_, outputStream)) } }
Example 156
Source File: NodeTests.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hierarchy import org.apache.spark.Logging import org.apache.spark.sql.types.{Node, NodeHelpers, StringType} import scala.collection.mutable.ArrayBuffer // scalastyle:off magic.number // scalastyle:off file.size.limit class NodeTests extends NodeUnitTestSpec with Logging { var nodes = ArrayBuffer[Node]() nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L)) nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 1L)) nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 1L, 2L)) nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 1L, 3L)) nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 2L)) nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 3L)) nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 4L)) nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 4L, 1L)) nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 4L, 2L)) log.info("Running unit tests for sorting class Node\n") nodes.toArray should equal { // deterministic generator: val myRand = new scala.util.Random(42) // take copy of array-buffer, shuffle it val shuffled_nodes = myRand.shuffle(nodes.toSeq) // shuffled?: shuffled_nodes should not equal nodes.toArray shuffled_nodes.sorted(NodeHelpers.OrderedNode) } log.info("Testing function compareToRecursive\n") val x = Node(null, null) 0 should equal {x.compareToRecursive(Seq(), Seq())} 0 should be > {x.compareToRecursive(Seq(), Seq(1))} 0 should be < {x.compareToRecursive(Seq(1), Seq())} 0 should equal {x.compareToRecursive(Seq(1,2), Seq(1,2))} 0 should be < {x.compareToRecursive(Seq(1,2), Seq(1))} 0 should be > {x.compareToRecursive(Seq(1), Seq(1,2))} } // scalastyle:on magic.number // scalastyle:on file.size.limit
Example 157
Source File: HierarchyJoinBuilderUnitTests.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hierarchy import org.apache.spark.sql.types.{IntegerType, Node} import org.apache.spark.Logging import org.apache.spark.sql.Row // scalastyle:off magic.number class HierarchyJoinBuilderUnitTests extends NodeUnitTestSpec with Logging { var jb = new HierarchyJoinBuilder[Row, Row, Long](null, null, null, null, null, null) log.info("Testing function 'extractNodeFromRow'\n") val x = Node(List(1,2,3), IntegerType, List(1L,1L,2L)) Some(x) should equal { jb.extractNodeFromRow(Row.fromSeq(Seq(1,2,3, x))) } None should equal { jb.extractNodeFromRow(Row.fromSeq(Seq(1,2,3))) } None should equal { jb.extractNodeFromRow(Row.fromSeq(Seq())) } log.info("Testing function 'getOrd'\n") None should equal { jb.getOrd(Row.fromSeq(Seq(1,2,3))) } val testValues = List((42L, Some(42L)), (13, Some(13L)), ("hello", None), (1234.56, None)) testValues.foreach( testVal => { val jbWithOrd = new HierarchyJoinBuilder[Row, Row, Long](null, null, null, null, x => testVal._1 , null) testVal._2 should equal { jbWithOrd.getOrd(Row.fromSeq(Seq(x))) } } ) } // scalastyle:on magic.number
Example 158
Source File: ExtractSQLParserSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import com.sap.spark.PlanTest import org.apache.spark.Logging import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.parser.SapParserDialect import org.scalatest.FunSuite class ExtractSQLParserSuite extends FunSuite with PlanTest with Logging { // scalastyle:off magic.number val t1 = UnresolvedRelation(TableIdentifier("T1")) val parser = new SapParserDialect test("Parse EXTRACT in SELECT") { val result = parser.parse("SELECT a, EXTRACT(YEAR FROM a) FROM T1") val expected = t1.select(AliasUnresolver('a, Year('a)): _*) comparePlans(expected, result) } test("Parse EXTRACT in WHERE") { val result = parser.parse("SELECT 1 FROM T1 WHERE EXTRACT(MONTH FROM a) = 2015") val expected = t1.where(Month('a) === 2015).select(AliasUnresolver(1): _*) comparePlans(expected, result) } test("Parse EXTRACT in GROUP BY") { val result = parser.parse("SELECT 1 FROM T1 GROUP BY EXTRACT(DAY FROM a)") val expected = t1.groupBy(DayOfMonth('a))(AliasUnresolver(1): _*) comparePlans(expected, result) } }
Example 159
Source File: MathsSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.commons.math3.util.FastMath import org.apache.spark.Logging import org.apache.spark.sql.{Row, GlobalSapSQLContext} import org.scalatest.FunSuite class MathsSuite extends FunSuite with GlobalSapSQLContext with Logging { // scalastyle:off magic.number val rowA = DoubleRow("AAA", 1.0) val rowB = DoubleRow("BBB", 2.0) val rowC = DoubleRow("CCC", 0.6) val rowD = DoubleRow("DDD", -1.1) val rowE = DoubleRow("DDD", -1.1) val data = Seq(rowA, rowB) test("ln, log, pow") { val rdd = sc.parallelize(data) val dSrc = sqlContext.createDataFrame(rdd).cache() dSrc.registerTempTable("src") val result1 = sqlContext.sql("SELECT name,d,LN(d) FROM src").collect() assertResult(Row(rowA.name, rowA.d, 0.0) :: Row(rowB.name, rowB.d, scala.math.log(2.0)) :: Nil)(result1) val result2 = sqlContext.sql("SELECT name,d,LOG(10, d) FROM src").collect() assertResult(Row(rowA.name, rowA.d, 0.0) :: Row(rowB.name, rowB.d, scala.math.log(rowB.d) / scala.math.log(10)) :: Nil)(result2) val result3 = sqlContext.sql("SELECT name,d,POWER(d,2) FROM src").collect() assertResult(Row(rowA.name, rowA.d, 1.0) :: Row(rowB.name, rowB.d, 4.0) :: Nil)(result3) } val data2 = Seq(rowC, rowD) test("ceil, floor, round, sign, mod") { val rdd = sc.parallelize(data2) val dSrc = sqlContext.createDataFrame(rdd).cache() dSrc.registerTempTable("src") val result1 = sqlContext.sql("SELECT name, d, CEIL(d), FLOOR(d)," + "ROUND(d,0), SIGN(d), MOD(d,3) FROM src").collect() assertResult(Row(rowC.name, rowC.d, 1.0, 0.0, 1.0, 1.0, 0.6) :: Row(rowD.name, rowD.d, -1.0, -2.0, -1.0, -1.0, -1.1) :: Nil)(result1) } test("cos, SIN, TAN, ACOS, ASIN, ATAN") { val rdd = sc.parallelize(data2) val dSrc = sqlContext.createDataFrame(rdd).cache() dSrc.registerTempTable("src") val result1 = sqlContext.sql("SELECT name, d, COS(d), SIN(d), TAN(d)," + " ACOS(COS(d)), ASIN(SIN(d)), ATAN(TAN(d)) FROM src").collect() assertResult(Row(rowC.name, rowC.d, FastMath.cos(rowC.d), FastMath.sin(rowC.d), FastMath.tan(rowC.d), 0.6, 0.6, 0.6) :: Row(rowD.name, rowD.d, FastMath.cos(rowD.d), FastMath.sin(rowD.d), FastMath.tan(rowD.d), 1.1, -1.1, -1.1) :: Nil)(result1) } }
Example 160
Source File: StringsSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.Logging import org.apache.spark.sql.{Row, GlobalSapSQLContext} import org.scalatest.FunSuite class StringsSuite extends FunSuite with GlobalSapSQLContext with Logging { // scalastyle:off magic.number val rowA = StringRow(" AAA") val rowB = StringRow("BBB ") val rowC = StringRow(" CCC ") val rowD = StringRow("DDDDDDD") val rowE = StringRow(null) val dataWithDates = Seq(rowA, rowB, rowC, rowD, rowE) test("string manipulations") { val rdd = sc.parallelize(dataWithDates) val dSrc = sqlContext.createDataFrame(rdd).cache() dSrc.registerTempTable("src") val result1 = sqlContext.sql("SELECT name,TRIM(name),RTRIM(name),LTRIM(name) FROM src").collect() assertResult(Row(rowA.name, "AAA", " AAA", "AAA") :: Row(rowB.name, "BBB", "BBB", "BBB ") :: Row(rowC.name, "CCC", " CCC", "CCC ") :: Row(rowD.name, "DDDDDDD", "DDDDDDD", "DDDDDDD") :: Row(rowE.name, null, null, null) :: Nil)(result1) val result2 = sqlContext.sql("SELECT name,LPAD(name,6,'x'),RPAD(name,6,'xyz') FROM src").collect() assertResult(Row(rowA.name, "xx AAA", " AAAxy") :: Row(rowB.name, "xxBBB ", "BBB xy") :: Row(rowC.name, "x CCC ", " CCC x") :: Row(rowD.name, "DDDDDD", "DDDDDD") :: Row(rowE.name, null, null) :: Nil)(result2) val result3 = sqlContext.sql("SELECT name, LENGTH(name), LOCATE('B', name) FROM src").collect() assertResult(Row(rowA.name, 4, 0) :: Row(rowB.name, 4, 1) :: Row(rowC.name, 5, 0) :: Row(rowD.name, 7, 0) :: Row(rowE.name, null, null) :: Nil)(result3) val result4 = sqlContext.sql("SELECT name, CONCAT(name,'aa') FROM src").collect() assertResult(Row(rowA.name, " AAAaa") :: Row(rowB.name, "BBB aa") :: Row(rowC.name, " CCC aa") :: Row(rowD.name, "DDDDDDDaa") :: Row(rowE.name, null) ::Nil)(result4) val result5 = sqlContext.sql("SELECT name,REPLACE(name,'DD','de'),REVERSE(name) FROM src").collect() assertResult(Row(rowA.name, " AAA", "AAA ") :: Row(rowB.name, "BBB ", " BBB") :: Row(rowC.name, " CCC ", " CCC ") :: Row(rowD.name, "dededeD", "DDDDDDD") :: Row(rowE.name, null, null) :: Nil)(result5) } }
Example 161
Source File: SapSQLEnv.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.sap.thriftserver import java.io.PrintStream import org.apache.spark.scheduler.StatsReportListener import org.apache.spark.sql.hive.{HiveContext, SapHiveContext} import org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver import org.apache.spark.sql.hive.thriftserver.SparkSQLEnv._ import org.apache.spark.util.Utils import org.apache.spark.{Logging, SparkConf, SparkContext} import scala.collection.JavaConversions._ object SapSQLEnv extends Logging { def init() { logDebug("Initializing SapSQLEnv") if (hiveContext == null) { logInfo("Creating SapSQLContext") val sparkConf = new SparkConf(loadDefaults = true) val maybeSerializer = sparkConf.getOption("spark.serializer") val maybeKryoReferenceTracking = sparkConf.getOption("spark.kryo.referenceTracking") // If user doesn't specify the appName, we want to get [SparkSQL::localHostName] instead of // the default appName [SparkSQLCLIDriver] in cli or beeline. val maybeAppName = sparkConf .getOption("spark.app.name") .filterNot(_ == classOf[SparkSQLCLIDriver].getName) sparkConf .setAppName(maybeAppName.getOrElse(s"SparkSQL::${Utils.localHostName()}")) .set("spark.serializer", maybeSerializer.getOrElse("org.apache.spark.serializer.KryoSerializer")) .set("spark.kryo.referenceTracking", maybeKryoReferenceTracking.getOrElse("false")) sparkContext = new SparkContext(sparkConf) sparkContext.addSparkListener(new StatsReportListener()) hiveContext = new SapHiveContext(sparkContext) hiveContext.metadataHive.setOut(new PrintStream(System.out, true, "UTF-8")) hiveContext.metadataHive.setInfo(new PrintStream(System.err, true, "UTF-8")) hiveContext.metadataHive.setError(new PrintStream(System.err, true, "UTF-8")) hiveContext.setConf("spark.sql.hive.version", HiveContext.hiveExecutionVersion) if (log.isDebugEnabled) { hiveContext.hiveconf.getAllProperties.toSeq.sorted.foreach { case (k, v) => logDebug(s"HiveConf var: $k=$v") } } } } }
Example 162
Source File: SapThriftServer.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import org.apache.commons.logging.LogFactory import org.apache.spark.Logging import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.hive.sap.thriftserver.SapSQLEnv import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2._ import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab import org.apache.hive.service.server.HiveServerServerOptionsProcessor object SapThriftServer extends Logging { var LOG = LogFactory.getLog(classOf[SapThriftServer]) def main(args: Array[String]) { val optionsProcessor = new HiveServerServerOptionsProcessor("SapThriftServer") if (!optionsProcessor.process(args)) { System.exit(-1) } logInfo("Starting SparkContext") SapSQLEnv.init() org.apache.spark.util.ShutdownHookManager.addShutdownHook { () => SparkSQLEnv.stop() uiTab.foreach(_.detach()) } try { val server = new HiveThriftServer2(SparkSQLEnv.hiveContext) server.init(SparkSQLEnv.hiveContext.hiveconf) server.start() logInfo("SapThriftServer started") listener = new HiveThriftServer2Listener(server, SparkSQLEnv.hiveContext.conf) SparkSQLEnv.sparkContext.addSparkListener(listener) uiTab = if (SparkSQLEnv.sparkContext.getConf.getBoolean("spark.ui.enabled", true)) { Some(new ThriftServerTab(SparkSQLEnv.sparkContext)) } else { None } } catch { case e: Exception => logError("Error starting SapThriftServer", e) System.exit(-1) } } } private[hive] class SapThriftServer(val hiveContext: HiveContext) extends Logging{ def start: Unit = { logInfo("ThriftServer with SapSQLContext") logInfo("Starting SparkContext") HiveThriftServer2.startWithContext(hiveContext) } }
Example 163
Source File: OAuthToken.scala From spark-power-bi with Apache License 2.0 | 5 votes |
package com.granturing.spark.powerbi import java.util.concurrent.{ExecutionException, TimeUnit, Executors} import com.microsoft.aad.adal4j.{AuthenticationResult, AuthenticationCallback, AuthenticationContext} import dispatch._ import org.apache.spark.Logging import scala.concurrent.{Await, promise} import scala.util.{Try, Failure, Success} private class OAuthReq(token: OAuthTokenHandler) extends (Req => Req) { override def apply(req: Req): Req = { req <:< Map("Authorization" -> s"Bearer ${token()}") } } private class OAuthTokenHandler(authConf: ClientConf, initialToken: Option[String] = None) extends Logging { private var _token: Option[String] = initialToken def apply(refresh: Boolean = false): String = { _token match { case Some(s) if !refresh => s case _ => { refreshToken match { case Success(s) => { _token = Some(s) s } case Failure(e) => throw e } } } } private def refreshToken: Try[String] = { log.info("refreshing OAuth token") val service = Executors.newFixedThreadPool(1); val context = new AuthenticationContext(authConf.token_uri, true, service) val p = promise[AuthenticationResult] val future = p.future context.acquireToken(authConf.resource, authConf.clientid, authConf.username, authConf.password, new AuthenticationCallback { def onSuccess(result: AuthenticationResult): Unit = { p.success(result) } def onFailure(ex: Throwable): Unit = { p.failure(ex) } }) try { val result = Await.result(future, authConf.timeout) log.info("OAuth token refresh successful") Success(result.getAccessToken) } catch { case e: ExecutionException => Failure(e.getCause) case t: Throwable => Failure(t) } finally { service.shutdown() } } }
Example 164
Source File: TestUtils.scala From hivemall-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.test import scala.reflect.runtime.{universe => ru} import org.apache.spark.Logging import org.apache.spark.sql.DataFrame object TestUtils extends Logging { // Do benchmark if INFO-log enabled def benchmark(benchName: String)(testFunc: => Unit): Unit = { if (log.isDebugEnabled) { testFunc } } def expectResult(res: Boolean, errMsg: String) = if (res) { logWarning(errMsg) } def invokeFunc(cls: Any, func: String, args: Any*): DataFrame = try { // Invoke a function with the given name via reflection val im = scala.reflect.runtime.currentMirror.reflect(cls) val mSym = im.symbol.typeSignature.member(ru.newTermName(func)).asMethod im.reflectMethod(mSym).apply(args: _*) .asInstanceOf[DataFrame] } catch { case e: Exception => assert(false, s"Invoking ${func} failed because: ${e.getMessage}") null // Not executed } } // TODO: Any same function in o.a.spark.*? class TestDoubleWrapper(d: Double) { // Check an equality between Double values def ~==(d: Double): Boolean = Math.abs(this.d - d) < 0.001 } object TestDoubleWrapper { @inline implicit def toTestDoubleWrapper(d: Double) = new TestDoubleWrapper(d) }
Example 165
Source File: HttpInputDStreamAsync.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.ClassTag import org.apache.spark.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaDStream import org.apache.spark.streaming.api.java.JavaDStream.fromDStream import org.apache.spark.streaming.api.java.JavaStreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import com.ning.http.client.AsyncCompletionHandler import com.ning.http.client.AsyncHttpClient import com.ning.http.client.Response class HttpInputDStreamAsync( @transient ssc_ : StreamingContext, storageLevel: StorageLevel, url: String) extends ReceiverInputDStream[String](ssc_) with Logging { def getReceiver(): Receiver[String] = { new HttpReceiverAsync(storageLevel, url) } } class HttpReceiverAsync( storageLevel: StorageLevel, url: String) extends Receiver[String](storageLevel) with Logging { var asyncHttpClient: AsyncHttpClient = _ def onStop() { asyncHttpClient.close() logInfo("Disconnected from Http Server") } def onStart() { asyncHttpClient = new AsyncHttpClient() asyncHttpClient.prepareGet(url).execute(new AsyncCompletionHandler[Response]() { override def onCompleted(response: Response): Response = { store(response.getResponseBody) return response } override def onThrowable(t: Throwable) { restart("Error! Problems while connecting", t) } }); logInfo("Http Connection initiated") } } object HttpUtilsAsync { def createStream( ssc: StreamingContext, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, url: String): DStream[String] = { new HttpInputDStreamAsync(ssc, storageLevel, url) } def createStream( jssc: JavaStreamingContext, storageLevel: StorageLevel, url: String): JavaDStream[String] = { implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] createStream(jssc.ssc, storageLevel, url) } }
Example 166
Source File: HttpInputDStream.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.util.Timer import java.util.TimerTask import scala.reflect.ClassTag import org.apache.http.client.methods.HttpGet import org.apache.http.impl.client.CloseableHttpClient import org.apache.http.impl.client.HttpClients import org.apache.http.util.EntityUtils import org.apache.spark.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaDStream import org.apache.spark.streaming.api.java.JavaDStream.fromDStream import org.apache.spark.streaming.api.java.JavaStreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver class HttpInputDStream( @transient ssc_ : StreamingContext, storageLevel: StorageLevel, url: String, interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging { def getReceiver(): Receiver[String] = { new HttpReceiver(storageLevel, url, interval) } } class HttpReceiver( storageLevel: StorageLevel, url: String, interval: Long) extends Receiver[String](storageLevel) with Logging { var httpClient: CloseableHttpClient = _ var trigger: Timer = _ def onStop() { httpClient.close() logInfo("Disconnected from Http Server") } def onStart() { httpClient = HttpClients.createDefault() trigger = new Timer() trigger.scheduleAtFixedRate(new TimerTask { def run() = doGet() }, 0, interval * 1000) logInfo("Http Receiver initiated") } def doGet() { logInfo("Fetching data from Http source") val response = httpClient.execute(new HttpGet(url)) try { val content = EntityUtils.toString(response.getEntity()) store(content) } catch { case e: Exception => restart("Error! Problems while connecting", e) } finally { response.close() } } } object HttpUtils { def createStream( ssc: StreamingContext, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, url: String, interval: Long): DStream[String] = { new HttpInputDStream(ssc, storageLevel, url, interval) } def createStream( jssc: JavaStreamingContext, storageLevel: StorageLevel, url: String, interval: Long): JavaDStream[String] = { implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] createStream(jssc.ssc, storageLevel, url, interval) } }
Example 167
Source File: HttpInputDStream.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.util.Timer import java.util.TimerTask import scala.reflect.ClassTag import org.apache.http.client.methods.HttpGet import org.apache.http.impl.client.CloseableHttpClient import org.apache.http.impl.client.HttpClients import org.apache.http.util.EntityUtils import org.apache.spark.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaDStream import org.apache.spark.streaming.api.java.JavaDStream.fromDStream import org.apache.spark.streaming.api.java.JavaStreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver class HttpInputDStream( @transient ssc_ : StreamingContext, storageLevel: StorageLevel, url: String, interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging { def getReceiver(): Receiver[String] = { new HttpReceiver(storageLevel, url, interval) } } class HttpReceiver( storageLevel: StorageLevel, url: String, interval: Long) extends Receiver[String](storageLevel) with Logging { var httpClient: CloseableHttpClient = _ var trigger: Timer = _ def onStop() { httpClient.close() logInfo("Disconnected from Http Server") } def onStart() { httpClient = HttpClients.createDefault() trigger = new Timer() trigger.scheduleAtFixedRate(new TimerTask { def run() = doGet() }, 0, interval * 1000) logInfo("Http Receiver initiated") } def doGet() { logInfo("Fetching data from Http source") val response = httpClient.execute(new HttpGet(url)) try { val content = EntityUtils.toString(response.getEntity()) store(content) } catch { case e: Exception => restart("Error! Problems while connecting", e) } finally { response.close() } } } object HttpUtils { def createStream( ssc: StreamingContext, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, url: String, interval: Long): DStream[String] = { new HttpInputDStream(ssc, storageLevel, url, interval) } def createStream( jssc: JavaStreamingContext, storageLevel: StorageLevel, url: String, interval: Long): JavaDStream[String] = { implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] createStream(jssc.ssc, storageLevel, url, interval) } }
Example 168
Source File: LibLinearTraining.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp.examples import scopt.OptionParser import org.apache.spark.SparkConf import org.apache.spark.SparkContext import se.uu.farmbio.cp.liblinear.LIBLINEAR import org.apache.spark.mllib.util.MLUtils import org.apache.spark.Logging object LibLinearTraining extends Logging { case class Params( trainInputPath: String = null, outputPath: String = null, calibrRatio: Double = 0.2, numberOfCPs: Int = 100, nofOutFiles: Int = 0, dfsBlockSize: String = "8M", master: String = null) def main(args: Array[String]) = { val defaultParams = Params() val parser = new OptionParser[Params]("PubChemTraining") { head("LibLinearTraining: LIBINEAR training procedure") opt[Double]("calibrRatio") .text("fraction of calibration examples") .action((x, c) => c.copy(calibrRatio = x)) opt[Int]("numberOfCPs") .text("number of CPs to train") .action((x, c) => c.copy(numberOfCPs = x)) opt[String]("master") .text("spark master") .action((x, c) => c.copy(master = x)) opt[Int]("nofOutFiles") .text("Number of output files. " + "It can be equal to the parallelism level at most " + "(defualt: as much as the parallelism level)") .action((x, c) => c.copy(nofOutFiles = x)) opt[String]("dfsBlockSize") .text("It tunes the Hadoop dfs.block.size property (default:8M)") .action((x, c) => c.copy(dfsBlockSize = x)) arg[String]("<input>") .required() .text("input path to training examples in LIBSVM format") .action((x, c) => c.copy(trainInputPath = x)) arg[String]("<output>") .required() .text("output path to save CPs") .action((x, c) => c.copy(outputPath = x)) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { sys.exit(1) } } def run(params: Params) { //Init Spark val conf = new SparkConf() .setAppName("LibLinearTraining") if (params.master != null) { conf.setMaster(params.master) } val sc = new SparkContext(conf) //Set and log dfs.block.size sc.hadoopConfiguration.set("dfs.block.size", params.dfsBlockSize) val blockSize = sc.hadoopConfiguration.get("dfs.block.size") logInfo(s"dfs.block.size = $blockSize") //Load data //This example assumes the training set to be relatively small //the model data generated will be big instead. val input = MLUtils.loadLibSVMFile(sc, params.trainInputPath) val trainingData = input.collect //Train the CPs val modelData = LIBLINEAR.trainAggregatedICPClassifier( sc, trainingData, params.calibrRatio, params.numberOfCPs) //Save the model in a distributed fashion modelData.save(params.outputPath, params.nofOutFiles) //Stop Spark sc.stop } }
Example 169
package se.uu.farmbio.cp import org.apache.spark.Logging import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD object ICP extends Logging { private def simpleSplit( input: RDD[LabeledPoint], numOfCalibSamples: Int) = { //Computing the calibration fraction using binomial upper bound val n = input.count val fraction = numOfCalibSamples.toDouble / n val delta = 1e-4 val minSamplingRate = 1e-10 val gamma = -math.log(delta) / n val calibFraction = math.min(1, math.max(minSamplingRate, fraction + gamma + math.sqrt(gamma * gamma + 2 * gamma * fraction))) //calibFraction is enough most of the times, but not always val splits = input.randomSplit(Array(calibFraction, 1 - calibFraction)) var sample = splits(0).collect while (sample.length < numOfCalibSamples) { logWarning("Needed to re-sample calibration set due to insufficient sample size.") val split = input.randomSplit(Array(calibFraction, 1 - calibFraction)) sample = splits(0).collect } val calibration = sample.take(numOfCalibSamples) val additional = sample.takeRight(sample.length - numOfCalibSamples) val sc = input.context (calibration, splits(1) ++ sc.parallelize(additional)) } private def stratifiedSplit( input: RDD[LabeledPoint], numOfCalibSamples: Int) = { logWarning("Stratified sampling is supported only for binary classification.") //Calibration split, making sure there is some data for both classes val class0 = input.filter(_.label == 0.0) val class1 = input.filter(_.label == 1.0) val count0 = class0.count val count1 = class1.count val posRatio = count1.doubleValue / (count0 + count1) val posSize = if(numOfCalibSamples * posRatio < 19) { logWarning("Raising the number of positive samples to 19 (allows sig >= 0.5)") 19 } else { (numOfCalibSamples * posRatio).ceil.toInt } val negSize = numOfCalibSamples - posSize val (negSmpl, negTr) = ICP.simpleSplit(class0, negSize) val (posSmpl, posTr) = ICP.simpleSplit(class1, posSize) val properTraining = negTr ++ posTr val clalibration = negSmpl ++ posSmpl (clalibration, properTraining) } def calibrationSplit( input: RDD[LabeledPoint], numOfCalibSamples: Int, stratified: Boolean = false) = { if (stratified) { logWarning("Stratified sampling needs to count the dataset, you should use it wisely.") ICP.stratifiedSplit(input, numOfCalibSamples) } else { ICP.simpleSplit(input, numOfCalibSamples) } } def trainClassifier[A <: UnderlyingAlgorithm]( alg: A, numClasses: Int, calibSet: Array[LabeledPoint]): ICPClassifierModel[A] = { //Compute aphas for each class (mondrian approach) val alphas = (0 to numClasses - 1).map { i => calibSet.filter(_.label == i) //filter current label .map(newSmpl => alg.nonConformityMeasure(newSmpl)) //compute alpha } new ICPClassifierModelImpl(alg, alphas) } }
Example 170
Source File: BlockTransferService.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.network import java.io.Closeable import java.nio.ByteBuffer import scala.concurrent.{Promise, Await, Future} import scala.concurrent.duration.Duration import org.apache.spark.Logging import org.apache.spark.network.buffer.{NioManagedBuffer, ManagedBuffer} import org.apache.spark.network.shuffle.{ShuffleClient, BlockFetchingListener} import org.apache.spark.storage.{BlockManagerId, BlockId, StorageLevel} private[spark] abstract class BlockTransferService extends ShuffleClient with Closeable with Logging { def uploadBlockSync( hostname: String, port: Int, execId: String, blockId: BlockId, blockData: ManagedBuffer, level: StorageLevel): Unit = { Await.result(uploadBlock(hostname, port, execId, blockId, blockData, level), Duration.Inf) } }
Example 171
Source File: NettyBlockRpcServer.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConversions._ import org.apache.spark.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, messageBytes: Array[Byte], responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteArray(messageBytes) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(blocks.iterator) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteArray) case uploadBlock: UploadBlock => // StorageLevel is serialized as bytes using our JavaSerializer. val level: StorageLevel = serializer.newInstance().deserialize(ByteBuffer.wrap(uploadBlock.metadata)) val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) blockManager.putBlockData(BlockId(uploadBlock.blockId), data, level) responseContext.onSuccess(new Array[Byte](0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 172
Source File: SortShuffleWriter.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.sort import org.apache.spark.{MapOutputTracker, SparkEnv, Logging, TaskContext} import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.scheduler.MapStatus import org.apache.spark.shuffle.{IndexShuffleBlockManager, ShuffleWriter, BaseShuffleHandle} import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.util.collection.ExternalSorter private[spark] class SortShuffleWriter[K, V, C]( shuffleBlockManager: IndexShuffleBlockManager, handle: BaseShuffleHandle[K, V, C], mapId: Int, context: TaskContext) extends ShuffleWriter[K, V] with Logging { private val dep = handle.dependency private val blockManager = SparkEnv.get.blockManager private var sorter: ExternalSorter[K, V, _] = null // Are we in the process of stopping? Because map tasks can call stop() with success = true // and then call stop() with success = false if they get an exception, we want to make sure // we don't try deleting files, etc twice. private var stopping = false private var mapStatus: MapStatus = null private val writeMetrics = new ShuffleWriteMetrics() context.taskMetrics.shuffleWriteMetrics = Some(writeMetrics) override def stop(success: Boolean): Option[MapStatus] = { try { if (stopping) { return None } stopping = true if (success) { return Option(mapStatus) } else { // The map task failed, so delete our output data. shuffleBlockManager.removeDataByMap(dep.shuffleId, mapId) return None } } finally { // Clean up our sorter, which may have its own intermediate files if (sorter != null) { sorter.stop() sorter = null } } } }
Example 173
Source File: MetricsConfig.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.metrics import java.io.{FileInputStream, InputStream} import java.util.Properties import scala.collection.mutable import scala.util.matching.Regex import org.apache.spark.Logging import org.apache.spark.util.Utils private[spark] class MetricsConfig(val configFile: Option[String]) extends Logging { private val DEFAULT_PREFIX = "*" private val INSTANCE_REGEX = "^(\\*|[a-zA-Z]+)\\.(.+)".r private val DEFAULT_METRICS_CONF_FILENAME = "metrics.properties" private[metrics] val properties = new Properties() private[metrics] var propertyCategories: mutable.HashMap[String, Properties] = null private def setDefaultProperties(prop: Properties) { prop.setProperty("*.sink.servlet.class", "org.apache.spark.metrics.sink.MetricsServlet") prop.setProperty("*.sink.servlet.path", "/metrics/json") prop.setProperty("master.sink.servlet.path", "/metrics/master/json") prop.setProperty("applications.sink.servlet.path", "/metrics/applications/json") } def initialize() { // Add default properties in case there's no properties file setDefaultProperties(properties) // If spark.metrics.conf is not set, try to get file in class path val isOpt: Option[InputStream] = configFile.map(new FileInputStream(_)).orElse { try { Option(Utils.getSparkClassLoader.getResourceAsStream(DEFAULT_METRICS_CONF_FILENAME)) } catch { case e: Exception => logError("Error loading default configuration file", e) None } } isOpt.foreach { is => try { properties.load(is) } finally { is.close() } } propertyCategories = subProperties(properties, INSTANCE_REGEX) if (propertyCategories.contains(DEFAULT_PREFIX)) { import scala.collection.JavaConversions._ val defaultProperty = propertyCategories(DEFAULT_PREFIX) for { (inst, prop) <- propertyCategories if (inst != DEFAULT_PREFIX) (k, v) <- defaultProperty if (prop.getProperty(k) == null) } { prop.setProperty(k, v) } } } def subProperties(prop: Properties, regex: Regex): mutable.HashMap[String, Properties] = { val subProperties = new mutable.HashMap[String, Properties] import scala.collection.JavaConversions._ prop.foreach { kv => if (regex.findPrefixOf(kv._1).isDefined) { val regex(prefix, suffix) = kv._1 subProperties.getOrElseUpdate(prefix, new Properties).setProperty(suffix, kv._2) } } subProperties } def getInstance(inst: String): Properties = { propertyCategories.get(inst) match { case Some(s) => s case None => propertyCategories.getOrElse(DEFAULT_PREFIX, new Properties) } } }
Example 174
Source File: PythonGatewayServer.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import java.io.DataOutputStream import java.net.Socket import py4j.GatewayServer import org.apache.spark.Logging import org.apache.spark.util.Utils private[spark] object PythonGatewayServer extends Logging { def main(args: Array[String]): Unit = Utils.tryOrExit { // Start a GatewayServer on an ephemeral port val gatewayServer: GatewayServer = new GatewayServer(null, 0) gatewayServer.start() val boundPort: Int = gatewayServer.getListeningPort if (boundPort == -1) { logError("GatewayServer failed to bind; exiting") System.exit(1) } else { logDebug(s"Started PythonGatewayServer on port $boundPort") } // Communicate the bound port back to the caller via the caller-specified callback port val callbackHost = sys.env("_PYSPARK_DRIVER_CALLBACK_HOST") val callbackPort = sys.env("_PYSPARK_DRIVER_CALLBACK_PORT").toInt logDebug(s"Communicating GatewayServer port to Python driver at $callbackHost:$callbackPort") val callbackSocket = new Socket(callbackHost, callbackPort) val dos = new DataOutputStream(callbackSocket.getOutputStream) dos.writeInt(boundPort) dos.close() callbackSocket.close() // Exit on EOF or broken pipe to ensure that this process dies when the Python driver dies: while (System.in.read() != -1) { // Do nothing } logDebug("Exiting due to broken pipe from Python driver") System.exit(0) } }
Example 175
Source File: TestClient.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.client import org.apache.spark.{SecurityManager, SparkConf, Logging} import org.apache.spark.deploy.{ApplicationDescription, Command} import org.apache.spark.util.{AkkaUtils, Utils} private[spark] object TestClient { class TestListener extends AppClientListener with Logging { def connected(id: String) { logInfo("Connected to master, got app ID " + id) } def disconnected() { logInfo("Disconnected from master") System.exit(0) } def dead(reason: String) { logInfo("Application died with error: " + reason) System.exit(0) } def executorAdded(id: String, workerId: String, hostPort: String, cores: Int, memory: Int) {} def executorRemoved(id: String, message: String, exitStatus: Option[Int]) {} } def main(args: Array[String]) { val url = args(0) val conf = new SparkConf val (actorSystem, _) = AkkaUtils.createActorSystem("spark", Utils.localIpAddress, 0, conf = conf, securityManager = new SecurityManager(conf)) val desc = new ApplicationDescription("TestClient", Some(1), 512, Command("spark.deploy.client.TestExecutor", Seq(), Map(), Seq(), Seq(), Seq()), "ignored") val listener = new TestListener val client = new AppClient(actorSystem, Array(url), desc, listener, new SparkConf) client.start() actorSystem.awaitTermination() } }
Example 176
Source File: FileSystemPersistenceEngine.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.io._ import scala.reflect.ClassTag import akka.serialization.Serialization import org.apache.spark.Logging private[spark] class FileSystemPersistenceEngine( val dir: String, val serialization: Serialization) extends PersistenceEngine with Logging { new File(dir).mkdir() override def persist(name: String, obj: Object): Unit = { serializeIntoFile(new File(dir + File.separator + name), obj) } override def unpersist(name: String): Unit = { new File(dir + File.separator + name).delete() } override def read[T: ClassTag](prefix: String) = { val files = new File(dir).listFiles().filter(_.getName.startsWith(prefix)) files.map(deserializeFromFile[T]) } private def serializeIntoFile(file: File, value: AnyRef) { val created = file.createNewFile() if (!created) { throw new IllegalStateException("Could not create file: " + file) } val serializer = serialization.findSerializerFor(value) val serialized = serializer.toBinary(value) val out = new FileOutputStream(file) try { out.write(serialized) } finally { out.close() } } private def deserializeFromFile[T](file: File)(implicit m: ClassTag[T]): T = { val fileData = new Array[Byte](file.length().asInstanceOf[Int]) val dis = new DataInputStream(new FileInputStream(file)) try { dis.readFully(fileData) } finally { dis.close() } val clazz = m.runtimeClass.asInstanceOf[Class[T]] val serializer = serialization.serializerFor(clazz) serializer.fromBinary(fileData).asInstanceOf[T] } }
Example 177
Source File: SparkCuratorUtil.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import scala.collection.JavaConversions._ import org.apache.curator.framework.{CuratorFramework, CuratorFrameworkFactory} import org.apache.curator.retry.ExponentialBackoffRetry import org.apache.zookeeper.KeeperException import org.apache.spark.{Logging, SparkConf} object SparkCuratorUtil extends Logging { val ZK_CONNECTION_TIMEOUT_MILLIS = 15000 val ZK_SESSION_TIMEOUT_MILLIS = 60000 val RETRY_WAIT_MILLIS = 5000 val MAX_RECONNECT_ATTEMPTS = 3 def newClient(conf: SparkConf): CuratorFramework = { val ZK_URL = conf.get("spark.deploy.zookeeper.url") val zk = CuratorFrameworkFactory.newClient(ZK_URL, ZK_SESSION_TIMEOUT_MILLIS, ZK_CONNECTION_TIMEOUT_MILLIS, new ExponentialBackoffRetry(RETRY_WAIT_MILLIS, MAX_RECONNECT_ATTEMPTS)) zk.start() zk } def mkdir(zk: CuratorFramework, path: String) { if (zk.checkExists().forPath(path) == null) { try { zk.create().creatingParentsIfNeeded().forPath(path) } catch { case nodeExist: KeeperException.NodeExistsException => // do nothing, ignore node existing exception. case e: Exception => throw e } } } def deleteRecursive(zk: CuratorFramework, path: String) { if (zk.checkExists().forPath(path) != null) { for (child <- zk.getChildren.forPath(path)) { zk.delete().forPath(path + "/" + child) } zk.delete().forPath(path) } } }
Example 178
Source File: ZooKeeperLeaderElectionAgent.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import akka.actor.ActorRef import org.apache.spark.{Logging, SparkConf} import org.apache.spark.deploy.master.MasterMessages._ import org.apache.curator.framework.CuratorFramework import org.apache.curator.framework.recipes.leader.{LeaderLatchListener, LeaderLatch} private[spark] class ZooKeeperLeaderElectionAgent(val masterActor: LeaderElectable, conf: SparkConf) extends LeaderLatchListener with LeaderElectionAgent with Logging { val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/leader_election" private var zk: CuratorFramework = _ private var leaderLatch: LeaderLatch = _ private var status = LeadershipStatus.NOT_LEADER start() def start() { logInfo("Starting ZooKeeper LeaderElection agent") zk = SparkCuratorUtil.newClient(conf) leaderLatch = new LeaderLatch(zk, WORKING_DIR) leaderLatch.addListener(this) leaderLatch.start() } override def stop() { leaderLatch.close() zk.close() } override def isLeader() { synchronized { // could have lost leadership by now. if (!leaderLatch.hasLeadership) { return } logInfo("We have gained leadership") updateLeadershipStatus(true) } } override def notLeader() { synchronized { // could have gained leadership by now. if (leaderLatch.hasLeadership) { return } logInfo("We have lost leadership") updateLeadershipStatus(false) } } def updateLeadershipStatus(isLeader: Boolean) { if (isLeader && status == LeadershipStatus.NOT_LEADER) { status = LeadershipStatus.LEADER masterActor.electedLeader() } else if (!isLeader && status == LeadershipStatus.LEADER) { status = LeadershipStatus.NOT_LEADER masterActor.revokedLeadership() } } private object LeadershipStatus extends Enumeration { type LeadershipStatus = Value val LEADER, NOT_LEADER = Value } }
Example 179
Source File: ZooKeeperPersistenceEngine.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import akka.serialization.Serialization import scala.collection.JavaConversions._ import scala.reflect.ClassTag import org.apache.curator.framework.CuratorFramework import org.apache.zookeeper.CreateMode import org.apache.spark.{Logging, SparkConf} private[spark] class ZooKeeperPersistenceEngine(conf: SparkConf, val serialization: Serialization) extends PersistenceEngine with Logging { val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/master_status" val zk: CuratorFramework = SparkCuratorUtil.newClient(conf) SparkCuratorUtil.mkdir(zk, WORKING_DIR) override def persist(name: String, obj: Object): Unit = { serializeIntoFile(WORKING_DIR + "/" + name, obj) } override def unpersist(name: String): Unit = { zk.delete().forPath(WORKING_DIR + "/" + name) } override def read[T: ClassTag](prefix: String) = { val file = zk.getChildren.forPath(WORKING_DIR).filter(_.startsWith(prefix)) file.map(deserializeFromFile[T]).flatten } override def close() { zk.close() } private def serializeIntoFile(path: String, value: AnyRef) { val serializer = serialization.findSerializerFor(value) val serialized = serializer.toBinary(value) zk.create().withMode(CreateMode.PERSISTENT).forPath(path, serialized) } def deserializeFromFile[T](filename: String)(implicit m: ClassTag[T]): Option[T] = { val fileData = zk.getData().forPath(WORKING_DIR + "/" + filename) val clazz = m.runtimeClass.asInstanceOf[Class[T]] val serializer = serialization.serializerFor(clazz) try { Some(serializer.fromBinary(fileData).asInstanceOf[T]) } catch { case e: Exception => { logWarning("Exception while reading persisted file, deleting", e) zk.delete().forPath(WORKING_DIR + "/" + filename) None } } } }
Example 180
Source File: WorkerWebUI.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.worker.ui import java.io.File import javax.servlet.http.HttpServletRequest import org.apache.spark.{Logging, SparkConf} import org.apache.spark.deploy.worker.Worker import org.apache.spark.deploy.worker.ui.WorkerWebUI._ import org.apache.spark.ui.{SparkUI, WebUI} import org.apache.spark.ui.JettyUtils._ import org.apache.spark.util.AkkaUtils def initialize() { val logPage = new LogPage(this) attachPage(logPage) attachPage(new WorkerPage(this)) attachHandler(createStaticHandler(WorkerWebUI.STATIC_RESOURCE_BASE, "/static")) attachHandler(createServletHandler("/log", (request: HttpServletRequest) => logPage.renderLog(request), worker.securityMgr)) } } private[spark] object WorkerWebUI { val STATIC_RESOURCE_BASE = SparkUI.STATIC_RESOURCE_DIR }
Example 181
Source File: WorkerWatcher.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.worker import akka.actor.{Actor, Address, AddressFromURIString} import akka.remote.{AssociatedEvent, AssociationErrorEvent, AssociationEvent, DisassociatedEvent, RemotingLifecycleEvent} import org.apache.spark.Logging import org.apache.spark.deploy.DeployMessages.SendHeartbeat import org.apache.spark.util.ActorLogReceive private[spark] class WorkerWatcher(workerUrl: String) extends Actor with ActorLogReceive with Logging { override def preStart() { context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent]) logInfo(s"Connecting to worker $workerUrl") val worker = context.actorSelection(workerUrl) worker ! SendHeartbeat // need to send a message here to initiate connection } // Used to avoid shutting down JVM during tests private[deploy] var isShutDown = false private[deploy] def setTesting(testing: Boolean) = isTesting = testing private var isTesting = false // Lets us filter events only from the worker's actor system private val expectedHostPort = AddressFromURIString(workerUrl).hostPort private def isWorker(address: Address) = address.hostPort == expectedHostPort def exitNonZero() = if (isTesting) isShutDown = true else System.exit(-1) override def receiveWithLogging = { case AssociatedEvent(localAddress, remoteAddress, inbound) if isWorker(remoteAddress) => logInfo(s"Successfully connected to $workerUrl") case AssociationErrorEvent(cause, localAddress, remoteAddress, inbound, _) if isWorker(remoteAddress) => // These logs may not be seen if the worker (and associated pipe) has died logError(s"Could not initialize connection to worker $workerUrl. Exiting.") logError(s"Error was: $cause") exitNonZero() case DisassociatedEvent(localAddress, remoteAddress, inbound) if isWorker(remoteAddress) => // This log message will never be seen logError(s"Lost connection to worker actor $workerUrl. Exiting.") exitNonZero() case e: AssociationEvent => // pass through association events relating to other remote actor systems case e => logWarning(s"Received unexpected actor system event: $e") } }
Example 182
Source File: StandaloneWorkerShuffleService.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.worker import org.apache.spark.{Logging, SparkConf, SecurityManager} import org.apache.spark.network.TransportContext import org.apache.spark.network.netty.SparkTransportConf import org.apache.spark.network.sasl.SaslRpcHandler import org.apache.spark.network.server.TransportServer import org.apache.spark.network.shuffle.ExternalShuffleBlockHandler def startIfEnabled() { if (enabled) { require(server == null, "Shuffle server already started") logInfo(s"Starting shuffle service on port $port with useSasl = $useSasl") server = transportContext.createServer(port) } } def stop() { if (enabled && server != null) { server.close() server = null } } }
Example 183
Source File: HistoryServerArguments.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.history import org.apache.spark.{Logging, SparkConf} import org.apache.spark.util.Utils private[spark] class HistoryServerArguments(conf: SparkConf, args: Array[String]) extends Logging { private var propertiesFile: String = null parse(args.toList) private def parse(args: List[String]): Unit = { args match { case ("--dir" | "-d") :: value :: tail => logWarning("Setting log directory through the command line is deprecated as of " + "Spark 1.1.0. Please set this through spark.history.fs.logDirectory instead.") conf.set("spark.history.fs.logDirectory", value) System.setProperty("spark.history.fs.logDirectory", value) parse(tail) case ("--help" | "-h") :: tail => printUsageAndExit(0) case ("--properties-file") :: value :: tail => propertiesFile = value parse(tail) case Nil => case _ => printUsageAndExit(1) } } // This mutates the SparkConf, so all accesses to it must be made after this line Utils.loadDefaultSparkProperties(conf, propertiesFile) private def printUsageAndExit(exitCode: Int) { System.err.println( """ |Usage: HistoryServer [options] | |Options: | --properties-file FILE Path to a custom Spark properties file. | Default is conf/spark-defaults.conf. | |Configuration options can be set by setting the corresponding JVM system property. |History Server options are always available; additional options depend on the provider. | |History Server options: | | spark.history.ui.port Port where server will listen for connections | (default 18080) | spark.history.acls.enable Whether to enable view acls for all applications | (default false) | spark.history.provider Name of history provider class (defaults to | file system-based provider) | spark.history.retainedApplications Max number of application UIs to keep loaded in memory | (default 50) |FsHistoryProvider options: | | spark.history.fs.logDirectory Directory where app logs are stored | (default: file:/tmp/spark-events) | spark.history.fs.updateInterval How often to reload log data from storage | (in seconds, default: 10) |""".stripMargin) System.exit(exitCode) } }
Example 184
Source File: LocalSparkCluster.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import scala.collection.mutable.ArrayBuffer import akka.actor.ActorSystem import org.apache.spark.{Logging, SparkConf} import org.apache.spark.deploy.worker.Worker import org.apache.spark.deploy.master.Master import org.apache.spark.util.Utils for (workerNum <- 1 to numWorkers) { val (workerSystem, _) = Worker.startSystemAndActor(localHostname, 0, 0, coresPerWorker, memoryPerWorker, masters, null, Some(workerNum), _conf) workerActorSystems += workerSystem } masters } def stop() { logInfo("Shutting down local Spark cluster.") // Stop the workers before the master so they don't get upset that it disconnected // TODO: In Akka 2.1.x, ActorSystem.awaitTermination hangs when you have remote actors! // This is unfortunate, but for now we just comment it out. workerActorSystems.foreach(_.shutdown()) // workerActorSystems.foreach(_.awaitTermination()) masterActorSystems.foreach(_.shutdown()) // masterActorSystems.foreach(_.awaitTermination()) masterActorSystems.clear() workerActorSystems.clear() } }
Example 185
Source File: SimrSchedulerBackend.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.{Logging, SparkContext, SparkEnv} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.scheduler.TaskSchedulerImpl import org.apache.spark.util.AkkaUtils private[spark] class SimrSchedulerBackend( scheduler: TaskSchedulerImpl, sc: SparkContext, driverFilePath: String) extends CoarseGrainedSchedulerBackend(scheduler, sc.env.actorSystem) with Logging { val tmpPath = new Path(driverFilePath + "_tmp") val filePath = new Path(driverFilePath) val maxCores = conf.getInt("spark.simr.executor.cores", 1) override def start() { super.start() val driverUrl = AkkaUtils.address( AkkaUtils.protocol(actorSystem), SparkEnv.driverActorSystemName, sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port"), CoarseGrainedSchedulerBackend.ACTOR_NAME) val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("") logInfo("Writing to HDFS file: " + driverFilePath) logInfo("Writing Akka address: " + driverUrl) logInfo("Writing Spark UI Address: " + appUIAddress) // Create temporary file to prevent race condition where executors get empty driverUrl file val temp = fs.create(tmpPath, true) temp.writeUTF(driverUrl) temp.writeInt(maxCores) temp.writeUTF(appUIAddress) temp.close() // "Atomic" rename fs.rename(tmpPath, filePath) } override def stop() { val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) fs.delete(new Path(driverFilePath), false) super.stop() } }
Example 186
Source File: MesosTaskLaunchData.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster.mesos import java.nio.ByteBuffer import org.apache.mesos.protobuf.ByteString import org.apache.spark.Logging private[spark] case class MesosTaskLaunchData( serializedTask: ByteBuffer, attemptNumber: Int) extends Logging { def toByteString: ByteString = { val dataBuffer = ByteBuffer.allocate(4 + serializedTask.limit) dataBuffer.putInt(attemptNumber) dataBuffer.put(serializedTask) dataBuffer.rewind logDebug(s"ByteBuffer size: [${dataBuffer.remaining}]") ByteString.copyFrom(dataBuffer) } } private[spark] object MesosTaskLaunchData extends Logging { def fromByteString(byteString: ByteString): MesosTaskLaunchData = { val byteBuffer = byteString.asReadOnlyByteBuffer() logDebug(s"ByteBuffer size: [${byteBuffer.remaining}]") val attemptNumber = byteBuffer.getInt // updates the position by 4 bytes val serializedTask = byteBuffer.slice() // subsequence starting at the current position MesosTaskLaunchData(serializedTask, attemptNumber) } }
Example 187
Source File: ReplayListenerBus.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io.{InputStream, IOException} import scala.io.Source import org.json4s.jackson.JsonMethods._ import org.apache.spark.Logging import org.apache.spark.util.JsonProtocol def replay(logData: InputStream, sourceName: String): Unit = { var currentLine: String = null var lineNumber: Int = 1 try { val lines = Source.fromInputStream(logData).getLines() lines.foreach { line => currentLine = line postToAll(JsonProtocol.sparkEventFromJson(parse(line))) lineNumber += 1 } } catch { case ioe: IOException => throw ioe case e: Exception => logError(s"Exception parsing Spark event log: $sourceName", e) logError(s"Malformed line #$lineNumber: $currentLine\n") } } }
Example 188
Source File: SparkUncaughtExceptionHandler.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.util import org.apache.spark.Logging private[spark] object SparkUncaughtExceptionHandler extends Thread.UncaughtExceptionHandler with Logging { override def uncaughtException(thread: Thread, exception: Throwable) { try { logError("Uncaught exception in thread " + thread, exception) // We may have been called from a shutdown hook. If so, we must not call System.exit(). // (If we do, we will deadlock.) if (!Utils.inShutdown()) { if (exception.isInstanceOf[OutOfMemoryError]) { System.exit(SparkExitCode.OOM) } else { System.exit(SparkExitCode.UNCAUGHT_EXCEPTION) } } } catch { case oom: OutOfMemoryError => Runtime.getRuntime.halt(SparkExitCode.OOM) case t: Throwable => Runtime.getRuntime.halt(SparkExitCode.UNCAUGHT_EXCEPTION_TWICE) } } def uncaughtException(exception: Throwable) { uncaughtException(Thread.currentThread(), exception) } }
Example 189
Source File: BlockManagerSlaveActor.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import scala.concurrent.Future import akka.actor.{ActorRef, Actor} import org.apache.spark.{Logging, MapOutputTracker, SparkEnv} import org.apache.spark.storage.BlockManagerMessages._ import org.apache.spark.util.ActorLogReceive private[storage] class BlockManagerSlaveActor( blockManager: BlockManager, mapOutputTracker: MapOutputTracker) extends Actor with ActorLogReceive with Logging { import context.dispatcher // Operations that involve removing blocks may be slow and should be done asynchronously override def receiveWithLogging = { case RemoveBlock(blockId) => doAsync[Boolean]("removing block " + blockId, sender) { blockManager.removeBlock(blockId) true } case RemoveRdd(rddId) => doAsync[Int]("removing RDD " + rddId, sender) { blockManager.removeRdd(rddId) } case RemoveShuffle(shuffleId) => doAsync[Boolean]("removing shuffle " + shuffleId, sender) { if (mapOutputTracker != null) { mapOutputTracker.unregisterShuffle(shuffleId) } SparkEnv.get.shuffleManager.unregisterShuffle(shuffleId) } case RemoveBroadcast(broadcastId, _) => doAsync[Int]("removing broadcast " + broadcastId, sender) { blockManager.removeBroadcast(broadcastId, tellMaster = true) } case GetBlockStatus(blockId, _) => sender ! blockManager.getStatus(blockId) case GetMatchingBlockIds(filter, _) => sender ! blockManager.getMatchingBlockIds(filter) } private def doAsync[T](actionMessage: String, responseActor: ActorRef)(body: => T) { val future = Future { logDebug(actionMessage) body } future.onSuccess { case response => logDebug("Done " + actionMessage + ", response is " + response) responseActor ! response logDebug("Sent response: " + response + " to " + responseActor) } future.onFailure { case t: Throwable => logError("Error in " + actionMessage, t) responseActor ! null.asInstanceOf[T] } } }
Example 190
Source File: DatasourceRDD.scala From datasource-receiver with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.datasource.receiver import org.apache.spark.partial.{BoundedDouble, CountEvaluator, PartialResult} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.streaming.datasource.config.ParametersUtils import org.apache.spark.streaming.datasource.models.{InputSentences, OffsetOperator} import org.apache.spark.{Logging, Partition, TaskContext} private[datasource] class DatasourceRDD( @transient sqlContext: SQLContext, inputSentences: InputSentences, datasourceParams: Map[String, String] ) extends RDD[Row](sqlContext.sparkContext, Nil) with Logging with ParametersUtils { private var totalCalculated: Option[Long] = None private val InitTableName = "initTable" private val LimitedTableName = "limitedTable" private val TempInitQuery = s"select * from $InitTableName" val dataFrame = inputSentences.offsetConditions.fold(sqlContext.sql(inputSentences.query)) { case offset => val parsedQuery = parseInitialQuery val conditionsSentence = offset.fromOffset.extractConditionSentence(parsedQuery) val orderSentence = offset.fromOffset.extractOrderSentence(parsedQuery, inverse = offset.limitRecords.isEmpty) val limitSentence = inputSentences.extractLimitSentence sqlContext.sql(parsedQuery + conditionsSentence + orderSentence + limitSentence) } private def parseInitialQuery: String = { if (inputSentences.query.toUpperCase.contains("WHERE") || inputSentences.query.toUpperCase.contains("ORDER") || inputSentences.query.toUpperCase.contains("LIMIT") ) { sqlContext.sql(inputSentences.query).registerTempTable(InitTableName) TempInitQuery } else inputSentences.query } def progressInputSentences: InputSentences = { if (!dataFrame.rdd.isEmpty()) { inputSentences.offsetConditions.fold(inputSentences) { case offset => val offsetValue = if (offset.limitRecords.isEmpty) dataFrame.rdd.first().get(dataFrame.schema.fieldIndex(offset.fromOffset.name)) else { dataFrame.registerTempTable(LimitedTableName) val limitedQuery = s"select * from $LimitedTableName order by ${offset.fromOffset.name} " + s"${OffsetOperator.toInverseOrderOperator(offset.fromOffset.operator)} limit 1" sqlContext.sql(limitedQuery).rdd.first().get(dataFrame.schema.fieldIndex(offset.fromOffset.name)) } inputSentences.copy(offsetConditions = Option(offset.copy(fromOffset = offset.fromOffset.copy( value = Option(offsetValue), operator = OffsetOperator.toProgressOperator(offset.fromOffset.operator))))) } } else inputSentences } override def isEmpty(): Boolean = { totalCalculated.fold { withScope { partitions.length == 0 || take(1).length == 0 } } { total => total == 0L } } override def getPartitions: Array[Partition] = dataFrame.rdd.partitions override def compute(thePart: Partition, context: TaskContext): Iterator[Row] = dataFrame.rdd.compute(thePart, context) override def getPreferredLocations(thePart: Partition): Seq[String] = dataFrame.rdd.preferredLocations(thePart) }
Example 191
Source File: SparkEsTransportClientManager.scala From Spark2Elasticsearch with Apache License 2.0 | 5 votes |
package com.github.jparkie.spark.elasticsearch.transport import com.github.jparkie.spark.elasticsearch.conf.SparkEsTransportClientConf import org.apache.spark.Logging import org.elasticsearch.client.Client import org.elasticsearch.client.transport.TransportClient import org.elasticsearch.common.settings.Settings import org.elasticsearch.common.transport.InetSocketTransportAddress import scala.collection.mutable private[elasticsearch] trait SparkEsTransportClientManager extends Serializable with Logging { @transient private[transport] val internalTransportClients = mutable.HashMap.empty[SparkEsTransportClientConf, TransportClient] private[transport] def buildTransportSettings(clientConf: SparkEsTransportClientConf): Settings = { val esSettingsBuilder = Settings.builder() clientConf.transportSettings foreach { currentSetting => esSettingsBuilder.put(currentSetting._1, currentSetting._2) } esSettingsBuilder.build() } private[transport] def buildTransportClient(clientConf: SparkEsTransportClientConf, esSettings: Settings): TransportClient = { import SparkEsTransportClientConf._ val esClient = TransportClient.builder() .settings(esSettings) .build() getTransportAddresses(clientConf.transportAddresses, clientConf.transportPort) foreach { inetSocketAddress => esClient.addTransportAddresses(new InetSocketTransportAddress(inetSocketAddress)) } sys.addShutdownHook { logInfo("Closed Elasticsearch Transport Client.") esClient.close() } logInfo(s"Connected to the following Elasticsearch nodes: ${esClient.connectedNodes()}.") esClient } def closeTransportClient(clientConf: SparkEsTransportClientConf): Unit = synchronized { internalTransportClients.remove(clientConf) match { case Some(transportClient) => transportClient.close() case None => logError(s"No TransportClient for $clientConf.") } } } object SparkEsTransportClientManager extends SparkEsTransportClientManager
Example 192
Source File: MessageDelimiter.scala From spark-cep with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.sources import org.apache.spark.Logging import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Cast, EmptyRow, Literal} import org.apache.spark.sql.types.StructType class MessageDelimiter extends MessageToRowConverter with Logging { val delimiter = " " def toRow(msg: String, schema: StructType): InternalRow = { val splitted = msg.split(delimiter).map(Literal(_)) val casted = splitted.indices.map(i => Cast(splitted(i), schema(i).dataType).eval(EmptyRow)) InternalRow.fromSeq(casted) } def toMessage(row: Row): String = row.mkString(delimiter) } trait MessageToRowConverter extends Serializable { def toRow(message: String, schema: StructType): InternalRow def toMessage(row: Row): String }
Example 193
Source File: ApspResult.scala From spark-all-pairs-shortest-path with Apache License 2.0 | 5 votes |
import java.io.Serializable import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.distributed.BlockMatrix import org.apache.spark.Logging import org.apache.spark.storage.StorageLevel class ApspResult ( var size: Long, var distMatrix: BlockMatrix) extends Serializable with Logging{ validateResult(distMatrix) private def validateResult(result: BlockMatrix): Unit = { require(result.numRows == result.numCols, "The shortest distance matrix is not square.") require(size == result.numRows, s"The size of the shortest distance matrix does not match $size.") if (result.blocks.getStorageLevel == StorageLevel.NONE) { logWarning("The APSP result is not cached. Lookup could be slow") } } def lookupDist(srcId: Long, dstId: Long): Double = { val sizePerBlock = distMatrix.rowsPerBlock val rowBlockId = (srcId/sizePerBlock).toInt val colBlockId = (dstId/sizePerBlock).toInt val block = distMatrix.blocks.filter{case ((i, j), _) => ( i == rowBlockId) & (j == colBlockId)} .first._2 block.toArray((dstId % sizePerBlock).toInt * block.numRows + (srcId % sizePerBlock).toInt) } def toLocal(): Matrix = { distMatrix.toLocalMatrix() } }
Example 194
Source File: SessionStats.scala From twitter-stream-ml with GNU General Public License v3.0 | 5 votes |
package com.giorgioinf.twtml.spark import com.giorgioinf.twtml.web.WebClient import org.apache.spark.Logging import org.apache.spark.rdd.RDD import org.viz.lightning.{Lightning,Visualization} import scala.util.Try class SessionStats(conf:ConfArguments) extends Logging { def lgn = Lightning(conf.lightning) def web = WebClient(conf.twtweb) var viz:Visualization = _ // blue val realColorDet = Array(173.0, 216.0, 230.0) val realColor = Array(30.0, 144.0, 255.0) // yellow val predColorDet = Array(238.0, 232.0, 170.0) val predColor = Array(255.0, 215.0, 0.0) def update(count:Long, batch:Long, mse:Double, realStdev:Double, predStdev:Double, real:Array[Double], pred:Array[Double]) { val realStdevArr = Array.fill(batch.toInt)(realStdev) val predStdevArr = Array.fill(batch.toInt)(predStdev) Try(web.stats(count, batch, mse.toLong, realStdev.toLong, predStdev.toLong)) Try(lgn.lineStreaming( series = Array(real, pred, realStdevArr, predStdevArr), viz = viz)) } def open():this.type = { log.info("Initializing plot on lightning server: {}", conf.lightning) // lgn.createSession(conf.appName) // if (lgn.session.nonEmpty) { // log.info("lightning server session: {}/sessions/{}{}", conf.lightning, lgn.session, "") // } else { // log.warn("lightning server session is empty") // } // plot new graph viz = lgn.lineStreaming( series = Array.fill(4)(Array(0.0)), size = Array(1.0, 1.0, 2.0, 2.0), color = Array(realColorDet, predColorDet, realColor, predColor)) log.info("lightning server session: \n {}/sessions/{}\n {}/visualizations/{}/pym", conf.lightning, viz.lgn.session, conf.lightning, viz.id) log.info("Initializing config on web server: {}", conf.twtweb) // send config to web server Try(web.config(viz.lgn.session, lgn.host, List(viz.id))) this } }
Example 195
Source File: LinearRegression.scala From twitter-stream-ml with GNU General Public License v3.0 | 5 votes |
package com.giorgioinf.twtml.spark import org.apache.spark.{Logging, SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.twitter.TwitterUtils object LinearRegression extends Logging { def main(args: Array[String]) { log.info("Parsing applications arguments") val conf = new ConfArguments() .setAppName("twitter-stream-ml-linear-regression") .parse(args.toList) log.info("Initializing session stats...") val session = new SessionStats(conf).open log.info("Initializing Spark Machine Learning Model...") MllibHelper.reset(conf) val model = new StreamingLinearRegressionWithSGD() .setNumIterations(conf.numIterations) .setStepSize(conf.stepSize) .setMiniBatchFraction(conf.miniBatchFraction) .setInitialWeights(Vectors.zeros(MllibHelper.numFeatures)) log.info("Initializing Spark Context...") val sc = new SparkContext(conf.sparkConf) log.info("Initializing Streaming Spark Context... {} sec/batch", conf.seconds) val ssc = new StreamingContext(sc, Seconds(conf.seconds)) log.info("Initializing Twitter stream...") val stream = TwitterUtils.createStream(ssc, None) .filter(MllibHelper.filtrate) .map(MllibHelper.featurize) .cache() log.info("Initializing prediction model...") val count = sc.accumulator(0L, "count") stream.foreachRDD({ rdd => if (rdd.isEmpty) log.debug("batch: 0") else { val realPred = rdd.map{ lb => (lb.label, Utils.round(model.latestModel.predict(lb.features))) } val batch = rdd.count count += batch val real = realPred.map(_._1) val pred = realPred.map(_._2) val realStdev = Utils.round(real.stdev) val predStdev = Utils.round(pred.stdev) val mse = Utils.round(realPred.map{case(v, p) => math.pow((v - p), 2)}.mean()) if (log.isDebugEnabled) { log.debug("count: {}", count) // batch, mse (training mean squared error) log.debug("batch: {}, mse: {}", batch, mse) log.debug("stdev (real, pred): ({}, {})", realStdev.toLong, predStdev.toLong) log.debug("value (real, pred): {} ...", realPred.take(10).toArray) } session.update(count.value, batch, mse, realStdev, predStdev, real.toArray, pred.toArray); } }) log.info("Initializing training model...") // training after prediction model.trainOn(stream) // Start the streaming computation ssc.start() log.info("Initialization complete.") ssc.awaitTermination() } }
Example 196
Source File: MllibHelper.scala From twitter-stream-ml with GNU General Public License v3.0 | 5 votes |
package com.giorgioinf.twtml.spark import java.text.Normalizer import org.apache.spark.Logging import org.apache.spark.mllib.feature.HashingTF import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import scala.math.BigDecimal import twitter4j.Status object MllibHelper extends Logging { val numNumberFeatures = 4 var numRetweetBegin = 100 var numRetweetEnd = 1000 var numTextFeatures = 1000 var hashText = new HashingTF(numTextFeatures) var numFeatures = numTextFeatures + numNumberFeatures var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray def reset(conf:ConfArguments) { numRetweetBegin = conf.numRetweetBegin numRetweetEnd = conf.numRetweetEnd numTextFeatures = conf.numTextFeatures var hashText = new HashingTF(numTextFeatures) var numFeatures = numTextFeatures + numNumberFeatures var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray log.debug(s"retweet range: ($numRetweetBegin - $numRetweetEnd), numTextFeatures: $numTextFeatures") } def featurizeText(statuses: Status): SparseVector = { val text = statuses.getRetweetedStatus .getText .toLowerCase // Separate accents from characters and then remove non-unicode // characters val noAccentText = Normalizer .normalize(text, Normalizer.Form.NFD) .replaceAll("\\p{M}", "") // bigrams hashText.transform(text.sliding(2).toSeq) .asInstanceOf[SparseVector] } def featurizeNumbers(statuses: Status): Vector = { val user = statuses.getRetweetedStatus.getUser val created = statuses.getRetweetedStatus.getCreatedAt val timeLeft = (System.currentTimeMillis - created.getTime) Vectors.dense( user.getFollowersCount * Math.pow(10, -12), user.getFavouritesCount * Math.pow(10, -12), user.getFriendsCount * Math.pow(10, -12), timeLeft * Math.pow(10, -14) //retweeted.getURLEntities.length, //retweeted.getUserMentionEntities.length ) } def featurize(statuses: Status): LabeledPoint = { val textFeatures = featurizeText(statuses) val numberFeatures = featurizeNumbers(statuses) val features = Vectors.sparse( numFeatures, textFeatures.indices ++ numberFeatureIndices, textFeatures.values ++ numberFeatures.toArray ) LabeledPoint( statuses.getRetweetedStatus.getRetweetCount.toDouble, features ) } def retweetInterval(statuses: Status, start:Long, end:Long):Boolean = { val n = statuses.getRetweetedStatus.getRetweetCount (n >= start && n <= end) } def filtrate(statuses: Status): Boolean = { ( statuses.isRetweet && //statuses.getLang == "en" && retweetInterval(statuses, numRetweetBegin, numRetweetEnd) ) } }
Example 197
Source File: LogUtils.scala From Spark-MLlib-Twitter-Sentiment-Analysis with Apache License 2.0 | 5 votes |
package org.p7h.spark.sentiment.utils import org.apache.log4j.{Level, Logger} import org.apache.spark.{Logging, SparkContext} object LogUtils extends Logging { def setLogLevels(sparkContext: SparkContext) { sparkContext.setLogLevel(Level.WARN.toString) val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements if (!log4jInitialized) { logInfo( """Setting log level to [WARN] for streaming executions. |To override add a custom log4j.properties to the classpath.""".stripMargin) Logger.getRootLogger.setLevel(Level.WARN) } } }
Example 198
Source File: CustomReceiver.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import java.io.{InputStreamReader, BufferedReader, InputStream} import java.net.Socket import org.apache.spark.{SparkConf, Logging} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { logInfo("Connecting to " + host + ":" + port) socket = new Socket(host, port) logInfo("Connected to " + host + ":" + port) val reader = new BufferedReader(new InputStreamReader(socket.getInputStream(), "UTF-8")) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() logInfo("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } }
Example 199
Source File: StreamingExamples.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.apache.spark.Logging import org.apache.log4j.{Level, Logger} def setStreamingLogLevels() { val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements if (!log4jInitialized) { // We first log something to initialize Spark's default logging, then we override the // logging level. logInfo("Setting log level to [WARN] for streaming example." + " To override add a custom log4j.properties to the classpath.") Logger.getRootLogger.setLevel(Level.WARN) } } }
Example 200
Source File: GraphLoader.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx import org.apache.spark.storage.StorageLevel import org.apache.spark.{Logging, SparkContext} import org.apache.spark.graphx.impl.{EdgePartitionBuilder, GraphImpl} def edgeListFile( sc: SparkContext, path: String, canonicalOrientation: Boolean = false, numEdgePartitions: Int = -1, edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY, vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) : Graph[Int, Int] = { val startTime = System.currentTimeMillis // Parse the edge data table directly into edge partitions val lines = if (numEdgePartitions > 0) { sc.textFile(path, numEdgePartitions).coalesce(numEdgePartitions) } else { sc.textFile(path) } val edges = lines.mapPartitionsWithIndex { (pid, iter) => val builder = new EdgePartitionBuilder[Int, Int] iter.foreach { line => if (!line.isEmpty && line(0) != '#') { val lineArray = line.split("\\s+") if (lineArray.length < 2) { throw new IllegalArgumentException("Invalid line: " + line) } val srcId = lineArray(0).toLong val dstId = lineArray(1).toLong if (canonicalOrientation && srcId > dstId) { builder.add(dstId, srcId, 1) } else { builder.add(srcId, dstId, 1) } } } Iterator((pid, builder.toEdgePartition)) }.persist(edgeStorageLevel).setName("GraphLoader.edgeListFile - edges (%s)".format(path)) edges.count() logInfo("It took %d ms to load the edges".format(System.currentTimeMillis - startTime)) GraphImpl.fromEdgePartitions(edges, defaultVertexAttr = 1, edgeStorageLevel = edgeStorageLevel, vertexStorageLevel = vertexStorageLevel) } // end of edgeListFile }