org.apache.spark.Logging Scala Examples

The following examples show how to use org.apache.spark.Logging. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: CatalogSuite.scala    From shc   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import org.apache.spark.Logging
import org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}

class CatalogSuite  extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll  with Logging{
  def catalog = s"""{
            |"table":{"namespace":"default", "name":"table1"},
            |"rowkey":"key1:key2",
            |"columns":{
              |"col00":{"cf":"rowkey", "col":"key1", "type":"string", "length":"6"},
              |"col01":{"cf":"rowkey", "col":"key2", "type":"int"},
              |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"},
              |"col2":{"cf":"cf2", "col":"col2", "type":"double"},
              |"col3":{"cf":"cf3", "col":"col3", "type":"float"},
              |"col4":{"cf":"cf4", "col":"col4", "type":"int"},
              |"col5":{"cf":"cf5", "col":"col5", "type":"bigint"},
              |"col6":{"cf":"cf6", "col":"col6", "type":"smallint"},
              |"col7":{"cf":"cf7", "col":"col7", "type":"string"},
              |"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"}
            |}
          |}""".stripMargin

  test("Catalog meta data check") {
     val m = HBaseTableCatalog(Map(HBaseTableCatalog.tableCatalog->catalog))
    assert(m.row.varLength == false)
    assert(m.row.length == 10)
  }

} 
Example 2
Source File: DataTypeConverter.scala    From shc   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog
import org.apache.spark.{SparkContext, Logging}

class DataTypeConverter extends SHC with Logging{
  ignore("Basic setup") {
    val sc = new SparkContext("local", "HBaseTest", conf)
    val sqlContext = new SQLContext(sc)

    val complex = s"""MAP<int, struct<varchar:string>>"""
    val schema =
      s"""{"namespace": "example.avro",
         |   "type": "record",      "name": "User",
         |    "fields": [      {"name": "name", "type": "string"},
         |      {"name": "favorite_number",  "type": ["int", "null"]},
         |        {"name": "favorite_color", "type": ["string", "null"]}      ]    }""".stripMargin

    val catalog = s"""{
            |"table":{"namespace":"default", "name":"htable"},
            |"rowkey":"key1:key2",
            |"columns":{
              |"col1":{"cf":"rowkey", "col":"key1", "type":"binary"},
              |"col2":{"cf":"rowkey", "col":"key2", "type":"double"},
              |"col3":{"cf":"cf1", "col":"col1", "avro":"schema1"},
              |"col4":{"cf":"cf1", "col":"col2", "type":"string"},
              |"col5":{"cf":"cf1", "col":"col3", "type":"double",
              |"sedes":"org.apache.spark.sql.execution.datasources.hbase.DoubleSedes"},
              |"col6":{"cf":"cf1", "col":"col4", "type":"$complex"}
            |}
          |}""".stripMargin
    val df =
      sqlContext.read.options(
        Map("schema1"->schema, HBaseTableCatalog.tableCatalog->catalog))
        .format("org.apache.spark.sql.execution.datasources.hbase")
        .load()
    df.write.options(
      Map("schema1"->schema, HBaseTableCatalog.tableCatalog->catalog))
      .format("org.apache.spark.sql.execution.datasources.hbase")
      .save()

    //val s = df.filter((($"col1" < Array(10.toByte)) and ($"col1" > Array(1.toByte))) or ($"col1" === Array(11.toByte))).select("col1")
    //val s = df.filter(Column("col1").<(Array(10.toByte)).and(Column("col1").>(Array(1.toByte))).or(Column("col1") === Array(11.toByte))).select("col1")
    // val s = df.filter((($"col1" < Array(10.toByte)) && ($"col1" > Array(1.toByte))) || ($"col1" === Array(11.toByte))).select("col1")
    //val s = df.filter(($"col1" < Array(10.toByte) && $"col1" > Array(1.toByte)) || $"col1" === Array(11.toByte) || $"col2" === 2.3).select("col1") // range should be (None, None)
    val s = df.filter(($"col1" < Array(10.toByte) &&
      $"col1" > Array(1.toByte)) ||
      $"col1" === Array(11.toByte) &&
        $"col2" === 2.3)
      .select("col1")
    s.count()
    df.registerTempTable("table")
    val c = sqlContext.sql("select count(col1) from table")
    // c.queryExecution
    c.show
    val se = df.filter($"col2" > 12).filter($"col4" < Array(10.toByte)).select("col1")

    val se1 = df.filter($"col2" > 12).filter($"col4" < Array(10.toByte)).select("col1")
    se.count()
    se1.collect.foreach(println(_))
    println(df)


  }
} 
Example 3
Source File: SignRandomProjectionLSH.scala    From lexrank-summarizer   with MIT License 5 votes vote down vote up
package io.github.karlhigley.lexrank

import scala.collection.immutable.BitSet
import scala.collection.mutable.ArrayBuffer
import scala.util.Random
import scala.util.hashing.MurmurHash3

import org.apache.spark.mllib.linalg.SparseVector
import org.apache.spark.Logging

class SignRandomProjectionLSH(poolSize: Int = 10000) extends Serializable with Logging {
  val pool = SignRandomProjectionLSH.generatePool(poolSize)
  
  def computeSignature(vector: SparseVector, length: Int): BitSet = {
    val buf = ArrayBuffer.empty[Int]
    
    val elements = vector.indices.zip(vector.values)
    for (bit <- 1 to length) {
      val components = elements.map(e => {
          val hash      = MurmurHash3.productHash((bit, e._1))
          val poolIndex = ((hash % poolSize) + poolSize) % poolSize
          val result    = e._2 * pool(poolIndex)
          result
      })

      val dotProduct = components.reduce(_ + _)
      if (dotProduct > 0) {
        buf += bit
      }
    }

    BitSet(buf.toArray:_*)
  }
  
}

object SignRandomProjectionLSH {
  def signatureSet(length: Int): Set[BitSet] = {
    BitSet(1 to length:_*).subsets.toSet
  }

  def estimateCosine(a: BitSet, b: BitSet, length: Int): Double = {
    val hammingDistance = (a^b).size
    math.cos(hammingDistance.toDouble/length.toDouble*math.Pi)
  }

  private def generatePool(size: Int): Array[Double] = {
    val rand = new Random()
    val buf  = ArrayBuffer.fill[Double](size)(rand.nextGaussian)
    buf.toArray
  }
} 
Example 4
Source File: Driver.scala    From lexrank-summarizer   with MIT License 5 votes vote down vote up
package io.github.karlhigley.lexrank

import scala.io.Source

import org.apache.spark.{SparkContext, SparkConf, Logging}
import org.apache.spark.rdd.RDD

import org.apache.spark.graphx._

object Driver extends Logging {
  val serializerClasses: Array[Class[_]] = Array(
    classOf[Document], classOf[Sentence],
    classOf[SentenceTokens], classOf[SentenceFeatures],
    classOf[Featurizer], classOf[SignRandomProjectionLSH],
    classOf[LexRank]
  )

  private def selectExcerpts(sentences: RDD[Sentence], scores: VertexRDD[Double], length: Int) = {
    scores
      .join(sentences.map(s => (s.id, s)))
      .map { case (_, (score, sentence)) => (sentence.docId, (score, sentence.id, sentence.text)) }
      .groupByKey()
      .flatMap { case (docId, sentences) => sentences.toSeq.sortWith(_._1 > _._1).take(length).map(e => (docId, e._3)) }
  }

  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("Summarizer")
    sparkConf.registerKryoClasses(serializerClasses)

    val sc        = new SparkContext(sparkConf)
    val config    = new Configuration(args)
    
    val documents = sc.textFile(config.inputPath, minPartitions = config.partitions).flatMap( 
      _.split('\t').toList match {
        case List(docId, text @ _*) => Some((docId.trim, text.mkString(" ")))
        case _                 => None
      }
    ).map(Document.tupled).filter(d => d.id.length > 0)

    val segmenter = new DocumentSegmenter
    val (sentences, tokenized) = segmenter(documents)

    val tokenizedFilteredByLength = tokenized.filter(t => t.tokens.size > 2)

    val featurizer = new Featurizer(config.numStopwords)
    val features = featurizer(tokenizedFilteredByLength)

    val model    = LexRank.build(features)
    val ranks    = model.score(config.cutoff, config.convergence)
    val excerpts = selectExcerpts(sentences, ranks, config.length)
 
    excerpts
      .map(_.productIterator.toList.mkString("\t"))
      .saveAsTextFile(config.outputPath)

    sc.stop()
  }
} 
Example 5
Source File: CustomLogger.scala    From hyperspark   with Apache License 2.0 5 votes vote down vote up
package util

import org.apache.spark.Logging

class CustomLogger extends Logging {
  protected var params: List[String] = List()
  protected def reformat(ps: List[String]) = {
    def produceBlanks(N: Int) = {
      if(N==0) ""
      else
        (for(i<-1 to N) yield " ").reduceLeft(_ concat _).concat("\t")
    }
    def fixsize(str: String) = {
      str.concat(produceBlanks(15-str.size))
    }
    ps.map { x => fixsize(x) }
  }
  def setFormat(parameters: List[String]) {
    params = parameters
    params = reformat(params)
  }
  def getFormatString(): String = {
    val toprint = params.reduceLeft(_ concat _).concat("\n")
    toprint
  }
  def printInfo(msg: String) = {
    print(msg)
    logInfo(msg)
  }
  def printFormat() = { printInfo(getFormatString()) }
  def getValuesString(values: List[Any]): String = {
    reformat(values.map { x => x.toString() }).reduceLeft(_ concat _).concat("\n")
  }
  def printValues(values: List[Any]) = { printInfo(getValuesString(values)) }
}
object CustomLogger {
  def apply() = new CustomLogger()
  def apply(parameters: List[String]) = new CustomLogger().setFormat(parameters)
} 
Example 6
Source File: EventTransformer.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.flume

import java.io.{ObjectOutput, ObjectInput}

import scala.collection.JavaConversions._

import org.apache.spark.util.Utils
import org.apache.spark.Logging


private[streaming] object EventTransformer extends Logging {
  def readExternal(in: ObjectInput): (java.util.HashMap[CharSequence, CharSequence],
    Array[Byte]) = {
    val bodyLength = in.readInt()
    val bodyBuff = new Array[Byte](bodyLength)
    in.readFully(bodyBuff)

    val numHeaders = in.readInt()
    val headers = new java.util.HashMap[CharSequence, CharSequence]

    for (i <- 0 until numHeaders) {
      val keyLength = in.readInt()
      val keyBuff = new Array[Byte](keyLength)
      in.readFully(keyBuff)
      val key: String = Utils.deserialize(keyBuff)

      val valLength = in.readInt()
      val valBuff = new Array[Byte](valLength)
      in.readFully(valBuff)
      val value: String = Utils.deserialize(valBuff)

      headers.put(key, value)
    }
    (headers, bodyBuff)
  }

  def writeExternal(out: ObjectOutput, headers: java.util.Map[CharSequence, CharSequence],
    body: Array[Byte]) {
    out.writeInt(body.length)
    out.write(body)
    val numHeaders = headers.size()
    out.writeInt(numHeaders)
    for ((k, v) <- headers) {
      val keyBuff = Utils.serialize(k.toString)
      out.writeInt(keyBuff.length)
      out.write(keyBuff)
      val valBuff = Utils.serialize(v.toString)
      out.writeInt(valBuff.length)
      out.write(valBuff)
    }
  }
} 
Example 7
Source File: FlumePollingStreamSuite.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.flume

import java.net.InetSocketAddress

import scala.collection.JavaConversions._
import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer}
import scala.concurrent.duration._
import scala.language.postfixOps

import com.google.common.base.Charsets.UTF_8
import org.scalatest.BeforeAndAfter
import org.scalatest.concurrent.Eventually._

import org.apache.spark.{Logging, SparkConf, SparkFunSuite}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.{Seconds, TestOutputStream, StreamingContext}
import org.apache.spark.util.{ManualClock, Utils}

  private def testMultipleTimes(test: () => Unit): Unit = {
    var testPassed = false
    var attempt = 0
    while (!testPassed && attempt < maxAttempts) {
      try {
        test()
        testPassed = true
      } catch {
        case e: Exception if Utils.isBindCollision(e) =>
          logWarning("Exception when running flume polling test: " + e)
          attempt += 1
      }
    }
    assert(testPassed, s"Test failed after $attempt attempts!")
  }

  private def testFlumePolling(): Unit = {
    try {
      val port = utils.startSingleSink()

      writeAndVerify(Seq(port))
      utils.assertChannelsAreEmpty()
    } finally {
      utils.close()
    }
  }

  private def testFlumePollingMultipleHost(): Unit = {
    try {
      val ports = utils.startMultipleSinks()
      writeAndVerify(ports)
      utils.assertChannelsAreEmpty()
    } finally {
      utils.close()
    }
  }

  def writeAndVerify(sinkPorts: Seq[Int]): Unit = {
    // Set up the streaming context and input streams
    //设置流上下文和输入流
    val ssc = new StreamingContext(conf, batchDuration)
    val addresses = sinkPorts.map(port => new InetSocketAddress("localhost", port))
    val flumeStream: ReceiverInputDStream[SparkFlumeEvent] =
      FlumeUtils.createPollingStream(ssc, addresses, StorageLevel.MEMORY_AND_DISK,
        utils.eventsPerBatch, 5)
    val outputBuffer = new ArrayBuffer[Seq[SparkFlumeEvent]]
      with SynchronizedBuffer[Seq[SparkFlumeEvent]]
    val outputStream = new TestOutputStream(flumeStream, outputBuffer)
    outputStream.register()

    ssc.start()
    try {
      utils.sendDatAndEnsureAllDataHasBeenReceived()
      val clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
      clock.advance(batchDuration.milliseconds)

      // The eventually is required to ensure that all data in the batch has been processed.
      //最终需要确保批处理中的所有数据已被处理
      eventually(timeout(10 seconds), interval(100 milliseconds)) {
        val flattenOutputBuffer = outputBuffer.flatten
        val headers = flattenOutputBuffer.map(_.event.getHeaders.map {
          case kv => (kv._1.toString, kv._2.toString)
        }).map(mapAsJavaMap)
        val bodies = flattenOutputBuffer.map(e => new String(e.event.getBody.array(), UTF_8))
        utils.assertOutput(headers, bodies)
      }
    } finally {
      ssc.stop()
    }
  }

} 
Example 8
Source File: FlumeStreamSuite.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.flume

import scala.collection.JavaConversions._
import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer}
import scala.concurrent.duration._
import scala.language.postfixOps

import com.google.common.base.Charsets
import org.jboss.netty.channel.ChannelPipeline
import org.jboss.netty.channel.socket.SocketChannel
import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory
import org.jboss.netty.handler.codec.compression._
import org.scalatest.{BeforeAndAfter, Matchers}
import org.scalatest.concurrent.Eventually._

import org.apache.spark.{Logging, SparkConf, SparkFunSuite}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Milliseconds, StreamingContext, TestOutputStream}

  private class CompressionChannelFactory(compressionLevel: Int)
    extends NioClientSocketChannelFactory {

    override def newChannel(pipeline: ChannelPipeline): SocketChannel = {
      val encoder = new ZlibEncoder(compressionLevel)
      pipeline.addFirst("deflater", encoder)
      pipeline.addFirst("inflater", new ZlibDecoder())
      super.newChannel(pipeline)
    }
  }
} 
Example 9
Source File: MQTTTestUtils.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.mqtt

import java.net.{ServerSocket, URI}

import scala.language.postfixOps

import com.google.common.base.Charsets.UTF_8
import org.apache.activemq.broker.{BrokerService, TransportConnector}
import org.apache.commons.lang3.RandomUtils
import org.eclipse.paho.client.mqttv3._
import org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence

import org.apache.spark.util.Utils
import org.apache.spark.{Logging, SparkConf}


private[mqtt] class MQTTTestUtils extends Logging {

  private val persistenceDir = Utils.createTempDir()
  private val brokerHost = "localhost"
  private val brokerPort = findFreePort()

  private var broker: BrokerService = _
  private var connector: TransportConnector = _

  def brokerUri: String = {
    s"$brokerHost:$brokerPort"
  }

  def setup(): Unit = {
    broker = new BrokerService()
    broker.setDataDirectoryFile(Utils.createTempDir())
    connector = new TransportConnector()
    connector.setName("mqtt")
    connector.setUri(new URI("mqtt://" + brokerUri))
    broker.addConnector(connector)
    broker.start()
  }

  def teardown(): Unit = {
    if (broker != null) {
      broker.stop()
      broker = null
    }
    if (connector != null) {
      connector.stop()
      connector = null
    }
    Utils.deleteRecursively(persistenceDir)
  }

  private def findFreePort(): Int = {
    val candidatePort = RandomUtils.nextInt(1024, 65536)
    Utils.startServiceOnPort(candidatePort, (trialPort: Int) => {
      val socket = new ServerSocket(trialPort)
      socket.close()
      (null, trialPort)
    }, new SparkConf())._2
  }

  def publishData(topic: String, data: String): Unit = {
    var client: MqttClient = null
    try {
      val persistence = new MqttDefaultFilePersistence(persistenceDir.getAbsolutePath)
      client = new MqttClient("tcp://" + brokerUri, MqttClient.generateClientId(), persistence)
      client.connect()
      if (client.isConnected) {
        val msgTopic = client.getTopic(topic)
        val message = new MqttMessage(data.getBytes(UTF_8))
        message.setQos(1)
        message.setRetained(true)

        for (i <- 0 to 10) {
          try {
            msgTopic.publish(message)
          } catch {
            case e: MqttException if e.getReasonCode == MqttException.REASON_CODE_MAX_INFLIGHT =>
              // wait for Spark streaming to consume something from the message queue
              Thread.sleep(50)
          }
        }
      }
    } finally {
      if (client != null) {
        client.disconnect()
        client.close()
        client = null
      }
    }
  }

} 
Example 10
Source File: FiltersSuite.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.client

import scala.collection.JavaConversions._

import org.apache.hadoop.hive.metastore.api.FieldSchema
import org.apache.hadoop.hive.serde.serdeConstants

import org.apache.spark.{Logging, SparkFunSuite}
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.types._


class FiltersSuite extends SparkFunSuite with Logging {
  private val shim = new Shim_v0_13

  private val testTable = new org.apache.hadoop.hive.ql.metadata.Table("default", "test")
  private val varCharCol = new FieldSchema()
  varCharCol.setName("varchar")
  varCharCol.setType(serdeConstants.VARCHAR_TYPE_NAME)
  testTable.setPartCols(varCharCol :: Nil)
  //字符串过滤器
  filterTest("string filter",
    (a("stringcol", StringType) > Literal("test")) :: Nil,
    "stringcol > \"test\"")
  //字符串过滤器向后
  filterTest("string filter backwards",
    (Literal("test") > a("stringcol", StringType)) :: Nil,
    "\"test\" > stringcol")
  //int过滤器
  filterTest("int filter",
    (a("intcol", IntegerType) === Literal(1)) :: Nil,
    "intcol = 1")
  //int向后过滤
  filterTest("int filter backwards",
    (Literal(1) === a("intcol", IntegerType)) :: Nil,
    "1 = intcol")

  filterTest("int and string filter",
    (Literal(1) === a("intcol", IntegerType)) :: (Literal("a") === a("strcol", IntegerType)) :: Nil,
    "1 = intcol and \"a\" = strcol")

  filterTest("skip varchar",
    (Literal("") === a("varchar", StringType)) :: Nil,
    "")

  private def filterTest(name: String, filters: Seq[Expression], result: String) = {
    test(name){
      val converted = shim.convertFilters(testTable, filters)
      if (converted != result) {
        fail(
          s"Expected filters ${filters.mkString(",")} to convert to '$result' but got '$converted'")
      }
    }
  }

  private def a(name: String, dataType: DataType) = AttributeReference(name, dataType)()
} 
Example 11
Source File: SparkSQLDriver.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.thriftserver

import java.util.{ArrayList => JArrayList, List => JList}

import scala.collection.JavaConversions._

import org.apache.commons.lang3.exception.ExceptionUtils
import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema}
import org.apache.hadoop.hive.ql.Driver
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse

import org.apache.spark.Logging
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}

private[hive] class SparkSQLDriver(
    val context: HiveContext = SparkSQLEnv.hiveContext)
  extends Driver
  with Logging {

  private[hive] var tableSchema: Schema = _
  private[hive] var hiveResponse: Seq[String] = _

  override def init(): Unit = {
  }

  private def getResultSetSchema(query: context.QueryExecution): Schema = {
    val analyzed = query.analyzed
    logDebug(s"Result Schema: ${analyzed.output}")
    if (analyzed.output.size == 0) {
      new Schema(new FieldSchema("Response code", "string", "") :: Nil, null)
    } else {
      val fieldSchemas = analyzed.output.map { attr =>
        new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "")
      }

      new Schema(fieldSchemas, null)
    }
  }

  override def run(command: String): CommandProcessorResponse = {
    // TODO unify the error code
    try {
      context.sparkContext.setJobDescription(command)
      val execution = context.executePlan(context.sql(command).logicalPlan)
      hiveResponse = execution.stringResult()
      tableSchema = getResultSetSchema(execution)
      new CommandProcessorResponse(0)
    } catch {
        case ae: AnalysisException =>
          logDebug(s"Failed in [$command]", ae)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae)
        case cause: Throwable =>
          logError(s"Failed in [$command]", cause)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause)
    }
  }

  override def close(): Int = {
    hiveResponse = null
    tableSchema = null
    0
  }

  override def getResults(res: JList[_]): Boolean = {
    if (hiveResponse == null) {
      false
    } else {
      res.asInstanceOf[JArrayList[String]].addAll(hiveResponse)
      hiveResponse = null
      true
    }
  }

  override def getSchema: Schema = tableSchema

  override def destroy() {
    super.destroy()
    hiveResponse = null
    tableSchema = null
  }
} 
Example 12
Source File: SparkSQLOperationManager.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.thriftserver.server

import java.util.{Map => JMap}
import scala.collection.mutable.Map

import org.apache.hive.service.cli._
import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, Operation, OperationManager}
import org.apache.hive.service.cli.session.HiveSession
import org.apache.spark.Logging
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.thriftserver.{SparkExecuteStatementOperation, ReflectionUtils}


private[thriftserver] class SparkSQLOperationManager(hiveContext: HiveContext)
  extends OperationManager with Logging {

  val handleToOperation = ReflectionUtils
    .getSuperField[JMap[OperationHandle, Operation]](this, "handleToOperation")

  val sessionToActivePool = Map[SessionHandle, String]()

  override def newExecuteStatementOperation(
      parentSession: HiveSession,
      statement: String,
      confOverlay: JMap[String, String],
      async: Boolean): ExecuteStatementOperation = synchronized {

    val runInBackground = async && hiveContext.hiveThriftServerAsync
    val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay,
      runInBackground)(hiveContext, sessionToActivePool)
    handleToOperation.put(operation.getHandle, operation)
    logDebug(s"Created Operation for $statement with session=$parentSession, " +
      s"runInBackground=$runInBackground")
    operation
  }
} 
Example 13
Source File: ThriftServerTab.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.thriftserver.ui

import org.apache.spark.sql.hive.thriftserver.{HiveThriftServer2, SparkSQLEnv}
import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._
import org.apache.spark.ui.{SparkUI, SparkUITab}
import org.apache.spark.{SparkContext, Logging, SparkException}


private[thriftserver] class ThriftServerTab(sparkContext: SparkContext)
  extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging {

  override val name = "JDBC/ODBC Server"

  val parent = getSparkUI(sparkContext)
  val listener = HiveThriftServer2.listener

  attachPage(new ThriftServerPage(this))
  attachPage(new ThriftServerSessionPage(this))
  parent.attachTab(this)

  def detach() {
    getSparkUI(sparkContext).detachTab(this)
  }
}

private[thriftserver] object ThriftServerTab {
  def getSparkUI(sparkContext: SparkContext): SparkUI = {
    sparkContext.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
} 
Example 14
Source File: SparkSQLEnv.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.thriftserver

import java.io.PrintStream

import scala.collection.JavaConversions._

import org.apache.spark.scheduler.StatsReportListener
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{Logging, SparkConf, SparkContext}
import org.apache.spark.util.Utils


  def stop() {
    logDebug("Shutting down Spark SQL Environment")
    // Stop the SparkContext
    if (SparkSQLEnv.sparkContext != null) {
      sparkContext.stop()
      sparkContext = null
      hiveContext = null
    }
  }
} 
Example 15
Source File: BoundAttribute.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.errors.attachTree
import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
import org.apache.spark.sql.types._


case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
  extends LeafExpression with NamedExpression {

  override def toString: String = s"input[$ordinal, $dataType]"

  // Use special getter for primitive types (for UnsafeRow)
  //对原始类型使用特殊的getter(对于UnsafeRow)
  override def eval(input: InternalRow): Any = {
    if (input.isNullAt(ordinal)) {
      null
    } else {
      dataType match {
        case BooleanType => input.getBoolean(ordinal)
        case ByteType => input.getByte(ordinal)
        case ShortType => input.getShort(ordinal)
        case IntegerType | DateType => input.getInt(ordinal)
        case LongType | TimestampType => input.getLong(ordinal)
        case FloatType => input.getFloat(ordinal)
        case DoubleType => input.getDouble(ordinal)
        case StringType => input.getUTF8String(ordinal)
        case BinaryType => input.getBinary(ordinal)
        case CalendarIntervalType => input.getInterval(ordinal)
        case t: DecimalType => input.getDecimal(ordinal, t.precision, t.scale)
        case t: StructType => input.getStruct(ordinal, t.size)
        case _: ArrayType => input.getArray(ordinal)
        case _: MapType => input.getMap(ordinal)
        case _ => input.get(ordinal, dataType)
      }
    }
  }

  override def name: String = s"i[$ordinal]"

  override def toAttribute: Attribute = throw new UnsupportedOperationException

  override def qualifiers: Seq[String] = throw new UnsupportedOperationException

  override def exprId: ExprId = throw new UnsupportedOperationException

  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
    val javaType = ctx.javaType(dataType)
    val value = ctx.getValue("i", dataType, ordinal.toString)
    s"""
      boolean ${ev.isNull} = i.isNullAt($ordinal);
      $javaType ${ev.primitive} = ${ev.isNull} ? ${ctx.defaultValue(dataType)} : ($value);
    """
  }
}

object BindReferences extends Logging {

  def bindReference[A <: Expression](
      expression: A,
      input: Seq[Attribute],
      allowFailures: Boolean = false): A = {
    expression.transform { case a: AttributeReference =>
      attachTree(a, "Binding attribute") {
        val ordinal = input.indexWhere(_.exprId == a.exprId)
        if (ordinal == -1) {
          if (allowFailures) {
            a
          } else {
            sys.error(s"Couldn't find $a in ${input.mkString("[", ",", "]")}")
          }
        } else {
          BoundReference(ordinal, a.dataType, a.nullable)
        }
      }
    }.asInstanceOf[A] // Kind of a hack, but safe.  TODO: Tighten return type when possible.
  }
} 
Example 16
Source File: package.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution

import scala.collection.mutable.HashSet

import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.trees.TreeNodeRef
import org.apache.spark.{Accumulator, AccumulatorParam, Logging}


    case class ColumnMetrics(
        elementTypes: Accumulator[HashSet[String]] = sparkContext.accumulator(HashSet.empty))
    val tupleCount: Accumulator[Int] = sparkContext.accumulator[Int](0)

    val numColumns: Int = child.output.size
    val columnStats: Array[ColumnMetrics] = Array.fill(child.output.size)(new ColumnMetrics())

    def dumpStats(): Unit = {
      logDebug(s"== ${child.simpleString} ==")
      logDebug(s"Tuples output: ${tupleCount.value}")
      child.output.zip(columnStats).foreach { case(attr, metric) =>
        val actualDataTypes = metric.elementTypes.value.mkString("{", ",", "}")
        logDebug(s" ${attr.name} ${attr.dataType}: $actualDataTypes")
      }
    }

    protected override def doExecute(): RDD[InternalRow] = {
      child.execute().mapPartitions { iter =>
        new Iterator[InternalRow] {
          def hasNext: Boolean = iter.hasNext
          def next(): InternalRow = {
            val currentRow = iter.next()
            tupleCount += 1
            var i = 0
            while (i < numColumns) {
              val value = currentRow.get(i, output(i).dataType)
              if (value != null) {
                columnStats(i).elementTypes += HashSet(value.getClass.getName)
              }
              i += 1
            }
            currentRow
          }
        }
      }
    }
  }
} 
Example 17
Source File: DriverRegistry.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.jdbc

import java.sql.{Driver, DriverManager}

import scala.collection.mutable

import org.apache.spark.Logging
import org.apache.spark.util.Utils


object DriverRegistry extends Logging {

  private val wrapperMap: mutable.Map[String, DriverWrapper] = mutable.Map.empty

  def register(className: String): Unit = {
    val cls = Utils.getContextOrSparkClassLoader.loadClass(className)
    if (cls.getClassLoader == null) {
      logTrace(s"$className has been loaded with bootstrap ClassLoader, wrapper is not required")
    } else if (wrapperMap.get(className).isDefined) {
      logTrace(s"Wrapper for $className already exists")
    } else {
      synchronized {
        if (wrapperMap.get(className).isEmpty) {
          val wrapper = new DriverWrapper(cls.newInstance().asInstanceOf[Driver])
          DriverManager.registerDriver(wrapper)
          wrapperMap(className) = wrapper
          logTrace(s"Wrapper for $className registered")
        }
      }
    }
  }

  def getDriverClassName(url: String): String = DriverManager.getDriver(url) match {
    case wrapper: DriverWrapper => wrapper.wrapped.getClass.getCanonicalName
    case driver => driver.getClass.getCanonicalName
  }
} 
Example 18
Source File: CompressibleColumnBuilder.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.columnar.compression

import java.nio.{ByteBuffer, ByteOrder}

import org.apache.spark.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.columnar.{ColumnBuilder, NativeColumnBuilder}
import org.apache.spark.sql.types.AtomicType


private[sql] trait CompressibleColumnBuilder[T <: AtomicType]
  extends ColumnBuilder with Logging {

  this: NativeColumnBuilder[T] with WithCompressionSchemes =>

  var compressionEncoders: Seq[Encoder[T]] = _

  abstract override def initialize(
      initialSize: Int,
      columnName: String,
      useCompression: Boolean): Unit = {

    compressionEncoders =
      if (useCompression) {
        schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType))
      } else {
        Seq(PassThrough.encoder(columnType))
      }
    super.initialize(initialSize, columnName, useCompression)
  }

  protected def isWorthCompressing(encoder: Encoder[T]) = {
    encoder.compressionRatio < 0.8
  }

  private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = {
    var i = 0
    while (i < compressionEncoders.length) {
      compressionEncoders(i).gatherCompressibilityStats(row, ordinal)
      i += 1
    }
  }

  abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = {
    super.appendFrom(row, ordinal)
    if (!row.isNullAt(ordinal)) {
      gatherCompressibilityStats(row, ordinal)
    }
  }

  override def build(): ByteBuffer = {
    val nonNullBuffer = buildNonNulls()
    val typeId = nonNullBuffer.getInt()
    val encoder: Encoder[T] = {
      val candidate = compressionEncoders.minBy(_.compressionRatio)
      if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType)
    }

    // Header = column type ID + null count + null positions
    val headerSize = 4 + 4 + nulls.limit()
    val compressedSize = if (encoder.compressedSize == 0) {
      nonNullBuffer.remaining()
    } else {
      encoder.compressedSize
    }

    val compressedBuffer = ByteBuffer
      // Reserves 4 bytes for compression scheme ID
      .allocate(headerSize + 4 + compressedSize)
      .order(ByteOrder.nativeOrder)
      // Write the header
      .putInt(typeId)
      .putInt(nullCount)
      .put(nulls)

    logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}")
    encoder.compress(nonNullBuffer, compressedBuffer)
  }
} 
Example 19
Source File: ExecutorDelegationTokenUpdater.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.yarn

import java.util.concurrent.{Executors, TimeUnit}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.security.{Credentials, UserGroupInformation}

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.util.{ThreadUtils, Utils}

import scala.util.control.NonFatal

private[spark] class ExecutorDelegationTokenUpdater(
    sparkConf: SparkConf,
    hadoopConf: Configuration) extends Logging {

  @volatile private var lastCredentialsFileSuffix = 0

  private val credentialsFile = sparkConf.get("spark.yarn.credentials.file")
  private val freshHadoopConf =
    SparkHadoopUtil.get.getConfBypassingFSCache(
      hadoopConf, new Path(credentialsFile).toUri.getScheme)

  private val delegationTokenRenewer =
    Executors.newSingleThreadScheduledExecutor(
      ThreadUtils.namedThreadFactory("Delegation Token Refresh Thread"))

  // On the executor, this thread wakes up and picks up new tokens from HDFS, if any.
  //在执行程序中,该线程唤醒并从HDFS中获取新令牌(如果有的话)
  private val executorUpdaterRunnable =
    new Runnable {
      override def run(): Unit = Utils.logUncaughtExceptions(updateCredentialsIfRequired())
    }

  def updateCredentialsIfRequired(): Unit = {
    try {
      val credentialsFilePath = new Path(credentialsFile)
      val remoteFs = FileSystem.get(freshHadoopConf)
      SparkHadoopUtil.get.listFilesSorted(
        remoteFs, credentialsFilePath.getParent,
        credentialsFilePath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION)
        .lastOption.foreach { credentialsStatus =>
        val suffix = SparkHadoopUtil.get.getSuffixForCredentialsPath(credentialsStatus.getPath)
        if (suffix > lastCredentialsFileSuffix) {
          logInfo("Reading new delegation tokens from " + credentialsStatus.getPath)
          val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsStatus.getPath)
          lastCredentialsFileSuffix = suffix
          UserGroupInformation.getCurrentUser.addCredentials(newCredentials)
          logInfo("Tokens updated from credentials file.")
        } else {
          // Check every hour to see if new credentials arrived.
          logInfo("Updated delegation tokens were expected, but the driver has not updated the " +
            "tokens yet, will check again in an hour.")
          delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS)
          return
        }
      }
      val timeFromNowToRenewal =
        SparkHadoopUtil.get.getTimeFromNowToRenewal(
          sparkConf, 0.8, UserGroupInformation.getCurrentUser.getCredentials)
      if (timeFromNowToRenewal <= 0) {
        executorUpdaterRunnable.run()
      } else {
        logInfo(s"Scheduling token refresh from HDFS in $timeFromNowToRenewal millis.")
        delegationTokenRenewer.schedule(
          executorUpdaterRunnable, timeFromNowToRenewal, TimeUnit.MILLISECONDS)
      }
    } catch {
      // Since the file may get deleted while we are reading it, catch the Exception and come
      // back in an hour to try again
      case NonFatal(e) =>
        logWarning("Error while trying to update credentials, will try again in 1 hour", e)
        delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS)
    }
  }

  private def getCredentialsFromHDFSFile(remoteFs: FileSystem, tokenPath: Path): Credentials = {
    val stream = remoteFs.open(tokenPath)
    try {
      val newCredentials = new Credentials()
      newCredentials.readTokenStorageStream(stream)
      newCredentials
    } finally {
      stream.close()
    }
  }

  def stop(): Unit = {
    delegationTokenRenewer.shutdown()
  }

} 
Example 20
Source File: SocketInputDStream.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.dstream

import scala.util.control.NonFatal

import org.apache.spark.streaming.StreamingContext
import org.apache.spark.storage.StorageLevel
import org.apache.spark.util.NextIterator

import scala.reflect.ClassTag

import java.io._
import java.net.{UnknownHostException, Socket}
import org.apache.spark.Logging
import org.apache.spark.streaming.receiver.Receiver

private[streaming]
class SocketInputDStream[T: ClassTag](
    @transient ssc_ : StreamingContext,
    host: String,
    port: Int,
    bytesToObjects: InputStream => Iterator[T],
    storageLevel: StorageLevel
  ) extends ReceiverInputDStream[T](ssc_) {

  def getReceiver(): Receiver[T] = {
    new SocketReceiver(host, port, bytesToObjects, storageLevel)
  }
}

private[streaming]
class SocketReceiver[T: ClassTag](
    host: String,
    port: Int,
    bytesToObjects: InputStream => Iterator[T],
    storageLevel: StorageLevel
  ) extends Receiver[T](storageLevel) with Logging {

  def onStart() {
    // Start the thread that receives data over a connection
    //启动接收到连接上的数据的线程
    new Thread("Socket Receiver") {
      setDaemon(true)
      override def run() { receive() }
    }.start()
  }

  def onStop() {
    // There is nothing much to do as the thread calling receive()
    //没有什么可做的线程调用receive()
    // is designed to stop by itself isStopped() returns false
    //是为了阻止自己isstopped()返回false
  }

  
  def bytesToLines(inputStream: InputStream): Iterator[String] = {
    val dataInputStream = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"))
    new NextIterator[String] {
      protected override def getNext() = {
        val nextValue = dataInputStream.readLine()
        if (nextValue == null) {
          finished = true
        }
        nextValue
      }

      protected override def close() {
        dataInputStream.close()
      }
    }
  }
} 
Example 21
Source File: StreamingTab.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.ui

import org.apache.spark.{Logging, SparkException}
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.ui.{SparkUI, SparkUITab}

import StreamingTab._


private[spark] class StreamingTab(val ssc: StreamingContext)
  extends SparkUITab(getSparkUI(ssc), "streaming") with Logging {

  private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static"

  val parent = getSparkUI(ssc)
  val listener = ssc.progressListener

  ssc.addStreamingListener(listener)
  ssc.sc.addSparkListener(listener)
  attachPage(new StreamingPage(this))
  attachPage(new BatchPage(this))

  def attach() {
    getSparkUI(ssc).attachTab(this)
    getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming")
  }

  def detach() {
    getSparkUI(ssc).detachTab(this)
    getSparkUI(ssc).removeStaticHandler("/static/streaming")
  }
}

private object StreamingTab {
  def getSparkUI(ssc: StreamingContext): SparkUI = {
    ssc.sc.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
} 
Example 22
Source File: StreamingListenerBus.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.scheduler

import java.util.concurrent.atomic.AtomicBoolean

import org.apache.spark.Logging
import org.apache.spark.util.AsynchronousListenerBus


private[spark] class StreamingListenerBus
  extends AsynchronousListenerBus[StreamingListener, StreamingListenerEvent]("StreamingListenerBus")
  with Logging {

  private val logDroppedEvent = new AtomicBoolean(false)

  override def onPostEvent(listener: StreamingListener, event: StreamingListenerEvent): Unit = {
    event match {
      case receiverStarted: StreamingListenerReceiverStarted =>
        listener.onReceiverStarted(receiverStarted)
      case receiverError: StreamingListenerReceiverError =>
        listener.onReceiverError(receiverError)
      case receiverStopped: StreamingListenerReceiverStopped =>
        listener.onReceiverStopped(receiverStopped)
      case batchSubmitted: StreamingListenerBatchSubmitted =>
        listener.onBatchSubmitted(batchSubmitted)
      case batchStarted: StreamingListenerBatchStarted =>
        listener.onBatchStarted(batchStarted)
      case batchCompleted: StreamingListenerBatchCompleted =>
        listener.onBatchCompleted(batchCompleted)
      case _ =>
    }
  }

  override def onDropEvent(event: StreamingListenerEvent): Unit = {
    if (logDroppedEvent.compareAndSet(false, true)) {
      // Only log the following message once to avoid duplicated annoying logs.
      logError("Dropping StreamingListenerEvent because no remaining room in event queue. " +
        "This likely means one of the StreamingListeners is too slow and cannot keep up with the " +
        "rate at which events are being started by the scheduler.")
    }
  }
} 
Example 23
Source File: RecurringTimer.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.util

import org.apache.spark.Logging
import org.apache.spark.util.{Clock, SystemClock}

private[streaming]
class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name: String)
  extends Logging {

  private val thread = new Thread("RecurringTimer - " + name) {
    setDaemon(true)
    override def run() { loop }
  }

  @volatile private var prevTime = -1L
  @volatile private var nextTime = -1L
  @volatile private var stopped = false

  
  private def loop() {
    try {
      while (!stopped) {
        triggerActionForNextInterval()
      }
      triggerActionForNextInterval()
    } catch {
      case e: InterruptedException =>
    }
  }
}

private[streaming]
object RecurringTimer extends Logging {

  def main(args: Array[String]) {
    var lastRecurTime = 0L
    val period = 1000

    def onRecur(time: Long) {
      val currentTime = System.currentTimeMillis()
      logInfo("" + currentTime + ": " + (currentTime - lastRecurTime))
      lastRecurTime = currentTime
    }
    val timer = new  RecurringTimer(new SystemClock(), period, onRecur, "Test")
    timer.start()
    Thread.sleep(30 * 1000)
    timer.stop(true)
  }
} 
Example 24
Source File: RawTextSender.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.util

import java.io.{ByteArrayOutputStream, IOException}
import java.net.ServerSocket
import java.nio.ByteBuffer

import scala.io.Source

import org.apache.spark.{SparkConf, Logging}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.IntParam


private[streaming]
object RawTextSender extends Logging {
  def main(args: Array[String]) {
    if (args.length != 4) {
      // scalastyle:off println
      System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>")
      // scalastyle:on println
      System.exit(1)
    }
    // Parse the arguments using a pattern match
    //解析使用模式匹配的参数
    val Array(IntParam(port), file, IntParam(blockSize), IntParam(bytesPerSec)) = args

    // Repeat the input data multiple times to fill in a buffer
    //多次重复输入数据以填充缓冲区
    val lines = Source.fromFile(file).getLines().toArray
    val bufferStream = new ByteArrayOutputStream(blockSize + 1000)
    val ser = new KryoSerializer(new SparkConf()).newInstance()
    val serStream = ser.serializeStream(bufferStream)
    var i = 0
    while (bufferStream.size < blockSize) {
      serStream.writeObject(lines(i))
      i = (i + 1) % lines.length
    }
    val array = bufferStream.toByteArray

    val countBuf = ByteBuffer.wrap(new Array[Byte](4))
    countBuf.putInt(array.length)
    countBuf.flip()

    val serverSocket = new ServerSocket(port)
    logInfo("Listening on port " + port)

    while (true) {
      val socket = serverSocket.accept()
      logInfo("Got a new connection")
      val out = new RateLimitedOutputStream(socket.getOutputStream, bytesPerSec)
      try {
        while (true) {
          out.write(countBuf.array)
          out.write(array)
        }
      } catch {
        case e: IOException =>
          logError("Client disconnected")
      } finally {
        socket.close()
      }
    }
  }
} 
Example 25
Source File: FileBasedWriteAheadLogReader.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.util

import java.io.{Closeable, EOFException}
import java.nio.ByteBuffer

import org.apache.hadoop.conf.Configuration
import org.apache.spark.Logging


private[streaming] class FileBasedWriteAheadLogReader(path: String, conf: Configuration)
  extends Iterator[ByteBuffer] with Closeable with Logging {

  private val instream = HdfsUtils.getInputStream(path, conf)
  private var closed = false
  //None被声明为一个对象,而不是一个类,在没有值的时候,使用None,如果有值可以引用,就使用Some来包含这个值,都是Option的子类
  private var nextItem: Option[ByteBuffer] = None

  override def hasNext: Boolean = synchronized {
    if (closed) {
       //如果已关闭,就肯定不hasNext了
      return false
    }
  
    if (nextItem.isDefined) { // handle the case where hasNext is called without calling next
      true
    } else {
      try {
         //读出来下一条,如果有,就说明还确实 hasNext
        val length = instream.readInt()
        val buffer = new Array[Byte](length)
        instream.readFully(buffer)
        nextItem = Some(ByteBuffer.wrap(buffer))
        logTrace("Read next item " + nextItem.get)
        true
      } catch {
        case e: EOFException =>
          logDebug("Error reading next item, EOF reached", e)
          close()
          false
        case e: Exception =>
          logWarning("Error while trying to read data from HDFS.", e)
          close()
          throw e
      }
    }
  }

  override def next(): ByteBuffer = synchronized {
    val data = nextItem.getOrElse {
      close()
      throw new IllegalStateException(
        "next called without calling hasNext or after hasNext returned false")
    }
    //确保下一个调用hasNext加载新的数据
    //None被声明为一个对象,而不是一个类,在没有值的时候,使用None,如果有值可以引用,就使用Some来包含这个值,都是Option的子类
    nextItem = None // Ensure the next hasNext call loads new data.
    data
  }

  override def close(): Unit = synchronized {
    if (!closed) {
      instream.close()
    }
    closed = true
  }
} 
Example 26
Source File: RateLimitedOutputStream.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.util

import scala.annotation.tailrec

import java.io.OutputStream
import java.util.concurrent.TimeUnit._

import org.apache.spark.Logging


private[streaming]
class RateLimitedOutputStream(out: OutputStream, desiredBytesPerSec: Int)
  extends OutputStream
  with Logging {

  require(desiredBytesPerSec > 0)

  private val SYNC_INTERVAL = NANOSECONDS.convert(10, SECONDS)
  private val CHUNK_SIZE = 8192
  private var lastSyncTime = System.nanoTime
  private var bytesWrittenSinceSync = 0L

  override def write(b: Int) {
    waitToWrite(1)
    out.write(b)
  }

  override def write(bytes: Array[Byte]) {
    write(bytes, 0, bytes.length)
  }

  @tailrec
  override final def write(bytes: Array[Byte], offset: Int, length: Int) {
    val writeSize = math.min(length - offset, CHUNK_SIZE)
    if (writeSize > 0) {
      waitToWrite(writeSize)
      out.write(bytes, offset, writeSize)
      write(bytes, offset + writeSize, length)
    }
  }

  override def flush() {
    out.flush()
  }

  override def close() {
    out.close()
  }

  @tailrec
  private def waitToWrite(numBytes: Int) {
    val now = System.nanoTime
    val elapsedNanosecs = math.max(now - lastSyncTime, 1)
    val rate = bytesWrittenSinceSync.toDouble * 1000000000 / elapsedNanosecs
    if (rate < desiredBytesPerSec) {
      // It's okay to write; just update some variables and return
      bytesWrittenSinceSync += numBytes
      if (now > lastSyncTime + SYNC_INTERVAL) {
        // Sync interval has passed; let's resync
        lastSyncTime = now
        bytesWrittenSinceSync = numBytes
      }
    } else {
      // Calculate how much time we should sleep to bring ourselves to the desired rate.
      val targetTimeInMillis = bytesWrittenSinceSync * 1000 / desiredBytesPerSec
      val elapsedTimeInMillis = elapsedNanosecs / 1000000
      val sleepTimeInMillis = targetTimeInMillis - elapsedTimeInMillis
      if (sleepTimeInMillis > 0) {
        logTrace("Natural rate is " + rate + " per second but desired rate is " +
          desiredBytesPerSec + ", sleeping for " + sleepTimeInMillis + " ms to compensate.")
        Thread.sleep(sleepTimeInMillis)
      }
      waitToWrite(numBytes)
    }
  }
} 
Example 27
Source File: FailureSuite.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming

import java.io.File

import org.scalatest.BeforeAndAfter

import org.apache.spark.{SparkFunSuite, Logging}
import org.apache.spark.util.Utils


class FailureSuite extends SparkFunSuite with BeforeAndAfter with Logging {

  private val batchDuration: Duration = Milliseconds(1000)
  private val numBatches = 30
  private var directory: File = null

  before {
    directory = Utils.createTempDir()
  }

  after {
    if (directory != null) {
     //删除临时目录
      Utils.deleteRecursively(directory)
    }
    //停止所有活动实时流
    StreamingContext.getActive().foreach { _.stop() }
  }
  //多次失败map
  test("multiple failures with map") {
    MasterFailureTest.testMap(directory.getAbsolutePath, numBatches, batchDuration)
  }
  //多次失败updateStateByKey
  test("multiple failures with updateStateByKey") {
    MasterFailureTest.testUpdateStateByKey(directory.getAbsolutePath, numBatches, batchDuration)
  }
} 
Example 28
Source File: EventLogDownloadResource.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.status.api.v1

import java.io.OutputStream
import java.util.zip.ZipOutputStream
import javax.ws.rs.{GET, Produces}
import javax.ws.rs.core.{MediaType, Response, StreamingOutput}

import scala.util.control.NonFatal

import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.deploy.SparkHadoopUtil

@Produces(Array(MediaType.APPLICATION_OCTET_STREAM))
private[v1] class EventLogDownloadResource(
    val uIRoot: UIRoot,
    val appId: String,
    val attemptId: Option[String]) extends Logging {
  val conf = SparkHadoopUtil.get.newConfiguration(new SparkConf)

  @GET
  def getEventLogs(): Response = {
    try {
      val fileName = {
        attemptId match {
          case Some(id) => s"eventLogs-$appId-$id.zip"
          case None => s"eventLogs-$appId.zip"
        }
      }
      //实现StreamingOutput接口
      val stream = new StreamingOutput {
        override def write(output: OutputStream): Unit = {
          //ZipOutputStream实现打包
          val zipStream = new ZipOutputStream(output)
          try {
            uIRoot.writeEventLogs(appId, attemptId, zipStream)
          } finally {
            zipStream.close()
          }

        }
      }

      Response.ok(stream)
        .header("Content-Disposition", s"attachment; filename=$fileName")
        .header("Content-Type", MediaType.APPLICATION_OCTET_STREAM)
        .build()
    } catch {
      case NonFatal(e) =>
        Response.serverError()
          .entity(s"Event logs are not available for app: $appId.")
          .status(Response.Status.SERVICE_UNAVAILABLE)
          .build()
    }
  }
} 
Example 29
Source File: NettyBlockRpcServer.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.network.netty

import java.nio.ByteBuffer

import scala.collection.JavaConversions._

import org.apache.spark.Logging
import org.apache.spark.network.BlockDataManager
import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer}
import org.apache.spark.network.client.{RpcResponseCallback, TransportClient}
import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager}
import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock}
import org.apache.spark.serializer.Serializer
import org.apache.spark.storage.{BlockId, StorageLevel}


class NettyBlockRpcServer(
    serializer: Serializer,
    blockManager: BlockDataManager)
  extends RpcHandler with Logging {

  private val streamManager = new OneForOneStreamManager()

  override def receive(
      client: TransportClient,
      messageBytes: Array[Byte],
      responseContext: RpcResponseCallback): Unit = {
    //消息解码
    val message = BlockTransferMessage.Decoder.fromByteArray(messageBytes)
    logTrace(s"Received request: $message")

    message match {
      //提供下载Block文件的功能,
      case openBlocks: OpenBlocks =>
        val blocks: Seq[ManagedBuffer] =
          //数据blockIds,存放BlockId,获得块数据
          openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) 
        val streamId = streamManager.registerStream(blocks.iterator)
        logTrace(s"Registered streamId $streamId with ${blocks.size} buffers")
        responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteArray)
        //提供上传Block文件的RPC服务
      case uploadBlock: UploadBlock =>
        // StorageLevel is serialized as bytes using our JavaSerializer.
        //使用我们的JavaSerializer将StorageLevel序列化为字节
        //存储级别
        val level: StorageLevel =
          serializer.newInstance().deserialize(ByteBuffer.wrap(uploadBlock.metadata)) 
        val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData))
        //存储局部块,使用给定的存储级别
        blockManager.putBlockData(BlockId(uploadBlock.blockId), data, level)
        responseContext.onSuccess(new Array[Byte](0))
    }
  }

  override def getStreamManager(): StreamManager = streamManager
} 
Example 30
Source File: MetricsConfig.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.metrics

import java.io.{FileInputStream, InputStream}
import java.util.Properties

import scala.collection.mutable
import scala.util.matching.Regex

import org.apache.spark.util.Utils
import org.apache.spark.{Logging, SparkConf}

private[spark] class MetricsConfig(conf: SparkConf) extends Logging {

  private val DEFAULT_PREFIX = "*"
  private val INSTANCE_REGEX = "^(\\*|[a-zA-Z]+)\\.(.+)".r
  private val DEFAULT_METRICS_CONF_FILENAME = "metrics.properties"

  private[metrics] val properties = new Properties()
  private[metrics] var propertyCategories: mutable.HashMap[String, Properties] = null

  private def setDefaultProperties(prop: Properties) {
    prop.setProperty("*.sink.servlet.class", "org.apache.spark.metrics.sink.MetricsServlet")
    prop.setProperty("*.sink.servlet.path", "/metrics/json")
    prop.setProperty("master.sink.servlet.path", "/metrics/master/json")
    prop.setProperty("applications.sink.servlet.path", "/metrics/applications/json")
  }

  def initialize() {
    // Add default properties in case there's no properties file
    // 添加默认属性的情况下,没有任何属性文件
    setDefaultProperties(properties)

    loadPropertiesFromFile(conf.getOption("spark.metrics.conf"))

    // Also look for the properties in provided Spark configuration
    //还要查找提供的Spark配置中的属性
    val prefix = "spark.metrics.conf."
    conf.getAll.foreach {
      case (k, v) if k.startsWith(prefix) =>
        properties.setProperty(k.substring(prefix.length()), v)
      case _ =>
    }

    propertyCategories = subProperties(properties, INSTANCE_REGEX)
    if (propertyCategories.contains(DEFAULT_PREFIX)) {
      import scala.collection.JavaConversions._

      val defaultProperty = propertyCategories(DEFAULT_PREFIX)
      for { (inst, prop) <- propertyCategories
            if (inst != DEFAULT_PREFIX)
            (k, v) <- defaultProperty
            if (prop.getProperty(k) == null) } {
        prop.setProperty(k, v)
      }
    }
  }
//使用正则匹配properties中以source.开头的属性,然后将属性中的source反映得到的实例加入HashMap
  def subProperties(prop: Properties, regex: Regex): mutable.HashMap[String, Properties] = {
    val subProperties = new mutable.HashMap[String, Properties]
    import scala.collection.JavaConversions._
    prop.foreach { kv =>
      if (regex.findPrefixOf(kv._1).isDefined) {
        val regex(prefix, suffix) = kv._1
        subProperties.getOrElseUpdate(prefix, new Properties).setProperty(suffix, kv._2)
      }
    }
    subProperties
  }

  def getInstance(inst: String): Properties = {
    propertyCategories.get(inst) match {
      case Some(s) => s
      case None => propertyCategories.getOrElse(DEFAULT_PREFIX, new Properties)
    }
  }

  
  private[this] def loadPropertiesFromFile(path: Option[String]): Unit = {
    var is: InputStream = null
    try {
      is = path match {
        case Some(f) => new FileInputStream(f)
        case None => Utils.getSparkClassLoader.getResourceAsStream(DEFAULT_METRICS_CONF_FILENAME)
      }

      if (is != null) {
        properties.load(is)
      }
    } catch {
      case e: Exception =>
        val file = path.getOrElse(DEFAULT_METRICS_CONF_FILENAME)
        logError(s"Error loading configuration file $file", e)
    } finally {
      if (is != null) {
        is.close()
      }
    }
  }

} 
Example 31
Source File: PythonGatewayServer.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.api.python

import java.io.DataOutputStream
import java.net.Socket

import py4j.GatewayServer

import org.apache.spark.Logging
import org.apache.spark.util.Utils


private[spark] object PythonGatewayServer extends Logging {
  def main(args: Array[String]): Unit = Utils.tryOrExit {
    // Start a GatewayServer on an ephemeral port
    val gatewayServer: GatewayServer = new GatewayServer(null, 0)
    gatewayServer.start()
    val boundPort: Int = gatewayServer.getListeningPort
    if (boundPort == -1) {
      logError("GatewayServer failed to bind; exiting")
      System.exit(1)
    } else {
      logDebug(s"Started PythonGatewayServer on port $boundPort")
    }

    // Communicate the bound port back to the caller via the caller-specified callback port
    //System.getenv()和System.getProperties()的区别
    //System.getenv() 返回系统环境变量值 设置系统环境变量:当前登录用户主目录下的".bashrc"文件中可以设置系统环境变量
    //System.getProperties() 返回Java进程变量值 通过命令行参数的"-D"选项
    val callbackHost = sys.env("_PYSPARK_DRIVER_CALLBACK_HOST")
    val callbackPort = sys.env("_PYSPARK_DRIVER_CALLBACK_PORT").toInt
    logDebug(s"Communicating GatewayServer port to Python driver at $callbackHost:$callbackPort")
    val callbackSocket = new Socket(callbackHost, callbackPort)
    val dos = new DataOutputStream(callbackSocket.getOutputStream)
    dos.writeInt(boundPort)
    dos.close()
    callbackSocket.close()

    // Exit on EOF or broken pipe to ensure that this process dies when the Python driver dies:
    while (System.in.read() != -1) {
      // Do nothing
    }
    logDebug("Exiting due to broken pipe from Python driver")
    System.exit(0)
  }
} 
Example 32
Source File: MesosExternalShuffleService.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.mesos

import java.net.SocketAddress

import scala.collection.mutable

import org.apache.spark.{Logging, SecurityManager, SparkConf}
import org.apache.spark.deploy.ExternalShuffleService
import org.apache.spark.network.client.{RpcResponseCallback, TransportClient}
import org.apache.spark.network.shuffle.ExternalShuffleBlockHandler
import org.apache.spark.network.shuffle.protocol.BlockTransferMessage
import org.apache.spark.network.shuffle.protocol.mesos.RegisterDriver
import org.apache.spark.network.util.TransportConf


private[mesos] class MesosExternalShuffleService(conf: SparkConf, securityManager: SecurityManager)
  extends ExternalShuffleService(conf, securityManager) {

  protected override def newShuffleBlockHandler(
      conf: TransportConf): ExternalShuffleBlockHandler = {
    new MesosExternalShuffleBlockHandler(conf)
  }
}

private[spark] object MesosExternalShuffleService extends Logging {

  def main(args: Array[String]): Unit = {
    ExternalShuffleService.main(args,
      (conf: SparkConf, sm: SecurityManager) => new MesosExternalShuffleService(conf, sm))
  }
} 
Example 33
Source File: MesosClusterDispatcher.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.mesos

import java.util.concurrent.CountDownLatch

import org.apache.spark.deploy.mesos.ui.MesosClusterUI
import org.apache.spark.deploy.rest.mesos.MesosRestServer
import org.apache.spark.scheduler.cluster.mesos._
import org.apache.spark.util.SignalLogger
import org.apache.spark.{Logging, SecurityManager, SparkConf}


private[mesos] class MesosClusterDispatcher(
    args: MesosClusterDispatcherArguments,
    conf: SparkConf)
  extends Logging {
  //Spark master和workers使用的公共DNS(默认空)
  private val publicAddress = Option(conf.getenv("SPARK_PUBLIC_DNS")).getOrElse(args.host)
  private val recoveryMode = conf.get("spark.mesos.deploy.recoveryMode", "NONE").toUpperCase()
  logInfo("Recovery mode in Mesos dispatcher set to: " + recoveryMode)

  private val engineFactory = recoveryMode match {
    case "NONE" => new BlackHoleMesosClusterPersistenceEngineFactory
    case "ZOOKEEPER" => new ZookeeperMesosClusterPersistenceEngineFactory(conf)
    case _ => throw new IllegalArgumentException("Unsupported recovery mode: " + recoveryMode)
  }

  private val scheduler = new MesosClusterScheduler(engineFactory, conf)

  private val server = new MesosRestServer(args.host, args.port, conf, scheduler)
  private val webUi = new MesosClusterUI(
    new SecurityManager(conf),
    args.webUiPort,
    conf,
    publicAddress,
    scheduler)

  private val shutdownLatch = new CountDownLatch(1)

  def start(): Unit = {
    webUi.bind()
    scheduler.frameworkUrl = webUi.activeWebUiUrl
    scheduler.start()
    server.start()
  }

  def awaitShutdown(): Unit = {
    shutdownLatch.await()
  }

  def stop(): Unit = {
    webUi.stop()
    server.stop()
    scheduler.stop()
    shutdownLatch.countDown()
  }
}

private[mesos] object MesosClusterDispatcher extends Logging {
  def main(args: Array[String]) {
    SignalLogger.register(log)
    val conf = new SparkConf
    val dispatcherArgs = new MesosClusterDispatcherArguments(args, conf)
    conf.setMaster(dispatcherArgs.masterUrl)
    conf.setAppName(dispatcherArgs.name)
    dispatcherArgs.zookeeperUrl.foreach { z =>
      conf.set("spark.mesos.deploy.recoveryMode", "ZOOKEEPER")
      conf.set("spark.mesos.deploy.zookeeper.url", z)
    }
    val dispatcher = new MesosClusterDispatcher(dispatcherArgs, conf)
    dispatcher.start()
    val shutdownHook = new Thread() {
      override def run() {
        logInfo("Shutdown hook is shutting down dispatcher")
        dispatcher.stop()
        dispatcher.awaitShutdown()
      }
    }
    Runtime.getRuntime.addShutdownHook(shutdownHook)
    dispatcher.awaitShutdown()
  }
} 
Example 34
Source File: SparkCuratorUtil.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy

import org.apache.curator.framework.{CuratorFramework, CuratorFrameworkFactory}
import org.apache.curator.retry.ExponentialBackoffRetry
import org.apache.spark.{Logging, SparkConf}
import org.apache.zookeeper.KeeperException

import scala.collection.JavaConversions._

private[spark] object SparkCuratorUtil extends Logging {

  private val ZK_CONNECTION_TIMEOUT_MILLIS = 15000
  private val ZK_SESSION_TIMEOUT_MILLIS = 60000
  private val RETRY_WAIT_MILLIS = 5000
  private val MAX_RECONNECT_ATTEMPTS = 3

  def newClient(
                 conf: SparkConf,
                 //zookeeper集群URL
                 zkUrlConf: String = "spark.deploy.zookeeper.url"): CuratorFramework = {
    val ZK_URL = conf.get(zkUrlConf)
    val zk = CuratorFrameworkFactory.newClient(ZK_URL,
      ZK_SESSION_TIMEOUT_MILLIS, ZK_CONNECTION_TIMEOUT_MILLIS,
      new ExponentialBackoffRetry(RETRY_WAIT_MILLIS, MAX_RECONNECT_ATTEMPTS))
    zk.start()
    zk
  }

  def mkdir(zk: CuratorFramework, path: String) {
    if (zk.checkExists().forPath(path) == null) {
      try {
        zk.create().creatingParentsIfNeeded().forPath(path)
      } catch {
        case nodeExist: KeeperException.NodeExistsException =>
        // do nothing, ignore node existing exception.
        case e: Exception => throw e
      }
    }
  }

  
  def deleteRecursive(zk: CuratorFramework, path: String) {
    if (zk.checkExists().forPath(path) != null) {
      for (child <- zk.getChildren.forPath(path)) {
        zk.delete().forPath(path + "/" + child)
      }
      zk.delete().forPath(path)
    }
  }
} 
Example 35
Source File: TestClient.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.client

import org.apache.spark.rpc.RpcEnv
import org.apache.spark.{SecurityManager, SparkConf, Logging}
import org.apache.spark.deploy.{ApplicationDescription, Command}
import org.apache.spark.util.Utils

private[spark] object TestClient {

  private class TestListener extends AppClientListener with Logging {
    def connected(id: String) {
      logInfo("Connected to master, got app ID " + id)
    }

    def disconnected() {
      logInfo("Disconnected from master")
      System.exit(0)
    }

    def dead(reason: String) {
      logInfo("Application died with error: " + reason)
      System.exit(0)
    }

    def executorAdded(id: String, workerId: String, hostPort: String, cores: Int, memory: Int) {}

    def executorRemoved(id: String, message: String, exitStatus: Option[Int]) {}
  }

  def main(args: Array[String]) {
    val url = if(args.isEmpty) "127.0.0.1" else args(0)
    
    val conf = new SparkConf
    val rpcEnv = RpcEnv.create("spark", Utils.localHostName(), 0, conf, new SecurityManager(conf))
    val executorClassnamea = TestExecutor.getClass.getCanonicalName
    println("====executorClassname======"+executorClassnamea)
    //stripSuffix返回这个字符串,给定的`suffix`剥离。 如果这个字符串不以`suffix'结尾,那么它不会被返回
    val executorClassname = TestExecutor.getClass.getCanonicalName.stripSuffix("$")
    println("====executorClassname======"+executorClassname)
    val desc = new ApplicationDescription("TestClient", Some(1), 512,
      Command(executorClassname, Seq(), Map(), Seq(), Seq(), Seq()), "ignored")
    val listener = new TestListener
    val client = new AppClient(rpcEnv, Array(url), desc, listener, new SparkConf)
    client.start()
    rpcEnv.awaitTermination()
  }
} 
Example 36
Source File: FileSystemPersistenceEngine.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.master

import java.io._

import scala.reflect.ClassTag

import org.apache.spark.Logging
import org.apache.spark.serializer.{DeserializationStream, SerializationStream, Serializer}
import org.apache.spark.util.Utils

private[master] class FileSystemPersistenceEngine(
    val dir: String,
    val serializer: Serializer)
  extends PersistenceEngine with Logging {

  new File(dir).mkdir()

  override def persist(name: String, obj: Object): Unit = {
    serializeIntoFile(new File(dir + File.separator + name), obj)
  }

  override def unpersist(name: String): Unit = {
    new File(dir + File.separator + name).delete()
  }

  override def read[T: ClassTag](prefix: String): Seq[T] = {
    val files = new File(dir).listFiles().filter(_.getName.startsWith(prefix))
    files.map(deserializeFromFile[T])
  }

  private def serializeIntoFile(file: File, value: AnyRef) {
    val created = file.createNewFile()
    if (!created) { throw new IllegalStateException("Could not create file: " + file) }
    val fileOut = new FileOutputStream(file)
    var out: SerializationStream = null
    Utils.tryWithSafeFinally {
      out = serializer.newInstance().serializeStream(fileOut)
      out.writeObject(value)
    } {
      fileOut.close()
      if (out != null) {
        out.close()
      }
    }
  }

  private def deserializeFromFile[T](file: File)(implicit m: ClassTag[T]): T = {
    val fileIn = new FileInputStream(file)
    var in: DeserializationStream = null
    try {
      in = serializer.newInstance().deserializeStream(fileIn)
      in.readObject[T]()
    } finally {
      fileIn.close()
      if (in != null) {
        in.close()
      }
    }
  }

} 
Example 37
Source File: RecoveryModeFactory.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.master

import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.serializer.Serializer


private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: Serializer)
  extends StandaloneRecoveryModeFactory(conf, serializer) with Logging {
//Spark保存恢复状态的目录
  val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "")

  def createPersistenceEngine(): PersistenceEngine = {
    logInfo("Persisting recovery state to directory: " + RECOVERY_DIR)
    new FileSystemPersistenceEngine(RECOVERY_DIR, serializer)
  }

  def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = {
    new MonarchyLeaderAgent(master)
  }
}

private[master] class ZooKeeperRecoveryModeFactory(conf: SparkConf, serializer: Serializer)
  extends StandaloneRecoveryModeFactory(conf, serializer) {

  def createPersistenceEngine(): PersistenceEngine = {
    new ZooKeeperPersistenceEngine(conf, serializer)
  }

  def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = {
    new ZooKeeperLeaderElectionAgent(master, conf)
  }
} 
Example 38
Source File: MasterWebUI.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.master.ui

import org.apache.spark.Logging
import org.apache.spark.deploy.master.Master
import org.apache.spark.status.api.v1.{ApiRootResource, ApplicationsListResource, ApplicationInfo,
  UIRoot}
import org.apache.spark.ui.{SparkUI, WebUI}
import org.apache.spark.ui.JettyUtils._


  def detachSparkUI(ui: SparkUI) {
    assert(serverInfo.isDefined, "Master UI must be bound to a server before detaching SparkUIs")
    ui.getHandlers.foreach(detachHandler)
  }

  def getApplicationInfoList: Iterator[ApplicationInfo] = {
    val state = masterPage.getMasterState
    val activeApps = state.activeApps.sortBy(_.startTime).reverse
    val completedApps = state.completedApps.sortBy(_.endTime).reverse
    activeApps.iterator.map { ApplicationsListResource.convertApplicationInfo(_, false) } ++
      completedApps.iterator.map { ApplicationsListResource.convertApplicationInfo(_, true) }
  }

  def getSparkUI(appId: String): Option[SparkUI] = {
    val state = masterPage.getMasterState
    val activeApps = state.activeApps.sortBy(_.startTime).reverse
    val completedApps = state.completedApps.sortBy(_.endTime).reverse
    (activeApps ++ completedApps).find { _.id == appId }.flatMap {
      master.rebuildSparkUI
    }
  }
}

private[master] object MasterWebUI {
  private val STATIC_RESOURCE_DIR = SparkUI.STATIC_RESOURCE_DIR
} 
Example 39
Source File: ZooKeeperLeaderElectionAgent.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.master

import org.apache.spark.{ Logging, SparkConf }
import org.apache.curator.framework.CuratorFramework
import org.apache.curator.framework.recipes.leader.{ LeaderLatchListener, LeaderLatch }
import org.apache.spark.deploy.SparkCuratorUtil

private[master] class ZooKeeperLeaderElectionAgent(val masterInstance: LeaderElectable,
    conf: SparkConf) extends LeaderLatchListener with LeaderElectionAgent with Logging {
  //zooKeeper保存恢复状态的目录,缺省为/spark
  val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/leader_election"

  private var zk: CuratorFramework = _
  private var leaderLatch: LeaderLatch = _
  private var status = LeadershipStatus.NOT_LEADER

  start()

  private def start() {
    logInfo("Starting ZooKeeper LeaderElection agent")
    zk = SparkCuratorUtil.newClient(conf)
    leaderLatch = new LeaderLatch(zk, WORKING_DIR)
    leaderLatch.addListener(this) //它实现LeanderLatchListener
    leaderLatch.start() //启动Lead的竞争与选举
  }

  override def stop() {
    leaderLatch.close()
    zk.close()
  }

  override def isLeader() {
    synchronized {
      // could have lost leadership by now.
      //有可能状态已经再次改变,即Leader再次变化,因此需要再次确认
      if (!leaderLatch.hasLeadership) {
        return
      }

      logInfo("We have gained leadership")
      //已经被选举Leader
      updateLeadershipStatus(true)
    }
  }

  override def notLeader() {
    synchronized {
      // could have gained leadership by now.
      //有可能状态已经再次改变,即Leader再次变化,因此需要再次确认
      if (leaderLatch.hasLeadership) {
        return
      }
      //被剥夺Leader
      logInfo("We have lost leadership")
      updateLeadershipStatus(false)
    }
  }

  private def updateLeadershipStatus(isLeader: Boolean) {
    if (isLeader && status == LeadershipStatus.NOT_LEADER) {
      status = LeadershipStatus.LEADER
      masterInstance.electedLeader() //Master已经被选举为Leader,
    } else if (!isLeader && status == LeadershipStatus.LEADER) {
      status = LeadershipStatus.NOT_LEADER
      masterInstance.revokedLeadership() //Master已经被剥夺Leader
    }
  }

  private object LeadershipStatus extends Enumeration {
    type LeadershipStatus = Value
    val LEADER, NOT_LEADER = Value
  }
} 
Example 40
Source File: ZooKeeperPersistenceEngine.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.master

import java.nio.ByteBuffer

import scala.collection.JavaConversions._
import scala.reflect.ClassTag

import org.apache.curator.framework.CuratorFramework
import org.apache.zookeeper.CreateMode

import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.deploy.SparkCuratorUtil
import org.apache.spark.serializer.Serializer


private[master] class ZooKeeperPersistenceEngine(conf: SparkConf, val serializer: Serializer)
  extends PersistenceEngine
  with Logging {
  //zooKeeper保存恢复状态的目录,缺省为/spark
  private val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/master_status"
  private val zk: CuratorFramework = SparkCuratorUtil.newClient(conf)

  SparkCuratorUtil.mkdir(zk, WORKING_DIR)


  override def persist(name: String, obj: Object): Unit = {
    serializeIntoFile(WORKING_DIR + "/" + name, obj)
  }

  override def unpersist(name: String): Unit = {
    zk.delete().forPath(WORKING_DIR + "/" + name)
  }

  override def read[T: ClassTag](prefix: String): Seq[T] = {
    val file = zk.getChildren.forPath(WORKING_DIR).filter(_.startsWith(prefix))
    file.map(deserializeFromFile[T]).flatten
  }

  override def close() {
    zk.close()
  }

  private def serializeIntoFile(path: String, value: AnyRef) {
    val serialized = serializer.newInstance().serialize(value)
    val bytes = new Array[Byte](serialized.remaining())
    serialized.get(bytes)
    zk.create().withMode(CreateMode.PERSISTENT).forPath(path, bytes)
  }

  private def deserializeFromFile[T](filename: String)(implicit m: ClassTag[T]): Option[T] = {
    val fileData = zk.getData().forPath(WORKING_DIR + "/" + filename)
    try {
      Some(serializer.newInstance().deserialize[T](ByteBuffer.wrap(fileData)))
    } catch {
      case e: Exception => {
        logWarning("Exception while reading persisted file, deleting", e)
        zk.delete().forPath(WORKING_DIR + "/" + filename)
        None
      }
    }
  }
} 
Example 41
Source File: WorkerWebUI.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.worker.ui

import java.io.File
import javax.servlet.http.HttpServletRequest

import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.deploy.worker.Worker
import org.apache.spark.deploy.worker.ui.WorkerWebUI._
import org.apache.spark.ui.{SparkUI, WebUI}
import org.apache.spark.ui.JettyUtils._
import org.apache.spark.util.RpcUtils


  def initialize() {
    val logPage = new LogPage(this)
    attachPage(logPage)
    attachPage(new WorkerPage(this))
    attachHandler(createStaticHandler(WorkerWebUI.STATIC_RESOURCE_BASE, "/static"))
    attachHandler(createServletHandler("/log",
      (request: HttpServletRequest) => logPage.renderLog(request), worker.securityMgr))
  }
}

private[worker] object WorkerWebUI {
  val STATIC_RESOURCE_BASE = SparkUI.STATIC_RESOURCE_DIR
  val DEFAULT_RETAINED_DRIVERS = 1000
  val DEFAULT_RETAINED_EXECUTORS = 1000
} 
Example 42
Source File: HistoryServerArguments.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.history

import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.util.Utils


private[history] class HistoryServerArguments(conf: SparkConf, args: Array[String])
  extends Logging {
  private var propertiesFile: String = null

  parse(args.toList)

  private def parse(args: List[String]): Unit = {
    args match {
      case ("--dir" | "-d") :: value :: tail =>
        logWarning("Setting log directory through the command line is deprecated as of " +
          "Spark 1.1.0. Please set this through spark.history.fs.logDirectory instead.")
        conf.set("spark.history.fs.logDirectory", value)
        System.setProperty("spark.history.fs.logDirectory", value)
        parse(tail)

      case ("--help" | "-h") :: tail =>
        printUsageAndExit(0)

      case ("--properties-file") :: value :: tail =>
        propertiesFile = value
        parse(tail)
      //Nil是一个空的List,::向队列的头部追加数据,创造新的列表
      case Nil =>

      case _ =>
        printUsageAndExit(1)
    }
  }

   // This mutates the SparkConf, so all accesses to it must be made after this line
  //可变SparkConf,因此,所有访问它必须在这行之后
   Utils.loadDefaultSparkProperties(conf, propertiesFile)

  private def printUsageAndExit(exitCode: Int) {
    // scalastyle:off println
    System.err.println(
      """
      |Usage: HistoryServer [options]
      |
      |Options:
      |  --properties-file FILE      Path to a custom Spark properties file.
      |                              Default is conf/spark-defaults.conf.
      |
      |Configuration options can be set by setting the corresponding JVM system property.
      |History Server options are always available; additional options depend on the provider.
      |
      |History Server options:
      |
      |  spark.history.ui.port              Port where server will listen for connections
      |                                     (default 18080)
      |  spark.history.acls.enable          Whether to enable view acls for all applications
      |                                     (default false)
      |  spark.history.provider             Name of history provider class (defaults to
      |                                     file system-based provider)
      |  spark.history.retainedApplications Max number of application UIs to keep loaded in memory
      |                                     (default 50)
      |FsHistoryProvider options:
      |
      |  spark.history.fs.logDirectory      Directory where app logs are stored
      |                                     (default: file:/tmp/spark-events)
      |  spark.history.fs.updateInterval    How often to reload log data from storage
      |                                     (in seconds, default: 10)
      |""".stripMargin)
    // scalastyle:on println
    System.exit(exitCode)
  }

} 
Example 43
Source File: SimrSchedulerBackend.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler.cluster

import org.apache.hadoop.fs.{Path, FileSystem}

import org.apache.spark.rpc.RpcAddress
import org.apache.spark.{Logging, SparkContext, SparkEnv}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.scheduler.TaskSchedulerImpl

private[spark] class SimrSchedulerBackend(
    scheduler: TaskSchedulerImpl,
    sc: SparkContext,
    driverFilePath: String)
  extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv)
  with Logging {

  val tmpPath = new Path(driverFilePath + "_tmp")
  val filePath = new Path(driverFilePath)

  val maxCores = conf.getInt("spark.simr.executor.cores", 1)

  override def start() {
    super.start()

    val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName,
    //运行driver的主机名或 IP 地址
      RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt),
      CoarseGrainedSchedulerBackend.ENDPOINT_NAME)

    val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
    val fs = FileSystem.get(conf)
    val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("")

    logInfo("Writing to HDFS file: "  + driverFilePath)
    logInfo("Writing Akka address: "  + driverUrl)
    logInfo("Writing Spark UI Address: " + appUIAddress)

    // Create temporary file to prevent race condition where executors get empty driverUrl file
    //创建临时文件以防止执行程序获得空的驱动程序文件的竞争条件
    val temp = fs.create(tmpPath, true)
    temp.writeUTF(driverUrl)
    temp.writeInt(maxCores)
    temp.writeUTF(appUIAddress)
    temp.close()

    // "Atomic" rename
    fs.rename(tmpPath, filePath)
  }

  override def stop() {
val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
val fs = FileSystem.get(conf)
fs.delete(new Path(driverFilePath), false)
super.stop()
}

} 
Example 44
Source File: MesosTaskLaunchData.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler.cluster.mesos

import java.nio.ByteBuffer

import org.apache.mesos.protobuf.ByteString

import org.apache.spark.Logging


private[spark] case class MesosTaskLaunchData(
  serializedTask: ByteBuffer,
  attemptNumber: Int) extends Logging {

  def toByteString: ByteString = {
    //ByteBuffer.allocate在能够读和写之前,必须有一个缓冲区,用静态方法 allocate() 来分配缓冲区
    val dataBuffer = ByteBuffer.allocate(4 + serializedTask.limit)
    dataBuffer.putInt(attemptNumber)
    dataBuffer.put(serializedTask)
    dataBuffer.rewind
    logDebug(s"ByteBuffer size: [${dataBuffer.remaining}]")
    ByteString.copyFrom(dataBuffer)
  }
}

private[spark] object MesosTaskLaunchData extends Logging {
  def fromByteString(byteString: ByteString): MesosTaskLaunchData = {
    val byteBuffer = byteString.asReadOnlyByteBuffer()
    logDebug(s"ByteBuffer size: [${byteBuffer.remaining}]")
    val attemptNumber = byteBuffer.getInt // updates the position by 4 bytes
    val serializedTask = byteBuffer.slice() // subsequence starting at the current position
    MesosTaskLaunchData(serializedTask, attemptNumber)
  }
} 
Example 45
Source File: ReplayListenerBus.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.io.{InputStream, IOException}

import scala.io.Source

import com.fasterxml.jackson.core.JsonParseException
import org.json4s.jackson.JsonMethods._

import org.apache.spark.Logging
import org.apache.spark.util.JsonProtocol


  def replay(
      logData: InputStream,
      sourceName: String,
      maybeTruncated: Boolean = false): Unit = {
    var currentLine: String = null
    var lineNumber: Int = 1
    try {
      val lines = Source.fromInputStream(logData).getLines()
      while (lines.hasNext) {
        currentLine = lines.next()
        try {
          postToAll(JsonProtocol.sparkEventFromJson(parse(currentLine)))
        } catch {
          case jpe: JsonParseException =>
            // We can only ignore exception from last line of the file that might be truncated
            //我们只能忽略可能被截断的文件的最后一行的异常
            if (!maybeTruncated || lines.hasNext) {
              throw jpe
            } else {
              logWarning(s"Got JsonParseException from log file $sourceName" +
                s" at line $lineNumber, the file might not have finished writing cleanly.")
            }
        }
        lineNumber += 1
      }
    } catch {
      case ioe: IOException =>
        throw ioe
      case e: Exception =>
        logError(s"Exception parsing Spark event log: $sourceName", e)
        logError(s"Malformed line #$lineNumber: $currentLine\n")
    }
  }

} 
Example 46
Source File: SparkUncaughtExceptionHandler.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import org.apache.spark.Logging


private[spark] object SparkUncaughtExceptionHandler
  extends Thread.UncaughtExceptionHandler with Logging {

  override def uncaughtException(thread: Thread, exception: Throwable) {
    try {
      logError("Uncaught exception in thread " + thread, exception)

      // We may have been called from a shutdown hook. If so, we must not call System.exit().
      // (If we do, we will deadlock.)
      //我们可能已经被关闭了一个挂机,如果是这样,我们不能调用System.exit()。
      //(如果我们这样做,我们会死锁。)
      if (!ShutdownHookManager.inShutdown()) {
        if (exception.isInstanceOf[OutOfMemoryError]) {
          System.exit(SparkExitCode.OOM)
        } else {
          System.exit(SparkExitCode.UNCAUGHT_EXCEPTION)
        }
      }
    } catch {
      case oom: OutOfMemoryError => Runtime.getRuntime.halt(SparkExitCode.OOM)
      case t: Throwable => Runtime.getRuntime.halt(SparkExitCode.UNCAUGHT_EXCEPTION_TWICE)
    }
  }

  def uncaughtException(exception: Throwable) {
    uncaughtException(Thread.currentThread(), exception)
  }
} 
Example 47
Source File: BlockManagerSlaveEndpoint.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.storage

import scala.concurrent.{ ExecutionContext, Future }

import org.apache.spark.rpc.{ RpcEnv, RpcCallContext, RpcEndpoint }
import org.apache.spark.util.ThreadUtils
import org.apache.spark.{ Logging, MapOutputTracker, SparkEnv }
import org.apache.spark.storage.BlockManagerMessages._


private[storage] class BlockManagerSlaveEndpoint(
  override val rpcEnv: RpcEnv,
  blockManager: BlockManager,//引用BlockManagerMaster与Mast消息通信
  mapOutputTracker: MapOutputTracker)
    extends RpcEndpoint with Logging {

  private val asyncThreadPool =
    ThreadUtils.newDaemonCachedThreadPool("block-manager-slave-async-thread-pool")
  private implicit val asyncExecutionContext = ExecutionContext.fromExecutorService(asyncThreadPool)

  // Operations that involve removing blocks may be slow and should be done asynchronously
  //涉及删除块的操作可能很慢,应该是异步完成
  override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
    //根据BlockId删除该Executor上所有和该Shuffle相关的Block
    case RemoveBlock(blockId) =>
      doAsync[Boolean]("removing block " + blockId, context) {
        blockManager.removeBlock(blockId)
        true
      }     
    //收到BlockManagerMasterEndpoint发送RemoveRdd信息,根据RddId删除该Excutor上RDD所关联的所有Block
    case RemoveRdd(rddId) =>
      doAsync[Int]("removing RDD " + rddId, context) {
        blockManager.removeRdd(rddId)
      }
    //根据shuffleId删除该Executor上所有和该Shuffle相关的Block
    case RemoveShuffle(shuffleId) =>
      doAsync[Boolean]("removing shuffle " + shuffleId, context) {
        if (mapOutputTracker != null) {
          mapOutputTracker.unregisterShuffle(shuffleId)
        }
        SparkEnv.get.shuffleManager.unregisterShuffle(shuffleId)
      }
    //根据broadcastId删除该Executor上和该广播变量相关的所有Block
    case RemoveBroadcast(broadcastId, _) =>
      doAsync[Int]("removing broadcast " + broadcastId, context) {
        //tellMaster 是否将状态汇报到Master
        blockManager.removeBroadcast(broadcastId, tellMaster = true)
      }
    //根据blockId和askSlaves向Master返回该Block的blockStatus
    case GetBlockStatus(blockId, _) =>
      context.reply(blockManager.getStatus(blockId))
    //根据blockId和askSlaves向Master返回该Block的blockStatus
    case GetMatchingBlockIds(filter, _) =>
      context.reply(blockManager.getMatchingBlockIds(filter))
  }
  //科里化函数,异步调用,方法回调
  private def doAsync[T](actionMessage: String, context: RpcCallContext)(body: => T) {
    val future = Future {
      logDebug(actionMessage)
      body
    }
    future.onSuccess {
      case response =>
        logDebug("Done " + actionMessage + ", response is " + response)
        context.reply(response)
        logDebug("Sent response: " + response + " to " + context.sender)
    }
    future.onFailure {
      case t: Throwable =>
        logError("Error in " + actionMessage, t)
        context.sendFailure(t)
    }
  }

  override def onStop(): Unit = {
    asyncThreadPool.shutdownNow()
  }
} 
Example 48
Source File: LocalRDDCheckpointData.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Logging, SparkEnv, SparkException, TaskContext}
import org.apache.spark.storage.{RDDBlockId, StorageLevel}
import org.apache.spark.util.Utils


  def transformStorageLevel(level: StorageLevel): StorageLevel = {
    // If this RDD is to be cached off-heap, fail fast since we cannot provide any
    // correctness guarantees about subsequent computations after the first one
    //如果这个RDD要被堆栈缓存,那么快速失败,因为我们不能在第一个之后提供关于后续计算的任何正确性保证
    if (level.useOffHeap) {
      throw new SparkException("Local checkpointing is not compatible with off-heap caching.")
    }

    StorageLevel(useDisk = true, level.useMemory, level.deserialized, level.replication)
  }
} 
Example 49
Source File: TestClient.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.client

import org.apache.spark.deploy.{ApplicationDescription, Command}
import org.apache.spark.rpc.RpcEnv
import org.apache.spark.util.Utils
import org.apache.spark.{Logging, SecurityManager, SparkConf}

private[spark] object TestClient {

  private class TestListener extends AppClientListener with Logging {
    def connected(id: String) {
      logInfo("Connected to master, got app ID " + id)
    }

    def disconnected() {
      logInfo("Disconnected from master")
      System.exit(0)
    }

    def dead(reason: String) {
      logInfo("Application died with error: " + reason)
      System.exit(0)
    }

    def executorAdded(id: String, workerId: String, hostPort: String, cores: Int, memory: Int) {}

    def executorRemoved(id: String, message: String, exitStatus: Option[Int]) {}
  }

  def main(args: Array[String]) {
    val url = if(args.isEmpty) "172.0.0.1" else args(0)
    
    val conf = new SparkConf
    val rpcEnv = RpcEnv.create("spark", Utils.localHostName(), 0, conf, new SecurityManager(conf))
    //stripSuffix去掉<string>字串中结尾的字符
    val executorClassname = TestExecutor.getClass.getCanonicalName.stripSuffix("$")
    println("====executorClassname======"+executorClassname)
    val desc = new ApplicationDescription("TestClient", Some(1), 512,
      Command(executorClassname, Seq(), Map(), Seq(), Seq(), Seq()), "ignored")
    val listener = new TestListener
    val client = new AppClient(rpcEnv, Array(url), desc, listener, new SparkConf)
    client.start()
    rpcEnv.awaitTermination()
  }
} 
Example 50
Source File: HBasePartition.scala    From Spark-SQL-on-HBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hbase

import org.apache.hadoop.hbase.regionserver.RegionScanner
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.hbase.catalyst.expressions.PartialPredicateOperations._
import org.apache.spark.sql.hbase.types.{HBaseBytesType, Range}
import org.apache.spark.{Logging, Partition}


private[hbase] class HBasePartition(
                                     val idx: Int, val mappedIndex: Int,
                                     start: Option[HBaseRawType] = None,
                                     end: Option[HBaseRawType] = None,
                                     val server: Option[String] = None,
                                     val filterPredicates: Option[Expression] = None,
                                     @transient relation: HBaseRelation = null,
                                     @transient val newScanner:RegionScanner = null)
  extends Range[HBaseRawType](start, true, end, false, HBaseBytesType)
  with Partition with IndexMappable with Logging {

  override def index: Int = idx

  override def hashCode(): Int = idx

  @transient lazy val startNative: Seq[Any] = relation.nativeKeyConvert(start)

  @transient lazy val endNative: Seq[Any] = relation.nativeKeyConvert(end)

  
  def computePredicate(relation: HBaseRelation): Option[Expression] = {
    val predicate = if (filterPredicates.isDefined &&
      filterPredicates.get.references.exists(_.exprId == relation.partitionKeys.head.exprId)) {
      val oriPredicate = filterPredicates.get
      val predicateReferences = oriPredicate.references.toSeq
      val boundReference = BindReferences.bindReference(oriPredicate, predicateReferences)
      val row = new GenericMutableRow(predicateReferences.size)
      var rowIndex = 0
      var i = 0
      var range: Range[_] = null
      while (i < relation.keyColumns.size) {
        range = relation.generateRange(this, oriPredicate, i)
        if (range != null) {
          rowIndex = relation.rowIndex(predicateReferences, i)
          if (rowIndex >= 0) row.update(rowIndex, range)
          // if the non-last dimension range is not point, do not proceed to the next dims
          if (i < relation.keyColumns.size - 1 && !range.isPoint) i = relation.keyColumns.size
          else i = i + 1
        } else i = relation.keyColumns.size
      }
      val pr = boundReference.partialReduce(row, predicateReferences)
      pr match {
        case (null, e: Expression) => Some(e)
        case (true, _) => None
        case (false, _) => Some(Literal(false))
      }
    } else filterPredicates
    logInfo(predicate.toString)
    predicate
  }

  override def toString = {
    s"HBasePartition: $idx, $mappedIndex, [$start, $end), $filterPredicates"
  }
} 
Example 51
Source File: BytesUtilsSuite.scala    From Spark-SQL-on-HBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hbase

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Logging
import org.apache.spark.sql.hbase.types.HBaseBytesType
import org.apache.spark.sql.hbase.util.BinaryBytesUtils
import org.apache.spark.sql.types._
import org.scalatest.{BeforeAndAfterAll, FunSuite}

class BytesUtilsSuite extends FunSuite with BeforeAndAfterAll with Logging {
  test("Bytes Ordering Test") {
    val s = Seq(-257, -256, -255, -129, -128, -127, -64, -16, -4, -1,
      0, 1, 4, 16, 64, 127, 128, 129, 255, 256, 257)
    val result = s.map(i => (i, BinaryBytesUtils.create(IntegerType).toBytes(i)))
      .sortWith((f, s) =>
      HBaseBytesType.ordering.gt(
        f._2.asInstanceOf[HBaseBytesType.InternalType],
        s._2.asInstanceOf[HBaseBytesType.InternalType]))
    assert(result.map(a => a._1) == s.sorted.reverse)
  }

  def compare(a: Array[Byte], b: Array[Byte]): Int = {
    val length = Math.min(a.length, b.length)
    var result: Int = 0
    for (i <- 0 to length - 1) {
      val diff: Int = (a(i) & 0xff).asInstanceOf[Byte] - (b(i) & 0xff).asInstanceOf[Byte]
      if (diff != 0) {
        result = diff
      }
    }
    result
  }

  test("Bytes Utility Test") {
    assert(BinaryBytesUtils.toBoolean(BinaryBytesUtils.create(BooleanType)
      .toBytes(input = true), 0) === true)
    assert(BinaryBytesUtils.toBoolean(BinaryBytesUtils.create(BooleanType)
      .toBytes(input = false), 0) === false)

    assert(BinaryBytesUtils.toDouble(BinaryBytesUtils.create(DoubleType).toBytes(12.34d), 0)
      === 12.34d)
    assert(BinaryBytesUtils.toDouble(BinaryBytesUtils.create(DoubleType).toBytes(-12.34d), 0)
      === -12.34d)

    assert(BinaryBytesUtils.toFloat(BinaryBytesUtils.create(FloatType).toBytes(12.34f), 0)
      === 12.34f)
    assert(BinaryBytesUtils.toFloat(BinaryBytesUtils.create(FloatType).toBytes(-12.34f), 0)
      === -12.34f)

    assert(BinaryBytesUtils.toInt(BinaryBytesUtils.create(IntegerType).toBytes(12), 0)
      === 12)
    assert(BinaryBytesUtils.toInt(BinaryBytesUtils.create(IntegerType).toBytes(-12), 0)
      === -12)

    assert(BinaryBytesUtils.toLong(BinaryBytesUtils.create(LongType).toBytes(1234l), 0)
      === 1234l)
    assert(BinaryBytesUtils.toLong(BinaryBytesUtils.create(LongType).toBytes(-1234l), 0)
      === -1234l)

    assert(BinaryBytesUtils.toShort(BinaryBytesUtils.create(ShortType)
      .toBytes(12.asInstanceOf[Short]), 0) === 12)
    assert(BinaryBytesUtils.toShort(BinaryBytesUtils.create(ShortType)
      .toBytes(-12.asInstanceOf[Short]), 0) === -12)

    assert(BinaryBytesUtils.toUTF8String(BinaryBytesUtils.create(StringType).toBytes("abc"), 0, 3)
      === UTF8String("abc"))
    assert(BinaryBytesUtils.toUTF8String(BinaryBytesUtils.create(StringType).toBytes(""), 0, 0) === UTF8String(""))

    assert(BinaryBytesUtils.toByte(BinaryBytesUtils.create(ByteType)
      .toBytes(5.asInstanceOf[Byte]), 0) === 5)
    assert(BinaryBytesUtils.toByte(BinaryBytesUtils.create(ByteType)
      .toBytes(-5.asInstanceOf[Byte]), 0) === -5)

    assert(compare(BinaryBytesUtils.create(IntegerType).toBytes(128),
      BinaryBytesUtils.create(IntegerType).toBytes(-128)) > 0)
  }

  test("byte array plus one") {
    var byteArray =  Array[Byte](0x01.toByte, 127.toByte)
    assert(Bytes.compareTo(BinaryBytesUtils.addOne(byteArray),  Array[Byte](0x01.toByte, 0x80.toByte)) == 0)

    byteArray =  Array[Byte](0xff.toByte, 0xff.toByte)
    assert(BinaryBytesUtils.addOne(byteArray) == null)

    byteArray =  Array[Byte](0x02.toByte, 0xff.toByte)
    assert(Bytes.compareTo(BinaryBytesUtils.addOne(byteArray),  Array[Byte](0x03.toByte, 0x00.toByte)) == 0)
  }

  test("float comparison") {
    val f1 = BinaryBytesUtils.create(FloatType).toBytes(-1.23f)
    val f2 = BinaryBytesUtils.create(FloatType).toBytes(100f)
    assert(Bytes.compareTo(f1, f2) < 0)
  }
} 
Example 52
Source File: MeetupReceiver.scala    From meetup-stream   with Apache License 2.0 5 votes vote down vote up
package receiver

import org.apache.spark.streaming.receiver.Receiver
import org.apache.spark.storage.StorageLevel
import org.apache.spark.Logging
import com.ning.http.client.AsyncHttpClientConfig
import com.ning.http.client._
import scala.collection.mutable.ArrayBuffer
import java.io.OutputStream
import java.io.ByteArrayInputStream
import java.io.InputStreamReader
import java.io.BufferedReader
import java.io.InputStream
import java.io.PipedInputStream
import java.io.PipedOutputStream

class MeetupReceiver(url: String) extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) with Logging {
  
  @transient var client: AsyncHttpClient = _
  
  @transient var inputPipe: PipedInputStream = _
  @transient var outputPipe: PipedOutputStream = _  
       
  def onStart() {    
    val cf = new AsyncHttpClientConfig.Builder()
    cf.setRequestTimeout(Integer.MAX_VALUE)
    cf.setReadTimeout(Integer.MAX_VALUE)
    cf.setPooledConnectionIdleTimeout(Integer.MAX_VALUE)      
    client= new AsyncHttpClient(cf.build())
    
    inputPipe = new PipedInputStream(1024 * 1024)
    outputPipe = new PipedOutputStream(inputPipe)
    val producerThread = new Thread(new DataConsumer(inputPipe))
    producerThread.start()
    
    client.prepareGet(url).execute(new AsyncHandler[Unit]{
        
      def onBodyPartReceived(bodyPart: HttpResponseBodyPart) = {
        bodyPart.writeTo(outputPipe)
        AsyncHandler.STATE.CONTINUE        
      }
      
      def onStatusReceived(status: HttpResponseStatus) = {
        AsyncHandler.STATE.CONTINUE
      }
      
      def onHeadersReceived(headers: HttpResponseHeaders) = {
        AsyncHandler.STATE.CONTINUE
      }
            
      def onCompleted = {
        println("completed")
      }
      
      
      def onThrowable(t: Throwable)={
        t.printStackTrace()
      }
        
    })    
    
    
  }

  def onStop() {
    if (Option(client).isDefined) client.close()
    if (Option(outputPipe).isDefined) {
     outputPipe.flush()
     outputPipe.close() 
    }
    if (Option(inputPipe).isDefined) {
     inputPipe.close() 
    }    
  }
  
  class DataConsumer(inputStream: InputStream) extends Runnable 
  {
       
      override
      def run()
      {        
        val bufferedReader = new BufferedReader( new InputStreamReader( inputStream ))
        var input=bufferedReader.readLine()
        while(input!=null){          
          store(input)
          input=bufferedReader.readLine()
        }            
      }  
      
  }

} 
Example 53
Source File: Loggable.scala    From meetup-stream   with Apache License 2.0 5 votes vote down vote up
package core

import org.apache.spark.Logging

import org.apache.log4j.{Level, Logger}


  def setStreamingLogLevels() {
    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
    if (!log4jInitialized) {
      // We first log something to initialize Spark's default logging, then we override the
      // logging level.
      logInfo("Setting log level to [ERROR] for streaming example." +
        " To override add a custom log4j.properties to the classpath.")
      Logger.getRootLogger.setLevel(Level.WARN)
      Logger.getLogger("org").setLevel(Level.ERROR)
      Logger.getLogger("akka").setLevel(Level.ERROR)
      Logger.getLogger("streaming").setLevel(Level.WARN)
      Logger.getLogger("spark").setLevel(Level.WARN)
    }
  }
} 
Example 54
Source File: DemoUtils.scala    From spark-orientdb-connector   with Apache License 2.0 5 votes vote down vote up
package com.metreta.spark.orientdb.connector.demo

import org.apache.spark.{ Logging, SparkContext, SparkConf }
import com.metreta.spark.orientdb.connector.SparkContextFunctions

trait DemoUtils extends Logging {

  val OrientDBNodesProperty = "spark.orientdb.connection.nodes"
  val DefaultOrientDBNodesProperty = "127.0.0.1"

  val OriendtDBProtocolProperty = "spark.orientdb.protocol"
  val DefaultOriendtDBProtocolProperty = "plocal"

  val OriendtDBDBNameProperty = "spark.orientdb.dbname"
  //  val DefaultOriendtDBDBNameProperty = "testdb"
  val DefaultOriendtDBDBNameProperty = """/path/to/orient"""

  val OriendtDBPortProperty = "spark.orientdb.port"
  val DefaultOriendtDBPortProperty = "2424"

  val OriendtDBUserProperty = "spark.orientdb.user"
  val DefaultOriendtDBUser = "admin"

  val OriendtDBPasswordProperty = "spark.orientdb.password"
  val DefaultOriendtDBPassword = "admin"

  val OriendtDBClusterModeProperty = "spark.orientdb.clustermode" //remote-colocated
  val DefaultOriendtDBClusterMode = "colocated"

  implicit def toSparkContextFunctions(sc: SparkContext): SparkContextFunctions = new SparkContextFunctions(sc)

  val conf = new SparkConf()
    .setMaster("local[*]")
    .setAppName("demo")
    .set(OrientDBNodesProperty, DefaultOrientDBNodesProperty)
    .set(OriendtDBProtocolProperty, DefaultOriendtDBProtocolProperty)
    .set(OriendtDBDBNameProperty, DefaultOriendtDBDBNameProperty)
    .set(OriendtDBPortProperty, DefaultOriendtDBPortProperty)
    .set(OriendtDBUserProperty, DefaultOriendtDBUser)
    .set(OriendtDBPasswordProperty, DefaultOriendtDBPassword)
    .set(OriendtDBClusterModeProperty, DefaultOriendtDBClusterMode)

  lazy val sc = new SparkContext(conf)

}

object DemoUtils {
  def apply(): DemoUtils = new DemoUtils {}
} 
Example 55
Source File: ClassJsonRDDFunctions.scala    From spark-orientdb-connector   with Apache License 2.0 5 votes vote down vote up
package com.metreta.spark.orientdb.connector

import com.metreta.spark.orientdb.connector.api.OrientDBConnector
import com.orientechnologies.orient.core.record.impl.ODocument
import org.apache.spark.Logging
import org.apache.spark.rdd.RDD

class ClassJsonRDDFunctions(rdd: RDD[String]) extends Serializable with Logging {
  
  def saveJsonToOrient(myClass: String)(implicit connector: OrientDBConnector = OrientDBConnector(rdd.sparkContext.getConf)): Unit = {
    rdd.foreachPartition { partition ⇒
      val db = connector.databaseDocumentTx()

      while (partition.hasNext) {
        val obj = partition.next()
        val doc = new ODocument(myClass);
        doc.fromJSON(obj)
        db.save(doc)

      }
      db.commit()
      db.close()
    }
  }
} 
Example 56
Source File: ClassRDDPartitioner.scala    From spark-orientdb-connector   with Apache License 2.0 5 votes vote down vote up
package com.metreta.spark.orientdb.connector.rdd.partitioner

import scala.collection.JavaConversions.iterableAsScalaIterable
import scala.collection.mutable.ArrayBuffer

import org.apache.spark.Logging
import org.apache.spark.Partition

import com.metreta.spark.orientdb.connector.api.OrientDBConnector
import com.orientechnologies.orient.core.metadata.schema.OClass
import com.orientechnologies.orient.core.metadata.schema.OSchema
import com.orientechnologies.orient.core.storage.OStorage
import com.metreta.spark.orientdb.connector.SystemTables
import scala.collection.JavaConversions.iterableAsScalaIterable


  def getPartitions(): Array[Partition] = {
    
	val db = connector.databaseDocumentTx()

    var partitions = new ArrayBuffer[OrientPartition]
    val schema: OSchema = connector.getSchema(db)
    var klass: OClass = schema.getClass(mClass)
    val storage: OStorage = connector.getStorage(db)
    klass.getClusterIds.zipWithIndex foreach {
      case (clusterId, index) => partitions = partitions.+=(OrientPartition(
        index,
        null, // <- Host Address ?????
        PartitionName(klass.getName, storage.getClusterById(clusterId).getName)))
    }
  partitions.toArray
  }

} 
Example 57
Source File: CustomReceiver.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.streaming

import java.io.{InputStreamReader, BufferedReader, InputStream}
import java.net.Socket

import org.apache.spark.{SparkConf, Logging}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.receiver.Receiver


  private def receive() {
   var socket: Socket = null
   var userInput: String = null
   try {
     logInfo("Connecting to " + host + ":" + port)
     socket = new Socket(host, port)
     logInfo("Connected to " + host + ":" + port)
     val reader = new BufferedReader(new InputStreamReader(socket.getInputStream(), "UTF-8"))
     userInput = reader.readLine()
     while(!isStopped && userInput != null) {
       store(userInput)
       userInput = reader.readLine()
     }
     reader.close()
     socket.close()
     logInfo("Stopped receiving")
     restart("Trying to connect again")
   } catch {
     case e: java.net.ConnectException =>
       restart("Error connecting to " + host + ":" + port, e)
     case t: Throwable =>
       restart("Error receiving data", t)
   }
  }
}
// scalastyle:on println 
Example 58
Source File: StreamingExamples.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.streaming

import org.apache.spark.Logging

import org.apache.log4j.{Level, Logger}


  def setStreamingLogLevels() {
    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
    if (!log4jInitialized) {
      // We first log something to initialize Spark's default logging, then we override the
      // logging level.
      logInfo("Setting log level to [WARN] for streaming example." +
        " To override add a custom log4j.properties to the classpath.")
      Logger.getRootLogger.setLevel(Level.WARN)
    }
  }
} 
Example 59
Source File: GraphLoader.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.graphx

import org.apache.spark.storage.StorageLevel
import org.apache.spark.{Logging, SparkContext}
import org.apache.spark.graphx.impl.{EdgePartitionBuilder, GraphImpl}


  def edgeListFile(
      sc: SparkContext,
      path: String,
      canonicalOrientation: Boolean = false,
      numEdgePartitions: Int = -1,
      edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY,
      vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
    : Graph[Int, Int] =
  {
    val startTime = System.currentTimeMillis

    // Parse the edge data table directly into edge partitions
    val lines =
      if (numEdgePartitions > 0) {
        sc.textFile(path, numEdgePartitions).coalesce(numEdgePartitions)
      } else {
        sc.textFile(path)
      }
    val edges = lines.mapPartitionsWithIndex { (pid, iter) =>
      val builder = new EdgePartitionBuilder[Int, Int]
      iter.foreach { line =>
        if (!line.isEmpty && line(0) != '#') {
          val lineArray = line.split("\\s+")
          if (lineArray.length < 2) {
            throw new IllegalArgumentException("Invalid line: " + line)
          }
          val srcId = lineArray(0).toLong
          val dstId = lineArray(1).toLong
          if (canonicalOrientation && srcId > dstId) {
            builder.add(dstId, srcId, 1)
          } else {
            builder.add(srcId, dstId, 1)
          }
        }
      }
      Iterator((pid, builder.toEdgePartition))
    }.persist(edgeStorageLevel).setName("GraphLoader.edgeListFile - edges (%s)".format(path))
    edges.count()

    logInfo("It took %d ms to load the edges".format(System.currentTimeMillis - startTime))

    GraphImpl.fromEdgePartitions(edges, defaultVertexAttr = 1, edgeStorageLevel = edgeStorageLevel,
      vertexStorageLevel = vertexStorageLevel)
  } // end of edgeListFile

} 
Example 60
Source File: CachedRDDManager.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package edu.ucla.cs.wis.bigdatalog.spark.execution.recursion

import org.apache.spark.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

import scala.collection.mutable.{HashMap, HashSet, Set}

class CachedRDDManager(defaultStorageLevel: StorageLevel)
  extends Logging with Serializable {

  val iterationToRDDMap = new HashMap[Int, HashSet[RDD[_]]]
  var currentIteration : Int = 0

  def persist(rdd: RDD[_]): Unit = {
    persist(rdd, false)
  }

  def persist(rdd: RDD[_], doMemoryCheckpoint: Boolean): Unit = {
    iterationToRDDMap.getOrElseUpdate(currentIteration, new HashSet[RDD[_]]).add(rdd)
    rdd.persist(defaultStorageLevel)

    if (doMemoryCheckpoint)
      rdd.memoryCheckpoint()
  }

  def cleanUpIteration(iterationsBackToRemove: Int = 2) = {
    val start = System.currentTimeMillis()
    if (currentIteration >= iterationsBackToRemove) {
      val iterationId = currentIteration - iterationsBackToRemove
      if (iterationToRDDMap.contains(iterationId)) {
        val rdds: HashSet[RDD[_]] = iterationToRDDMap.remove(iterationId).get
        if (rdds.nonEmpty)
          logInfo("Unpersisting "+rdds.size+" rdds for iteration " + iterationId)
        rdds.foreach(rdd => rdd.unpersist(false))
      }
    }
    logInfo("CleanUpIteration took " + (System.currentTimeMillis() - start) + " ms")
    currentIteration += 1
  }

  def cleanUpIterationById(iterationId: Int) = {
    if (iterationToRDDMap.contains(iterationId)) {
      val rdds: HashSet[RDD[_]] = iterationToRDDMap.remove(iterationId).get
      rdds.foreach(rdd => rdd.unpersist(false))
    }
  }

  def incrementIteration() { currentIteration += 1}

  def clear() = {
    iterationToRDDMap.clear()
  }

  def clear(remainCached: Seq[RDD[_]]) = {
    iterationToRDDMap.keySet.foreach(key => logInfo("key: " + key + " value: " + iterationToRDDMap.get(key)))

    iterationToRDDMap.keySet
      .foreach(key => iterationToRDDMap.get(key)
      .foreach(value => value.foreach(item => {if (!remainCached.contains(item)) item.unpersist(false)})))

    iterationToRDDMap.clear()
  }

  def unpersist(rdds: Set[RDD[_]]) = {
    for (rdd <- rdds) {
      iterationToRDDMap.synchronized {
        // rdd should only be in 1 iteration
        val iterations = iterationToRDDMap.filter(x => x._2.contains(rdd))
        if (iterations.nonEmpty) {
          val iteration = iterations.head
          iteration._2.remove(rdd)
          rdd.unpersist(false)
          if (iteration._2.isEmpty)
            iterationToRDDMap.remove(iteration._1)
        }
      }
    }
  }

  override def toString = {
    val output = new StringBuilder
    iterationToRDDMap.keySet.toSeq.sorted
      .foreach(iteration => {
        val rdds = iterationToRDDMap.get(iteration)
        rdds.foreach(rdd => output.append(iteration + ":" + rdd + "\n"))
      })
    output.toString()
  }
} 
Example 61
Source File: QuerySuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package edu.ucla.cs.wis.bigdatalog.spark

import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException}
import org.scalatest.FunSuite

import scala.collection.mutable.ArrayBuffer

abstract class QuerySuite extends FunSuite with Logging {

  case class TestCase(program: String, query: String, data: Map[String, Seq[String]], answers: Seq[String], answersSize: Int) {
    def this(program: String, query: String, data: Map[String, Seq[String]], answersSize: Int) = this(program, query, data, null, answersSize)

    def this(program: String, query: String, data: Map[String, Seq[String]], answers: Seq[String]) = this(program, query, data, answers, answers.size)
  }

  def runTest(testCase: TestCase): Unit = runTests(Seq(testCase))

  def runTests(testCases: Seq[TestCase]): Unit = {
    val sparkCtx = new SparkContext("local[*]", "QuerySuite", new SparkConf()
      .set("spark.eventLog.enabled", "true")
      //.set("spark.eventLog.dir", "../logs")
      .set("spark.ui.enabled", "false")
      .set("spark.sql.shuffle.partitions", "5")
      .setAll(Map.empty[String, String])
    )

    val bigDatalogCtx = new BigDatalogContext(sparkCtx)

    var count: Int = 1
    for (testCase <- testCases) {
      bigDatalogCtx.loadProgram(testCase.program)

      for ((relationName, data) <- testCase.data) {
        val relationInfo = bigDatalogCtx.relationCatalog.getRelationInfo(relationName)
        if (relationInfo == null)
          throw new SparkException("You are attempting to load an unknown relation.")

        bigDatalogCtx.registerAndLoadTable(relationName, data, bigDatalogCtx.conf.numShufflePartitions)
      }

      val query = testCase.query
      val answers = testCase.answers
      logInfo("========== START BigDatalog Query " + count + " START ==========")
      val program = bigDatalogCtx.query(query)

      val results = program.execute().collect()

      // for some test cases we will only know the size of the answer set, not the actual answers
      if (answers == null) {
        assert(results.size == testCase.answersSize)
      } else {
        if (results.size != answers.size) {
          displayDifferences(results.map(_.toString), answers)
          // yes this will fail
          assert(results.size == answers.size)
        } else {
          for (result <- results)
            assert(answers.contains(result.toString()))
        }

        val resultStrings = results.map(_.toString).toSet

        for (answer <- answers)
          assert(resultStrings.contains(answer.toString()))
      }
      logInfo("========== END BigDatalog Query " + count + " END ==========\n")
      count += 1
      bigDatalogCtx.reset()
    }

    sparkCtx.stop()
  }

  private def displayDifferences(results: Seq[String], answers: Seq[String]): Unit = {
    val missingAnswers = new ArrayBuffer[String]
    val missingResults = new ArrayBuffer[String]

    for (result <- results)
      if (!answers.contains(result))
        missingAnswers += result

    for (answer <- answers)
      if (!results.contains(answer))
        missingResults += answer

    if (missingAnswers.nonEmpty)
      logInfo("Results not in Answers: " + missingAnswers.mkString(", "))

    if (missingResults.nonEmpty)
      logInfo("Answers not in Results: " + missingResults.mkString(", "))
  }
} 
Example 62
Source File: LibSVMRelation.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.source.libsvm

import com.google.common.base.Objects

import org.apache.spark.Logging
import org.apache.spark.annotation.Since
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrameReader, DataFrame, Row, SQLContext}
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}


@Since("1.6.0")
class DefaultSource extends RelationProvider with DataSourceRegister {

  @Since("1.6.0")
  override def shortName(): String = "libsvm"

  @Since("1.6.0")
  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String])
    : BaseRelation = {
    val path = parameters.getOrElse("path",
      throw new IllegalArgumentException("'path' must be specified"))
    val numFeatures = parameters.getOrElse("numFeatures", "-1").toInt
    val vectorType = parameters.getOrElse("vectorType", "sparse")
    new LibSVMRelation(path, numFeatures, vectorType)(sqlContext)
  }
} 
Example 63
Source File: Transformer.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import scala.annotation.varargs

import org.apache.spark.Logging
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._


  protected def validateInputType(inputType: DataType): Unit = {}

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    validateInputType(inputType)
    if (schema.fieldNames.contains($(outputCol))) {
      throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.")
    }
    val outputFields = schema.fields :+
      StructField($(outputCol), outputDataType, nullable = false)
    StructType(outputFields)
  }

  override def transform(dataset: DataFrame): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    dataset.withColumn($(outputCol),
      callUDF(this.createTransformFunc, outputDataType, dataset($(inputCol))))
  }

  override def copy(extra: ParamMap): T = defaultCopy(extra)
} 
Example 64
Source File: LocalKMeans.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.clustering

import scala.util.Random

import org.apache.spark.Logging
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.BLAS.{axpy, scal}


  def kMeansPlusPlus(
      seed: Int,
      points: Array[VectorWithNorm],
      weights: Array[Double],
      k: Int,
      maxIterations: Int
  ): Array[VectorWithNorm] = {
    val rand = new Random(seed)
    val dimensions = points(0).vector.size
    val centers = new Array[VectorWithNorm](k)

    // Initialize centers by sampling using the k-means++ procedure.
    centers(0) = pickWeighted(rand, points, weights).toDense
    for (i <- 1 until k) {
      // Pick the next center with a probability proportional to cost under current centers
      val curCenters = centers.view.take(i)
      val sum = points.view.zip(weights).map { case (p, w) =>
        w * KMeans.pointCost(curCenters, p)
      }.sum
      val r = rand.nextDouble() * sum
      var cumulativeScore = 0.0
      var j = 0
      while (j < points.length && cumulativeScore < r) {
        cumulativeScore += weights(j) * KMeans.pointCost(curCenters, points(j))
        j += 1
      }
      if (j == 0) {
        logWarning("kMeansPlusPlus initialization ran out of distinct points for centers." +
          s" Using duplicate point for center k = $i.")
        centers(i) = points(0).toDense
      } else {
        centers(i) = points(j - 1).toDense
      }
    }

    // Run up to maxIterations iterations of Lloyd's algorithm
    val oldClosest = Array.fill(points.length)(-1)
    var iteration = 0
    var moved = true
    while (moved && iteration < maxIterations) {
      moved = false
      val counts = Array.fill(k)(0.0)
      val sums = Array.fill(k)(Vectors.zeros(dimensions))
      var i = 0
      while (i < points.length) {
        val p = points(i)
        val index = KMeans.findClosest(centers, p)._1
        axpy(weights(i), p.vector, sums(index))
        counts(index) += weights(i)
        if (index != oldClosest(i)) {
          moved = true
          oldClosest(i) = index
        }
        i += 1
      }
      // Update centers
      var j = 0
      while (j < k) {
        if (counts(j) == 0.0) {
          // Assign center to a random point
          centers(j) = points(rand.nextInt(points.length)).toDense
        } else {
          scal(1.0 / counts(j), sums(j))
          centers(j) = new VectorWithNorm(sums(j))
        }
        j += 1
      }
      iteration += 1
    }

    if (iteration == maxIterations) {
      logInfo(s"Local KMeans++ reached the max number of iterations: $maxIterations.")
    } else {
      logInfo(s"Local KMeans++ converged in $iteration iterations.")
    }

    centers
  }

  private def pickWeighted[T](rand: Random, data: Array[T], weights: Array[Double]): T = {
    val r = rand.nextDouble() * weights.sum
    var i = 0
    var curWeight = 0.0
    while (i < data.length && curWeight < r) {
      curWeight += weights(i)
      i += 1
    }
    data(i - 1)
  }
} 
Example 65
Source File: PearsonCorrelation.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat.correlation

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.Logging
import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.rdd.RDD


  def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = {
    val cov = covarianceMatrix.toBreeze.asInstanceOf[BDM[Double]]
    val n = cov.cols

    // Compute the standard deviation on the diagonals first
    var i = 0
    while (i < n) {
      // TODO remove once covariance numerical issue resolved.
      cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i))
      i +=1
    }

    // Loop through columns since cov is column major
    var j = 0
    var sigma = 0.0
    var containNaN = false
    while (j < n) {
      sigma = cov(j, j)
      i = 0
      while (i < j) {
        val corr = if (sigma == 0.0 || cov(i, i) == 0.0) {
          containNaN = true
          Double.NaN
        } else {
          cov(i, j) / (sigma * cov(i, i))
        }
        cov(i, j) = corr
        cov(j, i) = corr
        i += 1
      }
      j += 1
    }

    // put 1.0 on the diagonals
    i = 0
    while (i < n) {
      cov(i, i) = 1.0
      i +=1
    }

    if (containNaN) {
      logWarning("Pearson correlation matrix contains NaN values.")
    }

    Matrices.fromBreeze(cov)
  }

  private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = {
    math.abs(value) <= threshold
  }
} 
Example 66
Source File: SpearmanCorrelation.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat.correlation

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.Logging
import org.apache.spark.SparkContext._
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.rdd.RDD


  override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = {
    // ((columnIndex, value), rowUid)
    val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) =>
      vec.toArray.view.zipWithIndex.map { case (v, j) =>
        ((j, v), uid)
      }
    }
    // global sort by (columnIndex, value)
    val sorted = colBased.sortByKey()
    // assign global ranks (using average ranks for tied values)
    val globalRanks = sorted.zipWithIndex().mapPartitions { iter =>
      var preCol = -1
      var preVal = Double.NaN
      var startRank = -1.0
      var cachedUids = ArrayBuffer.empty[Long]
      val flush: () => Iterable[(Long, (Int, Double))] = () => {
        val averageRank = startRank + (cachedUids.size - 1) / 2.0
        val output = cachedUids.map { uid =>
          (uid, (preCol, averageRank))
        }
        cachedUids.clear()
        output
      }
      iter.flatMap { case (((j, v), uid), rank) =>
        // If we see a new value or cachedUids is too big, we flush ids with their average rank.
        if (j != preCol || v != preVal || cachedUids.size >= 10000000) {
          val output = flush()
          preCol = j
          preVal = v
          startRank = rank
          cachedUids += uid
          output
        } else {
          cachedUids += uid
          Iterator.empty
        }
      } ++ flush()
    }
    // Replace values in the input matrix by their ranks compared with values in the same column.
    // Note that shifting all ranks in a column by a constant value doesn't affect result.
    val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) =>
      // sort by column index and then convert values to a vector
      Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray)
    }
    PearsonCorrelation.computeCorrelationMatrix(groupedRanks)
  }
} 
Example 67
Source File: DataValidators.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.util

import org.apache.spark.Logging
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD


  @Since("1.3.0")
  def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data =>
    val numInvalid = data.filter(x =>
      x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count()
    if (numInvalid != 0) {
      logError("Classification labels should be in {0 to " + (k - 1) + "}. " +
        "Found " + numInvalid + " invalid labels")
    }
    numInvalid == 0
  }
} 
Example 68
Source File: TwitterInputDStream.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.twitter

import twitter4j._
import twitter4j.auth.Authorization
import twitter4j.conf.ConfigurationBuilder
import twitter4j.auth.OAuthAuthorization

import org.apache.spark.streaming._
import org.apache.spark.streaming.dstream._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.Logging
import org.apache.spark.streaming.receiver.Receiver


private[streaming]
class TwitterInputDStream(
    ssc_ : StreamingContext,
    twitterAuth: Option[Authorization],
    filters: Seq[String],
    storageLevel: StorageLevel
  ) extends ReceiverInputDStream[Status](ssc_)  {

  private def createOAuthAuthorization(): Authorization = {
    new OAuthAuthorization(new ConfigurationBuilder().build())
  }

  private val authorization = twitterAuth.getOrElse(createOAuthAuthorization())

  override def getReceiver(): Receiver[Status] = {
    new TwitterReceiver(authorization, filters, storageLevel)
  }
}

private[streaming]
class TwitterReceiver(
    twitterAuth: Authorization,
    filters: Seq[String],
    storageLevel: StorageLevel
  ) extends Receiver[Status](storageLevel) with Logging {

  @volatile private var twitterStream: TwitterStream = _
  @volatile private var stopped = false

  def onStart() {
    try {
      val newTwitterStream = new TwitterStreamFactory().getInstance(twitterAuth)
      newTwitterStream.addListener(new StatusListener {
        def onStatus(status: Status): Unit = {
          store(status)
        }
        // Unimplemented
        def onDeletionNotice(statusDeletionNotice: StatusDeletionNotice) {}
        def onTrackLimitationNotice(i: Int) {}
        def onScrubGeo(l: Long, l1: Long) {}
        def onStallWarning(stallWarning: StallWarning) {}
        def onException(e: Exception) {
          if (!stopped) {
            restart("Error receiving tweets", e)
          }
        }
      })

      val query = new FilterQuery
      if (filters.size > 0) {
        query.track(filters.mkString(","))
        newTwitterStream.filter(query)
      } else {
        newTwitterStream.sample()
      }
      setTwitterStream(newTwitterStream)
      logInfo("Twitter receiver started")
      stopped = false
    } catch {
      case e: Exception => restart("Error starting Twitter stream", e)
    }
  }

  def onStop() {
    stopped = true
    setTwitterStream(null)
    logInfo("Twitter receiver stopped")
  }

  private def setTwitterStream(newTwitterStream: TwitterStream) = synchronized {
    if (twitterStream != null) {
      twitterStream.shutdown()
    }
    twitterStream = newTwitterStream
  }
} 
Example 69
Source File: TwitterStreamSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.twitter


import org.scalatest.BeforeAndAfter
import twitter4j.Status
import twitter4j.auth.{NullAuthorization, Authorization}

import org.apache.spark.{Logging, SparkFunSuite}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.ReceiverInputDStream

class TwitterStreamSuite extends SparkFunSuite with BeforeAndAfter with Logging {

  val batchDuration = Seconds(1)

  private val master: String = "local[2]"

  private val framework: String = this.getClass.getSimpleName

  test("twitter input stream") {
    val ssc = new StreamingContext(master, framework, batchDuration)
    val filters = Seq("filter1", "filter2")
    val authorization: Authorization = NullAuthorization.getInstance()

    // tests the API, does not actually test data receiving
    val test1: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None)
    val test2: ReceiverInputDStream[Status] =
      TwitterUtils.createStream(ssc, None, filters)
    val test3: ReceiverInputDStream[Status] =
      TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_AND_DISK_SER_2)
    val test4: ReceiverInputDStream[Status] =
      TwitterUtils.createStream(ssc, Some(authorization))
    val test5: ReceiverInputDStream[Status] =
      TwitterUtils.createStream(ssc, Some(authorization), filters)
    val test6: ReceiverInputDStream[Status] = TwitterUtils.createStream(
      ssc, Some(authorization), filters, StorageLevel.MEMORY_AND_DISK_SER_2)

    // Note that actually testing the data receiving is hard as authentication keys are
    // necessary for accessing Twitter live stream
    ssc.stop()
  }
} 
Example 70
Source File: EventTransformer.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.flume

import java.io.{ObjectOutput, ObjectInput}

import scala.collection.JavaConverters._

import org.apache.spark.util.Utils
import org.apache.spark.Logging


private[streaming] object EventTransformer extends Logging {
  def readExternal(in: ObjectInput): (java.util.HashMap[CharSequence, CharSequence],
    Array[Byte]) = {
    val bodyLength = in.readInt()
    val bodyBuff = new Array[Byte](bodyLength)
    in.readFully(bodyBuff)

    val numHeaders = in.readInt()
    val headers = new java.util.HashMap[CharSequence, CharSequence]

    for (i <- 0 until numHeaders) {
      val keyLength = in.readInt()
      val keyBuff = new Array[Byte](keyLength)
      in.readFully(keyBuff)
      val key: String = Utils.deserialize(keyBuff)

      val valLength = in.readInt()
      val valBuff = new Array[Byte](valLength)
      in.readFully(valBuff)
      val value: String = Utils.deserialize(valBuff)

      headers.put(key, value)
    }
    (headers, bodyBuff)
  }

  def writeExternal(out: ObjectOutput, headers: java.util.Map[CharSequence, CharSequence],
    body: Array[Byte]) {
    out.writeInt(body.length)
    out.write(body)
    val numHeaders = headers.size()
    out.writeInt(numHeaders)
    for ((k, v) <- headers.asScala) {
      val keyBuff = Utils.serialize(k.toString)
      out.writeInt(keyBuff.length)
      out.write(keyBuff)
      val valBuff = Utils.serialize(v.toString)
      out.writeInt(valBuff.length)
      out.write(valBuff)
    }
  }
} 
Example 71
Source File: FlumeStreamSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.flume

import scala.collection.JavaConverters._
import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer}
import scala.concurrent.duration._
import scala.language.postfixOps

import com.google.common.base.Charsets
import org.jboss.netty.channel.ChannelPipeline
import org.jboss.netty.channel.socket.SocketChannel
import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory
import org.jboss.netty.handler.codec.compression._
import org.scalatest.{BeforeAndAfter, Matchers}
import org.scalatest.concurrent.Eventually._

import org.apache.spark.{Logging, SparkConf, SparkFunSuite}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Milliseconds, StreamingContext, TestOutputStream}

class FlumeStreamSuite extends SparkFunSuite with BeforeAndAfter with Matchers with Logging {
  val conf = new SparkConf().setMaster("local[4]").setAppName("FlumeStreamSuite")
  var ssc: StreamingContext = null

  test("flume input stream") {
    testFlumeStream(testCompression = false)
  }

  test("flume input compressed stream") {
    testFlumeStream(testCompression = true)
  }

  
  private class CompressionChannelFactory(compressionLevel: Int)
    extends NioClientSocketChannelFactory {

    override def newChannel(pipeline: ChannelPipeline): SocketChannel = {
      val encoder = new ZlibEncoder(compressionLevel)
      pipeline.addFirst("deflater", encoder)
      pipeline.addFirst("inflater", new ZlibDecoder())
      super.newChannel(pipeline)
    }
  }
} 
Example 72
Source File: MQTTTestUtils.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.mqtt

import java.net.{ServerSocket, URI}

import scala.language.postfixOps

import com.google.common.base.Charsets.UTF_8
import org.apache.activemq.broker.{BrokerService, TransportConnector}
import org.apache.commons.lang3.RandomUtils
import org.eclipse.paho.client.mqttv3._
import org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence

import org.apache.spark.util.Utils
import org.apache.spark.{Logging, SparkConf}


private[mqtt] class MQTTTestUtils extends Logging {

  private val persistenceDir = Utils.createTempDir()
  private val brokerHost = "localhost"
  private val brokerPort = findFreePort()

  private var broker: BrokerService = _
  private var connector: TransportConnector = _

  def brokerUri: String = {
    s"$brokerHost:$brokerPort"
  }

  def setup(): Unit = {
    broker = new BrokerService()
    broker.setDataDirectoryFile(Utils.createTempDir())
    connector = new TransportConnector()
    connector.setName("mqtt")
    connector.setUri(new URI("mqtt://" + brokerUri))
    broker.addConnector(connector)
    broker.start()
  }

  def teardown(): Unit = {
    if (broker != null) {
      broker.stop()
      broker = null
    }
    if (connector != null) {
      connector.stop()
      connector = null
    }
    Utils.deleteRecursively(persistenceDir)
  }

  private def findFreePort(): Int = {
    val candidatePort = RandomUtils.nextInt(1024, 65536)
    Utils.startServiceOnPort(candidatePort, (trialPort: Int) => {
      val socket = new ServerSocket(trialPort)
      socket.close()
      (null, trialPort)
    }, new SparkConf())._2
  }

  def publishData(topic: String, data: String): Unit = {
    var client: MqttClient = null
    try {
      val persistence = new MqttDefaultFilePersistence(persistenceDir.getAbsolutePath)
      client = new MqttClient("tcp://" + brokerUri, MqttClient.generateClientId(), persistence)
      client.connect()
      if (client.isConnected) {
        val msgTopic = client.getTopic(topic)
        val message = new MqttMessage(data.getBytes(UTF_8))
        message.setQos(1)
        message.setRetained(true)

        for (i <- 0 to 10) {
          try {
            msgTopic.publish(message)
          } catch {
            case e: MqttException if e.getReasonCode == MqttException.REASON_CODE_MAX_INFLIGHT =>
              // wait for Spark streaming to consume something from the message queue
              Thread.sleep(50)
          }
        }
      }
    } finally {
      if (client != null) {
        client.disconnect()
        client.close()
        client = null
      }
    }
  }

} 
Example 73
Source File: OrcFileOperator.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.orc

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader}
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector

import org.apache.spark.Logging
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.hive.HiveMetastoreTypes
import org.apache.spark.sql.types.StructType

private[orc] object OrcFileOperator extends Logging {
  
  def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = {
    def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = {
      reader.getObjectInspector match {
        case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 =>
          logInfo(
            s"ORC file $path has empty schema, it probably contains no rows. " +
              "Trying to read another ORC file to figure out the schema.")
          false
        case _ => true
      }
    }

    val conf = config.getOrElse(new Configuration)
    val fs = {
      val hdfsPath = new Path(basePath)
      hdfsPath.getFileSystem(conf)
    }

    listOrcFiles(basePath, conf).iterator.map { path =>
      path -> OrcFile.createReader(fs, path)
    }.collectFirst {
      case (path, reader) if isWithNonEmptySchema(path, reader) => reader
    }
  }

  def readSchema(path: String, conf: Option[Configuration]): StructType = {
    val reader = getFileReader(path, conf).getOrElse {
      throw new AnalysisException(
        s"Failed to discover schema from ORC files stored in $path. " +
          "Probably there are either no ORC files or only empty ORC files.")
    }
    val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
    val schema = readerInspector.getTypeName
    logDebug(s"Reading schema from file $path, got Hive schema string: $schema")
    HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType]
  }

  def getObjectInspector(
      path: String, conf: Option[Configuration]): Option[StructObjectInspector] = {
    getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector])
  }

  def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = {
    val origPath = new Path(pathStr)
    val fs = origPath.getFileSystem(conf)
    val path = origPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
    val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath)
      .filterNot(_.isDir)
      .map(_.getPath)
      .filterNot(_.getName.startsWith("_"))
      .filterNot(_.getName.startsWith("."))

    if (paths == null || paths.isEmpty) {
      throw new IllegalArgumentException(
        s"orcFileOperator: path $path does not have valid orc files matching the pattern")
    }

    paths
  }
} 
Example 74
Source File: FiltersSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.client

import java.util.Collections

import org.apache.hadoop.hive.metastore.api.FieldSchema
import org.apache.hadoop.hive.serde.serdeConstants

import org.apache.spark.{Logging, SparkFunSuite}
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.types._


class FiltersSuite extends SparkFunSuite with Logging {
  private val shim = new Shim_v0_13

  private val testTable = new org.apache.hadoop.hive.ql.metadata.Table("default", "test")
  private val varCharCol = new FieldSchema()
  varCharCol.setName("varchar")
  varCharCol.setType(serdeConstants.VARCHAR_TYPE_NAME)
  testTable.setPartCols(Collections.singletonList(varCharCol))

  filterTest("string filter",
    (a("stringcol", StringType) > Literal("test")) :: Nil,
    "stringcol > \"test\"")

  filterTest("string filter backwards",
    (Literal("test") > a("stringcol", StringType)) :: Nil,
    "\"test\" > stringcol")

  filterTest("int filter",
    (a("intcol", IntegerType) === Literal(1)) :: Nil,
    "intcol = 1")

  filterTest("int filter backwards",
    (Literal(1) === a("intcol", IntegerType)) :: Nil,
    "1 = intcol")

  filterTest("int and string filter",
    (Literal(1) === a("intcol", IntegerType)) :: (Literal("a") === a("strcol", IntegerType)) :: Nil,
    "1 = intcol and \"a\" = strcol")

  filterTest("skip varchar",
    (Literal("") === a("varchar", StringType)) :: Nil,
    "")

  private def filterTest(name: String, filters: Seq[Expression], result: String) = {
    test(name){
      val converted = shim.convertFilters(testTable, filters)
      if (converted != result) {
        fail(
          s"Expected filters ${filters.mkString(",")} to convert to '$result' but got '$converted'")
      }
    }
  }

  private def a(name: String, dataType: DataType) = AttributeReference(name, dataType)()
} 
Example 75
Source File: SparkSQLDriver.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.thriftserver

import java.util.{Arrays, ArrayList => JArrayList, List => JList}
import org.apache.log4j.LogManager
import org.apache.spark.sql.AnalysisException

import scala.collection.JavaConverters._

import org.apache.commons.lang3.exception.ExceptionUtils
import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema}
import org.apache.hadoop.hive.ql.Driver
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse

import org.apache.spark.Logging
import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}

private[hive] class SparkSQLDriver(
    val context: HiveContext = SparkSQLEnv.hiveContext)
  extends Driver
  with Logging {

  private[hive] var tableSchema: Schema = _
  private[hive] var hiveResponse: Seq[String] = _

  override def init(): Unit = {
  }

  private def getResultSetSchema(query: context.QueryExecution): Schema = {
    val analyzed = query.analyzed
    logDebug(s"Result Schema: ${analyzed.output}")
    if (analyzed.output.isEmpty) {
      new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null)
    } else {
      val fieldSchemas = analyzed.output.map { attr =>
        new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "")
      }

      new Schema(fieldSchemas.asJava, null)
    }
  }

  override def run(command: String): CommandProcessorResponse = {
    // TODO unify the error code
    try {
      context.sparkContext.setJobDescription(command)
      val execution = context.executePlan(context.sql(command).logicalPlan)
      hiveResponse = execution.stringResult()
      tableSchema = getResultSetSchema(execution)
      new CommandProcessorResponse(0)
    } catch {
        case ae: AnalysisException =>
          logDebug(s"Failed in [$command]", ae)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae)
        case cause: Throwable =>
          logError(s"Failed in [$command]", cause)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause)
    }
  }

  override def close(): Int = {
    hiveResponse = null
    tableSchema = null
    0
  }

  override def getResults(res: JList[_]): Boolean = {
    if (hiveResponse == null) {
      false
    } else {
      res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava)
      hiveResponse = null
      true
    }
  }

  override def getSchema: Schema = tableSchema

  override def destroy() {
    super.destroy()
    hiveResponse = null
    tableSchema = null
  }
} 
Example 76
Source File: SparkSQLOperationManager.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.thriftserver.server

import java.util.{Map => JMap}
import scala.collection.mutable.Map

import org.apache.hive.service.cli._
import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, Operation, OperationManager}
import org.apache.hive.service.cli.session.HiveSession
import org.apache.spark.Logging
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.thriftserver.{SparkExecuteStatementOperation, ReflectionUtils}


private[thriftserver] class SparkSQLOperationManager()
  extends OperationManager with Logging {

  val handleToOperation = ReflectionUtils
    .getSuperField[JMap[OperationHandle, Operation]](this, "handleToOperation")

  val sessionToActivePool = Map[SessionHandle, String]()
  val sessionToContexts = Map[SessionHandle, HiveContext]()

  override def newExecuteStatementOperation(
      parentSession: HiveSession,
      statement: String,
      confOverlay: JMap[String, String],
      async: Boolean): ExecuteStatementOperation = synchronized {
    val hiveContext = sessionToContexts(parentSession.getSessionHandle)
    val runInBackground = async && hiveContext.hiveThriftServerAsync
    val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay,
      runInBackground)(hiveContext, sessionToActivePool)
    handleToOperation.put(operation.getHandle, operation)
    logDebug(s"Created Operation for $statement with session=$parentSession, " +
      s"runInBackground=$runInBackground")
    operation
  }
} 
Example 77
Source File: ThriftServerTab.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.thriftserver.ui

import org.apache.spark.sql.hive.thriftserver.{HiveThriftServer2, SparkSQLEnv}
import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._
import org.apache.spark.ui.{SparkUI, SparkUITab}
import org.apache.spark.{SparkContext, Logging, SparkException}


private[thriftserver] class ThriftServerTab(sparkContext: SparkContext)
  extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging {

  override val name = "JDBC/ODBC Server"

  val parent = getSparkUI(sparkContext)
  val listener = HiveThriftServer2.listener

  attachPage(new ThriftServerPage(this))
  attachPage(new ThriftServerSessionPage(this))
  parent.attachTab(this)

  def detach() {
    getSparkUI(sparkContext).detachTab(this)
  }
}

private[thriftserver] object ThriftServerTab {
  def getSparkUI(sparkContext: SparkContext): SparkUI = {
    sparkContext.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
} 
Example 78
Source File: SparkSQLEnv.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.thriftserver

import java.io.PrintStream

import scala.collection.JavaConverters._

import org.apache.spark.scheduler.StatsReportListener
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{Logging, SparkConf, SparkContext}
import org.apache.spark.util.Utils


  def stop() {
    logDebug("Shutting down Spark SQL Environment")
    // Stop the SparkContext
    if (SparkSQLEnv.sparkContext != null) {
      sparkContext.stop()
      sparkContext = null
      hiveContext = null
    }
  }
} 
Example 79
Source File: BoundAttribute.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.errors.attachTree
import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
import org.apache.spark.sql.types._


case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
  extends LeafExpression with NamedExpression {

  override def toString: String = s"input[$ordinal, $dataType]"

  // Use special getter for primitive types (for UnsafeRow)
  override def eval(input: InternalRow): Any = {
    if (input.isNullAt(ordinal)) {
      null
    } else {
      dataType match {
        case BooleanType => input.getBoolean(ordinal)
        case ByteType => input.getByte(ordinal)
        case ShortType => input.getShort(ordinal)
        case IntegerType | DateType => input.getInt(ordinal)
        case LongType | TimestampType => input.getLong(ordinal)
        case FloatType => input.getFloat(ordinal)
        case DoubleType => input.getDouble(ordinal)
        case StringType => input.getUTF8String(ordinal)
        case BinaryType => input.getBinary(ordinal)
        case CalendarIntervalType => input.getInterval(ordinal)
        case t: DecimalType => input.getDecimal(ordinal, t.precision, t.scale)
        case t: StructType => input.getStruct(ordinal, t.size)
        case _: ArrayType => input.getArray(ordinal)
        case _: MapType => input.getMap(ordinal)
        case _ => input.get(ordinal, dataType)
      }
    }
  }

  override def name: String = s"i[$ordinal]"

  override def toAttribute: Attribute = throw new UnsupportedOperationException

  override def qualifiers: Seq[String] = throw new UnsupportedOperationException

  override def exprId: ExprId = throw new UnsupportedOperationException

  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
    val javaType = ctx.javaType(dataType)
    val value = ctx.getValue(ctx.INPUT_ROW, dataType, ordinal.toString)
    s"""
      boolean ${ev.isNull} = ${ctx.INPUT_ROW}.isNullAt($ordinal);
      $javaType ${ev.value} = ${ev.isNull} ? ${ctx.defaultValue(dataType)} : ($value);
    """
  }
}

object BindReferences extends Logging {

  def bindReference[A <: Expression](
      expression: A,
      input: Seq[Attribute],
      allowFailures: Boolean = false): A = {
    expression.transform { case a: AttributeReference =>
      attachTree(a, "Binding attribute") {
        val ordinal = input.indexWhere(_.exprId == a.exprId)
        if (ordinal == -1) {
          if (allowFailures) {
            a
          } else {
            sys.error(s"Couldn't find $a in ${input.mkString("[", ",", "]")}")
          }
        } else {
          BoundReference(ordinal, a.dataType, a.nullable)
        }
      }
    }.asInstanceOf[A] // Kind of a hack, but safe.  TODO: Tighten return type when possible.
  }
} 
Example 80
Source File: RuleExecutor.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.rules

import scala.collection.JavaConverters._

import com.google.common.util.concurrent.AtomicLongMap

import org.apache.spark.Logging
import org.apache.spark.sql.catalyst.trees.TreeNode
import org.apache.spark.sql.catalyst.util.sideBySide

object RuleExecutor {
  protected val timeMap = AtomicLongMap.create[String]()

  
  def execute(plan: TreeType): TreeType = {
    var curPlan = plan

    batches.foreach { batch =>
      val batchStartPlan = curPlan
      var iteration = 1
      var lastPlan = curPlan
      var continue = true

      // Run until fix point (or the max number of iterations as specified in the strategy.
      while (continue) {
        curPlan = batch.rules.foldLeft(curPlan) {
          case (plan, rule) =>
            val startTime = System.nanoTime()
            val result = rule(plan)
            val runTime = System.nanoTime() - startTime
            RuleExecutor.timeMap.addAndGet(rule.ruleName, runTime)

            if (!result.fastEquals(plan)) {
              logTrace(
                s"""
                  |=== Applying Rule ${rule.ruleName} ===
                  |${sideBySide(plan.treeString, result.treeString).mkString("\n")}
                """.stripMargin)
            }

            result
        }
        iteration += 1
        if (iteration > batch.strategy.maxIterations) {
          // Only log if this is a rule that is supposed to run more than once.
          if (iteration != 2) {
            logInfo(s"Max iterations (${iteration - 1}) reached for batch ${batch.name}")
          }
          continue = false
        }

        if (curPlan.fastEquals(lastPlan)) {
          logTrace(
            s"Fixed point reached for batch ${batch.name} after ${iteration - 1} iterations.")
          continue = false
        }
        lastPlan = curPlan
      }

      if (!batchStartPlan.fastEquals(curPlan)) {
        logDebug(
          s"""
          |=== Result of Batch ${batch.name} ===
          |${sideBySide(plan.treeString, curPlan.treeString).mkString("\n")}
        """.stripMargin)
      } else {
        logTrace(s"Batch ${batch.name} has no effect.")
      }
    }

    curPlan
  }
} 
Example 81
Source File: package.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution

import scala.collection.mutable.HashSet

import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.trees.TreeNodeRef
import org.apache.spark.{Accumulator, AccumulatorParam, Logging}


    case class ColumnMetrics(
        elementTypes: Accumulator[HashSet[String]] = sparkContext.accumulator(HashSet.empty))
    val tupleCount: Accumulator[Int] = sparkContext.accumulator[Int](0)

    val numColumns: Int = child.output.size
    val columnStats: Array[ColumnMetrics] = Array.fill(child.output.size)(new ColumnMetrics())

    def dumpStats(): Unit = {
      logDebug(s"== ${child.simpleString} ==")
      logDebug(s"Tuples output: ${tupleCount.value}")
      child.output.zip(columnStats).foreach { case(attr, metric) =>
        val actualDataTypes = metric.elementTypes.value.mkString("{", ",", "}")
        logDebug(s" ${attr.name} ${attr.dataType}: $actualDataTypes")
      }
    }

    protected override def doExecute(): RDD[InternalRow] = {
      child.execute().mapPartitions { iter =>
        new Iterator[InternalRow] {
          def hasNext: Boolean = iter.hasNext
          def next(): InternalRow = {
            val currentRow = iter.next()
            tupleCount += 1
            var i = 0
            while (i < numColumns) {
              val value = currentRow.get(i, output(i).dataType)
              if (value != null) {
                columnStats(i).elementTypes += HashSet(value.getClass.getName)
              }
              i += 1
            }
            currentRow
          }
        }
      }
    }
  }
} 
Example 82
Source File: DriverRegistry.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.jdbc

import java.sql.{Driver, DriverManager}

import scala.collection.mutable

import org.apache.spark.Logging
import org.apache.spark.util.Utils


object DriverRegistry extends Logging {

  private val wrapperMap: mutable.Map[String, DriverWrapper] = mutable.Map.empty

  def register(className: String): Unit = {
    val cls = Utils.getContextOrSparkClassLoader.loadClass(className)
    if (cls.getClassLoader == null) {
      logTrace(s"$className has been loaded with bootstrap ClassLoader, wrapper is not required")
    } else if (wrapperMap.get(className).isDefined) {
      logTrace(s"Wrapper for $className already exists")
    } else {
      synchronized {
        if (wrapperMap.get(className).isEmpty) {
          val wrapper = new DriverWrapper(cls.newInstance().asInstanceOf[Driver])
          DriverManager.registerDriver(wrapper)
          wrapperMap(className) = wrapper
          logTrace(s"Wrapper for $className registered")
        }
      }
    }
  }
} 
Example 83
Source File: FrequentItems.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.stat

import scala.collection.mutable.{Map => MutableMap}

import org.apache.spark.Logging
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, Column, DataFrame}

private[sql] object FrequentItems extends Logging {

  
  private[sql] def singlePassFreqItems(
      df: DataFrame,
      cols: Seq[String],
      support: Double): DataFrame = {
    require(support >= 1e-4, s"support ($support) must be greater than 1e-4.")
    val numCols = cols.length
    // number of max items to keep counts for
    val sizeOfMap = (1 / support).toInt
    val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap))
    val originalSchema = df.schema
    val colInfo: Array[(String, DataType)] = cols.map { name =>
      val index = originalSchema.fieldIndex(name)
      (name, originalSchema.fields(index).dataType)
    }.toArray

    val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)(
      seqOp = (counts, row) => {
        var i = 0
        while (i < numCols) {
          val thisMap = counts(i)
          val key = row.get(i)
          thisMap.add(key, 1L)
          i += 1
        }
        counts
      },
      combOp = (baseCounts, counts) => {
        var i = 0
        while (i < numCols) {
          baseCounts(i).merge(counts(i))
          i += 1
        }
        baseCounts
      }
    )
    val justItems = freqItems.map(m => m.baseMap.keys.toArray)
    val resultRow = Row(justItems : _*)
    // append frequent Items to the column name for easy debugging
    val outputCols = colInfo.map { v =>
      StructField(v._1 + "_freqItems", ArrayType(v._2, false))
    }
    val schema = StructType(outputCols).toAttributes
    new DataFrame(df.sqlContext, LocalRelation.fromExternalRows(schema, Seq(resultRow)))
  }
} 
Example 84
Source File: CompressibleColumnBuilder.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.columnar.compression

import java.nio.{ByteBuffer, ByteOrder}

import org.apache.spark.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.columnar.{ColumnBuilder, NativeColumnBuilder}
import org.apache.spark.sql.types.AtomicType


private[columnar] trait CompressibleColumnBuilder[T <: AtomicType]
  extends ColumnBuilder with Logging {

  this: NativeColumnBuilder[T] with WithCompressionSchemes =>

  var compressionEncoders: Seq[Encoder[T]] = _

  abstract override def initialize(
      initialSize: Int,
      columnName: String,
      useCompression: Boolean): Unit = {

    compressionEncoders =
      if (useCompression) {
        schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType))
      } else {
        Seq(PassThrough.encoder(columnType))
      }
    super.initialize(initialSize, columnName, useCompression)
  }

  protected def isWorthCompressing(encoder: Encoder[T]) = {
    encoder.compressionRatio < 0.8
  }

  private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = {
    var i = 0
    while (i < compressionEncoders.length) {
      compressionEncoders(i).gatherCompressibilityStats(row, ordinal)
      i += 1
    }
  }

  abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = {
    super.appendFrom(row, ordinal)
    if (!row.isNullAt(ordinal)) {
      gatherCompressibilityStats(row, ordinal)
    }
  }

  override def build(): ByteBuffer = {
    val nonNullBuffer = buildNonNulls()
    val encoder: Encoder[T] = {
      val candidate = compressionEncoders.minBy(_.compressionRatio)
      if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType)
    }

    // Header = null count + null positions
    val headerSize = 4 + nulls.limit()
    val compressedSize = if (encoder.compressedSize == 0) {
      nonNullBuffer.remaining()
    } else {
      encoder.compressedSize
    }

    val compressedBuffer = ByteBuffer
      // Reserves 4 bytes for compression scheme ID
      .allocate(headerSize + 4 + compressedSize)
      .order(ByteOrder.nativeOrder)
      // Write the header
      .putInt(nullCount)
      .put(nulls)

    logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}")
    encoder.compress(nonNullBuffer, compressedBuffer)
  }
} 
Example 85
Source File: ExecutorDelegationTokenUpdater.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.yarn

import java.util.concurrent.{Executors, TimeUnit}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.security.{Credentials, UserGroupInformation}

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.util.{ThreadUtils, Utils}

import scala.util.control.NonFatal

private[spark] class ExecutorDelegationTokenUpdater(
    sparkConf: SparkConf,
    hadoopConf: Configuration) extends Logging {

  @volatile private var lastCredentialsFileSuffix = 0

  private val credentialsFile = sparkConf.get("spark.yarn.credentials.file")
  private val freshHadoopConf =
    SparkHadoopUtil.get.getConfBypassingFSCache(
      hadoopConf, new Path(credentialsFile).toUri.getScheme)

  private val delegationTokenRenewer =
    Executors.newSingleThreadScheduledExecutor(
      ThreadUtils.namedThreadFactory("Delegation Token Refresh Thread"))

  // On the executor, this thread wakes up and picks up new tokens from HDFS, if any.
  private val executorUpdaterRunnable =
    new Runnable {
      override def run(): Unit = Utils.logUncaughtExceptions(updateCredentialsIfRequired())
    }

  def updateCredentialsIfRequired(): Unit = {
    try {
      val credentialsFilePath = new Path(credentialsFile)
      val remoteFs = FileSystem.get(freshHadoopConf)
      SparkHadoopUtil.get.listFilesSorted(
        remoteFs, credentialsFilePath.getParent,
        credentialsFilePath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION)
        .lastOption.foreach { credentialsStatus =>
        val suffix = SparkHadoopUtil.get.getSuffixForCredentialsPath(credentialsStatus.getPath)
        if (suffix > lastCredentialsFileSuffix) {
          logInfo("Reading new delegation tokens from " + credentialsStatus.getPath)
          val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsStatus.getPath)
          lastCredentialsFileSuffix = suffix
          UserGroupInformation.getCurrentUser.addCredentials(newCredentials)
          logInfo("Tokens updated from credentials file.")
        } else {
          // Check every hour to see if new credentials arrived.
          logInfo("Updated delegation tokens were expected, but the driver has not updated the " +
            "tokens yet, will check again in an hour.")
          delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS)
          return
        }
      }
      val timeFromNowToRenewal =
        SparkHadoopUtil.get.getTimeFromNowToRenewal(
          sparkConf, 0.8, UserGroupInformation.getCurrentUser.getCredentials)
      if (timeFromNowToRenewal <= 0) {
        // We just checked for new credentials but none were there, wait a minute and retry.
        // This handles the shutdown case where the staging directory may have been removed(see
        // SPARK-12316 for more details).
        delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.MINUTES)
      } else {
        logInfo(s"Scheduling token refresh from HDFS in $timeFromNowToRenewal millis.")
        delegationTokenRenewer.schedule(
          executorUpdaterRunnable, timeFromNowToRenewal, TimeUnit.MILLISECONDS)
      }
    } catch {
      // Since the file may get deleted while we are reading it, catch the Exception and come
      // back in an hour to try again
      case NonFatal(e) =>
        logWarning("Error while trying to update credentials, will try again in 1 hour", e)
        delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS)
    }
  }

  private def getCredentialsFromHDFSFile(remoteFs: FileSystem, tokenPath: Path): Credentials = {
    val stream = remoteFs.open(tokenPath)
    try {
      val newCredentials = new Credentials()
      newCredentials.readTokenStorageStream(stream)
      newCredentials
    } finally {
      stream.close()
    }
  }

  def stop(): Unit = {
    delegationTokenRenewer.shutdown()
  }

} 
Example 86
Source File: SocketInputDStream.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.dstream

import scala.util.control.NonFatal

import org.apache.spark.streaming.StreamingContext
import org.apache.spark.storage.StorageLevel
import org.apache.spark.util.NextIterator

import scala.reflect.ClassTag

import java.io._
import java.net.{UnknownHostException, Socket}
import org.apache.spark.Logging
import org.apache.spark.streaming.receiver.Receiver

private[streaming]
class SocketInputDStream[T: ClassTag](
    ssc_ : StreamingContext,
    host: String,
    port: Int,
    bytesToObjects: InputStream => Iterator[T],
    storageLevel: StorageLevel
  ) extends ReceiverInputDStream[T](ssc_) {

  def getReceiver(): Receiver[T] = {
    new SocketReceiver(host, port, bytesToObjects, storageLevel)
  }
}

private[streaming]
class SocketReceiver[T: ClassTag](
    host: String,
    port: Int,
    bytesToObjects: InputStream => Iterator[T],
    storageLevel: StorageLevel
  ) extends Receiver[T](storageLevel) with Logging {

  def onStart() {
    // Start the thread that receives data over a connection
    new Thread("Socket Receiver") {
      setDaemon(true)
      override def run() { receive() }
    }.start()
  }

  def onStop() {
    // There is nothing much to do as the thread calling receive()
    // is designed to stop by itself isStopped() returns false
  }

  
  def bytesToLines(inputStream: InputStream): Iterator[String] = {
    val dataInputStream = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"))
    new NextIterator[String] {
      protected override def getNext() = {
        val nextValue = dataInputStream.readLine()
        if (nextValue == null) {
          finished = true
        }
        nextValue
      }

      protected override def close() {
        dataInputStream.close()
      }
    }
  }
} 
Example 87
Source File: StreamingTab.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.ui

import org.apache.spark.{Logging, SparkException}
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.ui.{SparkUI, SparkUITab}

import StreamingTab._


private[spark] class StreamingTab(val ssc: StreamingContext)
  extends SparkUITab(getSparkUI(ssc), "streaming") with Logging {

  private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static"

  val parent = getSparkUI(ssc)
  val listener = ssc.progressListener

  ssc.addStreamingListener(listener)
  ssc.sc.addSparkListener(listener)
  attachPage(new StreamingPage(this))
  attachPage(new BatchPage(this))

  def attach() {
    getSparkUI(ssc).attachTab(this)
    getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming")
  }

  def detach() {
    getSparkUI(ssc).detachTab(this)
    getSparkUI(ssc).removeStaticHandler("/static/streaming")
  }
}

private object StreamingTab {
  def getSparkUI(ssc: StreamingContext): SparkUI = {
    ssc.sc.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
} 
Example 88
Source File: StreamingListenerBus.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.scheduler

import java.util.concurrent.atomic.AtomicBoolean

import org.apache.spark.Logging
import org.apache.spark.util.AsynchronousListenerBus


private[spark] class StreamingListenerBus
  extends AsynchronousListenerBus[StreamingListener, StreamingListenerEvent]("StreamingListenerBus")
  with Logging {

  private val logDroppedEvent = new AtomicBoolean(false)

  override def onPostEvent(listener: StreamingListener, event: StreamingListenerEvent): Unit = {
    event match {
      case receiverStarted: StreamingListenerReceiverStarted =>
        listener.onReceiverStarted(receiverStarted)
      case receiverError: StreamingListenerReceiverError =>
        listener.onReceiverError(receiverError)
      case receiverStopped: StreamingListenerReceiverStopped =>
        listener.onReceiverStopped(receiverStopped)
      case batchSubmitted: StreamingListenerBatchSubmitted =>
        listener.onBatchSubmitted(batchSubmitted)
      case batchStarted: StreamingListenerBatchStarted =>
        listener.onBatchStarted(batchStarted)
      case batchCompleted: StreamingListenerBatchCompleted =>
        listener.onBatchCompleted(batchCompleted)
      case outputOperationStarted: StreamingListenerOutputOperationStarted =>
        listener.onOutputOperationStarted(outputOperationStarted)
      case outputOperationCompleted: StreamingListenerOutputOperationCompleted =>
        listener.onOutputOperationCompleted(outputOperationCompleted)
      case _ =>
    }
  }

  override def onDropEvent(event: StreamingListenerEvent): Unit = {
    if (logDroppedEvent.compareAndSet(false, true)) {
      // Only log the following message once to avoid duplicated annoying logs.
      logError("Dropping StreamingListenerEvent because no remaining room in event queue. " +
        "This likely means one of the StreamingListeners is too slow and cannot keep up with the " +
        "rate at which events are being started by the scheduler.")
    }
  }
} 
Example 89
Source File: RecurringTimer.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.util

import org.apache.spark.Logging
import org.apache.spark.util.{Clock, SystemClock}

private[streaming]
class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name: String)
  extends Logging {

  private val thread = new Thread("RecurringTimer - " + name) {
    setDaemon(true)
    override def run() { loop }
  }

  @volatile private var prevTime = -1L
  @volatile private var nextTime = -1L
  @volatile private var stopped = false

  
  private def loop() {
    try {
      while (!stopped) {
        triggerActionForNextInterval()
      }
      triggerActionForNextInterval()
    } catch {
      case e: InterruptedException =>
    }
  }
}

private[streaming]
object RecurringTimer extends Logging {

  def main(args: Array[String]) {
    var lastRecurTime = 0L
    val period = 1000

    def onRecur(time: Long) {
      val currentTime = System.currentTimeMillis()
      logInfo("" + currentTime + ": " + (currentTime - lastRecurTime))
      lastRecurTime = currentTime
    }
    val timer = new  RecurringTimer(new SystemClock(), period, onRecur, "Test")
    timer.start()
    Thread.sleep(30 * 1000)
    timer.stop(true)
  }
} 
Example 90
Source File: RawTextSender.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.util

import java.io.{ByteArrayOutputStream, IOException}
import java.net.ServerSocket
import java.nio.ByteBuffer

import scala.io.Source

import org.apache.spark.{SparkConf, Logging}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.IntParam


private[streaming]
object RawTextSender extends Logging {
  def main(args: Array[String]) {
    if (args.length != 4) {
      // scalastyle:off println
      System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>")
      // scalastyle:on println
      System.exit(1)
    }
    // Parse the arguments using a pattern match
    val Array(IntParam(port), file, IntParam(blockSize), IntParam(bytesPerSec)) = args

    // Repeat the input data multiple times to fill in a buffer
    val lines = Source.fromFile(file).getLines().toArray
    val bufferStream = new ByteArrayOutputStream(blockSize + 1000)
    val ser = new KryoSerializer(new SparkConf()).newInstance()
    val serStream = ser.serializeStream(bufferStream)
    var i = 0
    while (bufferStream.size < blockSize) {
      serStream.writeObject(lines(i))
      i = (i + 1) % lines.length
    }
    val array = bufferStream.toByteArray

    val countBuf = ByteBuffer.wrap(new Array[Byte](4))
    countBuf.putInt(array.length)
    countBuf.flip()

    val serverSocket = new ServerSocket(port)
    logInfo("Listening on port " + port)

    while (true) {
      val socket = serverSocket.accept()
      logInfo("Got a new connection")
      val out = new RateLimitedOutputStream(socket.getOutputStream, bytesPerSec)
      try {
        while (true) {
          out.write(countBuf.array)
          out.write(array)
        }
      } catch {
        case e: IOException =>
          logError("Client disconnected")
      } finally {
        socket.close()
      }
    }
  }
} 
Example 91
Source File: FileBasedWriteAheadLogReader.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.util

import java.io.{IOException, Closeable, EOFException}
import java.nio.ByteBuffer

import org.apache.hadoop.conf.Configuration
import org.apache.spark.Logging


private[streaming] class FileBasedWriteAheadLogReader(path: String, conf: Configuration)
  extends Iterator[ByteBuffer] with Closeable with Logging {

  private val instream = HdfsUtils.getInputStream(path, conf)
  private var closed = (instream == null) // the file may be deleted as we're opening the stream
  private var nextItem: Option[ByteBuffer] = None

  override def hasNext: Boolean = synchronized {
    if (closed) {
      return false
    }

    if (nextItem.isDefined) { // handle the case where hasNext is called without calling next
      true
    } else {
      try {
        val length = instream.readInt()
        val buffer = new Array[Byte](length)
        instream.readFully(buffer)
        nextItem = Some(ByteBuffer.wrap(buffer))
        logTrace("Read next item " + nextItem.get)
        true
      } catch {
        case e: EOFException =>
          logDebug("Error reading next item, EOF reached", e)
          close()
          false
        case e: IOException =>
          logWarning("Error while trying to read data. If the file was deleted, " +
            "this should be okay.", e)
          close()
          if (HdfsUtils.checkFileExists(path, conf)) {
            // If file exists, this could be a legitimate error
            throw e
          } else {
            // File was deleted. This can occur when the daemon cleanup thread takes time to
            // delete the file during recovery.
            false
          }

        case e: Exception =>
          logWarning("Error while trying to read data from HDFS.", e)
          close()
          throw e
      }
    }
  }

  override def next(): ByteBuffer = synchronized {
    val data = nextItem.getOrElse {
      close()
      throw new IllegalStateException(
        "next called without calling hasNext or after hasNext returned false")
    }
    nextItem = None // Ensure the next hasNext call loads new data.
    data
  }

  override def close(): Unit = synchronized {
    if (!closed) {
      instream.close()
    }
    closed = true
  }
} 
Example 92
Source File: RateLimitedOutputStream.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.util

import scala.annotation.tailrec

import java.io.OutputStream
import java.util.concurrent.TimeUnit._

import org.apache.spark.Logging


private[streaming]
class RateLimitedOutputStream(out: OutputStream, desiredBytesPerSec: Int)
  extends OutputStream
  with Logging {

  require(desiredBytesPerSec > 0)

  private val SYNC_INTERVAL = NANOSECONDS.convert(10, SECONDS)
  private val CHUNK_SIZE = 8192
  private var lastSyncTime = System.nanoTime
  private var bytesWrittenSinceSync = 0L

  override def write(b: Int) {
    waitToWrite(1)
    out.write(b)
  }

  override def write(bytes: Array[Byte]) {
    write(bytes, 0, bytes.length)
  }

  @tailrec
  override final def write(bytes: Array[Byte], offset: Int, length: Int) {
    val writeSize = math.min(length - offset, CHUNK_SIZE)
    if (writeSize > 0) {
      waitToWrite(writeSize)
      out.write(bytes, offset, writeSize)
      write(bytes, offset + writeSize, length)
    }
  }

  override def flush() {
    out.flush()
  }

  override def close() {
    out.close()
  }

  @tailrec
  private def waitToWrite(numBytes: Int) {
    val now = System.nanoTime
    val elapsedNanosecs = math.max(now - lastSyncTime, 1)
    val rate = bytesWrittenSinceSync.toDouble * 1000000000 / elapsedNanosecs
    if (rate < desiredBytesPerSec) {
      // It's okay to write; just update some variables and return
      bytesWrittenSinceSync += numBytes
      if (now > lastSyncTime + SYNC_INTERVAL) {
        // Sync interval has passed; let's resync
        lastSyncTime = now
        bytesWrittenSinceSync = numBytes
      }
    } else {
      // Calculate how much time we should sleep to bring ourselves to the desired rate.
      val targetTimeInMillis = bytesWrittenSinceSync * 1000 / desiredBytesPerSec
      val elapsedTimeInMillis = elapsedNanosecs / 1000000
      val sleepTimeInMillis = targetTimeInMillis - elapsedTimeInMillis
      if (sleepTimeInMillis > 0) {
        logTrace("Natural rate is " + rate + " per second but desired rate is " +
          desiredBytesPerSec + ", sleeping for " + sleepTimeInMillis + " ms to compensate.")
        Thread.sleep(sleepTimeInMillis)
      }
      waitToWrite(numBytes)
    }
  }
} 
Example 93
Source File: FailureSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming

import java.io.File

import org.scalatest.BeforeAndAfter

import org.apache.spark.{SparkFunSuite, Logging}
import org.apache.spark.util.Utils


class FailureSuite extends SparkFunSuite with BeforeAndAfter with Logging {

  private val batchDuration: Duration = Milliseconds(1000)
  private val numBatches = 30
  private var directory: File = null

  before {
    directory = Utils.createTempDir()
  }

  after {
    if (directory != null) {
      Utils.deleteRecursively(directory)
    }
    StreamingContext.getActive().foreach { _.stop() }
  }

  test("multiple failures with map") {
    MasterFailureTest.testMap(directory.getAbsolutePath, numBatches, batchDuration)
  }

  test("multiple failures with updateStateByKey") {
    MasterFailureTest.testUpdateStateByKey(directory.getAbsolutePath, numBatches, batchDuration)
  }
} 
Example 94
Source File: EventLogDownloadResource.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.status.api.v1

import java.io.OutputStream
import java.util.zip.ZipOutputStream
import javax.ws.rs.{GET, Produces}
import javax.ws.rs.core.{MediaType, Response, StreamingOutput}

import scala.util.control.NonFatal

import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.deploy.SparkHadoopUtil

@Produces(Array(MediaType.APPLICATION_OCTET_STREAM))
private[v1] class EventLogDownloadResource(
    val uIRoot: UIRoot,
    val appId: String,
    val attemptId: Option[String]) extends Logging {
  val conf = SparkHadoopUtil.get.newConfiguration(new SparkConf)

  @GET
  def getEventLogs(): Response = {
    try {
      val fileName = {
        attemptId match {
          case Some(id) => s"eventLogs-$appId-$id.zip"
          case None => s"eventLogs-$appId.zip"
        }
      }

      val stream = new StreamingOutput {
        override def write(output: OutputStream): Unit = {
          val zipStream = new ZipOutputStream(output)
          try {
            uIRoot.writeEventLogs(appId, attemptId, zipStream)
          } finally {
            zipStream.close()
          }

        }
      }

      Response.ok(stream)
        .header("Content-Disposition", s"attachment; filename=$fileName")
        .header("Content-Type", MediaType.APPLICATION_OCTET_STREAM)
        .build()
    } catch {
      case NonFatal(e) =>
        Response.serverError()
          .entity(s"Event logs are not available for app: $appId.")
          .status(Response.Status.SERVICE_UNAVAILABLE)
          .build()
    }
  }
} 
Example 95
Source File: NettyRpcCallContext.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rpc.netty

import scala.concurrent.Promise

import org.apache.spark.Logging
import org.apache.spark.network.client.RpcResponseCallback
import org.apache.spark.rpc.{RpcAddress, RpcCallContext}

private[netty] abstract class NettyRpcCallContext(override val senderAddress: RpcAddress)
  extends RpcCallContext with Logging {

  protected def send(message: Any): Unit

  override def reply(response: Any): Unit = {
    send(response)
  }

  override def sendFailure(e: Throwable): Unit = {
    send(RpcFailure(e))
  }

}


private[netty] class RemoteNettyRpcCallContext(
    nettyEnv: NettyRpcEnv,
    callback: RpcResponseCallback,
    senderAddress: RpcAddress)
  extends NettyRpcCallContext(senderAddress) {

  override protected def send(message: Any): Unit = {
    val reply = nettyEnv.serialize(message)
    callback.onSuccess(reply)
  }
} 
Example 96
Source File: BlockTransferService.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.network

import java.io.Closeable
import java.nio.ByteBuffer

import scala.concurrent.{Promise, Await, Future}
import scala.concurrent.duration.Duration

import org.apache.spark.Logging
import org.apache.spark.network.buffer.{NioManagedBuffer, ManagedBuffer}
import org.apache.spark.network.shuffle.{ShuffleClient, BlockFetchingListener}
import org.apache.spark.storage.{BlockManagerId, BlockId, StorageLevel}

private[spark]
abstract class BlockTransferService extends ShuffleClient with Closeable with Logging {

  
  def uploadBlockSync(
      hostname: String,
      port: Int,
      execId: String,
      blockId: BlockId,
      blockData: ManagedBuffer,
      level: StorageLevel): Unit = {
    Await.result(uploadBlock(hostname, port, execId, blockId, blockData, level), Duration.Inf)
  }
} 
Example 97
Source File: NettyBlockRpcServer.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.network.netty

import java.nio.ByteBuffer

import scala.collection.JavaConverters._

import org.apache.spark.Logging
import org.apache.spark.network.BlockDataManager
import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer}
import org.apache.spark.network.client.{RpcResponseCallback, TransportClient}
import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager}
import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock}
import org.apache.spark.serializer.Serializer
import org.apache.spark.storage.{BlockId, StorageLevel}


class NettyBlockRpcServer(
    appId: String,
    serializer: Serializer,
    blockManager: BlockDataManager)
  extends RpcHandler with Logging {

  private val streamManager = new OneForOneStreamManager()

  override def receive(
      client: TransportClient,
      rpcMessage: ByteBuffer,
      responseContext: RpcResponseCallback): Unit = {
    val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage)
    logTrace(s"Received request: $message")

    message match {
      case openBlocks: OpenBlocks =>
        val blocks: Seq[ManagedBuffer] =
          openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData)
        val streamId = streamManager.registerStream(appId, blocks.iterator.asJava)
        logTrace(s"Registered streamId $streamId with ${blocks.size} buffers")
        responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteBuffer)

      case uploadBlock: UploadBlock =>
        // StorageLevel is serialized as bytes using our JavaSerializer.
        val level: StorageLevel =
          serializer.newInstance().deserialize(ByteBuffer.wrap(uploadBlock.metadata))
        val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData))
        blockManager.putBlockData(BlockId(uploadBlock.blockId), data, level)
        responseContext.onSuccess(ByteBuffer.allocate(0))
    }
  }

  override def getStreamManager(): StreamManager = streamManager
} 
Example 98
Source File: MetricsConfig.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.metrics

import java.io.{FileInputStream, InputStream}
import java.util.Properties

import scala.collection.JavaConverters._
import scala.collection.mutable
import scala.util.matching.Regex

import org.apache.spark.util.Utils
import org.apache.spark.{Logging, SparkConf}

private[spark] class MetricsConfig(conf: SparkConf) extends Logging {

  private val DEFAULT_PREFIX = "*"
  private val INSTANCE_REGEX = "^(\\*|[a-zA-Z]+)\\.(.+)".r
  private val DEFAULT_METRICS_CONF_FILENAME = "metrics.properties"

  private[metrics] val properties = new Properties()
  private[metrics] var propertyCategories: mutable.HashMap[String, Properties] = null

  private def setDefaultProperties(prop: Properties) {
    prop.setProperty("*.sink.servlet.class", "org.apache.spark.metrics.sink.MetricsServlet")
    prop.setProperty("*.sink.servlet.path", "/metrics/json")
    prop.setProperty("master.sink.servlet.path", "/metrics/master/json")
    prop.setProperty("applications.sink.servlet.path", "/metrics/applications/json")
  }

  def initialize() {
    // Add default properties in case there's no properties file
    setDefaultProperties(properties)

    loadPropertiesFromFile(conf.getOption("spark.metrics.conf"))

    // Also look for the properties in provided Spark configuration
    val prefix = "spark.metrics.conf."
    conf.getAll.foreach {
      case (k, v) if k.startsWith(prefix) =>
        properties.setProperty(k.substring(prefix.length()), v)
      case _ =>
    }

    propertyCategories = subProperties(properties, INSTANCE_REGEX)
    if (propertyCategories.contains(DEFAULT_PREFIX)) {
      val defaultProperty = propertyCategories(DEFAULT_PREFIX).asScala
      for((inst, prop) <- propertyCategories if (inst != DEFAULT_PREFIX);
          (k, v) <- defaultProperty if (prop.get(k) == null)) {
        prop.put(k, v)
      }
    }
  }

  def subProperties(prop: Properties, regex: Regex): mutable.HashMap[String, Properties] = {
    val subProperties = new mutable.HashMap[String, Properties]
    prop.asScala.foreach { kv =>
      if (regex.findPrefixOf(kv._1.toString).isDefined) {
        val regex(prefix, suffix) = kv._1.toString
        subProperties.getOrElseUpdate(prefix, new Properties).setProperty(suffix, kv._2.toString)
      }
    }
    subProperties
  }

  def getInstance(inst: String): Properties = {
    propertyCategories.get(inst) match {
      case Some(s) => s
      case None => propertyCategories.getOrElse(DEFAULT_PREFIX, new Properties)
    }
  }

  
  private[this] def loadPropertiesFromFile(path: Option[String]): Unit = {
    var is: InputStream = null
    try {
      is = path match {
        case Some(f) => new FileInputStream(f)
        case None => Utils.getSparkClassLoader.getResourceAsStream(DEFAULT_METRICS_CONF_FILENAME)
      }

      if (is != null) {
        properties.load(is)
      }
    } catch {
      case e: Exception =>
        val file = path.getOrElse(DEFAULT_METRICS_CONF_FILENAME)
        logError(s"Error loading configuration file $file", e)
    } finally {
      if (is != null) {
        is.close()
      }
    }
  }

} 
Example 99
Source File: PythonGatewayServer.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.api.python

import java.io.DataOutputStream
import java.net.Socket

import py4j.GatewayServer

import org.apache.spark.Logging
import org.apache.spark.util.Utils


private[spark] object PythonGatewayServer extends Logging {
  def main(args: Array[String]): Unit = Utils.tryOrExit {
    // Start a GatewayServer on an ephemeral port
    val gatewayServer: GatewayServer = new GatewayServer(null, 0)
    gatewayServer.start()
    val boundPort: Int = gatewayServer.getListeningPort
    if (boundPort == -1) {
      logError("GatewayServer failed to bind; exiting")
      System.exit(1)
    } else {
      logDebug(s"Started PythonGatewayServer on port $boundPort")
    }

    // Communicate the bound port back to the caller via the caller-specified callback port
    val callbackHost = sys.env("_PYSPARK_DRIVER_CALLBACK_HOST")
    val callbackPort = sys.env("_PYSPARK_DRIVER_CALLBACK_PORT").toInt
    logDebug(s"Communicating GatewayServer port to Python driver at $callbackHost:$callbackPort")
    val callbackSocket = new Socket(callbackHost, callbackPort)
    val dos = new DataOutputStream(callbackSocket.getOutputStream)
    dos.writeInt(boundPort)
    dos.close()
    callbackSocket.close()

    // Exit on EOF or broken pipe to ensure that this process dies when the Python driver dies:
    while (System.in.read() != -1) {
      // Do nothing
    }
    logDebug("Exiting due to broken pipe from Python driver")
    System.exit(0)
  }
} 
Example 100
Source File: MesosExternalShuffleService.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.mesos

import java.net.SocketAddress
import java.nio.ByteBuffer

import scala.collection.mutable

import org.apache.spark.{Logging, SecurityManager, SparkConf}
import org.apache.spark.deploy.ExternalShuffleService
import org.apache.spark.network.client.{RpcResponseCallback, TransportClient}
import org.apache.spark.network.shuffle.ExternalShuffleBlockHandler
import org.apache.spark.network.shuffle.protocol.BlockTransferMessage
import org.apache.spark.network.shuffle.protocol.mesos.RegisterDriver
import org.apache.spark.network.util.TransportConf


private[mesos] class MesosExternalShuffleService(conf: SparkConf, securityManager: SecurityManager)
  extends ExternalShuffleService(conf, securityManager) {

  protected override def newShuffleBlockHandler(
      conf: TransportConf): ExternalShuffleBlockHandler = {
    new MesosExternalShuffleBlockHandler(conf)
  }
}

private[spark] object MesosExternalShuffleService extends Logging {

  def main(args: Array[String]): Unit = {
    ExternalShuffleService.main(args,
      (conf: SparkConf, sm: SecurityManager) => new MesosExternalShuffleService(conf, sm))
  }
} 
Example 101
Source File: MesosClusterDispatcher.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.mesos

import java.util.concurrent.CountDownLatch

import org.apache.spark.deploy.mesos.ui.MesosClusterUI
import org.apache.spark.deploy.rest.mesos.MesosRestServer
import org.apache.spark.scheduler.cluster.mesos._
import org.apache.spark.util.SignalLogger
import org.apache.spark.{Logging, SecurityManager, SparkConf}


private[mesos] class MesosClusterDispatcher(
    args: MesosClusterDispatcherArguments,
    conf: SparkConf)
  extends Logging {

  private val publicAddress = Option(conf.getenv("SPARK_PUBLIC_DNS")).getOrElse(args.host)
  private val recoveryMode = conf.get("spark.mesos.deploy.recoveryMode", "NONE").toUpperCase()
  logInfo("Recovery mode in Mesos dispatcher set to: " + recoveryMode)

  private val engineFactory = recoveryMode match {
    case "NONE" => new BlackHoleMesosClusterPersistenceEngineFactory
    case "ZOOKEEPER" => new ZookeeperMesosClusterPersistenceEngineFactory(conf)
    case _ => throw new IllegalArgumentException("Unsupported recovery mode: " + recoveryMode)
  }

  private val scheduler = new MesosClusterScheduler(engineFactory, conf)

  private val server = new MesosRestServer(args.host, args.port, conf, scheduler)
  private val webUi = new MesosClusterUI(
    new SecurityManager(conf),
    args.webUiPort,
    conf,
    publicAddress,
    scheduler)

  private val shutdownLatch = new CountDownLatch(1)

  def start(): Unit = {
    webUi.bind()
    scheduler.frameworkUrl = webUi.activeWebUiUrl
    scheduler.start()
    server.start()
  }

  def awaitShutdown(): Unit = {
    shutdownLatch.await()
  }

  def stop(): Unit = {
    webUi.stop()
    server.stop()
    scheduler.stop()
    shutdownLatch.countDown()
  }
}

private[mesos] object MesosClusterDispatcher extends Logging {
  def main(args: Array[String]) {
    SignalLogger.register(log)
    val conf = new SparkConf
    val dispatcherArgs = new MesosClusterDispatcherArguments(args, conf)
    conf.setMaster(dispatcherArgs.masterUrl)
    conf.setAppName(dispatcherArgs.name)
    dispatcherArgs.zookeeperUrl.foreach { z =>
      conf.set("spark.mesos.deploy.recoveryMode", "ZOOKEEPER")
      conf.set("spark.mesos.deploy.zookeeper.url", z)
    }
    val dispatcher = new MesosClusterDispatcher(dispatcherArgs, conf)
    dispatcher.start()
    val shutdownHook = new Thread() {
      override def run() {
        logInfo("Shutdown hook is shutting down dispatcher")
        dispatcher.stop()
        dispatcher.awaitShutdown()
      }
    }
    Runtime.getRuntime.addShutdownHook(shutdownHook)
    dispatcher.awaitShutdown()
  }
} 
Example 102
Source File: SparkCuratorUtil.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy

import scala.collection.JavaConverters._

import org.apache.curator.framework.{CuratorFramework, CuratorFrameworkFactory}
import org.apache.curator.retry.ExponentialBackoffRetry
import org.apache.zookeeper.KeeperException

import org.apache.spark.{Logging, SparkConf}

private[spark] object SparkCuratorUtil extends Logging {

  private val ZK_CONNECTION_TIMEOUT_MILLIS = 15000
  private val ZK_SESSION_TIMEOUT_MILLIS = 60000
  private val RETRY_WAIT_MILLIS = 5000
  private val MAX_RECONNECT_ATTEMPTS = 3

  def newClient(
      conf: SparkConf,
      zkUrlConf: String = "spark.deploy.zookeeper.url"): CuratorFramework = {
    val ZK_URL = conf.get(zkUrlConf)
    val zk = CuratorFrameworkFactory.newClient(ZK_URL,
      ZK_SESSION_TIMEOUT_MILLIS, ZK_CONNECTION_TIMEOUT_MILLIS,
      new ExponentialBackoffRetry(RETRY_WAIT_MILLIS, MAX_RECONNECT_ATTEMPTS))
    zk.start()
    zk
  }

  def mkdir(zk: CuratorFramework, path: String) {
    if (zk.checkExists().forPath(path) == null) {
      try {
        zk.create().creatingParentsIfNeeded().forPath(path)
      } catch {
        case nodeExist: KeeperException.NodeExistsException =>
          // do nothing, ignore node existing exception.
        case e: Exception => throw e
      }
    }
  }

  def deleteRecursive(zk: CuratorFramework, path: String) {
    if (zk.checkExists().forPath(path) != null) {
      for (child <- zk.getChildren.forPath(path).asScala) {
        zk.delete().forPath(path + "/" + child)
      }
      zk.delete().forPath(path)
    }
  }
} 
Example 103
Source File: TestClient.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.client

import org.apache.spark.rpc.RpcEnv
import org.apache.spark.{SecurityManager, SparkConf, Logging}
import org.apache.spark.deploy.{ApplicationDescription, Command}
import org.apache.spark.util.Utils

private[spark] object TestClient {

  private class TestListener extends AppClientListener with Logging {
    def connected(id: String) {
      logInfo("Connected to master, got app ID " + id)
    }

    def disconnected() {
      logInfo("Disconnected from master")
      System.exit(0)
    }

    def dead(reason: String) {
      logInfo("Application died with error: " + reason)
      System.exit(0)
    }

    def executorAdded(id: String, workerId: String, hostPort: String, cores: Int, memory: Int) {}

    def executorRemoved(id: String, message: String, exitStatus: Option[Int]) {}
  }

  def main(args: Array[String]) {
    val url = args(0)
    val conf = new SparkConf
    val rpcEnv = RpcEnv.create("spark", Utils.localHostName(), 0, conf, new SecurityManager(conf))
    val executorClassname = TestExecutor.getClass.getCanonicalName.stripSuffix("$")
    val desc = new ApplicationDescription("TestClient", Some(1), 512,
      Command(executorClassname, Seq(), Map(), Seq(), Seq(), Seq()), "ignored")
    val listener = new TestListener
    val client = new AppClient(rpcEnv, Array(url), desc, listener, new SparkConf)
    client.start()
    rpcEnv.awaitTermination()
  }
} 
Example 104
Source File: FileSystemPersistenceEngine.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.master

import java.io._

import scala.reflect.ClassTag

import org.apache.spark.Logging
import org.apache.spark.serializer.{DeserializationStream, SerializationStream, Serializer}
import org.apache.spark.util.Utils



private[master] class FileSystemPersistenceEngine(
    val dir: String,
    val serializer: Serializer)
  extends PersistenceEngine with Logging {

  new File(dir).mkdir()

  override def persist(name: String, obj: Object): Unit = {
    serializeIntoFile(new File(dir + File.separator + name), obj)
  }

  override def unpersist(name: String): Unit = {
    val f = new File(dir + File.separator + name)
    if (!f.delete()) {
      logWarning(s"Error deleting ${f.getPath()}")
    }
  }

  override def read[T: ClassTag](prefix: String): Seq[T] = {
    val files = new File(dir).listFiles().filter(_.getName.startsWith(prefix))
    files.map(deserializeFromFile[T])
  }

  private def serializeIntoFile(file: File, value: AnyRef) {
    val created = file.createNewFile()
    if (!created) { throw new IllegalStateException("Could not create file: " + file) }
    val fileOut = new FileOutputStream(file)
    var out: SerializationStream = null
    Utils.tryWithSafeFinally {
      out = serializer.newInstance().serializeStream(fileOut)
      out.writeObject(value)
    } {
      fileOut.close()
      if (out != null) {
        out.close()
      }
    }
  }

  private def deserializeFromFile[T](file: File)(implicit m: ClassTag[T]): T = {
    val fileIn = new FileInputStream(file)
    var in: DeserializationStream = null
    try {
      in = serializer.newInstance().deserializeStream(fileIn)
      in.readObject[T]()
    } finally {
      fileIn.close()
      if (in != null) {
        in.close()
      }
    }
  }

} 
Example 105
Source File: RecoveryModeFactory.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.master

import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.serializer.Serializer


private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: Serializer)
  extends StandaloneRecoveryModeFactory(conf, serializer) with Logging {

  val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "")

  def createPersistenceEngine(): PersistenceEngine = {
    logInfo("Persisting recovery state to directory: " + RECOVERY_DIR)
    new FileSystemPersistenceEngine(RECOVERY_DIR, serializer)
  }

  def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = {
    new MonarchyLeaderAgent(master)
  }
}

private[master] class ZooKeeperRecoveryModeFactory(conf: SparkConf, serializer: Serializer)
  extends StandaloneRecoveryModeFactory(conf, serializer) {

  def createPersistenceEngine(): PersistenceEngine = {
    new ZooKeeperPersistenceEngine(conf, serializer)
  }

  def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = {
    new ZooKeeperLeaderElectionAgent(master, conf)
  }
} 
Example 106
Source File: MasterWebUI.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.master.ui

import org.apache.spark.Logging
import org.apache.spark.deploy.master.Master
import org.apache.spark.status.api.v1.{ApiRootResource, ApplicationsListResource, ApplicationInfo,
  UIRoot}
import org.apache.spark.ui.{SparkUI, WebUI}
import org.apache.spark.ui.JettyUtils._


  def detachSparkUI(ui: SparkUI) {
    assert(serverInfo.isDefined, "Master UI must be bound to a server before detaching SparkUIs")
    ui.getHandlers.foreach(detachHandler)
  }

  def getApplicationInfoList: Iterator[ApplicationInfo] = {
    val state = masterPage.getMasterState
    val activeApps = state.activeApps.sortBy(_.startTime).reverse
    val completedApps = state.completedApps.sortBy(_.endTime).reverse
    activeApps.iterator.map { ApplicationsListResource.convertApplicationInfo(_, false) } ++
      completedApps.iterator.map { ApplicationsListResource.convertApplicationInfo(_, true) }
  }

  def getSparkUI(appId: String): Option[SparkUI] = {
    val state = masterPage.getMasterState
    val activeApps = state.activeApps.sortBy(_.startTime).reverse
    val completedApps = state.completedApps.sortBy(_.endTime).reverse
    (activeApps ++ completedApps).find { _.id == appId }.flatMap {
      master.rebuildSparkUI
    }
  }
}

private[master] object MasterWebUI {
  private val STATIC_RESOURCE_DIR = SparkUI.STATIC_RESOURCE_DIR
} 
Example 107
Source File: ZooKeeperLeaderElectionAgent.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.master

import org.apache.spark.{Logging, SparkConf}
import org.apache.curator.framework.CuratorFramework
import org.apache.curator.framework.recipes.leader.{LeaderLatchListener, LeaderLatch}
import org.apache.spark.deploy.SparkCuratorUtil

private[master] class ZooKeeperLeaderElectionAgent(val masterInstance: LeaderElectable,
    conf: SparkConf) extends LeaderLatchListener with LeaderElectionAgent with Logging  {

  val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/leader_election"

  private var zk: CuratorFramework = _
  private var leaderLatch: LeaderLatch = _
  private var status = LeadershipStatus.NOT_LEADER

  start()

  private def start() {
    logInfo("Starting ZooKeeper LeaderElection agent")
    zk = SparkCuratorUtil.newClient(conf)
    leaderLatch = new LeaderLatch(zk, WORKING_DIR)
    leaderLatch.addListener(this)
    leaderLatch.start()
  }

  override def stop() {
    leaderLatch.close()
    zk.close()
  }

  override def isLeader() {
    synchronized {
      // could have lost leadership by now.
      if (!leaderLatch.hasLeadership) {
        return
      }

      logInfo("We have gained leadership")
      updateLeadershipStatus(true)
    }
  }

  override def notLeader() {
    synchronized {
      // could have gained leadership by now.
      if (leaderLatch.hasLeadership) {
        return
      }

      logInfo("We have lost leadership")
      updateLeadershipStatus(false)
    }
  }

  private def updateLeadershipStatus(isLeader: Boolean) {
    if (isLeader && status == LeadershipStatus.NOT_LEADER) {
      status = LeadershipStatus.LEADER
      masterInstance.electedLeader()
    } else if (!isLeader && status == LeadershipStatus.LEADER) {
      status = LeadershipStatus.NOT_LEADER
      masterInstance.revokedLeadership()
    }
  }

  private object LeadershipStatus extends Enumeration {
    type LeadershipStatus = Value
    val LEADER, NOT_LEADER = Value
  }
} 
Example 108
Source File: ZooKeeperPersistenceEngine.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.master

import java.nio.ByteBuffer

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.curator.framework.CuratorFramework
import org.apache.zookeeper.CreateMode

import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.deploy.SparkCuratorUtil
import org.apache.spark.serializer.Serializer


private[master] class ZooKeeperPersistenceEngine(conf: SparkConf, val serializer: Serializer)
  extends PersistenceEngine
  with Logging {

  private val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/master_status"
  private val zk: CuratorFramework = SparkCuratorUtil.newClient(conf)

  SparkCuratorUtil.mkdir(zk, WORKING_DIR)


  override def persist(name: String, obj: Object): Unit = {
    serializeIntoFile(WORKING_DIR + "/" + name, obj)
  }

  override def unpersist(name: String): Unit = {
    zk.delete().forPath(WORKING_DIR + "/" + name)
  }

  override def read[T: ClassTag](prefix: String): Seq[T] = {
    zk.getChildren.forPath(WORKING_DIR).asScala
      .filter(_.startsWith(prefix)).map(deserializeFromFile[T]).flatten
  }

  override def close() {
    zk.close()
  }

  private def serializeIntoFile(path: String, value: AnyRef) {
    val serialized = serializer.newInstance().serialize(value)
    val bytes = new Array[Byte](serialized.remaining())
    serialized.get(bytes)
    zk.create().withMode(CreateMode.PERSISTENT).forPath(path, bytes)
  }

  private def deserializeFromFile[T](filename: String)(implicit m: ClassTag[T]): Option[T] = {
    val fileData = zk.getData().forPath(WORKING_DIR + "/" + filename)
    try {
      Some(serializer.newInstance().deserialize[T](ByteBuffer.wrap(fileData)))
    } catch {
      case e: Exception => {
        logWarning("Exception while reading persisted file, deleting", e)
        zk.delete().forPath(WORKING_DIR + "/" + filename)
        None
      }
    }
  }
} 
Example 109
Source File: CommandUtils.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.worker

import java.io.{File, FileOutputStream, InputStream, IOException}

import scala.collection.JavaConverters._
import scala.collection.Map

import org.apache.spark.Logging
import org.apache.spark.SecurityManager
import org.apache.spark.deploy.Command
import org.apache.spark.launcher.WorkerCommandBuilder
import org.apache.spark.util.Utils


  def redirectStream(in: InputStream, file: File) {
    val out = new FileOutputStream(file, true)
    // TODO: It would be nice to add a shutdown hook here that explains why the output is
    //       terminating. Otherwise if the worker dies the executor logs will silently stop.
    new Thread("redirect output to " + file) {
      override def run() {
        try {
          Utils.copyStream(in, out, true)
        } catch {
          case e: IOException =>
            logInfo("Redirection to " + file + " closed: " + e.getMessage)
        }
      }
    }.start()
  }
} 
Example 110
Source File: WorkerWebUI.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.worker.ui

import java.io.File
import javax.servlet.http.HttpServletRequest

import org.apache.spark.Logging
import org.apache.spark.deploy.worker.Worker
import org.apache.spark.ui.{SparkUI, WebUI}
import org.apache.spark.ui.JettyUtils._
import org.apache.spark.util.RpcUtils


  def initialize() {
    val logPage = new LogPage(this)
    attachPage(logPage)
    attachPage(new WorkerPage(this))
    attachHandler(createStaticHandler(WorkerWebUI.STATIC_RESOURCE_BASE, "/static"))
    attachHandler(createServletHandler("/log",
      (request: HttpServletRequest) => logPage.renderLog(request),
      worker.securityMgr,
      worker.conf))
  }
}

private[worker] object WorkerWebUI {
  val STATIC_RESOURCE_BASE = SparkUI.STATIC_RESOURCE_DIR
  val DEFAULT_RETAINED_DRIVERS = 1000
  val DEFAULT_RETAINED_EXECUTORS = 1000
} 
Example 111
Source File: WorkerWatcher.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.worker

import org.apache.spark.Logging
import org.apache.spark.rpc._


private[spark] class WorkerWatcher(
    override val rpcEnv: RpcEnv, workerUrl: String, isTesting: Boolean = false)
  extends RpcEndpoint with Logging {

  logInfo(s"Connecting to worker $workerUrl")
  if (!isTesting) {
    rpcEnv.asyncSetupEndpointRefByURI(workerUrl)
  }

  // Used to avoid shutting down JVM during tests
  // In the normal case, exitNonZero will call `System.exit(-1)` to shutdown the JVM. In the unit
  // test, the user should call `setTesting(true)` so that `exitNonZero` will set `isShutDown` to
  // true rather than calling `System.exit`. The user can check `isShutDown` to know if
  // `exitNonZero` is called.
  private[deploy] var isShutDown = false

  // Lets filter events only from the worker's rpc system
  private val expectedAddress = RpcAddress.fromURIString(workerUrl)
  private def isWorker(address: RpcAddress) = expectedAddress == address

  private def exitNonZero() = if (isTesting) isShutDown = true else System.exit(-1)

  override def receive: PartialFunction[Any, Unit] = {
    case e => logWarning(s"Received unexpected message: $e")
  }

  override def onConnected(remoteAddress: RpcAddress): Unit = {
    if (isWorker(remoteAddress)) {
      logInfo(s"Successfully connected to $workerUrl")
    }
  }

  override def onDisconnected(remoteAddress: RpcAddress): Unit = {
    if (isWorker(remoteAddress)) {
      // This log message will never be seen
      logError(s"Lost connection to worker rpc endpoint $workerUrl. Exiting.")
      exitNonZero()
    }
  }

  override def onNetworkError(cause: Throwable, remoteAddress: RpcAddress): Unit = {
    if (isWorker(remoteAddress)) {
      // These logs may not be seen if the worker (and associated pipe) has died
      logError(s"Could not initialize connection to worker $workerUrl. Exiting.")
      logError(s"Error was: $cause")
      exitNonZero()
    }
  }
} 
Example 112
Source File: HistoryServerArguments.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.history

import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.util.Utils


private[history] class HistoryServerArguments(conf: SparkConf, args: Array[String])
  extends Logging {
  private var propertiesFile: String = null

  parse(args.toList)

  private def parse(args: List[String]): Unit = {
    if (args.length == 1) {
      setLogDirectory(args.head)
    } else {
      args match {
        case ("--dir" | "-d") :: value :: tail =>
          setLogDirectory(value)
          parse(tail)

        case ("--help" | "-h") :: tail =>
          printUsageAndExit(0)

        case ("--properties-file") :: value :: tail =>
          propertiesFile = value
          parse(tail)

        case Nil =>

        case _ =>
          printUsageAndExit(1)
      }
    }
  }

  private def setLogDirectory(value: String): Unit = {
    logWarning("Setting log directory through the command line is deprecated as of " +
      "Spark 1.1.0. Please set this through spark.history.fs.logDirectory instead.")
    conf.set("spark.history.fs.logDirectory", value)
  }

   // This mutates the SparkConf, so all accesses to it must be made after this line
   Utils.loadDefaultSparkProperties(conf, propertiesFile)

  private def printUsageAndExit(exitCode: Int) {
    // scalastyle:off println
    System.err.println(
      """
      |Usage: HistoryServer [options]
      |
      |Options:
      |  DIR                         Deprecated; set spark.history.fs.logDirectory directly
      |  --dir DIR (-d DIR)          Deprecated; set spark.history.fs.logDirectory directly
      |  --properties-file FILE      Path to a custom Spark properties file.
      |                              Default is conf/spark-defaults.conf.
      |
      |Configuration options can be set by setting the corresponding JVM system property.
      |History Server options are always available; additional options depend on the provider.
      |
      |History Server options:
      |
      |  spark.history.ui.port              Port where server will listen for connections
      |                                     (default 18080)
      |  spark.history.acls.enable          Whether to enable view acls for all applications
      |                                     (default false)
      |  spark.history.provider             Name of history provider class (defaults to
      |                                     file system-based provider)
      |  spark.history.retainedApplications Max number of application UIs to keep loaded in memory
      |                                     (default 50)
      |FsHistoryProvider options:
      |
      |  spark.history.fs.logDirectory      Directory where app logs are stored
      |                                     (default: file:/tmp/spark-events)
      |  spark.history.fs.updateInterval    How often to reload log data from storage
      |                                     (in seconds, default: 10)
      |""".stripMargin)
    // scalastyle:on println
    System.exit(exitCode)
  }

} 
Example 113
Source File: LocalSparkCluster.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.rpc.RpcEnv
import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.deploy.worker.Worker
import org.apache.spark.deploy.master.Master
import org.apache.spark.util.Utils


    for (workerNum <- 1 to numWorkers) {
      val workerEnv = Worker.startRpcEnvAndEndpoint(localHostname, 0, 0, coresPerWorker,
        memoryPerWorker, masters, null, Some(workerNum), _conf)
      workerRpcEnvs += workerEnv
    }

    masters
  }

  def stop() {
    logInfo("Shutting down local Spark cluster.")
    // Stop the workers before the master so they don't get upset that it disconnected
    workerRpcEnvs.foreach(_.shutdown())
    masterRpcEnvs.foreach(_.shutdown())
    workerRpcEnvs.foreach(_.awaitTermination())
    masterRpcEnvs.foreach(_.awaitTermination())
    masterRpcEnvs.clear()
    workerRpcEnvs.clear()
  }
} 
Example 114
Source File: SimrSchedulerBackend.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler.cluster

import org.apache.hadoop.fs.{Path, FileSystem}

import org.apache.spark.rpc.RpcAddress
import org.apache.spark.{Logging, SparkContext, SparkEnv}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.scheduler.TaskSchedulerImpl

private[spark] class SimrSchedulerBackend(
    scheduler: TaskSchedulerImpl,
    sc: SparkContext,
    driverFilePath: String)
  extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv)
  with Logging {

  val tmpPath = new Path(driverFilePath + "_tmp")
  val filePath = new Path(driverFilePath)

  val maxCores = conf.getInt("spark.simr.executor.cores", 1)

  override def start() {
    super.start()

    val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName,
      RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt),
      CoarseGrainedSchedulerBackend.ENDPOINT_NAME)

    val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
    val fs = FileSystem.get(conf)
    val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("")

    logInfo("Writing to HDFS file: "  + driverFilePath)
    logInfo("Writing Akka address: "  + driverUrl)
    logInfo("Writing Spark UI Address: " + appUIAddress)

    // Create temporary file to prevent race condition where executors get empty driverUrl file
    val temp = fs.create(tmpPath, true)
    temp.writeUTF(driverUrl)
    temp.writeInt(maxCores)
    temp.writeUTF(appUIAddress)
    temp.close()

    // "Atomic" rename
    fs.rename(tmpPath, filePath)
  }

  override def stop() {
    val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
    val fs = FileSystem.get(conf)
    if (!fs.delete(new Path(driverFilePath), false)) {
      logWarning(s"error deleting ${driverFilePath}")
    }
    super.stop()
  }

} 
Example 115
Source File: MesosClusterPersistenceEngine.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler.cluster.mesos

import scala.collection.JavaConverters._

import org.apache.curator.framework.CuratorFramework
import org.apache.zookeeper.CreateMode
import org.apache.zookeeper.KeeperException.NoNodeException

import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.deploy.SparkCuratorUtil
import org.apache.spark.util.Utils


private[spark] class ZookeeperMesosClusterPersistenceEngine(
    baseDir: String,
    zk: CuratorFramework,
    conf: SparkConf)
  extends MesosClusterPersistenceEngine with Logging {
  private val WORKING_DIR =
    conf.get("spark.deploy.zookeeper.dir", "/spark_mesos_dispatcher") + "/" + baseDir

  SparkCuratorUtil.mkdir(zk, WORKING_DIR)

  def path(name: String): String = {
    WORKING_DIR + "/" + name
  }

  override def expunge(name: String): Unit = {
    zk.delete().forPath(path(name))
  }

  override def persist(name: String, obj: Object): Unit = {
    val serialized = Utils.serialize(obj)
    val zkPath = path(name)
    zk.create().withMode(CreateMode.PERSISTENT).forPath(zkPath, serialized)
  }

  override def fetch[T](name: String): Option[T] = {
    val zkPath = path(name)

    try {
      val fileData = zk.getData().forPath(zkPath)
      Some(Utils.deserialize[T](fileData))
    } catch {
      case e: NoNodeException => None
      case e: Exception => {
        logWarning("Exception while reading persisted file, deleting", e)
        zk.delete().forPath(zkPath)
        None
      }
    }
  }

  override def fetchAll[T](): Iterable[T] = {
    zk.getChildren.forPath(WORKING_DIR).asScala.flatMap(fetch[T])
  }
} 
Example 116
Source File: MesosTaskLaunchData.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler.cluster.mesos

import java.nio.ByteBuffer

import org.apache.mesos.protobuf.ByteString

import org.apache.spark.Logging


private[spark] case class MesosTaskLaunchData(
  serializedTask: ByteBuffer,
  attemptNumber: Int) extends Logging {

  def toByteString: ByteString = {
    val dataBuffer = ByteBuffer.allocate(4 + serializedTask.limit)
    dataBuffer.putInt(attemptNumber)
    dataBuffer.put(serializedTask)
    dataBuffer.rewind
    logDebug(s"ByteBuffer size: [${dataBuffer.remaining}]")
    ByteString.copyFrom(dataBuffer)
  }
}

private[spark] object MesosTaskLaunchData extends Logging {
  def fromByteString(byteString: ByteString): MesosTaskLaunchData = {
    val byteBuffer = byteString.asReadOnlyByteBuffer()
    logDebug(s"ByteBuffer size: [${byteBuffer.remaining}]")
    val attemptNumber = byteBuffer.getInt // updates the position by 4 bytes
    val serializedTask = byteBuffer.slice() // subsequence starting at the current position
    MesosTaskLaunchData(serializedTask, attemptNumber)
  }
} 
Example 117
Source File: ReplayListenerBus.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.io.{InputStream, IOException}

import scala.io.Source

import com.fasterxml.jackson.core.JsonParseException
import org.json4s.jackson.JsonMethods._

import org.apache.spark.Logging
import org.apache.spark.util.JsonProtocol


  def replay(
      logData: InputStream,
      sourceName: String,
      maybeTruncated: Boolean = false): Unit = {
    var currentLine: String = null
    var lineNumber: Int = 1
    try {
      val lines = Source.fromInputStream(logData).getLines()
      while (lines.hasNext) {
        currentLine = lines.next()
        try {
          postToAll(JsonProtocol.sparkEventFromJson(parse(currentLine)))
        } catch {
          case jpe: JsonParseException =>
            // We can only ignore exception from last line of the file that might be truncated
            if (!maybeTruncated || lines.hasNext) {
              throw jpe
            } else {
              logWarning(s"Got JsonParseException from log file $sourceName" +
                s" at line $lineNumber, the file might not have finished writing cleanly.")
            }
        }
        lineNumber += 1
      }
    } catch {
      case ioe: IOException =>
        throw ioe
      case e: Exception =>
        logError(s"Exception parsing Spark event log: $sourceName", e)
        logError(s"Malformed line #$lineNumber: $currentLine\n")
    }
  }

} 
Example 118
Source File: SparkUncaughtExceptionHandler.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import org.apache.spark.Logging


private[spark] object SparkUncaughtExceptionHandler
  extends Thread.UncaughtExceptionHandler with Logging {

  override def uncaughtException(thread: Thread, exception: Throwable) {
    try {
      // Make it explicit that uncaught exceptions are thrown when container is shutting down.
      // It will help users when they analyze the executor logs
      val inShutdownMsg = if (ShutdownHookManager.inShutdown()) "[Container in shutdown] " else ""
      val errMsg = "Uncaught exception in thread "
      logError(inShutdownMsg + errMsg + thread, exception)

      // We may have been called from a shutdown hook. If so, we must not call System.exit().
      // (If we do, we will deadlock.)
      if (!ShutdownHookManager.inShutdown()) {
        if (exception.isInstanceOf[OutOfMemoryError]) {
          System.exit(SparkExitCode.OOM)
        } else {
          System.exit(SparkExitCode.UNCAUGHT_EXCEPTION)
        }
      }
    } catch {
      case oom: OutOfMemoryError => Runtime.getRuntime.halt(SparkExitCode.OOM)
      case t: Throwable => Runtime.getRuntime.halt(SparkExitCode.UNCAUGHT_EXCEPTION_TWICE)
    }
  }

  def uncaughtException(exception: Throwable) {
    uncaughtException(Thread.currentThread(), exception)
  }
} 
Example 119
Source File: BlockManagerSlaveEndpoint.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.storage

import scala.concurrent.{ExecutionContext, Future}

import org.apache.spark.{Logging, MapOutputTracker, SparkEnv}
import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint}
import org.apache.spark.storage.BlockManagerMessages._
import org.apache.spark.util.{ThreadUtils, Utils}


private[storage]
class BlockManagerSlaveEndpoint(
    override val rpcEnv: RpcEnv,
    blockManager: BlockManager,
    mapOutputTracker: MapOutputTracker)
  extends ThreadSafeRpcEndpoint with Logging {

  private val asyncThreadPool =
    ThreadUtils.newDaemonCachedThreadPool("block-manager-slave-async-thread-pool")
  private implicit val asyncExecutionContext = ExecutionContext.fromExecutorService(asyncThreadPool)

  // Operations that involve removing blocks may be slow and should be done asynchronously
  override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
    case RemoveBlock(blockId) =>
      doAsync[Boolean]("removing block " + blockId, context) {
        blockManager.removeBlock(blockId)
        true
      }

    case RemoveRdd(rddId) =>
      doAsync[Int]("removing RDD " + rddId, context) {
        blockManager.removeRdd(rddId)
      }

    case RemoveShuffle(shuffleId) =>
      doAsync[Boolean]("removing shuffle " + shuffleId, context) {
        if (mapOutputTracker != null) {
          mapOutputTracker.unregisterShuffle(shuffleId)
        }
        SparkEnv.get.shuffleManager.unregisterShuffle(shuffleId)
      }

    case RemoveBroadcast(broadcastId, _) =>
      doAsync[Int]("removing broadcast " + broadcastId, context) {
        blockManager.removeBroadcast(broadcastId, tellMaster = true)
      }

    case GetBlockStatus(blockId, _) =>
      context.reply(blockManager.getStatus(blockId))

    case GetMatchingBlockIds(filter, _) =>
      context.reply(blockManager.getMatchingBlockIds(filter))

    case TriggerThreadDump =>
      context.reply(Utils.getThreadDump())
  }

  private def doAsync[T](actionMessage: String, context: RpcCallContext)(body: => T) {
    val future = Future {
      logDebug(actionMessage)
      body
    }
    future.onSuccess { case response =>
      logDebug("Done " + actionMessage + ", response is " + response)
      context.reply(response)
      logDebug("Sent response: " + response + " to " + context.senderAddress)
    }
    future.onFailure { case t: Throwable =>
      logError("Error in " + actionMessage, t)
      context.sendFailure(t)
    }
  }

  override def onStop(): Unit = {
    asyncThreadPool.shutdownNow()
  }
} 
Example 120
Source File: OrderedRDDFunctions.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Logging, Partitioner, RangePartitioner}
import org.apache.spark.annotation.DeveloperApi


  def filterByRange(lower: K, upper: K): RDD[P] = self.withScope {

    def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper)

    val rddToFilter: RDD[P] = self.partitioner match {
      case Some(rp: RangePartitioner[K, V]) => {
        val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match {
          case (l, u) => Math.min(l, u) to Math.max(l, u)
        }
        PartitionPruningRDD.create(self, partitionIndicies.contains)
      }
      case _ =>
        self
    }
    rddToFilter.filter { case (k, v) => inRange(k) }
  }

} 
Example 121
Source File: MemoryRDDCheckpointData.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.spark.Logging
import scala.reflect.ClassTag

class MemoryRDDCheckpointData[T: ClassTag](@transient private val rdd: RDD[T])
  extends RDDCheckpointData[T](rdd) with Logging {

  protected override def doCheckpoint(): CheckpointRDD[T] = {
    val level = rdd.getStorageLevel
    // If you're using this, persist with storage level using memory before reaching this code.
    // By the time this method is reached, the rdd should already be cached.  This is part of truncating the lineage.
    // We do not set the storage level here so the user intentionally receives the error.

    // LocalCheckpointing is not sufficient for this purpose since it requires executing a new job.
    // If instead local checkpointing, or checkpointing in general, was integrated into the block manager,
    // this approach would become unnecessary.

    // Assume storage level uses memory; otherwise eviction may cause data loss
    assume(level.useMemory, s"Storage level $level is not appropriate for memory checkpointing")

    new MemoryCheckpointRDD[T](rdd)
  }
} 
Example 122
Source File: SparkFunSuite.scala    From yggdrasil   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.util

import org.apache.spark.Logging
import org.scalatest.{FunSuite, Outcome}


  final protected override def withFixture(test: NoArgTest): Outcome = {
    val testName = test.text
    val suiteName = this.getClass.getName
    val shortSuiteName = suiteName.replaceAll("org.apache.spark", "o.a.s")
    try {
      logInfo(s"\n\n===== TEST OUTPUT FOR $shortSuiteName: '$testName' =====\n")
      test()
    } finally {
      logInfo(s"\n\n===== FINISHED $shortSuiteName: '$testName' =====\n")
    }
  }
} 
Example 123
Source File: StoryJSONExtractor.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.story

import java.io._
import java.util.Date

import io.gzet.story.util.Tokenizer
import org.apache.spark.{Logging, SparkConf, SparkContext}
import org.elasticsearch.spark._
import org.json4s.DefaultFormats
import org.json4s.native.JsonMethods._

import scala.util.Try

object StoryJSONExtractor extends SimpleConfig with Logging {

  def main(args: Array[String]): Unit = {

    val sparkConf = new SparkConf().setAppName("Story Extractor")
    val sc = new SparkContext(sparkConf)

    val outputDir = args.head
    val minWeight = Try(args.last.toInt).getOrElse(0)

    val nodes = sc.esJsonRDD(esNodesResource).map({ case (_, strJson) =>
      implicit val format = DefaultFormats
      val json = parse(strJson)
      val title = (json \ "title").extractOrElse[String]("")
      val gid = (json \ "gid").extractOrElse[Int](-1)
      val articles = (json \ "articles").extractOrElse[Int](-1)
      val cid = (json \ "cid").extractOrElse[Int](-1)
      val date = (json \ "date").extractOrElse[Long](0L)
      Array(cid, gid, new Date(date).toString, articles, Tokenizer.lucene(title.replaceAll("\\n", "").replaceAll("\\r", "")).mkString(" ")).mkString(",")
    }).collect()

    val nodesMap = sc.broadcast(sc.esJsonRDD(esNodesResource).map({ case (_, strJson) =>
      implicit val format = DefaultFormats
      val json = parse(strJson)
      val gid = (json \ "gid").extractOrElse[Int](-1)
      val cid = (json \ "cid").extractOrElse[Int](-1)
      (cid, gid)
    }).collectAsMap())

    val edges = sc.esJsonRDD(esEdgesResource).map({ case (_, strJson) =>
      implicit val format = DefaultFormats
      val json = parse(strJson)
      val source = (json \ "source").extractOrElse[Int](-1)
      val target = (json \ "target").extractOrElse[Int](-1)
      val weight = (json \ "weight").extractOrElse[Int](-1)
      (source, target, weight)
    }).filter(_._3 > minWeight).map({ case (source, target, weight) =>
      val mutation = nodesMap.value.getOrElse(source, -1) != nodesMap.value.getOrElse(target, -1)
      Array(source, target, weight, mutation).mkString(",")
    }).collect()

    printToFile(new File(s"$outputDir/nodes")) { p =>
      p.println("id,story,date,articles,label")
      nodes.foreach(p.println)
    }

    printToFile(new File(s"$outputDir/edges")) { p =>
      p.println("source,target,weight,mutation")
      edges.foreach(p.println)
    }
  }

  def printToFile(f: java.io.File)(op: java.io.PrintWriter => Unit) {
    val p = new java.io.PrintWriter(f)
    try {
      op(p)
    } finally {
      p.close()
    }
  }
} 
Example 124
Source File: SimhashIndexing.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.story

import java.net.URL

import com.datastax.spark.connector._
import io.gzet.story.model.Article
import io.gzet.story.util.SimhashUtils._
import io.gzet.story.util.{HtmlFetcher, Tokenizer}
import io.gzet.utils.spark.gdelt.GKGParser
import org.apache.lucene.analysis.en.EnglishAnalyzer
import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException}

import scala.util.Try

object SimhashIndexing extends SimpleConfig with Logging {

  def main(args: Array[String]) = {

    val sc = new SparkContext(new SparkConf().setAppName("GDELT Indexing"))

    if (args.isEmpty)
      throw new SparkException("usage: <gdeltInputDir>")

    val gdeltInputDir = args.head
    val gkgRDD = sc.textFile(gdeltInputDir)
      .map(GKGParser.toJsonGKGV2)
      .map(GKGParser.toCaseClass2)

    val urlRDD = gkgRDD.map(g => g.documentId.getOrElse("NA"))
      .filter(url => Try(new URL(url)).isSuccess)
      .distinct()
      .repartition(partitions)

    val contentRDD = urlRDD.mapPartitions({ it =>
      val html = new HtmlFetcher(gooseConnectionTimeout, gooseSocketTimeout)
      it map html.fetch
    })

    val corpusRDD = contentRDD.mapPartitions({ it =>
      val analyzer = new EnglishAnalyzer()
      it.map(content => (content, Tokenizer.lucene(content.body, analyzer)))
    }).filter({ case (content, corpus) =>
      corpus.length > minWords
    })

    //CREATE TABLE gzet.articles ( hash int PRIMARY KEY, url text, title text, body text );
    corpusRDD.mapValues(_.mkString(" ").simhash).map({ case (content, simhash) =>
      Article(simhash, content.body, content.title, content.url)
    }).saveToCassandra(cassandraKeyspace, cassandraTable)

  }

} 
Example 125
Source File: StoryBatchDedup.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.story

import io.gzet.story.model.{Content, Article}
import org.apache.spark.graphx.{Graph, Edge}
import org.apache.spark.{Logging, SparkConf, SparkContext}
import io.gzet.story.util.SimhashUtils._
import com.datastax.spark.connector._

object StoryBatchDedup extends SimpleConfig with Logging {

  def main(args: Array[String]): Unit = {

    val sparkConf = new SparkConf().setAppName("Story Extractor")
    val sc = new SparkContext(sparkConf)

    val simhashRDD = sc.cassandraTable[Article]("gzet", "articles").zipWithIndex().map({ case (a, id) =>
      ((id, Content(a.url, a.title, a.body)), a.hash)
    })
    Set(0)

    val duplicateTupleRDD = simhashRDD.flatMap({ case ((id, content), simhash) =>
      searchmasks.map({ mask =>
        (simhash ^ mask, id)
      })
    }).groupByKey()

    val edgeRDD = duplicateTupleRDD.values.flatMap({ it =>
      val list = it.toList
      for (x <- list; y <- list) yield (x, y)
    }).filter({ case (x, y) =>
      x != y
    }).distinct().map({case (x, y) =>
      Edge(x, y, 0)
    })

    val duplicateRDD = Graph.fromEdges(edgeRDD, 0L)
      .connectedComponents()
      .vertices
      .join(simhashRDD.keys)
      .values

    duplicateRDD.sortBy(_._1).collect().foreach({ case (story, content) =>
      println(story + "\t" + content.title)
    })

  }

} 
Example 126
Source File: TwitterStream.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.timeseries

import com.google.gson.GsonBuilder
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.twitter.TwitterUtils
import org.apache.spark.streaming.{Minutes, StreamingContext}
import org.apache.spark.{Logging, SparkConf, SparkContext}
import twitter4j.auth.OAuthAuthorization
import twitter4j.conf.ConfigurationBuilder

import scala.util.Try

object TwitterStream extends SimpleConfig with Logging {

  def getTwitterStream(ssc: StreamingContext, filters: Seq[String] = Nil) = {
    val builder = new ConfigurationBuilder()
    builder.setOAuthConsumerKey(twitterApiKey)
    builder.setOAuthConsumerSecret(twitterApiSecret)
    builder.setOAuthAccessToken(twitterTokenKey)
    builder.setOAuthAccessTokenSecret(twitterTokenSecret)
    val configuration = builder.build()
    TwitterUtils.createStream(
      ssc,
      Some(new OAuthAuthorization(configuration)),
      filters,
      StorageLevel.MEMORY_ONLY
    )
  }

  def main(args: Array[String]) = {

    val sparkConf = new SparkConf().setAppName("Twitter Extractor")
    val sc = new SparkContext(sparkConf)
    val ssc = new StreamingContext(sc, Minutes(5))

    val twitterStream = getTwitterStream(ssc, args).mapPartitions({ it =>
      val gson = new GsonBuilder().create()
      it map { s =>
        Try(gson.toJson(s))
      }
    })

    twitterStream
      .filter(_.isSuccess)
      .map(_.get)
      .saveAsTextFiles("twitter")

    // Start streaming context
    ssc.start()
    ssc.awaitTermination()

  }

} 
Example 127
Source File: MetricImplicits.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.timeseries.timely

import java.io.PrintStream
import java.net.Socket
import java.nio.charset.StandardCharsets

import io.gzet.timeseries.SimpleConfig
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.{Logging, Partitioner}

object MetricImplicits extends Logging with SimpleConfig {

  def nonNegativeMod(x: Int, mod: Int): Int = {
    val rawMod = x % mod
    rawMod + (if (rawMod < 0) mod else 0)
  }

  class MetricPartitioner(partitions: Int) extends Partitioner {
    require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.")

    override def numPartitions: Int = partitions

    override def getPartition(key: Any): Int = {
      val k = key.asInstanceOf[MetricKey]
      nonNegativeMod(k.metricName.hashCode, partitions)
    }
  }

  implicit class Metrics(rdd: RDD[Metric]) {

    val partitions = rdd.partitions.length
    val partitioner = new MetricPartitioner(partitions)

    def publish() = {
      val sSortedMetricRDD = rdd filter { metric =>
        metric.tags.nonEmpty
      } map { metric =>
        (MetricKey(metric.name, metric.time), metric)
      } repartitionAndSortWithinPartitions partitioner

      sSortedMetricRDD.values foreachPartition { it: Iterator[Metric] =>
        val sock = new Socket(timelyHost, timelyPort)
        val writer = new PrintStream(sock.getOutputStream, true, StandardCharsets.UTF_8.name)
        it foreach { metric =>
          writer.println(metric.toPut)
        }
        writer.flush()
      }
    }
  }


  implicit class MetricStream(stream: DStream[Metric]) {
    def publish() = {
      stream foreachRDD {
        rdd => rdd.publish()
      }
    }
  }
}

case class Metric(name: String, time: Long, value: Double, tags: Map[String, String], viz: Option[String] = None) {
  def toPut = {
    val vizMap = if(viz.isDefined) List("viz" -> viz.get) else List[(String, String)]()
    val strTags = vizMap.union(tags.toList).map({ case (k, v) =>
      s"$k=$v"
    }).mkString(" ")
    s"put $name $time $value $strTags"
  }
}

case class MetricKey(metricName: String, metricTime: Long)

object MetricKey {
  implicit def orderingByMetricDate[A <: MetricKey] : Ordering[A] = {
    Ordering.by(fk => (fk.metricName, fk.metricTime))
  }
} 
Example 128
Source File: GDBIndex.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import java.io.{DataInput, File}
import java.nio.{ByteBuffer, ByteOrder}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, Path}
import org.apache.spark.Logging

object GDBIndex {
  def apply(path: String, name: String, conf: Configuration = new Configuration()) = {
    val filename = StringBuilder.newBuilder.append(path).append(File.separator).append(name).append(".gdbtablx").toString()
    val hdfsPath = new Path(filename)
    val dataInput = hdfsPath.getFileSystem(conf).open(hdfsPath)

    val bytes = new Array[Byte](16)
    dataInput.readFully(bytes)
    val byteBuffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN)

    val signature = byteBuffer.getInt
    val n1024Blocks = byteBuffer.getInt
    val numRows = byteBuffer.getInt
    val indexSize = byteBuffer.getInt

    new GDBIndex(dataInput, numRows, indexSize)
  }
}

private[gdb] class GDBIndex(dataInput: FSDataInputStream,
                            val numRows: Int,
                            indexSize: Int
                           ) extends Logging with AutoCloseable with Serializable {

  def readSeekForRowNum(rowNum: Int) = {
    val bytes = new Array[Byte](indexSize)
    dataInput.seek(16 + rowNum * indexSize)
    dataInput.readFully(bytes)
    ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).getInt
  }

  def iterator(startAtRow: Int = 0, numRowsToRead: Int = -1) = {
    dataInput.seek(16 + startAtRow * indexSize)
    val maxRows = if (numRowsToRead == -1) numRows else numRowsToRead
    // log.info(s"iterator::startAtRow=$startAtRow maxRows=$maxRows")
    new GDBIndexIterator(dataInput, startAtRow, maxRows, indexSize).withFilter(_.isSeekable)
  }

  def close() {
    dataInput.close()
  }
}

private[gdb] class GDBIndexIterator(dataInput: DataInput,
                                    startID: Int,
                                    maxRows: Int,
                                    indexSize: Int
                                   ) extends Iterator[IndexInfo] with Logging with Serializable {

  private val indexInfo = IndexInfo(0, 0)
  private val bytes = new Array[Byte](indexSize)
  private val byteBuffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN)

  private var objectID = startID
  private var nextRow = 0

  def hasNext() = nextRow < maxRows

  def next() = {
    // log.info(s"next::nextRow=$nextRow maxRows=$maxRows")
    nextRow += 1

    objectID += 1
    indexInfo.objectID = objectID

    byteBuffer.clear
    dataInput.readFully(bytes)
    indexInfo.seek = byteBuffer.getInt

    indexInfo
  }
} 
Example 129
Source File: GDBRDD.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import org.apache.hadoop.conf.Configuration
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.{Logging, Partition, SparkContext, TaskContext}


case class GDBRDD(@transient sc: SparkContext, gdbPath: String, gdbName: String, numPartitions: Int) extends RDD[Row](sc, Nil) with Logging {

  @DeveloperApi
  override def compute(partition: Partition, context: TaskContext): Iterator[Row] = {
    val part = partition.asInstanceOf[GDBPartition]
    val hadoopConf = if (sc == null) new Configuration() else sc.hadoopConfiguration
    val index = GDBIndex(gdbPath, part.hexName, hadoopConf)
    val table = GDBTable(gdbPath, part.hexName, hadoopConf)
    context.addTaskCompletionListener(context => {
      table.close()
      index.close()
    })
    table.rowIterator(index, part.startAtRow, part.numRowsToRead)
  }

  override protected def getPartitions: Array[Partition] = {
    val hadoopConf = if (sc == null) new Configuration() else sc.hadoopConfiguration
    GDBTable.findTable(gdbPath, gdbName, hadoopConf) match {
      case Some(catTab) => {
        val index = GDBIndex(gdbPath, catTab.hexName, hadoopConf)
        try {
          val numRows = index.numRows
          val numRowsPerPartition = (numRows.toDouble / numPartitions).ceil.toInt
          var startAtRow = 0
          (0 until numPartitions).map(i => {
            val endAtRow = startAtRow + numRowsPerPartition
            val numRowsToRead = if (endAtRow <= numRows) numRowsPerPartition else numRows - startAtRow
            val gdbPartition = GDBPartition(i, catTab.hexName, startAtRow, numRowsToRead)
            startAtRow += numRowsToRead
            gdbPartition
          }).toArray
        } finally {
          index.close()
        }
      }
      case _ => {
        log.error(s"Cannot find '$gdbName' in $gdbPath, creating an empty array of Partitions !")
        Array.empty[Partition]
      }
    }
  }
}

private[this] case class GDBPartition(m_index: Int,
                                      val hexName: String,
                                      val startAtRow: Int,
                                      val numRowsToRead: Int
                                     ) extends Partition {
  override def index = m_index
} 
Example 130
Source File: GDBRelation.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import org.apache.spark.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, SQLContext}


case class GDBRelation(gdbPath: String, gdbName: String, numPartition: Int)
                      (@transient val sqlContext: SQLContext)
  extends BaseRelation with Logging with TableScan {

  override val schema = inferSchema()

  private def inferSchema() = {
    val sc = sqlContext.sparkContext
    GDBTable.findTable(gdbPath, gdbName, sc.hadoopConfiguration) match {
      case Some(catTab) => {
        val table = GDBTable(gdbPath, catTab.hexName, sc.hadoopConfiguration)
        try {
          table.schema()
        } finally {
          table.close()
        }
      }
      case _ => {
        log.error(s"Cannot find '$gdbName' in $gdbPath, creating an empty schema !")
        StructType(Seq.empty[StructField])
      }
    }
  }

  override def buildScan(): RDD[Row] = {
    GDBRDD(sqlContext.sparkContext, gdbPath, gdbName, numPartition)
  }
} 
Example 131
Source File: Main.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.app

import com.esri.core.geometry.Polyline
import com.esri.udt.{PointType, PolylineType}
import org.apache.spark.sql.{SQLContext, SaveMode}
import org.apache.spark.{Logging, SparkConf, SparkContext}



    val sqlContext = new SQLContext(sc)
    val df = sqlContext.read.format("com.esri.gdb")
      .option("path", path)
      .option("name", name)
      .option("numPartitions", "1")
      .load()
    df.printSchema()
    df.registerTempTable(name)
    sqlContext.udf.register("getX", (point: PointType) => point.x)
    sqlContext.udf.register("getY", (point: PointType) => point.y)
    sqlContext.udf.register("line", (point: PointType) => PolylineType({
      val polyline = new Polyline()
      polyline.startPath(point.x - 2, point.y - 2)
      polyline.lineTo(point.x + 2, point.y + 2)
      polyline
    }
    ))
    sqlContext.sql(s"select line(Shape),getX(Shape)-2 as x from $name")
      .write
      .mode(SaveMode.Overwrite)
      .format("json")
      .save(s"/tmp/$name.json")
  } finally {
    sc.stop()
  }

} 
Example 132
Source File: HBaseSource.scala    From shc   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.hbase.examples

import org.apache.spark.{SparkConf, Logging, SparkContext}
import org.apache.spark.sql.execution.datasources.hbase._
import  org.apache.spark.sql.SQLContext
import org.apache.spark.sql._

case class HBaseRecord(
    col0: String,
    col1: Boolean,
    col2: Double,
    col3: Float,
    col4: Int,
    col5: Long,
    col6: Short,
    col7: String,
    col8: Byte)

object HBaseRecord {
  def apply(i: Int): HBaseRecord = {
    val s = s"""row${"%03d".format(i)}"""
    HBaseRecord(s,
      i % 2 == 0,
      i.toDouble,
      i.toFloat,
      i,
      i.toLong,
      i.toShort,
      s"String$i extra",
      i.toByte)
  }
}

object HBaseSource {
  val cat = s"""{
            |"table":{"namespace":"default", "name":"table1"},
            |"rowkey":"key",
            |"columns":{
              |"col0":{"cf":"rowkey", "col":"key", "type":"string"},
              |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"},
              |"col2":{"cf":"cf2", "col":"col2", "type":"double"},
              |"col3":{"cf":"cf3", "col":"col3", "type":"float"},
              |"col4":{"cf":"cf4", "col":"col4", "type":"int"},
              |"col5":{"cf":"cf5", "col":"col5", "type":"bigint"},
              |"col6":{"cf":"cf6", "col":"col6", "type":"smallint"},
              |"col7":{"cf":"cf7", "col":"col7", "type":"string"},
              |"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"}
            |}
          |}""".stripMargin

  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("HBaseTest")
    val sc = new SparkContext(sparkConf)
    val sqlContext = new SQLContext(sc)

    import sqlContext._
    import sqlContext.implicits._


    def withCatalog(cat: String): DataFrame = {
      sqlContext
        .read
        .options(Map(HBaseTableCatalog.tableCatalog->cat))
        .format("org.apache.spark.sql.execution.datasources.hbase")
        .load()
    }


    val data = (0 to 255).map { i =>
      HBaseRecord(i)
    }
    sc.parallelize(data).toDF.write.options(
      Map(HBaseTableCatalog.tableCatalog -> cat, HBaseTableCatalog.newTable -> "5"))
      .format("org.apache.spark.sql.execution.datasources.hbase")
      .save()
    val df = withCatalog(cat)
    df.show
    df.filter($"col0" <= "row005")
      .select($"col0", $"col1").show
    df.filter($"col0" === "row005" || $"col0" <= "row005")
      .select($"col0", $"col1").show
    df.filter($"col0" > "row250")
      .select($"col0", $"col1").show
    df.registerTempTable("table1")
    val c = sqlContext.sql("select count(col1) from table1 where col0 < 'row050'")
    c.show()
  }
} 
Example 133
Source File: SHC.scala    From shc   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import java.io.File

import com.google.common.io.Files
import org.apache.hadoop.hbase.{HColumnDescriptor, HTableDescriptor, TableName, HBaseTestingUtility}
import org.apache.hadoop.hbase.client.{Scan, Put, ConnectionFactory, Table}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.execution.datasources.hbase.SparkHBaseConf
import org.apache.spark.sql.types.UTF8String
import org.apache.spark.{SparkContext, SparkConf, Logging}
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}
import scala.collection.JavaConverters._

class SHC  extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll  with Logging {
  implicit class StringToColumn(val sc: StringContext) {
    def $(args: Any*): ColumnName = {
      new ColumnName(sc.s(args: _*))
    }
  }


  private[spark] var htu = HBaseTestingUtility.createLocalHTU()
  private[spark] def tableName = "table1"

  private[spark] def columnFamilies: Array[String] = Array.tabulate(9){ x=> s"cf$x"}
  var table: Table = null
  val conf = new SparkConf
  conf.set(SparkHBaseConf.testConf, "true")
  SparkHBaseConf.conf = htu.getConfiguration
  // private[spark] var columnFamilyStr = Bytes.toString(columnFamily)

  def catalog = s"""{
            |"table":{"namespace":"default", "name":"table1"},
            |"rowkey":"key",
            |"columns":{
              |"col0":{"cf":"rowkey", "col":"key", "type":"string"},
              |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"},
              |"col2":{"cf":"cf2", "col":"col2", "type":"double"},
              |"col3":{"cf":"cf3", "col":"col3", "type":"float"},
              |"col4":{"cf":"cf4", "col":"col4", "type":"int"},
              |"col5":{"cf":"cf5", "col":"col5", "type":"bigint"},
              |"col6":{"cf":"cf6", "col":"col6", "type":"smallint"},
              |"col7":{"cf":"cf7", "col":"col7", "type":"string"},
              |"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"}
            |}
          |}""".stripMargin

  override def beforeAll() {
    val tempDir: File = Files.createTempDir
    tempDir.deleteOnExit
    htu.cleanupTestDir
    htu.startMiniZKCluster
    htu.startMiniHBaseCluster(1, 4)
    logInfo(" - minicluster started")
    println(" - minicluster started")

  }

  override def afterAll() {
    try {
      table.close()
      println("shutdown")
      htu.deleteTable(TableName.valueOf(tableName))
      logInfo("shuting down minicluster")
      htu.shutdownMiniHBaseCluster
      htu.shutdownMiniZKCluster
      logInfo(" - minicluster shut down")
      htu.cleanupTestDir
    } catch {
      case _ => logError("teardown error")
    }
  }

  def createTable(name: String, cfs: Array[String]) {
    val tName = Bytes.toBytes(name)
    val bcfs = cfs.map(Bytes.toBytes(_))
    try {
      htu.deleteTable(TableName.valueOf(tName))
    } catch {
      case _ =>
        logInfo(" - no table " + name + " found")
    }
    htu.createMultiRegionTable(TableName.valueOf(tName), bcfs)
  }


  def createTable(name: Array[Byte], cfs: Array[Array[Byte]]) {
    try {
      htu.deleteTable(TableName.valueOf(name))
    } catch {
      case _ =>
        logInfo(" - no table " + Bytes.toString(name) + " found")
    }
    htu.createMultiRegionTable(TableName.valueOf(name), cfs)
  }
} 
Example 134
Source File: CounterEtlFunctions.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.counter.loader.core

import org.apache.s2graph.core.{S2Edge, S2Graph, GraphUtil}
import org.apache.s2graph.counter.loader.config.StreamingConfig
import org.apache.s2graph.counter.models.CounterModel
import org.apache.s2graph.spark.config.S2ConfigFactory
import org.apache.spark.Logging
import play.api.libs.json._
import scala.collection.mutable.{HashMap => MutableHashMap}

object CounterEtlFunctions extends Logging {
  lazy val filterOps = Seq("insert", "insertBulk", "update", "increment").map(op => GraphUtil.operations(op))
  lazy val preFetchSize = StreamingConfig.PROFILE_PREFETCH_SIZE
  lazy val config = S2ConfigFactory.config
  lazy val counterModel = new CounterModel(config)
  lazy val graph = new S2Graph(config)(scala.concurrent.ExecutionContext.Implicits.global)

  def logToEdge(line: String): Option[S2Edge] = {
    for {
      elem <- graph.toGraphElement(line) if elem.isInstanceOf[S2Edge]
      edge <- Some(elem.asInstanceOf[S2Edge]).filter { x =>
        filterOps.contains(x.op)
      }
    } yield {
      edge
    }
  }

  def parseEdgeFormat(line: String): Option[CounterEtlItem] = {
    
    for {
      elem <- graph.toGraphElement(line) if elem.isInstanceOf[S2Edge]
      edge <- Some(elem.asInstanceOf[S2Edge]).filter { x =>
        filterOps.contains(x.op)
      }
    } yield {
      val label = edge.innerLabel
      val labelName = label.label
      val tgtService = label.tgtColumn.service.serviceName
      val tgtId = edge.tgtVertex.innerId.toString()
      val srcId = edge.srcVertex.innerId.toString()

      // make empty property if no exist edge property
      val dimension = Json.parse(Some(GraphUtil.split(line)).filter(_.length >= 7).map(_(6)).getOrElse("{}"))
      val bucketKeys = Seq("_from")
      val bucketKeyValues = {
        for {
          variable <- bucketKeys
        } yield {
          val jsValue = variable match {
            case "_from" => JsString(srcId)
            case s => (dimension \ s).get
          }
          s"[[$variable]]" -> jsValue
        }
      }
      val property = Json.toJson(bucketKeyValues :+ ("value" -> JsString("1")) toMap)
//      val property = Json.toJson(Map("_from" -> srcId, "_to" -> tgtId, "value" -> "1"))

      CounterEtlItem(edge.ts, tgtService, labelName, tgtId, dimension, property)
    }
  }

  def parseEdgeFormat(lines: List[String]): List[CounterEtlItem] = {
    for {
      line <- lines
      item <- parseEdgeFormat(line)
    } yield {
      item
    }
  }
  
  def checkPolicyAndMergeDimension(service: String, action: String, items: List[CounterEtlItem]): List[CounterEtlItem] = {
    counterModel.findByServiceAction(service, action).map { policy =>
      if (policy.useProfile) {
        policy.bucketImpId match {
          case Some(_) => DimensionProps.mergeDimension(policy, items)
          case None => Nil
        }
      } else {
        items
      }
    }.getOrElse(Nil)
  }
} 
Example 135
Source File: SubscriberListener.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.spark.spark

import org.apache.spark.Logging
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.scheduler.{StreamingListener, StreamingListenerReceiverError, StreamingListenerReceiverStarted, StreamingListenerReceiverStopped}

class SubscriberListener(ssc: StreamingContext) extends StreamingListener with Logging {
  override def onReceiverError(receiverError: StreamingListenerReceiverError): Unit = {
    logInfo("onReceiverError")
  }

  override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted): Unit = {
    logInfo("onReceiverStarted")
  }

  override def onReceiverStopped(receiverStopped: StreamingListenerReceiverStopped): Unit = {
    logInfo("onReceiverStopped")
    ssc.stop()
  }
} 
Example 136
Source File: StreamHelper.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.kafka

import kafka.KafkaHelper
import kafka.common.TopicAndPartition
import kafka.consumer.PartitionTopicInfo
import kafka.message.MessageAndMetadata
import kafka.serializer.Decoder
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.{Logging, SparkException}
import scala.reflect.ClassTag

case class StreamHelper(kafkaParams: Map[String, String]) extends Logging {
  // helper for kafka zookeeper
  lazy val kafkaHelper = KafkaHelper(kafkaParams)
  lazy val kc = new KafkaCluster(kafkaParams)

  // 1. get leader's earliest and latest offset
  // 2. get consumer offset
  // 3-1. if (2) is bounded in (1) use (2) for stream
  // 3-2. else use (1) by "auto.offset.reset"
  private def getStartOffsets(topics: Set[String]): Map[TopicAndPartition, Long] = {
    lazy val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase)
    lazy val consumerOffsets = kafkaHelper.getConsumerOffsets(topics.toSeq)

    {
      for {
        topicPartitions <- kc.getPartitions(topics).right
        smallOffsets <- kc.getEarliestLeaderOffsets(topicPartitions).right
        largeOffsets <- kc.getLatestLeaderOffsets(topicPartitions).right
      } yield {
        {
          for {
            tp <- topicPartitions
          } yield {
            val co = consumerOffsets.getOrElse(tp, PartitionTopicInfo.InvalidOffset)
            val so = smallOffsets.get(tp).map(_.offset).get
            val lo = largeOffsets.get(tp).map(_.offset).get

            logWarning(s"$tp: $co $so $lo")

            if (co >= so && co <= lo) {
              (tp, co)
            } else {
              (tp, reset match {
                case Some("smallest") => so
                case _ => lo
              })
            }
          }
        }.toMap
      }
    }.fold(errs => throw new SparkException(errs.mkString("\n")), ok => ok)
  }

  def createStream[K: ClassTag, V: ClassTag, KD <: Decoder[K]: ClassTag, VD <: Decoder[V]: ClassTag](ssc: StreamingContext, topics: Set[String]): InputDStream[(K, V)] = {
    type R = (K, V)
    val messageHandler = (mmd: MessageAndMetadata[K, V]) => (mmd.key(), mmd.message())

    kafkaHelper.registerConsumerInZK(topics)

    new DirectKafkaInputDStream[K, V, KD, VD, R](ssc, kafkaParams, getStartOffsets(topics), messageHandler)
  }

  def commitConsumerOffsets(offsets: HasOffsetRanges): Unit = {
    val offsetsMap = {
      for {
        range <- offsets.offsetRanges if range.fromOffset < range.untilOffset
      } yield {
        logDebug(range.toString())
        TopicAndPartition(range.topic, range.partition) -> range.untilOffset
      }
    }.toMap

    kafkaHelper.commitConsumerOffsets(offsetsMap)
  }

  def commitConsumerOffset(range: OffsetRange): Unit = {
    if (range.fromOffset < range.untilOffset) {
      try {
        val tp = TopicAndPartition(range.topic, range.partition)
        logDebug("Committed offset " + range.untilOffset + " for topic " + tp)
        kafkaHelper.commitConsumerOffset(tp, range.untilOffset)
      } catch {
        case t: Throwable =>
          // log it and let it go
          logWarning("exception during commitOffsets",  t)
          throw t
      }
    }
  }

  def commitConsumerOffsets[R](stream: InputDStream[R]): Unit = {
    stream.foreachRDD { rdd =>
      commitConsumerOffsets(rdd.asInstanceOf[HasOffsetRanges])
    }
  }
} 
Example 137
Source File: KafkaRDDFunctions.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.kafka

import org.apache.spark.Logging
import org.apache.spark.rdd.RDD

import scala.language.implicitConversions
import scala.reflect.ClassTag

class KafkaRDDFunctions[T: ClassTag](self: RDD[T])
  extends Logging
  with Serializable
{
  def foreachPartitionWithOffsetRange(f: (OffsetRange, Iterator[T]) => Unit): Unit = {
    val offsets = self.asInstanceOf[HasOffsetRanges].offsetRanges
    foreachPartitionWithIndex { (i, part) =>
      val osr: OffsetRange = offsets(i)
      f(osr, part)
    }
  }

  def foreachPartitionWithIndex(f: (Int, Iterator[T]) => Unit): Unit = {
    self.mapPartitionsWithIndex[Nothing] { (i, part) =>
      f(i, part)
      Iterator.empty
    }.foreach {
      (_: Nothing) => ()
    }
  }
}

object KafkaRDDFunctions {
  implicit def rddToKafkaRDDFunctions[T: ClassTag](rdd: RDD[T]): KafkaRDDFunctions[T] = {
    new KafkaRDDFunctions(rdd)
  }
} 
Example 138
Source File: RiakRelation.scala    From spark-riak-connector   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.riak

import com.basho.riak.spark._
import scala.reflect._
import com.basho.riak.spark.rdd.connector.{RiakConnectorConf, RiakConnector}
import com.basho.riak.spark.rdd.{ReadConf, RiakTSRDD}
import com.basho.riak.spark.util.TSConversionUtil
import com.basho.riak.spark.writer.WriteConf
import com.basho.riak.spark.writer.mapper.SqlDataMapper
import org.apache.spark.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{InsertableRelation, BaseRelation, Filter, PrunedFilteredScan}
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import scala.collection.convert.decorateAsScala._
import com.basho.riak.spark.query.QueryBucketDef


object RiakRelation {
  def apply(bucket: String,
            sqlContext: SQLContext,
            schema: Option[StructType] = None,
            connector: Option[RiakConnector] = None,
            readConf: ReadConf,
            writeConf: WriteConf): RiakRelation = {

    new RiakRelation(bucket, connector.getOrElse(RiakConnector(sqlContext.sparkContext.getConf)),
      readConf, writeConf, sqlContext, schema)
  }

  def apply(sqlContext: SQLContext, parameters: Map[String, String], schema: Option[StructType]): RiakRelation = {
    val existingConf = sqlContext.sparkContext.getConf
    val bucketDef = BucketDef(parameters(DefaultSource.RiakBucketProperty), None)
    val riakConnector = new RiakConnector(RiakConnectorConf(existingConf, parameters))
    val readConf = ReadConf(existingConf, parameters)
    val writeConf = WriteConf(existingConf, parameters)
    RiakRelation(bucketDef.bucket, sqlContext, schema, Some(riakConnector), readConf, writeConf)
  }
} 
Example 139
Source File: TSDataQueryingIterator.scala    From spark-riak-connector   with Apache License 2.0 5 votes vote down vote up
package com.basho.riak.spark.query

import com.basho.riak.client.core.query.timeseries.Row
import org.apache.spark.Logging
import com.basho.riak.client.core.query.timeseries.ColumnDescription

class TSDataQueryingIterator(query: QueryTS) extends Iterator[Row] with Logging {

  private var _iterator: Option[Iterator[Row]] = None
  private val subqueries = query.queryData.iterator
  private var columns: Option[Seq[ColumnDescription]] = None

  
      prefetch()
    }

    columns match {
      case None      => Seq()
      case Some(cds) => cds
    }
  }
  
  protected[this] def prefetch() = {
    while( subqueries.hasNext && !isPrefetchedDataAvailable) {
      val nextSubQuery = subqueries.next
      logTrace(s"Prefetching chunk of data: ts-query(token=$nextSubQuery)")

      val r = query.nextChunk(nextSubQuery)

      r match {
        case (cds, rows) =>
          if (isTraceEnabled()) {
            logTrace(s"ts-query($nextSubQuery) returns:\n  columns: ${r._1}\n  data:\n\t ${r._2}")
          } else {
            logDebug(s"ts-query($nextSubQuery) returns:\n  data.size: ${r._2.size}")
          }

          if (cds != null && cds.nonEmpty) {
            columns = Some(cds)
          } else if (columns.isEmpty) {
            // We have to initialize columns here, to make a difference and use it as indikator
            columns = Some(Seq())
          }

          _iterator = Some(rows.iterator)

        case _ => _iterator = None
          logWarning(s"ts-query(token=$nextSubQuery) returns: NOTHING")
      }
    }
  }

  private def isPrefetchedDataAvailable: Boolean =
    !(_iterator.isEmpty || (_iterator.isDefined && !_iterator.get.hasNext))

  override def hasNext: Boolean = {
    if (!isPrefetchedDataAvailable) {
        prefetch()
    }

    _iterator match {
      case Some(it) => it.hasNext
      case None     => false
    }
  }

  override def next(): Row = {
    if (!hasNext) {
      throw new NoSuchElementException("next on empty iterator")
    }
    _iterator.get.next
  }
}

object TSDataQueryingIterator {

  def apply[R](query: QueryTS): TSDataQueryingIterator = new TSDataQueryingIterator(query)
} 
Example 140
Source File: KVDataQueryingIterator.scala    From spark-riak-connector   with Apache License 2.0 5 votes vote down vote up
package com.basho.riak.spark.query

import com.basho.riak.client.core.query.{Location, RiakObject}
import org.apache.spark.Logging

class KVDataQueryingIterator[T](query: Query[T]) extends Iterator[(Location, RiakObject)] with Logging {

  type ResultT = (Location, RiakObject)

  private var isThereNextValue: Option[Boolean] = None
  private var nextToken: Option[T] = None

  private var _iterator: Option[Iterator[ResultT]] = None

  protected[this] def prefetch(): Boolean = {
    logTrace(s"Prefetching chunk of data: query(token=$nextToken)")

    val r = query.nextChunk(nextToken)

    if( isTraceEnabled() ) {
      logTrace(s"query(token=$nextToken) returns:\n  token: ${r._1}\n  data:\n\t ${r._2}")
    } else {
      logDebug(s"query(token=$nextToken) returns:\n  token: ${r._1}\n  data.size: ${r._2.size}")
    }

    nextToken = r._1

    r match {
      case (_, Nil) =>
        
        logDebug("prefetch returned Nothing, all data was already processed (empty chunk was returned)")
        _iterator = KVDataQueryingIterator.OPTION_EMPTY_ITERATOR

      case (_, data: Iterable[(Location,RiakObject)]) =>
        if(nextToken.isEmpty){
          logDebug("prefetch returned the last chunk, all data was processed")
        }

        _iterator = Some(data.iterator)
    }

    _iterator.get.hasNext
  }

  override def hasNext: Boolean = {
    isThereNextValue match {
      case Some(b: Boolean) =>
        // cached value will be returned

      case None if _iterator.isDefined && _iterator.get.hasNext =>
        logTrace(s"prefetch is not required, at least one pre-fetched value available")
        isThereNextValue = KVDataQueryingIterator.OPTION_TRUE

      case None if _iterator.isDefined && _iterator.get.isEmpty && nextToken.isEmpty =>
        logTrace("prefetch is not required, all data was already processed")
        isThereNextValue = KVDataQueryingIterator.OPTION_FALSE

      case None =>
        isThereNextValue = Some(prefetch())
    }

    isThereNextValue.get
  }

  override def next(): (Location, RiakObject) = {
    if( !hasNext ){
      throw new NoSuchElementException("next on iterator")
    }

    isThereNextValue = None
    _iterator.get.next()
  }
}

object  KVDataQueryingIterator {
  private val OPTION_EMPTY_ITERATOR = Some(Iterator.empty)
  private val OPTION_TRUE = Some(true)
  private val OPTION_FALSE = Some(false)

  def apply[T](query: Query[T]): KVDataQueryingIterator[T] = new KVDataQueryingIterator[T](query)
} 
Example 141
Source File: DataMapper.scala    From spark-riak-connector   with Apache License 2.0 5 votes vote down vote up
package com.basho.riak.spark.util

import com.basho.riak.client.api.convert.JSONConverter
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import org.apache.spark.Logging

trait DataMapper extends Serializable {
    DataMapper.ensureInitialized()
}

object DataMapper extends Logging {
  private var isInitialized = false

  def ensureInitialized(): Boolean = {
    if (!isInitialized) {
      // Register Scala module to serialize/deserialize Scala stuff smoothly
      JSONConverter.registerJacksonModule(DefaultScalaModule)
      logDebug("Jackson DefaultScalaModule has been registered")
      isInitialized = true
    } else {
      logTrace("Jackson DefaultScalaModule initialization was skipped since module has been registered.")
    }
    isInitialized
  }
} 
Example 142
Source File: SparkStreamingFixture.scala    From spark-riak-connector   with Apache License 2.0 5 votes vote down vote up
package com.basho.riak.spark.streaming

import org.apache.spark.{Logging, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.junit.{After, Before}

trait SparkStreamingFixture extends Logging {

  protected var sc: SparkContext

  protected var ssc: StreamingContext = _

  protected val batchDuration = Seconds(1)

  @Before
  def startStreamingContext(): Unit = {
    ssc = new StreamingContext(sc, batchDuration)
    logInfo("Streaming context created")
  }

  @After
  def stopStreamingContext(): Unit = {
    Option(ssc).foreach(_.stop())
    logInfo("Streaming context stopped")
  }
} 
Example 143
Source File: SocketStreamingDataSource.scala    From spark-riak-connector   with Apache License 2.0 5 votes vote down vote up
package com.basho.riak.spark.streaming

import java.net.InetSocketAddress
import java.nio.channels.{AsynchronousCloseException, AsynchronousServerSocketChannel, AsynchronousSocketChannel, CompletionHandler}

import com.basho.riak.stub.SocketUtils
import org.apache.spark.Logging

class SocketStreamingDataSource extends Logging {

  private var serverChannel: AsynchronousServerSocketChannel = _
  private var clientChannel: AsynchronousSocketChannel = _

  def start(writeToSocket: AsynchronousSocketChannel => Unit): Int = {
    serverChannel = AsynchronousServerSocketChannel.open()
    require(serverChannel.isOpen)

    serverChannel.bind(new InetSocketAddress(0))
    serverChannel.accept(serverChannel, new CompletionHandler[AsynchronousSocketChannel, AsynchronousServerSocketChannel]() {
      override def completed(client: AsynchronousSocketChannel, server: AsynchronousServerSocketChannel): Unit = {
        logInfo(s"Incoming connection: ${SocketUtils.serverConnectionAsStr(client)}")
        clientChannel = client

        writeToSocket(client)

        client.isOpen match {
          case true =>
            val connectionString = SocketUtils.serverConnectionAsStr(client)
            client.shutdownInput()
            client.shutdownOutput()
            client.close()
            logInfo(s"Client $connectionString was gracefully disconnected")
          case false => // client is already closed - do nothing
        }
      }

      override def failed(exc: Throwable, serverChannel: AsynchronousServerSocketChannel): Unit = exc match {
        case _: AsynchronousCloseException =>
        case _ => logError(s"Something went wrong:  ${serverChannel.toString}", exc);
      }
    })

    serverChannel.getLocalAddress.asInstanceOf[InetSocketAddress].getPort
  }

  def stop(): Unit = {
    Option(clientChannel).foreach(_.close())
    Option(serverChannel).foreach(_.close())
  }
} 
Example 144
Source File: AbstractFailoverOfflineTest.scala    From spark-riak-connector   with Apache License 2.0 5 votes vote down vote up
package com.basho.riak.spark.rdd.failover

import com.basho.riak.client.core.query.Namespace
import com.basho.riak.client.core.util.HostAndPort
import com.basho.riak.stub.{RiakMessageHandler, RiakNodeStub}
import org.apache.commons.lang3.exception.ExceptionUtils
import org.apache.spark.{Logging, SparkConf, SparkContext}
import org.hamcrest.{Description, Matchers}
import org.junit.internal.matchers.ThrowableCauseMatcher
import org.junit.{After, Before}

import scala.collection.JavaConversions._

abstract class AbstractFailoverOfflineTest extends Logging {

  protected final val NAMESPACE = new Namespace("default", "test-bucket")
  protected final val COVERAGE_ENTRIES_COUNT = 64

  protected var sc: SparkContext = _
  protected var riakNodes: Seq[(HostAndPort, RiakNodeStub)] = _ // tuple HostAndPort -> stub

  val riakHosts: Int = 1

  val riakMessageHandler: Option[RiakMessageHandler] = None

  def sparkConf: SparkConf = new SparkConf(false)
    .setMaster("local")
    .setAppName(getClass.getSimpleName)
    .set("spark.riak.connection.host", riakNodes.map{case (hp, _) => s"${hp.getHost}:${hp.getPort}"}.mkString(","))
    .set("spark.riak.output.wquorum", "1")
    .set("spark.riak.input.fetch-size", "2")

  def initRiakNodes(): Seq[(HostAndPort, RiakNodeStub)] = {
    require(riakMessageHandler.isDefined)

    // start riak stubs on localhost and free random port
    (1 to riakHosts).map { _ =>
      val riakNode = RiakNodeStub(riakMessageHandler.get)
      riakNode.start() -> riakNode
    }
  }

  @Before
  def setUp(): Unit = {
    riakNodes = initRiakNodes()
    sc = new SparkContext(sparkConf)
  }

  @After
  def tearDown(): Unit = {
    Option(riakNodes).foreach(_.foreach(n => n._2.stop()))
    Option(sc).foreach(_.stop())
  }

  def distributeEvenly(size: Int, splitCount: Int): Seq[Int] = {
    val (base, rem) = (size / splitCount, size % splitCount)
    (0 until splitCount).map(i => if (i < rem) base + 1 else base)
  }
}

class RootCauseMatcher[T <: Throwable](val excClass: Class[T]) extends ThrowableCauseMatcher[T](Matchers.isA(excClass)) {

  private def getOneBeforeRootCause(item: T): Throwable = {
    val throwables = ExceptionUtils.getThrowableList(item)
    if (throwables.length > 1) {
      throwables.reverse.tail.head
    } else {
      throwables.head
    }
  }

  override def matchesSafely(item: T): Boolean = super.matchesSafely(getOneBeforeRootCause(item).asInstanceOf[T])

  override def describeMismatchSafely(item: T, description: Description): Unit =
    super.describeMismatchSafely(getOneBeforeRootCause(item).asInstanceOf[T], description)
} 
Example 145
Source File: BisectingKMeansModel.scala    From bisecting-kmeans   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.bisectingkmeans

import breeze.linalg.{Vector => BV, norm => breezeNorm}

import org.apache.spark.Logging
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD


  def toJavaLinkageMatrix: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = {
    val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]()
    this.node.toLinkageMatrix.foreach {x =>
      val row = new java.util.ArrayList[java.lang.Double]()
      row.add(x._1.toDouble)
      row.add(x._2.toDouble)
      row.add(x._3.toDouble)
      row.add(x._4.toDouble)
      javaList.add(row)
    }
    javaList
  }
} 
Example 146
Source File: SparkCassSSTableLoaderClientManager.scala    From Spark2Cassandra   with Apache License 2.0 5 votes vote down vote up
package com.github.jparkie.spark.cassandra.client

import java.net.InetAddress

import com.datastax.spark.connector.cql.{ AuthConf, CassandraConnector }
import com.github.jparkie.spark.cassandra.conf.SparkCassServerConf
import org.apache.spark.Logging

import scala.collection.mutable

private[cassandra] trait SparkCassSSTableLoaderClientManager extends Serializable with Logging {
  case class SessionKey(
    hosts:               Set[InetAddress],
    port:                Int,
    authConf:            AuthConf,
    sparkCassServerConf: SparkCassServerConf
  ) extends Serializable

  @transient
  private[client] val internalClients = mutable.HashMap.empty[SessionKey, SparkCassSSTableLoaderClient]

  private[client] def buildSessionKey(
    cassandraConnector:  CassandraConnector,
    sparkCassServerConf: SparkCassServerConf
  ): SessionKey = {
    SessionKey(cassandraConnector.hosts, cassandraConnector.port, cassandraConnector.authConf, sparkCassServerConf)
  }

  private[client] def buildClient(
    cassandraConnector:  CassandraConnector,
    sparkCassServerConf: SparkCassServerConf
  ): SparkCassSSTableLoaderClient = {
    val newSession = cassandraConnector.openSession()

    logInfo(s"Created SSTableLoaderClient to the following Cassandra nodes: ${cassandraConnector.hosts}")

    val sparkCassSSTableLoaderClient = new SparkCassSSTableLoaderClient(newSession, sparkCassServerConf)

    sys.addShutdownHook {
      logInfo("Closed Cassandra Session for SSTableLoaderClient.")

      sparkCassSSTableLoaderClient.stop()
    }

    sparkCassSSTableLoaderClient
  }

  
  private[cassandra] def evictAll(): Unit = synchronized {
    internalClients.values.foreach(_.stop())
    internalClients.clear()
  }
}

object SparkCassSSTableLoaderClientManager extends SparkCassSSTableLoaderClientManager 
Example 147
Source File: HBasePartition.scala    From Backup-Repo   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hbase

import org.apache.hadoop.hbase.regionserver.RegionScanner
import org.apache.spark.{Logging, Partition}
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.hbase.catalyst.expressions.PartialPredicateOperations._
import org.apache.spark.sql.hbase.types.{HBaseBytesType, Range}


private[hbase] class HBasePartition(
                                     val idx: Int, val mappedIndex: Int,
                                     start: Option[HBaseRawType] = None,
                                     end: Option[HBaseRawType] = None,
                                     val server: Option[String] = None,
                                     val filterPredicates: Option[Expression] = None,
                                     @transient relation: HBaseRelation = null,
                                     @transient val newScanner:RegionScanner = null)
  extends Range[HBaseRawType](start, true, end, false, HBaseBytesType)
  with Partition with IndexMappable with Logging {

  override def index: Int = idx

  override def hashCode(): Int = idx

  @transient lazy val startNative: Seq[Any] = relation.nativeKeyConvert(start)

  @transient lazy val endNative: Seq[Any] = relation.nativeKeyConvert(end)

  
  def computePredicate(relation: HBaseRelation): Option[Expression] = {
    val predicate = if (filterPredicates.isDefined &&
      filterPredicates.get.references.exists(_.exprId == relation.partitionKeys.head.exprId)) {
      val oriPredicate = filterPredicates.get
      val predicateReferences = oriPredicate.references.toSeq
      val boundReference = BindReferences.bindReference(oriPredicate, predicateReferences)
      val row = new GenericMutableRow(predicateReferences.size)
      var rowIndex = 0
      var i = 0
      var range: Range[_] = null
      while (i < relation.keyColumns.size) {
        range = relation.generateRange(this, oriPredicate, i)
        if (range != null) {
          rowIndex = relation.rowIndex(predicateReferences, i)
          if (rowIndex >= 0) row.update(rowIndex, range)
          // if the non-last dimension range is not point, do not proceed to the next dims
          if (i < relation.keyColumns.size - 1 && !range.isPoint) i = relation.keyColumns.size
          else i = i + 1
        } else i = relation.keyColumns.size
      }
      val pr = boundReference.partialReduce(row, predicateReferences)
      pr match {
        case (null, e: Expression) => Some(e)
        case (true, _) => None
        case (false, _) => Some(Literal(false))
      }
    } else filterPredicates
    logInfo(predicate.toString)
    predicate
  }

  override def toString = {
    s"HBasePartition: $idx, $mappedIndex, [$start, $end), $filterPredicates"
  }
} 
Example 148
Source File: BytesUtilsSuite.scala    From Backup-Repo   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hbase

import org.apache.spark.Logging
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.types._
import org.apache.spark.sql.hbase.types.HBaseBytesType
import org.apache.spark.sql.hbase.util.BytesUtils
import org.scalatest.{BeforeAndAfterAll, FunSuite}

class BytesUtilsSuite extends FunSuite with BeforeAndAfterAll with Logging {
  test("Bytes Ordering Test") {
    val s = Seq(-257, -256, -255, -129, -128, -127, -64, -16, -4, -1,
      0, 1, 4, 16, 64, 127, 128, 129, 255, 256, 257)
    val result = s.map(i => (i, BytesUtils.create(IntegerType).toBytes(i)))
      .sortWith((f, s) =>
      HBaseBytesType.ordering.gt(
        f._2.asInstanceOf[HBaseBytesType.InternalType],
        s._2.asInstanceOf[HBaseBytesType.InternalType]))
    assert(result.map(a => a._1) == s.sorted.reverse)
  }

  def compare(a: Array[Byte], b: Array[Byte]): Int = {
    val length = Math.min(a.length, b.length)
    var result: Int = 0
    for (i <- 0 to length - 1) {
      val diff: Int = (a(i) & 0xff).asInstanceOf[Byte] - (b(i) & 0xff).asInstanceOf[Byte]
      if (diff != 0) {
        result = diff
      }
    }
    result
  }

  test("Bytes Utility Test") {
    assert(BytesUtils.toBoolean(BytesUtils.create(BooleanType)
      .toBytes(input = true), 0) === true)
    assert(BytesUtils.toBoolean(BytesUtils.create(BooleanType)
      .toBytes(input = false), 0) === false)

    assert(BytesUtils.toDouble(BytesUtils.create(DoubleType).toBytes(12.34d), 0)
      === 12.34d)
    assert(BytesUtils.toDouble(BytesUtils.create(DoubleType).toBytes(-12.34d), 0)
      === -12.34d)

    assert(BytesUtils.toFloat(BytesUtils.create(FloatType).toBytes(12.34f), 0)
      === 12.34f)
    assert(BytesUtils.toFloat(BytesUtils.create(FloatType).toBytes(-12.34f), 0)
      === -12.34f)

    assert(BytesUtils.toInt(BytesUtils.create(IntegerType).toBytes(12), 0)
      === 12)
    assert(BytesUtils.toInt(BytesUtils.create(IntegerType).toBytes(-12), 0)
      === -12)

    assert(BytesUtils.toLong(BytesUtils.create(LongType).toBytes(1234l), 0)
      === 1234l)
    assert(BytesUtils.toLong(BytesUtils.create(LongType).toBytes(-1234l), 0)
      === -1234l)

    assert(BytesUtils.toShort(BytesUtils.create(ShortType)
      .toBytes(12.asInstanceOf[Short]), 0) === 12)
    assert(BytesUtils.toShort(BytesUtils.create(ShortType)
      .toBytes(-12.asInstanceOf[Short]), 0) === -12)

    assert(BytesUtils.toUTF8String(BytesUtils.create(StringType).toBytes("abc"), 0, 3)
      === UTF8String("abc"))
    assert(BytesUtils.toUTF8String(BytesUtils.create(StringType).toBytes(""), 0, 0) === UTF8String(""))

    assert(BytesUtils.toByte(BytesUtils.create(ByteType)
      .toBytes(5.asInstanceOf[Byte]), 0) === 5)
    assert(BytesUtils.toByte(BytesUtils.create(ByteType)
      .toBytes(-5.asInstanceOf[Byte]), 0) === -5)

    assert(compare(BytesUtils.create(IntegerType).toBytes(128),
      BytesUtils.create(IntegerType).toBytes(-128)) > 0)
  }

  test("byte array plus one") {
    var byteArray =  Array[Byte](0x01.toByte, 127.toByte)
    assert(Bytes.compareTo(BytesUtils.addOne(byteArray),  Array[Byte](0x01.toByte, 0x80.toByte)) == 0)

    byteArray =  Array[Byte](0xff.toByte, 0xff.toByte)
    assert(BytesUtils.addOne(byteArray) == null)

    byteArray =  Array[Byte](0x02.toByte, 0xff.toByte)
    assert(Bytes.compareTo(BytesUtils.addOne(byteArray),  Array[Byte](0x03.toByte, 0x00.toByte)) == 0)
  }

  test("float comparison") {
    val f1 = BytesUtils.create(FloatType).toBytes(-1.23f)
    val f2 = BytesUtils.create(FloatType).toBytes(100f)
    assert(Bytes.compareTo(f1, f2) < 0)
  }
} 
Example 149
Source File: WebSocketReceiver.scala    From spark-streaming-demo   with Apache License 2.0 5 votes vote down vote up
package com.datastax.examples.meetup.websocket

import com.datastax.examples.meetup.model._
import org.apache.spark.storage.StorageLevel
import scalawebsocket.WebSocket
import org.apache.spark.streaming.receiver.Receiver
import org.apache.spark.Logging

import org.json4s._
import org.json4s.jackson.JsonMethods._

class WebSocketReceiver(url: String, storageLevel: StorageLevel)
    extends Receiver[MeetupRsvp](storageLevel) with Logging
{
  @volatile private var webSocket: WebSocket = _

  def onStart() {
    try{
      logInfo("Connecting to WebSocket: " + url)
      val newWebSocket = WebSocket().open(url).onTextMessage({ msg: String => parseJson(msg) })
      setWebSocket(newWebSocket)
      logInfo("Connected to: WebSocket" + url)
    } catch {
      case e: Exception => restart("Error starting WebSocket stream", e)
    }
  }

  def onStop() {
    setWebSocket(null)
    logInfo("WebSocket receiver stopped")
  }

  private def setWebSocket(newWebSocket: WebSocket) = synchronized {
    if (webSocket != null) {
      webSocket.shutdown()
    }
    webSocket = newWebSocket
  }

  private def parseJson(jsonStr: String): Unit =
  {
    implicit lazy val formats = DefaultFormats

    try {
      val json = parse(jsonStr)
      val rsvp = json.extract[MeetupRsvp]
      store(rsvp)
    } catch {
      case e: MappingException => logError("Unable to map JSON message to MeetupRsvp object:" + e.msg)
      case e: Exception => logError("Unable to map JSON message to MeetupRsvp object")
    }

  }
} 
Example 150
Source File: GraphLoader.scala    From graphx-algorithm   with GNU General Public License v2.0 5 votes vote down vote up
package org.apache.spark.graphx

import org.apache.spark.storage.StorageLevel
import org.apache.spark.{Logging, SparkContext}
import org.apache.spark.graphx.impl.{EdgePartitionBuilder, GraphImpl}


  def edgeListFile(
      sc: SparkContext,
      path: String,
      canonicalOrientation: Boolean = false,
      numEdgePartitions: Int = -1,
      edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY,
      vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
    : Graph[Int, Int] =
  {
    val startTime = System.currentTimeMillis

    // Parse the edge data table directly into edge partitions
    val lines =
      if (numEdgePartitions > 0) {
        sc.textFile(path, numEdgePartitions).coalesce(numEdgePartitions)
      } else {
        sc.textFile(path)
      }
    val edges = lines.mapPartitionsWithIndex { (pid, iter) =>
      val builder = new EdgePartitionBuilder[Int, Int]
      iter.foreach { line =>
        if (!line.isEmpty && line(0) != '#') {
          val lineArray = line.split("\\s+")
          if (lineArray.length < 2) {
            logWarning("Invalid line: " + line)
          }
          val srcId = lineArray(0).toLong
          val dstId = lineArray(1).toLong
          if (canonicalOrientation && srcId > dstId) {
            builder.add(dstId, srcId, 1)
          } else {
            builder.add(srcId, dstId, 1)
          }
        }
      }
      Iterator((pid, builder.toEdgePartition))
    }.persist(edgeStorageLevel).setName("GraphLoader.edgeListFile - edges (%s)".format(path))
    edges.count()

    logInfo("It took %d ms to load the edges".format(System.currentTimeMillis - startTime))

    GraphImpl.fromEdgePartitions(edges, defaultVertexAttr = 1, edgeStorageLevel = edgeStorageLevel,
      vertexStorageLevel = vertexStorageLevel)
  } // end of edgeListFile

} 
Example 151
Source File: GraphLoaderPlus.scala    From graphx-algorithm   with GNU General Public License v2.0 5 votes vote down vote up
package org.apache.spark.graphx

import org.apache.spark.storage.StorageLevel
import org.apache.spark.{Logging, SparkContext}
import org.apache.spark.graphx.impl.{EdgePartitionBuilder, GraphImpl}


  def edgeListFile(
                    sc: SparkContext,
                    path: String,
                    canonicalOrientation: Boolean = false,
                    numEdgePartitions: Int = -1,
                    edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY,
                    vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  : Graph[Int, Int] =
  {
    val startTime = System.currentTimeMillis

    // Parse the edge data table directly into edge partitions
    val lines =
      if (numEdgePartitions > 0) {
        sc.textFile(path, numEdgePartitions).coalesce(numEdgePartitions)
      } else {
        sc.textFile(path)
      }
    val edges = lines.mapPartitionsWithIndex { (pid, iter) =>
      val builder = new EdgePartitionBuilder[Int, Int]
      iter.foreach { line =>
        if (!line.isEmpty && line(0) != '#') {
          val lineArray = line.split("\\s+")
          if (lineArray.length < 2) {
            throw new IllegalArgumentException("Invalid line: " + line)
          }
          if (lineArray.length == 2) {
            val srcId = lineArray(0).toLong
            val dstId = lineArray(1).toLong
            if (canonicalOrientation && srcId > dstId) {
              builder.add(dstId, srcId, 1)
            } else {
              builder.add(srcId, dstId, 1)
            }
          }
          else {
            val srcId = lineArray(0).toLong
            val dstId = lineArray(1).toLong
            val weight = lineArray(2).toInt
            if (canonicalOrientation && srcId > dstId) {
              builder.add(dstId, srcId, weight)
            } else {
              builder.add(srcId, dstId, weight)
            }
          }
        }
      }
      Iterator((pid, builder.toEdgePartition))
    }.persist(edgeStorageLevel).setName("GraphLoaderPlus.edgeListFile - edges (%s)".format(path))
    edges.count()

    logInfo("It took %d ms to load the edges".format(System.currentTimeMillis - startTime))

    GraphImpl.fromEdgePartitions(edges, defaultVertexAttr = 1, edgeStorageLevel = edgeStorageLevel,
      vertexStorageLevel = vertexStorageLevel)
  } // end of edgeListFile

} 
Example 152
Source File: SparkFunSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark

// scalastyle:off
import org.apache.log4j.{Level, Logger}
import org.scalatest.{FunSuite, Outcome}

import org.apache.spark.Logging


  final protected override def withFixture(test: NoArgTest): Outcome = {
    val testName = test.text
    val suiteName = this.getClass.getName
    val shortSuiteName = suiteName.replaceAll("org.apache.spark", "o.a.s")
    try {
      Logger.getLogger("org").setLevel(Level.OFF)
      Logger.getLogger("akka").setLevel(Level.OFF)

      logInfo(s"\n\n===== TEST OUTPUT FOR $shortSuiteName: '$testName' =====\n")
      test()
    } finally {
      logInfo(s"\n\n===== FINISHED $shortSuiteName: '$testName' =====\n")
    }
  }

} 
Example 153
Source File: ExtendedPlanner.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.extension

import org.apache.spark.Logging
import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.{SparkPlan, SparkPlanner}


  def planLaterExt(p: LogicalPlan): SparkPlan = planLater(p)

  def optimizedPlan(p: LogicalPlan): LogicalPlan = sqlContext.executePlan(p).optimizedPlan

  def optimizedRelationLookup(u: UnresolvedRelation): Option[LogicalPlan] = {
    if (sqlContext.catalog.tableExists(u.tableIdentifier)) {
      Some(optimizedPlan(u))
    } else {
      None
    }
  }

  // TODO (AC) Remove this once table-valued function are rebased on top.
  def analyze(p: LogicalPlan): LogicalPlan = sqlContext.analyzer.execute(p)

  override def plan(p: LogicalPlan): Iterator[SparkPlan] = {
    val iter = strategies.view.flatMap({ strategy =>
      val plans = strategy(p)
      if (plans.isEmpty) {
        logTrace(s"Strategy $strategy did not produce plans for $p")
      } else {
        logDebug(s"Strategy $strategy produced a plan for $p: ${plans.head}")
      }
      plans
    }).toIterator
    assert(iter.hasNext, s"No plan for $p")
    iter
  }

} 
Example 154
Source File: DropRunnableCommand.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources

import org.apache.spark.Logging
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.sources.DropRelation
import org.apache.spark.sql.{Row, SQLContext}

import scala.util.Try


private[sql] case class DropRunnableCommand(toDrop: Map[String, Option[DropRelation]])
  extends RunnableCommand
  with Logging {

  override def run(sqlContext: SQLContext): Seq[Row] = {
    toDrop.foreach {
      case (name, dropOption) =>
        sqlContext.dropTempTable(name)
        dropOption.foreach { dropRelation =>
          Try {
            dropRelation.dropTable()
          }.recover {
            // When the provider indicates an exception while dropping, we still have to continue
            // dropping all the referencing tables, otherwise there could be integrity issues
            case ex =>
              logWarning(
                s"""Error occurred when dropping table '$name':${ex.getMessage}, however
                   |table '$name' will still be dropped from Spark catalog.
                 """.stripMargin)
          }.get
        }
    }
    Seq.empty
  }
} 
Example 155
Source File: SQLRunner.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package com.sap.spark.cli

import java.io._

import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.{Logging, SparkContext}

import scala.annotation.tailrec

protected[cli] case class CLIOptions(
    sqlFiles: List[String] = Nil, output: Option[String] = None)


  def main(args: Array[String]): Unit = {
    def fail(msg: String = USAGE): Unit = {
      logError(msg)
      System.exit(1)
    }

    val opts = parseOpts(args.toList)

    val outputStream: OutputStream = opts.output match {
      case Some(filename) => new FileOutputStream(new File(filename))
      case None => System.out
    }

    opts.sqlFiles
      .map((string: String) => new FileInputStream(new File(string)))
      .foreach(sql(_, outputStream))
  }
} 
Example 156
Source File: NodeTests.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hierarchy

import org.apache.spark.Logging
import org.apache.spark.sql.types.{Node, NodeHelpers, StringType}

import scala.collection.mutable.ArrayBuffer

// scalastyle:off magic.number
// scalastyle:off file.size.limit
class NodeTests extends NodeUnitTestSpec with Logging {
    var nodes = ArrayBuffer[Node]()
    nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L))
    nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 1L))
    nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 1L, 2L))
    nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 1L, 3L))
    nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 2L))
    nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 3L))
    nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 4L))
    nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 4L, 1L))
    nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 4L, 2L))
  log.info("Running unit tests for sorting class Node\n")
  nodes.toArray should equal {
    // deterministic generator:
    val myRand = new scala.util.Random(42)

    // take copy of array-buffer, shuffle it
    val shuffled_nodes = myRand.shuffle(nodes.toSeq)

    // shuffled?:
    shuffled_nodes should not equal nodes.toArray

    shuffled_nodes.sorted(NodeHelpers.OrderedNode)
  }
  log.info("Testing function compareToRecursive\n")
  val x = Node(null, null)

  0 should equal {x.compareToRecursive(Seq(), Seq())}
  0 should be > {x.compareToRecursive(Seq(), Seq(1))}
  0 should be < {x.compareToRecursive(Seq(1), Seq())}
  0 should equal {x.compareToRecursive(Seq(1,2), Seq(1,2))}
  0 should be < {x.compareToRecursive(Seq(1,2), Seq(1))}
  0 should be > {x.compareToRecursive(Seq(1), Seq(1,2))}

}
// scalastyle:on magic.number
// scalastyle:on file.size.limit 
Example 157
Source File: HierarchyJoinBuilderUnitTests.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hierarchy

import org.apache.spark.sql.types.{IntegerType, Node}
import org.apache.spark.Logging
import org.apache.spark.sql.Row

// scalastyle:off magic.number
class HierarchyJoinBuilderUnitTests extends NodeUnitTestSpec with Logging {
  var jb = new HierarchyJoinBuilder[Row, Row, Long](null, null, null, null, null, null)

  log.info("Testing function 'extractNodeFromRow'\n")

  val x = Node(List(1,2,3), IntegerType, List(1L,1L,2L))
  Some(x) should equal {
    jb.extractNodeFromRow(Row.fromSeq(Seq(1,2,3, x)))
  }

  None should equal {
    jb.extractNodeFromRow(Row.fromSeq(Seq(1,2,3)))
  }

  None should equal {
    jb.extractNodeFromRow(Row.fromSeq(Seq()))
  }

  log.info("Testing function 'getOrd'\n")
   None should equal {
     jb.getOrd(Row.fromSeq(Seq(1,2,3)))
   }
  val testValues = List((42L, Some(42L)), (13, Some(13L)), ("hello", None), (1234.56, None))
  testValues.foreach(
    testVal => {
      val jbWithOrd = new HierarchyJoinBuilder[Row, Row, Long](null, null, null, null,
        x => testVal._1
        , null)
    testVal._2 should equal {
      jbWithOrd.getOrd(Row.fromSeq(Seq(x)))
    }
    }
  )
}
// scalastyle:on magic.number 
Example 158
Source File: ExtractSQLParserSuite.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import com.sap.spark.PlanTest
import org.apache.spark.Logging
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.parser.SapParserDialect
import org.scalatest.FunSuite

class ExtractSQLParserSuite extends FunSuite with PlanTest with Logging {

  // scalastyle:off magic.number

  val t1 = UnresolvedRelation(TableIdentifier("T1"))
  val parser = new SapParserDialect

  test("Parse EXTRACT in SELECT") {
    val result = parser.parse("SELECT a, EXTRACT(YEAR FROM a) FROM T1")
    val expected = t1.select(AliasUnresolver('a, Year('a)): _*)
    comparePlans(expected, result)
  }

  test("Parse EXTRACT in WHERE") {
    val result = parser.parse("SELECT 1 FROM T1 WHERE EXTRACT(MONTH FROM a) = 2015")
    val expected = t1.where(Month('a) === 2015).select(AliasUnresolver(1): _*)
    comparePlans(expected, result)
  }

  test("Parse EXTRACT in GROUP BY") {
    val result = parser.parse("SELECT 1 FROM T1 GROUP BY EXTRACT(DAY FROM a)")
    val expected = t1.groupBy(DayOfMonth('a))(AliasUnresolver(1): _*)
    comparePlans(expected, result)
  }

} 
Example 159
Source File: MathsSuite.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.expressions

import org.apache.commons.math3.util.FastMath
import org.apache.spark.Logging
import org.apache.spark.sql.{Row, GlobalSapSQLContext}
import org.scalatest.FunSuite

class MathsSuite
  extends FunSuite
  with GlobalSapSQLContext
  with Logging {

  // scalastyle:off magic.number

  val rowA = DoubleRow("AAA", 1.0)
  val rowB = DoubleRow("BBB", 2.0)
  val rowC = DoubleRow("CCC", 0.6)
  val rowD = DoubleRow("DDD", -1.1)
  val rowE = DoubleRow("DDD", -1.1)

  val data = Seq(rowA, rowB)

  test("ln, log, pow") {
    val rdd = sc.parallelize(data)
    val dSrc = sqlContext.createDataFrame(rdd).cache()
    dSrc.registerTempTable("src")

    val result1 = sqlContext.sql("SELECT name,d,LN(d) FROM src").collect()

    assertResult(Row(rowA.name, rowA.d, 0.0) ::
      Row(rowB.name, rowB.d, scala.math.log(2.0)) :: Nil)(result1)

    val result2 = sqlContext.sql("SELECT name,d,LOG(10, d) FROM src").collect()

    assertResult(Row(rowA.name, rowA.d, 0.0) ::
      Row(rowB.name, rowB.d, scala.math.log(rowB.d) / scala.math.log(10)) :: Nil)(result2)

    val result3 = sqlContext.sql("SELECT name,d,POWER(d,2) FROM src").collect()

    assertResult(Row(rowA.name, rowA.d, 1.0) ::
      Row(rowB.name, rowB.d, 4.0) :: Nil)(result3)
  }

  val data2 = Seq(rowC, rowD)

  test("ceil, floor, round, sign, mod") {
    val rdd = sc.parallelize(data2)
    val dSrc = sqlContext.createDataFrame(rdd).cache()
    dSrc.registerTempTable("src")

    val result1 = sqlContext.sql("SELECT name, d, CEIL(d), FLOOR(d)," +
      "ROUND(d,0), SIGN(d), MOD(d,3) FROM src").collect()

    assertResult(Row(rowC.name, rowC.d, 1.0, 0.0, 1.0, 1.0, 0.6) ::
      Row(rowD.name, rowD.d, -1.0, -2.0, -1.0, -1.0, -1.1) :: Nil)(result1)
  }

  test("cos, SIN, TAN, ACOS, ASIN, ATAN") {
    val rdd = sc.parallelize(data2)
    val dSrc = sqlContext.createDataFrame(rdd).cache()
    dSrc.registerTempTable("src")

    val result1 = sqlContext.sql("SELECT name, d, COS(d), SIN(d), TAN(d)," +
      " ACOS(COS(d)), ASIN(SIN(d)), ATAN(TAN(d)) FROM src").collect()

    assertResult(Row(rowC.name, rowC.d, FastMath.cos(rowC.d),
      FastMath.sin(rowC.d), FastMath.tan(rowC.d), 0.6, 0.6, 0.6) ::
      Row(rowD.name, rowD.d, FastMath.cos(rowD.d), FastMath.sin(rowD.d),
        FastMath.tan(rowD.d), 1.1, -1.1, -1.1) :: Nil)(result1)
  }
} 
Example 160
Source File: StringsSuite.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.Logging
import org.apache.spark.sql.{Row, GlobalSapSQLContext}
import org.scalatest.FunSuite

class StringsSuite
  extends FunSuite
  with GlobalSapSQLContext
  with Logging {

  // scalastyle:off magic.number

  val rowA = StringRow(" AAA")
  val rowB = StringRow("BBB ")
  val rowC = StringRow(" CCC ")
  val rowD = StringRow("DDDDDDD")
  val rowE = StringRow(null)

  val dataWithDates = Seq(rowA, rowB, rowC, rowD, rowE)

  test("string manipulations") {
    val rdd = sc.parallelize(dataWithDates)
    val dSrc = sqlContext.createDataFrame(rdd).cache()
    dSrc.registerTempTable("src")

    val result1 =
      sqlContext.sql("SELECT name,TRIM(name),RTRIM(name),LTRIM(name) FROM src").collect()

    assertResult(Row(rowA.name, "AAA", " AAA", "AAA") ::
      Row(rowB.name, "BBB", "BBB", "BBB ") ::
      Row(rowC.name, "CCC", " CCC", "CCC ") ::
      Row(rowD.name, "DDDDDDD", "DDDDDDD", "DDDDDDD") ::
      Row(rowE.name, null, null, null) :: Nil)(result1)

    val result2 =
      sqlContext.sql("SELECT name,LPAD(name,6,'x'),RPAD(name,6,'xyz') FROM src").collect()

    assertResult(Row(rowA.name, "xx AAA", " AAAxy") ::
      Row(rowB.name, "xxBBB ", "BBB xy") ::
      Row(rowC.name, "x CCC ", " CCC x") ::
      Row(rowD.name, "DDDDDD", "DDDDDD") ::
      Row(rowE.name, null, null) :: Nil)(result2)

    val result3 =
      sqlContext.sql("SELECT name, LENGTH(name), LOCATE('B', name) FROM src").collect()

    assertResult(Row(rowA.name, 4, 0) ::
      Row(rowB.name, 4, 1) ::
      Row(rowC.name, 5, 0) ::
      Row(rowD.name, 7, 0) ::
      Row(rowE.name, null, null) :: Nil)(result3)

    val result4 = sqlContext.sql("SELECT name, CONCAT(name,'aa') FROM src").collect()

    assertResult(Row(rowA.name, " AAAaa") ::
      Row(rowB.name, "BBB aa") ::
      Row(rowC.name, " CCC aa") ::
      Row(rowD.name, "DDDDDDDaa") ::
      Row(rowE.name, null) ::Nil)(result4)

    val result5 =
      sqlContext.sql("SELECT name,REPLACE(name,'DD','de'),REVERSE(name) FROM src").collect()

    assertResult(Row(rowA.name, " AAA", "AAA ") ::
      Row(rowB.name, "BBB ", " BBB") ::
      Row(rowC.name, " CCC ", " CCC ") ::
      Row(rowD.name, "dededeD", "DDDDDDD") ::
      Row(rowE.name, null, null) :: Nil)(result5)
  }
} 
Example 161
Source File: SapSQLEnv.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.sap.thriftserver

import java.io.PrintStream

import org.apache.spark.scheduler.StatsReportListener
import org.apache.spark.sql.hive.{HiveContext, SapHiveContext}
import org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver
import org.apache.spark.sql.hive.thriftserver.SparkSQLEnv._
import org.apache.spark.util.Utils
import org.apache.spark.{Logging, SparkConf, SparkContext}

import scala.collection.JavaConversions._


object SapSQLEnv extends Logging {

  def init() {
    logDebug("Initializing SapSQLEnv")
    if (hiveContext == null) {
      logInfo("Creating SapSQLContext")
      val sparkConf = new SparkConf(loadDefaults = true)
      val maybeSerializer = sparkConf.getOption("spark.serializer")
      val maybeKryoReferenceTracking = sparkConf.getOption("spark.kryo.referenceTracking")
      // If user doesn't specify the appName, we want to get [SparkSQL::localHostName] instead of
      // the default appName [SparkSQLCLIDriver] in cli or beeline.
      val maybeAppName = sparkConf
        .getOption("spark.app.name")
        .filterNot(_ == classOf[SparkSQLCLIDriver].getName)

      sparkConf
        .setAppName(maybeAppName.getOrElse(s"SparkSQL::${Utils.localHostName()}"))
        .set("spark.serializer",
          maybeSerializer.getOrElse("org.apache.spark.serializer.KryoSerializer"))
        .set("spark.kryo.referenceTracking",
          maybeKryoReferenceTracking.getOrElse("false"))

      sparkContext = new SparkContext(sparkConf)
      sparkContext.addSparkListener(new StatsReportListener())
      hiveContext = new SapHiveContext(sparkContext)

      hiveContext.metadataHive.setOut(new PrintStream(System.out, true, "UTF-8"))
      hiveContext.metadataHive.setInfo(new PrintStream(System.err, true, "UTF-8"))
      hiveContext.metadataHive.setError(new PrintStream(System.err, true, "UTF-8"))

      hiveContext.setConf("spark.sql.hive.version", HiveContext.hiveExecutionVersion)

      if (log.isDebugEnabled) {
        hiveContext.hiveconf.getAllProperties.toSeq.sorted.foreach { case (k, v) =>
          logDebug(s"HiveConf var: $k=$v")
        }
      }
    }
  }
} 
Example 162
Source File: SapThriftServer.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.thriftserver

import org.apache.commons.logging.LogFactory
import org.apache.spark.Logging
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.sap.thriftserver.SapSQLEnv
import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2._
import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab
import org.apache.hive.service.server.HiveServerServerOptionsProcessor

object SapThriftServer extends Logging {
  var LOG = LogFactory.getLog(classOf[SapThriftServer])


  def main(args: Array[String]) {
    val optionsProcessor = new HiveServerServerOptionsProcessor("SapThriftServer")
    if (!optionsProcessor.process(args)) {
      System.exit(-1)
    }

    logInfo("Starting SparkContext")
    SapSQLEnv.init()

    org.apache.spark.util.ShutdownHookManager.addShutdownHook { () =>
      SparkSQLEnv.stop()
      uiTab.foreach(_.detach())
    }

    try {
      val server = new HiveThriftServer2(SparkSQLEnv.hiveContext)
      server.init(SparkSQLEnv.hiveContext.hiveconf)
      server.start()
      logInfo("SapThriftServer started")
      listener = new HiveThriftServer2Listener(server, SparkSQLEnv.hiveContext.conf)
      SparkSQLEnv.sparkContext.addSparkListener(listener)
      uiTab = if (SparkSQLEnv.sparkContext.getConf.getBoolean("spark.ui.enabled", true)) {
        Some(new ThriftServerTab(SparkSQLEnv.sparkContext))
      } else {
        None
      }
    } catch {
      case e: Exception =>
        logError("Error starting SapThriftServer", e)
        System.exit(-1)
    }
  }
}

private[hive] class SapThriftServer(val hiveContext: HiveContext) extends Logging{

  def start: Unit = {
    logInfo("ThriftServer with SapSQLContext")
    logInfo("Starting SparkContext")
    HiveThriftServer2.startWithContext(hiveContext)
  }
} 
Example 163
Source File: OAuthToken.scala    From spark-power-bi   with Apache License 2.0 5 votes vote down vote up
package com.granturing.spark.powerbi

import java.util.concurrent.{ExecutionException, TimeUnit, Executors}
import com.microsoft.aad.adal4j.{AuthenticationResult, AuthenticationCallback, AuthenticationContext}
import dispatch._
import org.apache.spark.Logging
import scala.concurrent.{Await, promise}
import scala.util.{Try, Failure, Success}

private class OAuthReq(token: OAuthTokenHandler) extends (Req => Req) {

  override def apply(req: Req): Req = {
    req <:< Map("Authorization" -> s"Bearer ${token()}")
  }

}

private class OAuthTokenHandler(authConf: ClientConf, initialToken: Option[String] = None) extends Logging {

  private var _token: Option[String] = initialToken

  def apply(refresh: Boolean = false): String = {
    _token match {
      case Some(s) if !refresh => s
      case _ => {
        refreshToken match {
          case Success(s) => {
            _token = Some(s)
            s
          }
          case Failure(e) => throw e
        }
      }
    }
  }

  private def refreshToken: Try[String] = {
    log.info("refreshing OAuth token")

    val service = Executors.newFixedThreadPool(1);
    val context = new AuthenticationContext(authConf.token_uri, true, service)

    val p = promise[AuthenticationResult]
    val future = p.future

    context.acquireToken(authConf.resource, authConf.clientid, authConf.username, authConf.password, new AuthenticationCallback {
      def onSuccess(result: AuthenticationResult): Unit = {
        p.success(result)
      }

      def onFailure(ex: Throwable): Unit = {
        p.failure(ex)
      }
    })

    try {
      val result = Await.result(future, authConf.timeout)

      log.info("OAuth token refresh successful")

      Success(result.getAccessToken)
    } catch {
      case e: ExecutionException => Failure(e.getCause)
      case t: Throwable => Failure(t)
    } finally {
      service.shutdown()
    }

  }

} 
Example 164
Source File: TestUtils.scala    From hivemall-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.test

import scala.reflect.runtime.{universe => ru}

import org.apache.spark.Logging
import org.apache.spark.sql.DataFrame

object TestUtils extends Logging {

  // Do benchmark if INFO-log enabled
  def benchmark(benchName: String)(testFunc: => Unit): Unit = {
    if (log.isDebugEnabled) {
      testFunc
    }
  }

  def expectResult(res: Boolean, errMsg: String) = if (res) {
    logWarning(errMsg)
  }

  def invokeFunc(cls: Any, func: String, args: Any*): DataFrame = try {
    // Invoke a function with the given name via reflection
    val im = scala.reflect.runtime.currentMirror.reflect(cls)
    val mSym = im.symbol.typeSignature.member(ru.newTermName(func)).asMethod
    im.reflectMethod(mSym).apply(args: _*)
      .asInstanceOf[DataFrame]
  } catch {
    case e: Exception =>
      assert(false, s"Invoking ${func} failed because: ${e.getMessage}")
      null // Not executed
  }
}

// TODO: Any same function in o.a.spark.*?
class TestDoubleWrapper(d: Double) {
  // Check an equality between Double values
  def ~==(d: Double): Boolean = Math.abs(this.d - d) < 0.001
}

object TestDoubleWrapper {
  @inline implicit def toTestDoubleWrapper(d: Double) = new TestDoubleWrapper(d)
} 
Example 165
Source File: HttpInputDStreamAsync.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import scala.reflect.ClassTag

import org.apache.spark.Logging
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.JavaDStream
import org.apache.spark.streaming.api.java.JavaDStream.fromDStream
import org.apache.spark.streaming.api.java.JavaStreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver

import com.ning.http.client.AsyncCompletionHandler
import com.ning.http.client.AsyncHttpClient
import com.ning.http.client.Response

class HttpInputDStreamAsync(
    @transient ssc_ : StreamingContext,
    storageLevel: StorageLevel,
    url: String) extends ReceiverInputDStream[String](ssc_) with Logging {

  def getReceiver(): Receiver[String] = {
    new HttpReceiverAsync(storageLevel, url)
  }
}

class HttpReceiverAsync(
    storageLevel: StorageLevel,
    url: String) extends Receiver[String](storageLevel) with Logging {

  var asyncHttpClient: AsyncHttpClient = _

  def onStop() {
    asyncHttpClient.close()
    logInfo("Disconnected from Http Server")
  }

  def onStart() {
    asyncHttpClient = new AsyncHttpClient()
    asyncHttpClient.prepareGet(url).execute(new AsyncCompletionHandler[Response]() {

      override def onCompleted(response: Response): Response = {
        store(response.getResponseBody)
        return response
      }

      override def onThrowable(t: Throwable) {
        restart("Error! Problems while connecting", t)
      }
    });
    logInfo("Http Connection initiated")
  }
  
}

object HttpUtilsAsync {
  def createStream(
    ssc: StreamingContext,
    storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2,
    url: String): DStream[String] = {
    new HttpInputDStreamAsync(ssc, storageLevel, url)
  }

  def createStream(
    jssc: JavaStreamingContext,
    storageLevel: StorageLevel,
    url: String): JavaDStream[String] = {
    implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
    createStream(jssc.ssc, storageLevel, url)
  }
} 
Example 166
Source File: HttpInputDStream.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import java.util.Timer
import java.util.TimerTask

import scala.reflect.ClassTag

import org.apache.http.client.methods.HttpGet
import org.apache.http.impl.client.CloseableHttpClient
import org.apache.http.impl.client.HttpClients
import org.apache.http.util.EntityUtils
import org.apache.spark.Logging
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.JavaDStream
import org.apache.spark.streaming.api.java.JavaDStream.fromDStream
import org.apache.spark.streaming.api.java.JavaStreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver

class HttpInputDStream(
    @transient ssc_ : StreamingContext,
    storageLevel: StorageLevel,
    url: String,
    interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging {

  def getReceiver(): Receiver[String] = {
    new HttpReceiver(storageLevel, url, interval)
  }
}

class HttpReceiver(
    storageLevel: StorageLevel,
    url: String,
    interval: Long) extends Receiver[String](storageLevel) with Logging {

  var httpClient: CloseableHttpClient = _
  var trigger: Timer = _

  def onStop() {
    httpClient.close()
    logInfo("Disconnected from Http Server")
  }

  def onStart() {
    httpClient = HttpClients.createDefault()
    trigger = new Timer()
    trigger.scheduleAtFixedRate(new TimerTask {
      def run() = doGet()
    }, 0, interval * 1000)

    logInfo("Http Receiver initiated")
  }

  def doGet() {
    logInfo("Fetching data from Http source")
    val response = httpClient.execute(new HttpGet(url))
    try {
      val content = EntityUtils.toString(response.getEntity())
      store(content)
    } catch {
      case e: Exception => restart("Error! Problems while connecting", e)
    } finally {
      response.close()
    }

  }

}

object HttpUtils {
  def createStream(
    ssc: StreamingContext,
    storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2,
    url: String,
    interval: Long): DStream[String] = {
    new HttpInputDStream(ssc, storageLevel, url, interval)
  }

  def createStream(
    jssc: JavaStreamingContext,
    storageLevel: StorageLevel,
    url: String,
    interval: Long): JavaDStream[String] = {
    implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
    createStream(jssc.ssc, storageLevel, url, interval)
  }
} 
Example 167
Source File: HttpInputDStream.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import java.util.Timer
import java.util.TimerTask

import scala.reflect.ClassTag

import org.apache.http.client.methods.HttpGet
import org.apache.http.impl.client.CloseableHttpClient
import org.apache.http.impl.client.HttpClients
import org.apache.http.util.EntityUtils
import org.apache.spark.Logging
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.JavaDStream
import org.apache.spark.streaming.api.java.JavaDStream.fromDStream
import org.apache.spark.streaming.api.java.JavaStreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver

class HttpInputDStream(
    @transient ssc_ : StreamingContext,
    storageLevel: StorageLevel,
    url: String,
    interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging {

  def getReceiver(): Receiver[String] = {
    new HttpReceiver(storageLevel, url, interval)
  }
}

class HttpReceiver(
    storageLevel: StorageLevel,
    url: String,
    interval: Long) extends Receiver[String](storageLevel) with Logging {

  var httpClient: CloseableHttpClient = _
  var trigger: Timer = _

  def onStop() {
    httpClient.close()
    logInfo("Disconnected from Http Server")
  }

  def onStart() {
    httpClient = HttpClients.createDefault()
    trigger = new Timer()
    trigger.scheduleAtFixedRate(new TimerTask {
      def run() = doGet()
    }, 0, interval * 1000)

    logInfo("Http Receiver initiated")
  }

  def doGet() {
    logInfo("Fetching data from Http source")
    val response = httpClient.execute(new HttpGet(url))
    try {
      val content = EntityUtils.toString(response.getEntity())
      store(content)
    } catch {
      case e: Exception => restart("Error! Problems while connecting", e)
    } finally {
      response.close()
    }

  }

}

object HttpUtils {
  def createStream(
    ssc: StreamingContext,
    storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2,
    url: String,
    interval: Long): DStream[String] = {
    new HttpInputDStream(ssc, storageLevel, url, interval)
  }

  def createStream(
    jssc: JavaStreamingContext,
    storageLevel: StorageLevel,
    url: String,
    interval: Long): JavaDStream[String] = {
    implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
    createStream(jssc.ssc, storageLevel, url, interval)
  }
} 
Example 168
Source File: LibLinearTraining.scala    From spark-cp   with Apache License 2.0 5 votes vote down vote up
package se.uu.farmbio.cp.examples

import scopt.OptionParser
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import se.uu.farmbio.cp.liblinear.LIBLINEAR
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.Logging

object LibLinearTraining extends Logging {
  
  case class Params(
    trainInputPath: String = null,
    outputPath: String = null,
    calibrRatio: Double = 0.2,
    numberOfCPs: Int = 100,
    nofOutFiles: Int = 0,
    dfsBlockSize: String = "8M",
    master: String = null)
  
  def main(args: Array[String]) = {

    val defaultParams = Params()

    val parser = new OptionParser[Params]("PubChemTraining") {
      head("LibLinearTraining: LIBINEAR training procedure")
      opt[Double]("calibrRatio")
        .text("fraction of calibration examples")
        .action((x, c) => c.copy(calibrRatio = x))
      opt[Int]("numberOfCPs")
        .text("number of CPs to train")
        .action((x, c) => c.copy(numberOfCPs = x))
      opt[String]("master")
        .text("spark master")
        .action((x, c) => c.copy(master = x))
      opt[Int]("nofOutFiles")
        .text("Number of output files. " + 
            "It can be equal to the parallelism level at most " + 
            "(defualt: as much as the parallelism level)")
        .action((x, c) => c.copy(nofOutFiles = x))
      opt[String]("dfsBlockSize")
        .text("It tunes the Hadoop dfs.block.size property (default:8M)")
        .action((x, c) => c.copy(dfsBlockSize = x))
      arg[String]("<input>")
        .required()
        .text("input path to training examples in LIBSVM format")
        .action((x, c) => c.copy(trainInputPath = x))
      arg[String]("<output>")
        .required()
        .text("output path to save CPs")
        .action((x, c) => c.copy(outputPath = x))

    }

    parser.parse(args, defaultParams).map { params =>
      run(params)
    } getOrElse {
      sys.exit(1)
    }

  }
  
  def run(params: Params) {

    //Init Spark
    val conf = new SparkConf()
      .setAppName("LibLinearTraining")
    if (params.master != null) {
      conf.setMaster(params.master)
    }
    val sc = new SparkContext(conf)
    
    //Set and log dfs.block.size
    sc.hadoopConfiguration.set("dfs.block.size", params.dfsBlockSize)
    val blockSize = sc.hadoopConfiguration.get("dfs.block.size")
    logInfo(s"dfs.block.size = $blockSize")
    
    //Load data
    //This example assumes the training set to be relatively small
    //the model data generated will be big instead.
    val input = MLUtils.loadLibSVMFile(sc, params.trainInputPath)
    val trainingData = input.collect
    
    //Train the CPs
    val modelData = LIBLINEAR.trainAggregatedICPClassifier(
        sc, 
        trainingData, 
        params.calibrRatio, 
        params.numberOfCPs)
        
    //Save the model in a distributed fashion 
    modelData.save(params.outputPath, params.nofOutFiles)
    
    //Stop Spark
    sc.stop
    
  }

} 
Example 169
Source File: ICP.scala    From spark-cp   with Apache License 2.0 5 votes vote down vote up
package se.uu.farmbio.cp

import org.apache.spark.Logging
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

object ICP extends Logging {

  private def simpleSplit(
    input: RDD[LabeledPoint],
    numOfCalibSamples: Int) = {

    //Computing the calibration fraction using binomial upper bound
    val n = input.count
    val fraction = numOfCalibSamples.toDouble / n
    val delta = 1e-4
    val minSamplingRate = 1e-10
    val gamma = -math.log(delta) / n
    val calibFraction = math.min(1,
      math.max(minSamplingRate, fraction + gamma + math.sqrt(gamma * gamma + 2 * gamma * fraction)))

    //calibFraction is enough most of the times, but not always 
    val splits = input.randomSplit(Array(calibFraction, 1 - calibFraction))
    var sample = splits(0).collect
    while (sample.length < numOfCalibSamples) {
      logWarning("Needed to re-sample calibration set due to insufficient sample size.")
      val split = input.randomSplit(Array(calibFraction, 1 - calibFraction))
      sample = splits(0).collect
    }

    val calibration = sample.take(numOfCalibSamples)
    val additional = sample.takeRight(sample.length - numOfCalibSamples)

    val sc = input.context
    (calibration, splits(1) ++ sc.parallelize(additional))

  }

  private def stratifiedSplit(
    input: RDD[LabeledPoint],
    numOfCalibSamples: Int) = {

    logWarning("Stratified sampling is supported only for binary classification.")
    
    //Calibration split, making sure there is some data for both classes
    val class0 = input.filter(_.label == 0.0)
    val class1 = input.filter(_.label == 1.0)
    val count0 = class0.count
    val count1 = class1.count
    val posRatio = count1.doubleValue / (count0 + count1)
    val posSize = if(numOfCalibSamples * posRatio < 19) {
      logWarning("Raising the number of positive samples to 19 (allows sig >= 0.5)")
      19
    } else {
      (numOfCalibSamples * posRatio).ceil.toInt
    }
    val negSize = numOfCalibSamples - posSize
    val (negSmpl, negTr) = ICP.simpleSplit(class0, negSize)
    val (posSmpl, posTr) = ICP.simpleSplit(class1, posSize)
    val properTraining = negTr ++ posTr
    val clalibration = negSmpl ++ posSmpl
    (clalibration, properTraining)

  }

  def calibrationSplit(
    input: RDD[LabeledPoint],
    numOfCalibSamples: Int,
    stratified: Boolean = false) = {

    if (stratified) {
      logWarning("Stratified sampling needs to count the dataset, you should use it wisely.")
      ICP.stratifiedSplit(input, numOfCalibSamples)
    } else {
      ICP.simpleSplit(input, numOfCalibSamples)
    }

  }

  def trainClassifier[A <: UnderlyingAlgorithm](
    alg: A,
    numClasses: Int,
    calibSet: Array[LabeledPoint]): ICPClassifierModel[A] = {
    //Compute aphas for each class (mondrian approach)
    val alphas = (0 to numClasses - 1).map { i =>
      calibSet.filter(_.label == i) //filter current label
        .map(newSmpl => alg.nonConformityMeasure(newSmpl)) //compute alpha
    }
    new ICPClassifierModelImpl(alg, alphas)
  }

} 
Example 170
Source File: BlockTransferService.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.network

import java.io.Closeable
import java.nio.ByteBuffer

import scala.concurrent.{Promise, Await, Future}
import scala.concurrent.duration.Duration

import org.apache.spark.Logging
import org.apache.spark.network.buffer.{NioManagedBuffer, ManagedBuffer}
import org.apache.spark.network.shuffle.{ShuffleClient, BlockFetchingListener}
import org.apache.spark.storage.{BlockManagerId, BlockId, StorageLevel}

private[spark]
abstract class BlockTransferService extends ShuffleClient with Closeable with Logging {

  
  def uploadBlockSync(
      hostname: String,
      port: Int,
      execId: String,
      blockId: BlockId,
      blockData: ManagedBuffer,
      level: StorageLevel): Unit = {
    Await.result(uploadBlock(hostname, port, execId, blockId, blockData, level), Duration.Inf)
  }
} 
Example 171
Source File: NettyBlockRpcServer.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.network.netty

import java.nio.ByteBuffer

import scala.collection.JavaConversions._

import org.apache.spark.Logging
import org.apache.spark.network.BlockDataManager
import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer}
import org.apache.spark.network.client.{RpcResponseCallback, TransportClient}
import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager}
import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock}
import org.apache.spark.serializer.Serializer
import org.apache.spark.storage.{BlockId, StorageLevel}


class NettyBlockRpcServer(
    serializer: Serializer,
    blockManager: BlockDataManager)
  extends RpcHandler with Logging {

  private val streamManager = new OneForOneStreamManager()

  override def receive(
      client: TransportClient,
      messageBytes: Array[Byte],
      responseContext: RpcResponseCallback): Unit = {
    val message = BlockTransferMessage.Decoder.fromByteArray(messageBytes)
    logTrace(s"Received request: $message")

    message match {
      case openBlocks: OpenBlocks =>
        val blocks: Seq[ManagedBuffer] =
          openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData)
        val streamId = streamManager.registerStream(blocks.iterator)
        logTrace(s"Registered streamId $streamId with ${blocks.size} buffers")
        responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteArray)

      case uploadBlock: UploadBlock =>
        // StorageLevel is serialized as bytes using our JavaSerializer.
        val level: StorageLevel =
          serializer.newInstance().deserialize(ByteBuffer.wrap(uploadBlock.metadata))
        val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData))
        blockManager.putBlockData(BlockId(uploadBlock.blockId), data, level)
        responseContext.onSuccess(new Array[Byte](0))
    }
  }

  override def getStreamManager(): StreamManager = streamManager
} 
Example 172
Source File: SortShuffleWriter.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.shuffle.sort

import org.apache.spark.{MapOutputTracker, SparkEnv, Logging, TaskContext}
import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.scheduler.MapStatus
import org.apache.spark.shuffle.{IndexShuffleBlockManager, ShuffleWriter, BaseShuffleHandle}
import org.apache.spark.storage.ShuffleBlockId
import org.apache.spark.util.collection.ExternalSorter

private[spark] class SortShuffleWriter[K, V, C](
    shuffleBlockManager: IndexShuffleBlockManager,
    handle: BaseShuffleHandle[K, V, C],
    mapId: Int,
    context: TaskContext)
  extends ShuffleWriter[K, V] with Logging {

  private val dep = handle.dependency

  private val blockManager = SparkEnv.get.blockManager

  private var sorter: ExternalSorter[K, V, _] = null

  // Are we in the process of stopping? Because map tasks can call stop() with success = true
  // and then call stop() with success = false if they get an exception, we want to make sure
  // we don't try deleting files, etc twice.
  private var stopping = false

  private var mapStatus: MapStatus = null

  private val writeMetrics = new ShuffleWriteMetrics()
  context.taskMetrics.shuffleWriteMetrics = Some(writeMetrics)

  
  override def stop(success: Boolean): Option[MapStatus] = {
    try {
      if (stopping) {
        return None
      }
      stopping = true
      if (success) {
        return Option(mapStatus)
      } else {
        // The map task failed, so delete our output data.
        shuffleBlockManager.removeDataByMap(dep.shuffleId, mapId)
        return None
      }
    } finally {
      // Clean up our sorter, which may have its own intermediate files
      if (sorter != null) {
        sorter.stop()
        sorter = null
      }
    }
  }
} 
Example 173
Source File: MetricsConfig.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.metrics

import java.io.{FileInputStream, InputStream}
import java.util.Properties

import scala.collection.mutable
import scala.util.matching.Regex

import org.apache.spark.Logging
import org.apache.spark.util.Utils

private[spark] class MetricsConfig(val configFile: Option[String]) extends Logging {

  private val DEFAULT_PREFIX = "*"
  private val INSTANCE_REGEX = "^(\\*|[a-zA-Z]+)\\.(.+)".r
  private val DEFAULT_METRICS_CONF_FILENAME = "metrics.properties"

  private[metrics] val properties = new Properties()
  private[metrics] var propertyCategories: mutable.HashMap[String, Properties] = null

  private def setDefaultProperties(prop: Properties) {
    prop.setProperty("*.sink.servlet.class", "org.apache.spark.metrics.sink.MetricsServlet")
    prop.setProperty("*.sink.servlet.path", "/metrics/json")
    prop.setProperty("master.sink.servlet.path", "/metrics/master/json")
    prop.setProperty("applications.sink.servlet.path", "/metrics/applications/json")
  }

  def initialize() {
    // Add default properties in case there's no properties file
    setDefaultProperties(properties)

    // If spark.metrics.conf is not set, try to get file in class path
    val isOpt: Option[InputStream] = configFile.map(new FileInputStream(_)).orElse {
      try {
        Option(Utils.getSparkClassLoader.getResourceAsStream(DEFAULT_METRICS_CONF_FILENAME))
      } catch {
        case e: Exception =>
          logError("Error loading default configuration file", e)
          None
      }
    }

    isOpt.foreach { is =>
      try {
        properties.load(is)
      } finally {
        is.close()
      }
    }

    propertyCategories = subProperties(properties, INSTANCE_REGEX)
    if (propertyCategories.contains(DEFAULT_PREFIX)) {
      import scala.collection.JavaConversions._

      val defaultProperty = propertyCategories(DEFAULT_PREFIX)
      for { (inst, prop) <- propertyCategories
            if (inst != DEFAULT_PREFIX)
            (k, v) <- defaultProperty
            if (prop.getProperty(k) == null) } {
        prop.setProperty(k, v)
      }
    }
  }

  def subProperties(prop: Properties, regex: Regex): mutable.HashMap[String, Properties] = {
    val subProperties = new mutable.HashMap[String, Properties]
    import scala.collection.JavaConversions._
    prop.foreach { kv =>
      if (regex.findPrefixOf(kv._1).isDefined) {
        val regex(prefix, suffix) = kv._1
        subProperties.getOrElseUpdate(prefix, new Properties).setProperty(suffix, kv._2)
      }
    }
    subProperties
  }

  def getInstance(inst: String): Properties = {
    propertyCategories.get(inst) match {
      case Some(s) => s
      case None => propertyCategories.getOrElse(DEFAULT_PREFIX, new Properties)
    }
  }
} 
Example 174
Source File: PythonGatewayServer.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.api.python

import java.io.DataOutputStream
import java.net.Socket

import py4j.GatewayServer

import org.apache.spark.Logging
import org.apache.spark.util.Utils


private[spark] object PythonGatewayServer extends Logging {
  def main(args: Array[String]): Unit = Utils.tryOrExit {
    // Start a GatewayServer on an ephemeral port
    val gatewayServer: GatewayServer = new GatewayServer(null, 0)
    gatewayServer.start()
    val boundPort: Int = gatewayServer.getListeningPort
    if (boundPort == -1) {
      logError("GatewayServer failed to bind; exiting")
      System.exit(1)
    } else {
      logDebug(s"Started PythonGatewayServer on port $boundPort")
    }

    // Communicate the bound port back to the caller via the caller-specified callback port
    val callbackHost = sys.env("_PYSPARK_DRIVER_CALLBACK_HOST")
    val callbackPort = sys.env("_PYSPARK_DRIVER_CALLBACK_PORT").toInt
    logDebug(s"Communicating GatewayServer port to Python driver at $callbackHost:$callbackPort")
    val callbackSocket = new Socket(callbackHost, callbackPort)
    val dos = new DataOutputStream(callbackSocket.getOutputStream)
    dos.writeInt(boundPort)
    dos.close()
    callbackSocket.close()

    // Exit on EOF or broken pipe to ensure that this process dies when the Python driver dies:
    while (System.in.read() != -1) {
      // Do nothing
    }
    logDebug("Exiting due to broken pipe from Python driver")
    System.exit(0)
  }
} 
Example 175
Source File: TestClient.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.client

import org.apache.spark.{SecurityManager, SparkConf, Logging}
import org.apache.spark.deploy.{ApplicationDescription, Command}
import org.apache.spark.util.{AkkaUtils, Utils}

private[spark] object TestClient {

  class TestListener extends AppClientListener with Logging {
    def connected(id: String) {
      logInfo("Connected to master, got app ID " + id)
    }

    def disconnected() {
      logInfo("Disconnected from master")
      System.exit(0)
    }

    def dead(reason: String) {
      logInfo("Application died with error: " + reason)
      System.exit(0)
    }

    def executorAdded(id: String, workerId: String, hostPort: String, cores: Int, memory: Int) {}

    def executorRemoved(id: String, message: String, exitStatus: Option[Int]) {}
  }

  def main(args: Array[String]) {
    val url = args(0)
    val conf = new SparkConf
    val (actorSystem, _) = AkkaUtils.createActorSystem("spark", Utils.localIpAddress, 0,
      conf = conf, securityManager = new SecurityManager(conf))
    val desc = new ApplicationDescription("TestClient", Some(1), 512,
      Command("spark.deploy.client.TestExecutor", Seq(), Map(), Seq(), Seq(), Seq()), "ignored")
    val listener = new TestListener
    val client = new AppClient(actorSystem, Array(url), desc, listener, new SparkConf)
    client.start()
    actorSystem.awaitTermination()
  }
} 
Example 176
Source File: FileSystemPersistenceEngine.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.master

import java.io._

import scala.reflect.ClassTag

import akka.serialization.Serialization

import org.apache.spark.Logging



private[spark] class FileSystemPersistenceEngine(
    val dir: String,
    val serialization: Serialization)
  extends PersistenceEngine with Logging {

  new File(dir).mkdir()

  override def persist(name: String, obj: Object): Unit = {
    serializeIntoFile(new File(dir + File.separator + name), obj)
  }

  override def unpersist(name: String): Unit = {
    new File(dir + File.separator + name).delete()
  }

  override def read[T: ClassTag](prefix: String) = {
    val files = new File(dir).listFiles().filter(_.getName.startsWith(prefix))
    files.map(deserializeFromFile[T])
  }

  private def serializeIntoFile(file: File, value: AnyRef) {
    val created = file.createNewFile()
    if (!created) { throw new IllegalStateException("Could not create file: " + file) }
    val serializer = serialization.findSerializerFor(value)
    val serialized = serializer.toBinary(value)
    val out = new FileOutputStream(file)
    try {
      out.write(serialized)
    } finally {
      out.close()
    }
  }

  private def deserializeFromFile[T](file: File)(implicit m: ClassTag[T]): T = {
    val fileData = new Array[Byte](file.length().asInstanceOf[Int])
    val dis = new DataInputStream(new FileInputStream(file))
    try {
      dis.readFully(fileData)
    } finally {
      dis.close()
    }
    val clazz = m.runtimeClass.asInstanceOf[Class[T]]
    val serializer = serialization.serializerFor(clazz)
    serializer.fromBinary(fileData).asInstanceOf[T]
  }

} 
Example 177
Source File: SparkCuratorUtil.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.master

import scala.collection.JavaConversions._

import org.apache.curator.framework.{CuratorFramework, CuratorFrameworkFactory}
import org.apache.curator.retry.ExponentialBackoffRetry
import org.apache.zookeeper.KeeperException

import org.apache.spark.{Logging, SparkConf}

object SparkCuratorUtil extends Logging {

  val ZK_CONNECTION_TIMEOUT_MILLIS = 15000
  val ZK_SESSION_TIMEOUT_MILLIS = 60000
  val RETRY_WAIT_MILLIS = 5000
  val MAX_RECONNECT_ATTEMPTS = 3

  def newClient(conf: SparkConf): CuratorFramework = {
    val ZK_URL = conf.get("spark.deploy.zookeeper.url")
    val zk = CuratorFrameworkFactory.newClient(ZK_URL,
      ZK_SESSION_TIMEOUT_MILLIS, ZK_CONNECTION_TIMEOUT_MILLIS,
      new ExponentialBackoffRetry(RETRY_WAIT_MILLIS, MAX_RECONNECT_ATTEMPTS))
    zk.start()
    zk
  }

  def mkdir(zk: CuratorFramework, path: String) {
    if (zk.checkExists().forPath(path) == null) {
      try {
        zk.create().creatingParentsIfNeeded().forPath(path)
      } catch {
        case nodeExist: KeeperException.NodeExistsException =>
          // do nothing, ignore node existing exception.
        case e: Exception => throw e
      }
    }
  }

  def deleteRecursive(zk: CuratorFramework, path: String) {
    if (zk.checkExists().forPath(path) != null) {
      for (child <- zk.getChildren.forPath(path)) {
        zk.delete().forPath(path + "/" + child)
      }
      zk.delete().forPath(path)
    }
  }
} 
Example 178
Source File: ZooKeeperLeaderElectionAgent.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.master

import akka.actor.ActorRef

import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.deploy.master.MasterMessages._
import org.apache.curator.framework.CuratorFramework
import org.apache.curator.framework.recipes.leader.{LeaderLatchListener, LeaderLatch}

private[spark] class ZooKeeperLeaderElectionAgent(val masterActor: LeaderElectable,
    conf: SparkConf) extends LeaderLatchListener with LeaderElectionAgent with Logging  {

  val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/leader_election"

  private var zk: CuratorFramework = _
  private var leaderLatch: LeaderLatch = _
  private var status = LeadershipStatus.NOT_LEADER

  start()

  def start() {
    logInfo("Starting ZooKeeper LeaderElection agent")
    zk = SparkCuratorUtil.newClient(conf)
    leaderLatch = new LeaderLatch(zk, WORKING_DIR)
    leaderLatch.addListener(this)
    leaderLatch.start()
  }

  override def stop() {
    leaderLatch.close()
    zk.close()
  }

  override def isLeader() {
    synchronized {
      // could have lost leadership by now.
      if (!leaderLatch.hasLeadership) {
        return
      }

      logInfo("We have gained leadership")
      updateLeadershipStatus(true)
    }
  }

  override def notLeader() {
    synchronized {
      // could have gained leadership by now.
      if (leaderLatch.hasLeadership) {
        return
      }

      logInfo("We have lost leadership")
      updateLeadershipStatus(false)
    }
  }

  def updateLeadershipStatus(isLeader: Boolean) {
    if (isLeader && status == LeadershipStatus.NOT_LEADER) {
      status = LeadershipStatus.LEADER
      masterActor.electedLeader()
    } else if (!isLeader && status == LeadershipStatus.LEADER) {
      status = LeadershipStatus.NOT_LEADER
      masterActor.revokedLeadership()
    }
  }

  private object LeadershipStatus extends Enumeration {
    type LeadershipStatus = Value
    val LEADER, NOT_LEADER = Value
  }
} 
Example 179
Source File: ZooKeeperPersistenceEngine.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.master

import akka.serialization.Serialization

import scala.collection.JavaConversions._
import scala.reflect.ClassTag

import org.apache.curator.framework.CuratorFramework
import org.apache.zookeeper.CreateMode

import org.apache.spark.{Logging, SparkConf}


private[spark] class ZooKeeperPersistenceEngine(conf: SparkConf, val serialization: Serialization)
  extends PersistenceEngine
  with Logging
{
  val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/master_status"
  val zk: CuratorFramework = SparkCuratorUtil.newClient(conf)

  SparkCuratorUtil.mkdir(zk, WORKING_DIR)


  override def persist(name: String, obj: Object): Unit = {
    serializeIntoFile(WORKING_DIR + "/" + name, obj)
  }

  override def unpersist(name: String): Unit = {
    zk.delete().forPath(WORKING_DIR + "/" + name)
  }

  override def read[T: ClassTag](prefix: String) = {
    val file = zk.getChildren.forPath(WORKING_DIR).filter(_.startsWith(prefix))
    file.map(deserializeFromFile[T]).flatten
  }

  override def close() {
    zk.close()
  }

  private def serializeIntoFile(path: String, value: AnyRef) {
    val serializer = serialization.findSerializerFor(value)
    val serialized = serializer.toBinary(value)
    zk.create().withMode(CreateMode.PERSISTENT).forPath(path, serialized)
  }

  def deserializeFromFile[T](filename: String)(implicit m: ClassTag[T]): Option[T] = {
    val fileData = zk.getData().forPath(WORKING_DIR + "/" + filename)
    val clazz = m.runtimeClass.asInstanceOf[Class[T]]
    val serializer = serialization.serializerFor(clazz)
    try {
      Some(serializer.fromBinary(fileData).asInstanceOf[T])
    } catch {
      case e: Exception => {
        logWarning("Exception while reading persisted file, deleting", e)
        zk.delete().forPath(WORKING_DIR + "/" + filename)
        None
      }
    }
  }
} 
Example 180
Source File: WorkerWebUI.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.worker.ui

import java.io.File
import javax.servlet.http.HttpServletRequest

import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.deploy.worker.Worker
import org.apache.spark.deploy.worker.ui.WorkerWebUI._
import org.apache.spark.ui.{SparkUI, WebUI}
import org.apache.spark.ui.JettyUtils._
import org.apache.spark.util.AkkaUtils


  def initialize() {
    val logPage = new LogPage(this)
    attachPage(logPage)
    attachPage(new WorkerPage(this))
    attachHandler(createStaticHandler(WorkerWebUI.STATIC_RESOURCE_BASE, "/static"))
    attachHandler(createServletHandler("/log",
      (request: HttpServletRequest) => logPage.renderLog(request), worker.securityMgr))
  }
}

private[spark] object WorkerWebUI {
  val STATIC_RESOURCE_BASE = SparkUI.STATIC_RESOURCE_DIR
} 
Example 181
Source File: WorkerWatcher.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.worker

import akka.actor.{Actor, Address, AddressFromURIString}
import akka.remote.{AssociatedEvent, AssociationErrorEvent, AssociationEvent, DisassociatedEvent, RemotingLifecycleEvent}

import org.apache.spark.Logging
import org.apache.spark.deploy.DeployMessages.SendHeartbeat
import org.apache.spark.util.ActorLogReceive


private[spark] class WorkerWatcher(workerUrl: String)
  extends Actor with ActorLogReceive with Logging {

  override def preStart() {
    context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent])

    logInfo(s"Connecting to worker $workerUrl")
    val worker = context.actorSelection(workerUrl)
    worker ! SendHeartbeat // need to send a message here to initiate connection
  }

  // Used to avoid shutting down JVM during tests
  private[deploy] var isShutDown = false
  private[deploy] def setTesting(testing: Boolean) = isTesting = testing
  private var isTesting = false

  // Lets us filter events only from the worker's actor system
  private val expectedHostPort = AddressFromURIString(workerUrl).hostPort
  private def isWorker(address: Address) = address.hostPort == expectedHostPort

  def exitNonZero() = if (isTesting) isShutDown = true else System.exit(-1)

  override def receiveWithLogging = {
    case AssociatedEvent(localAddress, remoteAddress, inbound) if isWorker(remoteAddress) =>
      logInfo(s"Successfully connected to $workerUrl")

    case AssociationErrorEvent(cause, localAddress, remoteAddress, inbound, _)
        if isWorker(remoteAddress) =>
      // These logs may not be seen if the worker (and associated pipe) has died
      logError(s"Could not initialize connection to worker $workerUrl. Exiting.")
      logError(s"Error was: $cause")
      exitNonZero()

    case DisassociatedEvent(localAddress, remoteAddress, inbound) if isWorker(remoteAddress) =>
      // This log message will never be seen
      logError(s"Lost connection to worker actor $workerUrl. Exiting.")
      exitNonZero()

    case e: AssociationEvent =>
      // pass through association events relating to other remote actor systems

    case e => logWarning(s"Received unexpected actor system event: $e")
  }
} 
Example 182
Source File: StandaloneWorkerShuffleService.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.worker

import org.apache.spark.{Logging, SparkConf, SecurityManager}
import org.apache.spark.network.TransportContext
import org.apache.spark.network.netty.SparkTransportConf
import org.apache.spark.network.sasl.SaslRpcHandler
import org.apache.spark.network.server.TransportServer
import org.apache.spark.network.shuffle.ExternalShuffleBlockHandler


  def startIfEnabled() {
    if (enabled) {
      require(server == null, "Shuffle server already started")
      logInfo(s"Starting shuffle service on port $port with useSasl = $useSasl")
      server = transportContext.createServer(port)
    }
  }

  def stop() {
    if (enabled && server != null) {
      server.close()
      server = null
    }
  }
} 
Example 183
Source File: HistoryServerArguments.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.history

import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.util.Utils


private[spark] class HistoryServerArguments(conf: SparkConf, args: Array[String]) extends Logging {
  private var propertiesFile: String = null

  parse(args.toList)

  private def parse(args: List[String]): Unit = {
    args match {
      case ("--dir" | "-d") :: value :: tail =>
        logWarning("Setting log directory through the command line is deprecated as of " +
          "Spark 1.1.0. Please set this through spark.history.fs.logDirectory instead.")
        conf.set("spark.history.fs.logDirectory", value)
        System.setProperty("spark.history.fs.logDirectory", value)
        parse(tail)

      case ("--help" | "-h") :: tail =>
        printUsageAndExit(0)

      case ("--properties-file") :: value :: tail =>
        propertiesFile = value
        parse(tail)

      case Nil =>

      case _ =>
        printUsageAndExit(1)
    }
  }

   // This mutates the SparkConf, so all accesses to it must be made after this line
   Utils.loadDefaultSparkProperties(conf, propertiesFile)

  private def printUsageAndExit(exitCode: Int) {
    System.err.println(
      """
      |Usage: HistoryServer [options]
      |
      |Options:
      |  --properties-file FILE      Path to a custom Spark properties file.
      |                              Default is conf/spark-defaults.conf.
      |
      |Configuration options can be set by setting the corresponding JVM system property.
      |History Server options are always available; additional options depend on the provider.
      |
      |History Server options:
      |
      |  spark.history.ui.port              Port where server will listen for connections
      |                                     (default 18080)
      |  spark.history.acls.enable          Whether to enable view acls for all applications
      |                                     (default false)
      |  spark.history.provider             Name of history provider class (defaults to
      |                                     file system-based provider)
      |  spark.history.retainedApplications Max number of application UIs to keep loaded in memory
      |                                     (default 50)
      |FsHistoryProvider options:
      |
      |  spark.history.fs.logDirectory      Directory where app logs are stored
      |                                     (default: file:/tmp/spark-events)
      |  spark.history.fs.updateInterval    How often to reload log data from storage
      |                                     (in seconds, default: 10)
      |""".stripMargin)
    System.exit(exitCode)
  }

} 
Example 184
Source File: LocalSparkCluster.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy

import scala.collection.mutable.ArrayBuffer

import akka.actor.ActorSystem

import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.deploy.worker.Worker
import org.apache.spark.deploy.master.Master
import org.apache.spark.util.Utils


    for (workerNum <- 1 to numWorkers) {
      val (workerSystem, _) = Worker.startSystemAndActor(localHostname, 0, 0, coresPerWorker,
        memoryPerWorker, masters, null, Some(workerNum), _conf)
      workerActorSystems += workerSystem
    }

    masters
  }

  def stop() {
    logInfo("Shutting down local Spark cluster.")
    // Stop the workers before the master so they don't get upset that it disconnected
    // TODO: In Akka 2.1.x, ActorSystem.awaitTermination hangs when you have remote actors!
    //       This is unfortunate, but for now we just comment it out.
    workerActorSystems.foreach(_.shutdown())
    // workerActorSystems.foreach(_.awaitTermination())
    masterActorSystems.foreach(_.shutdown())
    // masterActorSystems.foreach(_.awaitTermination())
    masterActorSystems.clear()
    workerActorSystems.clear()
  }
} 
Example 185
Source File: SimrSchedulerBackend.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler.cluster

import org.apache.hadoop.fs.{Path, FileSystem}

import org.apache.spark.{Logging, SparkContext, SparkEnv}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.scheduler.TaskSchedulerImpl
import org.apache.spark.util.AkkaUtils

private[spark] class SimrSchedulerBackend(
    scheduler: TaskSchedulerImpl,
    sc: SparkContext,
    driverFilePath: String)
  extends CoarseGrainedSchedulerBackend(scheduler, sc.env.actorSystem)
  with Logging {

  val tmpPath = new Path(driverFilePath + "_tmp")
  val filePath = new Path(driverFilePath)

  val maxCores = conf.getInt("spark.simr.executor.cores", 1)

  override def start() {
    super.start()

    val driverUrl = AkkaUtils.address(
      AkkaUtils.protocol(actorSystem),
      SparkEnv.driverActorSystemName,
      sc.conf.get("spark.driver.host"),
      sc.conf.get("spark.driver.port"),
      CoarseGrainedSchedulerBackend.ACTOR_NAME)

    val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
    val fs = FileSystem.get(conf)
    val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("")

    logInfo("Writing to HDFS file: "  + driverFilePath)
    logInfo("Writing Akka address: "  + driverUrl)
    logInfo("Writing Spark UI Address: " + appUIAddress)

    // Create temporary file to prevent race condition where executors get empty driverUrl file
    val temp = fs.create(tmpPath, true)
    temp.writeUTF(driverUrl)
    temp.writeInt(maxCores)
    temp.writeUTF(appUIAddress)
    temp.close()

    // "Atomic" rename
    fs.rename(tmpPath, filePath)
  }

  override def stop() {
    val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
    val fs = FileSystem.get(conf)
    fs.delete(new Path(driverFilePath), false)
    super.stop()
  }

} 
Example 186
Source File: MesosTaskLaunchData.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler.cluster.mesos

import java.nio.ByteBuffer

import org.apache.mesos.protobuf.ByteString

import org.apache.spark.Logging


private[spark] case class MesosTaskLaunchData(
  serializedTask: ByteBuffer,
  attemptNumber: Int) extends Logging {

  def toByteString: ByteString = {
    val dataBuffer = ByteBuffer.allocate(4 + serializedTask.limit)
    dataBuffer.putInt(attemptNumber)
    dataBuffer.put(serializedTask)
    dataBuffer.rewind
    logDebug(s"ByteBuffer size: [${dataBuffer.remaining}]")
    ByteString.copyFrom(dataBuffer)
  }
}

private[spark] object MesosTaskLaunchData extends Logging {
  def fromByteString(byteString: ByteString): MesosTaskLaunchData = {
    val byteBuffer = byteString.asReadOnlyByteBuffer()
    logDebug(s"ByteBuffer size: [${byteBuffer.remaining}]")
    val attemptNumber = byteBuffer.getInt // updates the position by 4 bytes
    val serializedTask = byteBuffer.slice() // subsequence starting at the current position
    MesosTaskLaunchData(serializedTask, attemptNumber)
  }
} 
Example 187
Source File: ReplayListenerBus.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.io.{InputStream, IOException}

import scala.io.Source

import org.json4s.jackson.JsonMethods._

import org.apache.spark.Logging
import org.apache.spark.util.JsonProtocol


  def replay(logData: InputStream, sourceName: String): Unit = {
    var currentLine: String = null
    var lineNumber: Int = 1
    try {
      val lines = Source.fromInputStream(logData).getLines()
      lines.foreach { line =>
        currentLine = line
        postToAll(JsonProtocol.sparkEventFromJson(parse(line)))
        lineNumber += 1
      }
    } catch {
      case ioe: IOException =>
        throw ioe
      case e: Exception =>
        logError(s"Exception parsing Spark event log: $sourceName", e)
        logError(s"Malformed line #$lineNumber: $currentLine\n")
    }
  }

} 
Example 188
Source File: SparkUncaughtExceptionHandler.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import org.apache.spark.Logging


private[spark] object SparkUncaughtExceptionHandler
  extends Thread.UncaughtExceptionHandler with Logging {

  override def uncaughtException(thread: Thread, exception: Throwable) {
    try {
      logError("Uncaught exception in thread " + thread, exception)

      // We may have been called from a shutdown hook. If so, we must not call System.exit().
      // (If we do, we will deadlock.)
      if (!Utils.inShutdown()) {
        if (exception.isInstanceOf[OutOfMemoryError]) {
          System.exit(SparkExitCode.OOM)
        } else {
          System.exit(SparkExitCode.UNCAUGHT_EXCEPTION)
        }
      }
    } catch {
      case oom: OutOfMemoryError => Runtime.getRuntime.halt(SparkExitCode.OOM)
      case t: Throwable => Runtime.getRuntime.halt(SparkExitCode.UNCAUGHT_EXCEPTION_TWICE)
    }
  }

  def uncaughtException(exception: Throwable) {
    uncaughtException(Thread.currentThread(), exception)
  }
} 
Example 189
Source File: BlockManagerSlaveActor.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.storage

import scala.concurrent.Future

import akka.actor.{ActorRef, Actor}

import org.apache.spark.{Logging, MapOutputTracker, SparkEnv}
import org.apache.spark.storage.BlockManagerMessages._
import org.apache.spark.util.ActorLogReceive


private[storage]
class BlockManagerSlaveActor(
    blockManager: BlockManager,
    mapOutputTracker: MapOutputTracker)
  extends Actor with ActorLogReceive with Logging {

  import context.dispatcher

  // Operations that involve removing blocks may be slow and should be done asynchronously
  override def receiveWithLogging = {
    case RemoveBlock(blockId) =>
      doAsync[Boolean]("removing block " + blockId, sender) {
        blockManager.removeBlock(blockId)
        true
      }

    case RemoveRdd(rddId) =>
      doAsync[Int]("removing RDD " + rddId, sender) {
        blockManager.removeRdd(rddId)
      }

    case RemoveShuffle(shuffleId) =>
      doAsync[Boolean]("removing shuffle " + shuffleId, sender) {
        if (mapOutputTracker != null) {
          mapOutputTracker.unregisterShuffle(shuffleId)
        }
        SparkEnv.get.shuffleManager.unregisterShuffle(shuffleId)
      }

    case RemoveBroadcast(broadcastId, _) =>
      doAsync[Int]("removing broadcast " + broadcastId, sender) {
        blockManager.removeBroadcast(broadcastId, tellMaster = true)
      }

    case GetBlockStatus(blockId, _) =>
      sender ! blockManager.getStatus(blockId)

    case GetMatchingBlockIds(filter, _) =>
      sender ! blockManager.getMatchingBlockIds(filter)
  }

  private def doAsync[T](actionMessage: String, responseActor: ActorRef)(body: => T) {
    val future = Future {
      logDebug(actionMessage)
      body
    }
    future.onSuccess { case response =>
      logDebug("Done " + actionMessage + ", response is " + response)
      responseActor ! response
      logDebug("Sent response: " + response + " to " + responseActor)
    }
    future.onFailure { case t: Throwable =>
      logError("Error in " + actionMessage, t)
      responseActor ! null.asInstanceOf[T]
    }
  }
} 
Example 190
Source File: DatasourceRDD.scala    From datasource-receiver   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.datasource.receiver

import org.apache.spark.partial.{BoundedDouble, CountEvaluator, PartialResult}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.streaming.datasource.config.ParametersUtils
import org.apache.spark.streaming.datasource.models.{InputSentences, OffsetOperator}
import org.apache.spark.{Logging, Partition, TaskContext}

private[datasource]
class DatasourceRDD(
                     @transient sqlContext: SQLContext,
                     inputSentences: InputSentences,
                     datasourceParams: Map[String, String]
                   ) extends RDD[Row](sqlContext.sparkContext, Nil) with Logging with ParametersUtils {

  private var totalCalculated: Option[Long] = None

  private val InitTableName = "initTable"
  private val LimitedTableName = "limitedTable"
  private val TempInitQuery = s"select * from $InitTableName"

  val dataFrame = inputSentences.offsetConditions.fold(sqlContext.sql(inputSentences.query)) { case offset =>
    val parsedQuery = parseInitialQuery
    val conditionsSentence = offset.fromOffset.extractConditionSentence(parsedQuery)
    val orderSentence = offset.fromOffset.extractOrderSentence(parsedQuery, inverse = offset.limitRecords.isEmpty)
    val limitSentence = inputSentences.extractLimitSentence

    sqlContext.sql(parsedQuery + conditionsSentence + orderSentence + limitSentence)
  }

  private def parseInitialQuery: String = {
    if (inputSentences.query.toUpperCase.contains("WHERE") ||
      inputSentences.query.toUpperCase.contains("ORDER") ||
      inputSentences.query.toUpperCase.contains("LIMIT")
    ) {
      sqlContext.sql(inputSentences.query).registerTempTable(InitTableName)
      TempInitQuery
    } else inputSentences.query
  }

  def progressInputSentences: InputSentences = {
    if (!dataFrame.rdd.isEmpty()) {
      inputSentences.offsetConditions.fold(inputSentences) { case offset =>

        val offsetValue = if (offset.limitRecords.isEmpty)
          dataFrame.rdd.first().get(dataFrame.schema.fieldIndex(offset.fromOffset.name))
        else {
          dataFrame.registerTempTable(LimitedTableName)
          val limitedQuery = s"select * from $LimitedTableName order by ${offset.fromOffset.name} " +
            s"${OffsetOperator.toInverseOrderOperator(offset.fromOffset.operator)} limit 1"

          sqlContext.sql(limitedQuery).rdd.first().get(dataFrame.schema.fieldIndex(offset.fromOffset.name))
        }

        inputSentences.copy(offsetConditions = Option(offset.copy(fromOffset = offset.fromOffset.copy(
          value = Option(offsetValue),
          operator = OffsetOperator.toProgressOperator(offset.fromOffset.operator)))))
      }
    } else inputSentences
  }

  
  override def isEmpty(): Boolean = {
    totalCalculated.fold {
      withScope {
        partitions.length == 0 || take(1).length == 0
      }
    } { total => total == 0L }
  }

  override def getPartitions: Array[Partition] = dataFrame.rdd.partitions

  override def compute(thePart: Partition, context: TaskContext): Iterator[Row] = dataFrame.rdd.compute(thePart, context)

  override def getPreferredLocations(thePart: Partition): Seq[String] = dataFrame.rdd.preferredLocations(thePart)
} 
Example 191
Source File: SparkEsTransportClientManager.scala    From Spark2Elasticsearch   with Apache License 2.0 5 votes vote down vote up
package com.github.jparkie.spark.elasticsearch.transport

import com.github.jparkie.spark.elasticsearch.conf.SparkEsTransportClientConf
import org.apache.spark.Logging
import org.elasticsearch.client.Client
import org.elasticsearch.client.transport.TransportClient
import org.elasticsearch.common.settings.Settings
import org.elasticsearch.common.transport.InetSocketTransportAddress

import scala.collection.mutable

private[elasticsearch] trait SparkEsTransportClientManager extends Serializable with Logging {
  @transient
  private[transport] val internalTransportClients = mutable.HashMap.empty[SparkEsTransportClientConf, TransportClient]

  private[transport] def buildTransportSettings(clientConf: SparkEsTransportClientConf): Settings = {
    val esSettingsBuilder = Settings.builder()

    clientConf.transportSettings foreach { currentSetting =>
      esSettingsBuilder.put(currentSetting._1, currentSetting._2)
    }

    esSettingsBuilder.build()
  }

  private[transport] def buildTransportClient(clientConf: SparkEsTransportClientConf, esSettings: Settings): TransportClient = {
    import SparkEsTransportClientConf._

    val esClient = TransportClient.builder()
      .settings(esSettings)
      .build()

    getTransportAddresses(clientConf.transportAddresses, clientConf.transportPort) foreach { inetSocketAddress =>
      esClient.addTransportAddresses(new InetSocketTransportAddress(inetSocketAddress))
    }

    sys.addShutdownHook {
      logInfo("Closed Elasticsearch Transport Client.")

      esClient.close()
    }

    logInfo(s"Connected to the following Elasticsearch nodes: ${esClient.connectedNodes()}.")

    esClient
  }

  
  def closeTransportClient(clientConf: SparkEsTransportClientConf): Unit = synchronized {
    internalTransportClients.remove(clientConf) match {
      case Some(transportClient) =>
        transportClient.close()
      case None =>
        logError(s"No TransportClient for $clientConf.")
    }
  }
}

object SparkEsTransportClientManager extends SparkEsTransportClientManager 
Example 192
Source File: MessageDelimiter.scala    From spark-cep   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming.sources

import org.apache.spark.Logging
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Cast, EmptyRow, Literal}
import org.apache.spark.sql.types.StructType

class MessageDelimiter extends MessageToRowConverter with Logging {
  val delimiter = " "

  def toRow(msg: String, schema: StructType): InternalRow = {
    val splitted = msg.split(delimiter).map(Literal(_))
    val casted = splitted.indices.map(i => Cast(splitted(i), schema(i).dataType).eval(EmptyRow))
    InternalRow.fromSeq(casted)
  }

  def toMessage(row: Row): String = row.mkString(delimiter)
}

trait MessageToRowConverter extends Serializable {
  def toRow(message: String, schema: StructType): InternalRow

  def toMessage(row: Row): String
} 
Example 193
Source File: ApspResult.scala    From spark-all-pairs-shortest-path   with Apache License 2.0 5 votes vote down vote up
import java.io.Serializable
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.distributed.BlockMatrix
import org.apache.spark.Logging
import org.apache.spark.storage.StorageLevel



class ApspResult (
                 var size: Long,
                 var distMatrix: BlockMatrix)
  extends Serializable with Logging{

  validateResult(distMatrix)

  private def validateResult(result: BlockMatrix): Unit = {
    require(result.numRows == result.numCols,
      "The shortest distance matrix is not square.")
    require(size == result.numRows,
      s"The size of the shortest distance matrix does not match $size.")
    if (result.blocks.getStorageLevel == StorageLevel.NONE) {
      logWarning("The APSP result is not cached. Lookup could be slow")
    }
  }

  def lookupDist(srcId: Long, dstId: Long): Double = {
    val sizePerBlock = distMatrix.rowsPerBlock
    val rowBlockId = (srcId/sizePerBlock).toInt
    val colBlockId = (dstId/sizePerBlock).toInt
    val block = distMatrix.blocks.filter{case ((i, j), _) => ( i == rowBlockId) & (j == colBlockId)}
      .first._2
    block.toArray((dstId % sizePerBlock).toInt * block.numRows + (srcId % sizePerBlock).toInt)
  }

  def toLocal(): Matrix = {
    distMatrix.toLocalMatrix()
  }
} 
Example 194
Source File: SessionStats.scala    From twitter-stream-ml   with GNU General Public License v3.0 5 votes vote down vote up
package com.giorgioinf.twtml.spark

import com.giorgioinf.twtml.web.WebClient
import org.apache.spark.Logging
import org.apache.spark.rdd.RDD
import org.viz.lightning.{Lightning,Visualization}
import scala.util.Try

class SessionStats(conf:ConfArguments) extends Logging {

  def lgn = Lightning(conf.lightning)
  def web = WebClient(conf.twtweb)
  var viz:Visualization = _

  // blue
  val realColorDet = Array(173.0, 216.0, 230.0)
  val realColor = Array(30.0, 144.0, 255.0)
  // yellow
  val predColorDet = Array(238.0, 232.0, 170.0)
  val predColor = Array(255.0, 215.0, 0.0)

  def update(count:Long, batch:Long, mse:Double,
      realStdev:Double, predStdev:Double,
      real:Array[Double], pred:Array[Double]) {

    val realStdevArr = Array.fill(batch.toInt)(realStdev)
    val predStdevArr = Array.fill(batch.toInt)(predStdev)

    Try(web.stats(count, batch, mse.toLong, realStdev.toLong, predStdev.toLong))

    Try(lgn.lineStreaming(
      series = Array(real, pred, realStdevArr, predStdevArr),
      viz = viz))
  }

  def open():this.type = {

    log.info("Initializing plot on lightning server: {}", conf.lightning)

    // lgn.createSession(conf.appName)

    // if (lgn.session.nonEmpty) {
    //   log.info("lightning server session: {}/sessions/{}{}", conf.lightning, lgn.session, "")
    // } else {
    //   log.warn("lightning server session is empty")
    // }

    // plot new graph
    viz = lgn.lineStreaming(
        series = Array.fill(4)(Array(0.0)),
        size = Array(1.0, 1.0, 2.0, 2.0),
        color = Array(realColorDet, predColorDet, realColor, predColor))

    log.info("lightning server session: \n  {}/sessions/{}\n  {}/visualizations/{}/pym",
      conf.lightning, viz.lgn.session, conf.lightning, viz.id)

    log.info("Initializing config on web server: {}", conf.twtweb)

    // send config to web server
    Try(web.config(viz.lgn.session, lgn.host, List(viz.id)))
    this
  }
} 
Example 195
Source File: LinearRegression.scala    From twitter-stream-ml   with GNU General Public License v3.0 5 votes vote down vote up
package com.giorgioinf.twtml.spark

import org.apache.spark.{Logging, SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.twitter.TwitterUtils

object LinearRegression extends Logging {

  def main(args: Array[String]) {

    log.info("Parsing applications arguments")

    val conf = new ConfArguments()
      .setAppName("twitter-stream-ml-linear-regression")
      .parse(args.toList)

    log.info("Initializing session stats...")

    val session = new SessionStats(conf).open

    log.info("Initializing Spark Machine Learning Model...")

    MllibHelper.reset(conf)

    val model = new StreamingLinearRegressionWithSGD()
      .setNumIterations(conf.numIterations)
      .setStepSize(conf.stepSize)
      .setMiniBatchFraction(conf.miniBatchFraction)
      .setInitialWeights(Vectors.zeros(MllibHelper.numFeatures))

    log.info("Initializing Spark Context...")

    val sc = new SparkContext(conf.sparkConf)

    log.info("Initializing Streaming Spark Context... {} sec/batch", conf.seconds)

    val ssc = new StreamingContext(sc, Seconds(conf.seconds))

    log.info("Initializing Twitter stream...")

    val stream = TwitterUtils.createStream(ssc, None)
      .filter(MllibHelper.filtrate)
      .map(MllibHelper.featurize)
      .cache()

    log.info("Initializing prediction model...")

    val count = sc.accumulator(0L, "count")

    stream.foreachRDD({ rdd =>
      if (rdd.isEmpty) log.debug("batch: 0")
      else {
        val realPred = rdd.map{ lb =>
          (lb.label, Utils.round(model.latestModel.predict(lb.features)))
        }
        val batch = rdd.count
        count += batch
        val real = realPred.map(_._1)
        val pred = realPred.map(_._2)
        val realStdev = Utils.round(real.stdev)
        val predStdev = Utils.round(pred.stdev)
        val mse = Utils.round(realPred.map{case(v, p) => math.pow((v - p), 2)}.mean())

        if (log.isDebugEnabled) {
          log.debug("count: {}", count)
          // batch, mse (training mean squared error)
          log.debug("batch: {},  mse: {}", batch, mse)
          log.debug("stdev (real, pred): ({}, {})", realStdev.toLong,
            predStdev.toLong)
          log.debug("value (real, pred): {} ...", realPred.take(10).toArray)
        }

        session.update(count.value, batch, mse, realStdev, predStdev,
          real.toArray, pred.toArray);

      }

    })

    log.info("Initializing training model...")

    // training after prediction
    model.trainOn(stream)

    // Start the streaming computation
    ssc.start()
    log.info("Initialization complete.")
    ssc.awaitTermination()
  }

} 
Example 196
Source File: MllibHelper.scala    From twitter-stream-ml   with GNU General Public License v3.0 5 votes vote down vote up
package com.giorgioinf.twtml.spark

import java.text.Normalizer
import org.apache.spark.Logging
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import scala.math.BigDecimal
import twitter4j.Status

object MllibHelper extends Logging {

  val numNumberFeatures = 4

  var numRetweetBegin = 100
  var numRetweetEnd = 1000
  var numTextFeatures = 1000
  var hashText = new HashingTF(numTextFeatures)
  var numFeatures = numTextFeatures + numNumberFeatures
  var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray

  def reset(conf:ConfArguments) {
    numRetweetBegin = conf.numRetweetBegin
    numRetweetEnd = conf.numRetweetEnd
    numTextFeatures = conf.numTextFeatures

    var hashText = new HashingTF(numTextFeatures)
    var numFeatures = numTextFeatures + numNumberFeatures
    var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray

    log.debug(s"retweet range: ($numRetweetBegin - $numRetweetEnd), numTextFeatures: $numTextFeatures")
  }

  
  def featurizeText(statuses: Status): SparseVector = {
    val text = statuses.getRetweetedStatus
      .getText
      .toLowerCase

    // Separate accents from characters and then remove non-unicode
    // characters
    val noAccentText = Normalizer
      .normalize(text, Normalizer.Form.NFD)
      .replaceAll("\\p{M}", "")

    // bigrams
    hashText.transform(text.sliding(2).toSeq)
      .asInstanceOf[SparseVector]
  }

  def featurizeNumbers(statuses: Status): Vector = {
    val user = statuses.getRetweetedStatus.getUser
    val created = statuses.getRetweetedStatus.getCreatedAt
    val timeLeft = (System.currentTimeMillis - created.getTime)

    Vectors.dense(
      user.getFollowersCount * Math.pow(10, -12),
      user.getFavouritesCount * Math.pow(10, -12),
      user.getFriendsCount * Math.pow(10, -12),
      timeLeft * Math.pow(10, -14)
      //retweeted.getURLEntities.length,
      //retweeted.getUserMentionEntities.length
    )
  }

  def featurize(statuses: Status): LabeledPoint = {
    val textFeatures = featurizeText(statuses)
    val numberFeatures = featurizeNumbers(statuses)
    val features = Vectors.sparse(
      numFeatures,
      textFeatures.indices ++ numberFeatureIndices,
      textFeatures.values ++ numberFeatures.toArray
    )
    LabeledPoint( statuses.getRetweetedStatus.getRetweetCount.toDouble, features )
  }

  def retweetInterval(statuses: Status, start:Long, end:Long):Boolean = {
    val n = statuses.getRetweetedStatus.getRetweetCount
    (n >= start && n <= end)
  }

  def filtrate(statuses: Status): Boolean = {
    (
      statuses.isRetweet &&
      //statuses.getLang == "en" &&
      retweetInterval(statuses, numRetweetBegin, numRetweetEnd)
    )
  }
} 
Example 197
Source File: LogUtils.scala    From Spark-MLlib-Twitter-Sentiment-Analysis   with Apache License 2.0 5 votes vote down vote up
package org.p7h.spark.sentiment.utils

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{Logging, SparkContext}


object LogUtils extends Logging {

  def setLogLevels(sparkContext: SparkContext) {

    sparkContext.setLogLevel(Level.WARN.toString)
    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
    if (!log4jInitialized) {
      logInfo(
        """Setting log level to [WARN] for streaming executions.
          |To override add a custom log4j.properties to the classpath.""".stripMargin)
      Logger.getRootLogger.setLevel(Level.WARN)
    }
  }
} 
Example 198
Source File: CustomReceiver.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.streaming

import java.io.{InputStreamReader, BufferedReader, InputStream}
import java.net.Socket

import org.apache.spark.{SparkConf, Logging}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.receiver.Receiver


  private def receive() {
   var socket: Socket = null
   var userInput: String = null
   try {
     logInfo("Connecting to " + host + ":" + port)
     socket = new Socket(host, port)
     logInfo("Connected to " + host + ":" + port)
     val reader = new BufferedReader(new InputStreamReader(socket.getInputStream(), "UTF-8"))
     userInput = reader.readLine()
     while(!isStopped && userInput != null) {
       store(userInput)
       userInput = reader.readLine()
     }
     reader.close()
     socket.close()
     logInfo("Stopped receiving")
     restart("Trying to connect again")
   } catch {
     case e: java.net.ConnectException =>
       restart("Error connecting to " + host + ":" + port, e)
     case t: Throwable =>
       restart("Error receiving data", t)
   }
  }
} 
Example 199
Source File: StreamingExamples.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.streaming

import org.apache.spark.Logging

import org.apache.log4j.{Level, Logger}


  def setStreamingLogLevels() {
    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
    if (!log4jInitialized) {
      // We first log something to initialize Spark's default logging, then we override the
      // logging level.
      logInfo("Setting log level to [WARN] for streaming example." +
        " To override add a custom log4j.properties to the classpath.")
      Logger.getRootLogger.setLevel(Level.WARN)
    }
  }
} 
Example 200
Source File: GraphLoader.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.graphx

import org.apache.spark.storage.StorageLevel
import org.apache.spark.{Logging, SparkContext}
import org.apache.spark.graphx.impl.{EdgePartitionBuilder, GraphImpl}


  def edgeListFile(
      sc: SparkContext,
      path: String,
      canonicalOrientation: Boolean = false,
      numEdgePartitions: Int = -1,
      edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY,
      vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
    : Graph[Int, Int] =
  {
    val startTime = System.currentTimeMillis

    // Parse the edge data table directly into edge partitions
    val lines =
      if (numEdgePartitions > 0) {
        sc.textFile(path, numEdgePartitions).coalesce(numEdgePartitions)
      } else {
        sc.textFile(path)
      }
    val edges = lines.mapPartitionsWithIndex { (pid, iter) =>
      val builder = new EdgePartitionBuilder[Int, Int]
      iter.foreach { line =>
        if (!line.isEmpty && line(0) != '#') {
          val lineArray = line.split("\\s+")
          if (lineArray.length < 2) {
            throw new IllegalArgumentException("Invalid line: " + line)
          }
          val srcId = lineArray(0).toLong
          val dstId = lineArray(1).toLong
          if (canonicalOrientation && srcId > dstId) {
            builder.add(dstId, srcId, 1)
          } else {
            builder.add(srcId, dstId, 1)
          }
        }
      }
      Iterator((pid, builder.toEdgePartition))
    }.persist(edgeStorageLevel).setName("GraphLoader.edgeListFile - edges (%s)".format(path))
    edges.count()

    logInfo("It took %d ms to load the edges".format(System.currentTimeMillis - startTime))

    GraphImpl.fromEdgePartitions(edges, defaultVertexAttr = 1, edgeStorageLevel = edgeStorageLevel,
      vertexStorageLevel = vertexStorageLevel)
  } // end of edgeListFile

}