org.apache.log4j.Logger Scala Examples

The following examples show how to use org.apache.log4j.Logger. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: WortschatzParser.scala    From dbpedia-spotlight-model   with Apache License 2.0 7 votes vote down vote up
package org.dbpedia.spotlight.io

import com.officedepot.cdap2.collection.CompactHashSet
import org.apache.log4j.Logger

import scala.io.Source



object WortschatzParser {

    val LOG = Logger.getLogger(this.getClass)

    def parse(filename: String) : CompactHashSet[String] = {
        parse(filename, count => true);
    }

    def parse(filename: String, minimumCount: Int) : CompactHashSet[String] = {
        parse(filename, count => (count > minimumCount) )
    }

    def parse(filename: String, minimumCount: Int, maximumCount: Int) : CompactHashSet[String] = {
        parse(filename, count => (count > minimumCount) && (count < maximumCount))
    }

    def parse(filename: String, condition: Int => Boolean) : CompactHashSet[String] = {
        LOG.info(" parsing common words file ")
        // get lines, split in three fields, get the middle one (word)
        val commonWords = new CompactHashSet[String]();

        val log = Source.fromFile(filename, "iso-8859-1").getLines.foreach(line => {
            if (line.trim()!="") {
                val fields = line.split("\\s")
                if (condition(fields(2).toInt)) commonWords.add(fields(1))
            }
        });
        commonWords
    }
} 
Example 2
Source File: DenseKMeans.scala    From drizzle-spark   with Apache License 2.0 6 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors


object DenseKMeans {

  object InitializationMode extends Enumeration {
    type InitializationMode = Value
    val Random, Parallel = Value
  }

  import InitializationMode._

  case class Params(
      input: String = null,
      k: Int = -1,
      numIterations: Int = 10,
      initializationMode: InitializationMode = Parallel) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DenseKMeans") {
      head("DenseKMeans: an example k-means app for dense data.")
      opt[Int]('k', "k")
        .required()
        .text(s"number of clusters, required")
        .action((x, c) => c.copy(k = x))
      opt[Int]("numIterations")
        .text(s"number of iterations, default: ${defaultParams.numIterations}")
        .action((x, c) => c.copy(numIterations = x))
      opt[String]("initMode")
        .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " +
        s"default: ${defaultParams.initializationMode}")
        .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x)))
      arg[String]("<input>")
        .text("input paths to examples")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"DenseKMeans with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = sc.textFile(params.input).map { line =>
      Vectors.dense(line.split(' ').map(_.toDouble))
    }.cache()

    val numExamples = examples.count()

    println(s"numExamples = $numExamples.")

    val initMode = params.initializationMode match {
      case Random => KMeans.RANDOM
      case Parallel => KMeans.K_MEANS_PARALLEL
    }

    val model = new KMeans()
      .setInitializationMode(initMode)
      .setK(params.k)
      .setMaxIterations(params.numIterations)
      .run(examples)

    val cost = model.computeCost(examples)

    println(s"Total cost = $cost.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 3
Source File: HttpUtil.scala    From sparta   with Apache License 2.0 6 votes vote down vote up
package com.stratio.benchmark.generator.utils

import org.apache.http.HttpStatus
import org.apache.http.client.methods.{HttpDelete, HttpGet, HttpPost, HttpPut}
import org.apache.http.entity.StringEntity
import org.apache.http.impl.client.HttpClientBuilder
import org.apache.http.util.EntityUtils
import org.apache.log4j.Logger
import org.json4s.DefaultFormats
import org.json4s.native.JsonMethods._

import scala.io.Source

trait HttpUtil   {

  private val logger = Logger.getLogger(this.getClass)

  
  def createPolicy(policyContent: String, endpoint: String)(implicit defaultFormats: DefaultFormats): String = {

    val policyName = (parse(policyContent) \ "name").extract[String]

    // If the policy exists when it launches the benchmark, it should stop and delete it.
    getPolicyId(policyName, endpoint) match {
      case Some(id) =>
        stopPolicy(id, endpoint)
        deletePolicy(id, endpoint)
      case None => logger.debug(s"No policy with name $policyName exists in Sparta yet.")
    }

    val client = HttpClientBuilder.create().build()
    val post = new HttpPost(s"$endpoint/policyContext")
    post.setHeader("Content-type", "application/json")
    post.setEntity(new StringEntity(policyContent))
    val response = client.execute(post)

   if(response.getStatusLine.getStatusCode != HttpStatus.SC_OK)
     throw new IllegalStateException(s"Sparta status code is not OK: ${response.getStatusLine.getStatusCode}")
   else {
     val entity = response.getEntity
     val policyId = (parse(EntityUtils.toString(entity)) \ "policyId").extract[String]
     policyId
   }
  }

  def getPolicyId(name: String, endpoint: String)(implicit defaultFormats: DefaultFormats): Option[String] = {
    val client = HttpClientBuilder.create().build()
    val get = new HttpGet(s"$endpoint/policy/findByName/$name")

    val response = client.execute(get)

    response.getStatusLine.getStatusCode match {
      case HttpStatus.SC_OK =>
        Option((parse(EntityUtils.toString(response.getEntity)) \ "id").extract[String])
      case _ => None
    }
  }

  def stopPolicy(id: String, endpoint: String): Unit = {
    val client = HttpClientBuilder.create().build()
    val put = new HttpPut(s"$endpoint/policyContext")
    put.setHeader("Content-Type", "application/json")
    val entity = new StringEntity(s"""{"id":"$id", "status":"Stopping"}""")
    put.setEntity(entity)
    val response = client.execute(put)

    if(response.getStatusLine.getStatusCode != HttpStatus.SC_CREATED) {
      logger.info(Source.fromInputStream(response.getEntity.getContent).mkString(""))
      logger.info(s"Sparta status code is not OK: ${response.getStatusLine.getStatusCode}")
    }
  }

  def deletePolicy(id: String, endpoint: String): Unit = {
    val client = HttpClientBuilder.create().build()
    val delete = new HttpDelete(s"$endpoint/policy/$id")
    val response = client.execute(delete)

    if(response.getStatusLine.getStatusCode != HttpStatus.SC_OK)
      logger.info(s"Sparta status code is not OK: ${response.getStatusLine.getStatusCode}")
  }
} 
Example 4
package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object RandomForestPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val rf = new RandomForestClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setNumTrees(20)
      .setMaxDepth(5)
      .setMaxBins(32)
      .setMinInstancesPerNode(1)
      .setMinInfoGain(0.0)
      .setCacheNodeIds(false)
      .setCheckpointInterval(10)

    stages += vectorAssembler
    stages += rf
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)

  }
} 
Example 5
package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.GBTClassifier
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object GradientBoostedTreePipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val gbt = new GBTClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setMaxIter(10)

    stages += vectorAssembler
    stages += gbt
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)

    val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0))
    val labels = model.transform(test).select("label").rdd.map(_.getDouble(0))
    val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision
    println(s"  Accuracy : $accuracy")

    holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/GBT.xls")

    savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/GBT.csv")
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }

} 
Example 6
Source File: JDBCSink.scala    From BigData-News   with Apache License 2.0 5 votes vote down vote up
package com.vita.spark

import java.sql.{Connection, ResultSet, SQLException, Statement}

import org.apache.log4j.{LogManager, Logger}
import org.apache.spark.sql.{ForeachWriter, Row}

/**
  * 处理从StructuredStreaming中向mysql中写入数据
  */
class JDBCSink(url: String, username: String, password: String) extends ForeachWriter[Row] {

  var statement: Statement = _
  var resultSet: ResultSet = _
  var connection: Connection = _

  override def open(partitionId: Long, version: Long): Boolean = {
    connection = new MySqlPool(url, username, password).getJdbcConn()
    statement = connection.createStatement();
    print("open")
    return true
  }

  override def process(value: Row): Unit = {
    println("process step one")
    val titleName = value.getAs[String]("titleName").replaceAll("[\\[\\]]", "")
    val count = value.getAs[Long]("count")

    val querySql = "select 1 from webCount where titleName = '" + titleName + "'"
    val insertSql = "insert into webCount(titleName,count) values('" + titleName + "' , '" + count + "')"
    val updateSql = "update webCount set count = " + count + " where titleName = '" + titleName + "'"
    println("process step two")
    try {
      //查看连接是否成功
      var resultSet = statement.executeQuery(querySql)
      if (resultSet.next()) {
        println("updateSql")
        statement.executeUpdate(updateSql)
      } else {
        println("insertSql")
        statement.execute(insertSql)
      }

    } catch {
      case ex: SQLException => {
        println("SQLException")
      }
      case ex: Exception => {
        println("Exception")
      }
      case ex: RuntimeException => {
        println("RuntimeException")
      }
      case ex: Throwable => {
        println("Throwable")
      }
    }
  }

  override def close(errorOrNull: Throwable): Unit = {
    if (statement == null) {
      statement.close()
    }
    if (connection == null) {
      connection.close()
    }
  }
} 
Example 7
Source File: MySqlPool.scala    From BigData-News   with Apache License 2.0 5 votes vote down vote up
package com.vita.spark

import java.sql.{Connection, DriverManager}
import java.util

import org.apache.log4j.{LogManager, Logger}


/**
  * 从mysql连接池中获取连接
  */
class MySqlPool(url: String, user: String, pwd: String) extends Serializable {
  //连接池连接总数
  private val max = 3

  //每次产生连接数
  private val connectionNum = 1

  //当前连接池已产生的连接数
  private var conNum = 0

  private val pool = new util.LinkedList[Connection]() //连接池

  val LOGGER :Logger = LogManager.getLogger("vita")

  //获取连接
  def getJdbcConn(): Connection = {
    LOGGER.info("getJdbcConn")
    //同步代码块,AnyRef为所有引用类型的基类,AnyVal为所有值类型的基类
    AnyRef.synchronized({
      if (pool.isEmpty) {
        //加载驱动
        preGetConn()
        for (i <- 1 to connectionNum) {
          val conn = DriverManager.getConnection(url, user, pwd)
          pool.push(conn)
          conNum += 1
        }
      }
      pool.poll()
    })
  }

  //释放连接
  def releaseConn(conn: Connection): Unit = {
    pool.push(conn)
  }

  //加载驱动
  private def preGetConn(): Unit = {
    //控制加载
    if (conNum < max && !pool.isEmpty) {
      LOGGER.info("Jdbc Pool has no connection now, please wait a moments!")
      Thread.sleep(2000)
      preGetConn()
    } else {
      Class.forName("com.mysql.jdbc.Driver")
    }
  }
} 
Example 8
Source File: StructuredStreamingOffset.scala    From BigData-News   with Apache License 2.0 5 votes vote down vote up
package com.vita.spark.streaming

import com.vita.Constants
import com.vita.redies.RedisSingle
import com.vita.spark.streaming.writer.RedisWriteKafkaOffset
import org.apache.log4j.{LogManager, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.{ProcessingTime, Trigger}


object StructuredStreamingOffset {

  val LOGGER: Logger = LogManager.getLogger("StructuredStreamingOffset")

  //topic
  val SUBSCRIBE = "log"

  case class readLogs(context: String, offset: String)

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .master("local[*]")
      .appName("StructuredStreamingOffset")
      .getOrCreate()

    //开始 offset
    var startOffset = -1

    //init
    val redisSingle: RedisSingle = new RedisSingle()
    redisSingle.init(Constants.IP, Constants.PORT)
    //get redis
    if (redisSingle.exists(Constants.REDIDS_KEY) && redisSingle.getTime(Constants.REDIDS_KEY) != -1) {
      startOffset = redisSingle.get(Constants.REDIDS_KEY).toInt
    }

    //sink
    val df = spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("subscribe", SUBSCRIBE)
      .option("startingOffsets", "{\"" + SUBSCRIBE + "\":{\"0\":" + startOffset + "}}")
      .load()

    import spark.implicits._

    //row 包含: key、value 、topic、 partition、offset、timestamp、timestampType
    val lines = df.selectExpr("CAST(value AS STRING)", "CAST(offset AS LONG)").as[(String, Long)]

    val content = lines.map(x => readLogs(x._1, x._2.toString))

    val count = content.toDF("context", "offset")

    //sink foreach 记录offset
    val query = count
      .writeStream
      .foreach(new RedisWriteKafkaOffset)
      .outputMode("update")
      .trigger(Trigger.ProcessingTime("5 seconds"))
      .format("console")
      .start()

    query.awaitTermination()
  }
} 
Example 9
Source File: LinearRegression.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.optimization.{L1Updater, SimpleUpdater, SquaredL2Updater}
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
import org.apache.spark.mllib.util.MLUtils

spark-examples-*.jar \
          |  data/mllib/sample_linear_regression_data.txt
        """.stripMargin)
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"LinearRegression with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()

    val splits = examples.randomSplit(Array(0.8, 0.2))
    val training = splits(0).cache()
    val test = splits(1).cache()

    val numTraining = training.count()
    val numTest = test.count()
    println(s"Training: $numTraining, test: $numTest.")

    examples.unpersist(blocking = false)

    val updater = params.regType match {
      case NONE => new SimpleUpdater()
      case L1 => new L1Updater()
      case L2 => new SquaredL2Updater()
    }

    val algorithm = new LinearRegressionWithSGD()
    algorithm.optimizer
      .setNumIterations(params.numIterations)
      .setStepSize(params.stepSize)
      .setUpdater(updater)
      .setRegParam(params.regParam)

    val model = algorithm.run(training)

    val prediction = model.predict(test.map(_.features))
    val predictionAndLabel = prediction.zip(test.map(_.label))

    val loss = predictionAndLabel.map { case (p, l) =>
      val err = p - l
      err * err
    }.reduce(_ + _)
    val rmse = math.sqrt(loss / numTest)

    println(s"Test RMSE = $rmse.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 10
Source File: BinaryClassification.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, SVMWithSGD}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.optimization.{L1Updater, SquaredL2Updater}
import org.apache.spark.mllib.util.MLUtils

spark-examples-*.jar \
          |  --algorithm LR --regType L2 --regParam 1.0 \
          |  data/mllib/sample_binary_classification_data.txt
        """.stripMargin)
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"BinaryClassification with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()

    val splits = examples.randomSplit(Array(0.8, 0.2))
    val training = splits(0).cache()
    val test = splits(1).cache()

    val numTraining = training.count()
    val numTest = test.count()
    println(s"Training: $numTraining, test: $numTest.")

    examples.unpersist(blocking = false)

    val updater = params.regType match {
      case L1 => new L1Updater()
      case L2 => new SquaredL2Updater()
    }

    val model = params.algorithm match {
      case LR =>
        val algorithm = new LogisticRegressionWithLBFGS()
        algorithm.optimizer
          .setNumIterations(params.numIterations)
          .setUpdater(updater)
          .setRegParam(params.regParam)
        algorithm.run(training).clearThreshold()
      case SVM =>
        val algorithm = new SVMWithSGD()
        algorithm.optimizer
          .setNumIterations(params.numIterations)
          .setStepSize(params.stepSize)
          .setUpdater(updater)
          .setRegParam(params.regParam)
        algorithm.run(training).clearThreshold()
    }

    val prediction = model.predict(test.map(_.features))
    val predictionAndLabel = prediction.zip(test.map(_.label))

    val metrics = new BinaryClassificationMetrics(predictionAndLabel)

    println(s"Test areaUnderPR = ${metrics.areaUnderPR()}.")
    println(s"Test areaUnderROC = ${metrics.areaUnderROC()}.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 11
Source File: SparseNaiveBayes.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.util.MLUtils


object SparseNaiveBayes {

  case class Params(
      input: String = null,
      minPartitions: Int = 0,
      numFeatures: Int = -1,
      lambda: Double = 1.0) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("SparseNaiveBayes") {
      head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.")
      opt[Int]("numPartitions")
        .text("min number of partitions")
        .action((x, c) => c.copy(minPartitions = x))
      opt[Int]("numFeatures")
        .text("number of features")
        .action((x, c) => c.copy(numFeatures = x))
      opt[Double]("lambda")
        .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}")
        .action((x, c) => c.copy(lambda = x))
      arg[String]("<input>")
        .text("input paths to labeled examples in LIBSVM format")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val minPartitions =
      if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions

    val examples =
      MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions)
    // Cache examples because it will be used in both training and evaluation.
    examples.cache()

    val splits = examples.randomSplit(Array(0.8, 0.2))
    val training = splits(0)
    val test = splits(1)

    val numTraining = training.count()
    val numTest = test.count()

    println(s"numTraining = $numTraining, numTest = $numTest.")

    val model = new NaiveBayes().setLambda(params.lambda).run(training)

    val prediction = model.predict(test.map(_.features))
    val predictionAndLabel = prediction.zip(test.map(_.label))
    val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest

    println(s"Test accuracy = $accuracy.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 12
Source File: StreamingExamples.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.streaming

import org.apache.log4j.{Level, Logger}

import org.apache.spark.internal.Logging


  def setStreamingLogLevels() {
    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
    if (!log4jInitialized) {
      // We first log something to initialize Spark's default logging, then we override the
      // logging level.
      logInfo("Setting log level to [WARN] for streaming example." +
        " To override add a custom log4j.properties to the classpath.")
      Logger.getRootLogger.setLevel(Level.WARN)
    }
  }
} 
Example 13
Source File: YarnScheduler.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler.cluster

import org.apache.hadoop.yarn.util.RackResolver
import org.apache.log4j.{Level, Logger}

import org.apache.spark._
import org.apache.spark.scheduler.TaskSchedulerImpl
import org.apache.spark.util.Utils

private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) {

  // RackResolver logs an INFO message whenever it resolves a rack, which is way too often.
  if (Logger.getLogger(classOf[RackResolver]).getLevel == null) {
    Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN)
  }

  // By default, rack is unknown
  override def getRackForHost(hostPort: String): Option[String] = {
    val host = Utils.parseHostPort(hostPort)._1
    Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation)
  }
} 
Example 14
Source File: DLClassifierLeNet.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.example.MLPipeline

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToBatch}
import com.intel.analytics.bigdl.dataset.{DataSet, DistributedDataSet, MiniBatch, _}
import com.intel.analytics.bigdl.dlframes.DLClassifier
import com.intel.analytics.bigdl.models.lenet.LeNet5
import com.intel.analytics.bigdl.models.lenet.Utils._
import com.intel.analytics.bigdl.nn.ClassNLLCriterion
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat
import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext


object DLClassifierLeNet {

  LoggerFilter.redirectSparkInfoLogs()

  def main(args: Array[String]): Unit = {
    val inputs = Array[String]("Feature data", "Label data")
    trainParser.parse(args, new TrainParams()).foreach(param => {
      val conf = Engine.createSparkConf()
        .setAppName("MLPipeline Example")
        .set("spark.task.maxFailures", "1")
      val sc = new SparkContext(conf)
      val sqLContext = SQLContext.getOrCreate(sc)
      Engine.init

      val trainData = param.folder + "/train-images-idx3-ubyte"
      val trainLabel = param.folder + "/train-labels-idx1-ubyte"
      val validationData = param.folder + "/t10k-images-idx3-ubyte"
      val validationLabel = param.folder + "/t10k-labels-idx1-ubyte"

      val trainSet = DataSet.array(load(trainData, trainLabel), sc) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch(1)

      val trainingRDD : RDD[Data[Float]] = trainSet.
        asInstanceOf[DistributedDataSet[MiniBatch[Float]]].data(false).map(batch => {
          val feature = batch.getInput().asInstanceOf[Tensor[Float]]
          val label = batch.getTarget().asInstanceOf[Tensor[Float]]
          Data[Float](feature.storage().array(), label.storage().array())
        })
      val trainingDF = sqLContext.createDataFrame(trainingRDD).toDF(inputs: _*)

      val model = LeNet5(classNum = 10)
      val criterion = ClassNLLCriterion[Float]()
      val featureSize = Array(28, 28)
      val estimator = new DLClassifier[Float](model, criterion, featureSize)
        .setFeaturesCol(inputs(0))
        .setLabelCol(inputs(1))
        .setBatchSize(param.batchSize)
        .setMaxEpoch(param.maxEpoch)
      val transformer = estimator.fit(trainingDF)

      val validationSet = DataSet.array(load(validationData, validationLabel), sc) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToBatch(1)

      val validationRDD: RDD[Data[Float]] = validationSet.
        asInstanceOf[DistributedDataSet[MiniBatch[Float]]].data(false).map{batch =>
          val feature = batch.getInput().asInstanceOf[Tensor[Float]]
          val label = batch.getTarget().asInstanceOf[Tensor[Float]]
          Data[Float](feature.storage().array(), label.storage().array())
        }
      val validationDF = sqLContext.createDataFrame(validationRDD).toDF(inputs: _*)
      val transformed = transformer.transform(validationDF)
      transformed.show()
      sc.stop()
    })
  }
}

private case class Data[T](featureData : Array[T], labelData : Array[T]) 
Example 15
Source File: ImagePredictor.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.example.imageclassification

import java.nio.file.Paths

import com.intel.analytics.bigdl.dataset.image._
import com.intel.analytics.bigdl.dlframes.DLClassifierModel
import com.intel.analytics.bigdl.example.imageclassification.MlUtils._
import com.intel.analytics.bigdl.numeric.NumericFloat
import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext


object ImagePredictor {
  LoggerFilter.redirectSparkInfoLogs()
  Logger.getLogger("com.intel.analytics.bigdl.example").setLevel(Level.INFO)

  def main(args: Array[String]): Unit = {
    predictParser.parse(args, new PredictParams()).map(param => {
      val conf = Engine.createSparkConf()
      conf.setAppName("Predict with trained model")
      val sc = new SparkContext(conf)
      Engine.init
      val sqlContext = new SQLContext(sc)

      val partitionNum = Engine.nodeNumber() * Engine.coreNumber()
      val model = loadModel(param)
      val valTrans = new DLClassifierModel(model, Array(3, imageSize, imageSize))
        .setBatchSize(param.batchSize)
        .setFeaturesCol("features")
        .setPredictionCol("predict")

      val valRDD = if (param.isHdfs) {
        // load image set from hdfs
        imagesLoadSeq(param.folder, sc, param.classNum).coalesce(partitionNum, true)
      } else {
        // load image set from local
        val paths = LocalImageFiles.readPaths(Paths.get(param.folder), hasLabel = false)
        sc.parallelize(imagesLoad(paths, 256), partitionNum)
      }

      val transf = RowToByteRecords() ->
          BytesToBGRImg() ->
          BGRImgCropper(imageSize, imageSize) ->
          BGRImgNormalizer(testMean, testStd) ->
          BGRImgToImageVector()

      val valDF = transformDF(sqlContext.createDataFrame(valRDD), transf)

      valTrans.transform(valDF)
          .select("imageName", "predict")
          .collect()
          .take(param.showNum)
          .foreach(println)
      sc.stop()
    })
  }
} 
Example 16
Source File: RowToByteRecords.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.example.imageclassification

import com.intel.analytics.bigdl.dataset.{ByteRecord, Transformer}
import org.apache.log4j.Logger
import org.apache.spark.sql.Row

import scala.collection.Iterator

object RowToByteRecords {
  val logger = Logger.getLogger(getClass)

  def apply(colName: String = "data"): RowToByteRecords = {
    new RowToByteRecords(colName)
  }
}


class RowToByteRecords(colName: String)
  extends Transformer[Row, ByteRecord] {

  override def apply(prev: Iterator[Row]): Iterator[ByteRecord] = {
    prev.map(
      img => {
        ByteRecord(img.getAs[Array[Byte]](colName), -1.0f)
      }
    )
  }
} 
Example 17
Source File: ImageNetInference.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.example.mkldnn.int8

import com.intel.analytics.bigdl.models.resnet.ImageNetDataSet
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._
import com.intel.analytics.bigdl.utils._
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext


object ImageNetInference {
  LoggerFilter.redirectSparkInfoLogs()
  Logger.getLogger("com.intel.analytics.bigdl.optim").setLevel(Level.INFO)

  val logger: Logger = Logger.getLogger(getClass)

  import Utils._

  def main(args: Array[String]): Unit = {
    testParser.parse(args, TestParams()).foreach(param => {
      val conf = Engine.createSparkConf()
        .setAppName("Test model on ImageNet2012 with Int8")
        .set("spark.rpc.message.maxSize", "200")
      val sc = new SparkContext(conf)
      Engine.init

      val evaluationSet = ImageNetDataSet.valDataSet(param.folder,
        sc, 224, param.batchSize).toDistributed().data(train = false)

      val model = Module.loadModule[Float](param.model).quantize()
      model.evaluate()

      val result = model.evaluate(evaluationSet, Array(new Top1Accuracy[Float],
        new Top5Accuracy[Float]))

      result.foreach(r => println(s"${r._2} is ${r._1}"))

      sc.stop()
    })
  }
} 
Example 18
Source File: GenerateInt8Scales.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.example.mkldnn.int8

import com.intel.analytics.bigdl.dataset.{DataSet, MiniBatch}
import com.intel.analytics.bigdl.models.resnet.ImageNetDataSet
import com.intel.analytics.bigdl.nn.{Graph, Module}
import com.intel.analytics.bigdl.utils.Engine
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD


object GenerateInt8Scales {
  val logger: Logger = Logger.getLogger(getClass)
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)

  import Utils._

  def genereateInt8Scales(model: Graph[Float], modelName: String,
    evaluationSet: RDD[MiniBatch[Float]]): Unit = {
    model.evaluate()

    model.setInputDimMask(0, true)
    model.setOutputDimMask(0, true)
    model.setWeightDimMask(1, true)

    logger.info(s"Generate the scales for $modelName ...")
    val samples = evaluationSet
      .repartition(1) // repartition (shuffle) will have better accuracy
      .take(1) // only split one batch to sample
      .map(_.getInput().toTensor[Float])

    samples.foreach { sample =>
      model.forward(sample)
      model.calcScales(sample)
    }

    // we should clean the state, such as output
    model.clearState()

    logger.info(s"Generate the scales for $modelName done.")
  }

  def saveQuantizedModel(model: Graph[Float], modelName: String): Unit = {
    val suffix = ".bigdl"
    val prefix = modelName.stripSuffix(suffix)
    val name = prefix.concat(".quantized").concat(suffix)
    logger.info(s"Save the quantized model $name ...")
    // it will force overWrite the existed model file
    model.saveModule(name, overWrite = true)
    logger.info(s"Save the quantized model $name done.")
  }

  def main(args: Array[String]): Unit = {
    genInt8ScalesParser.parse(args, GenInt8ScalesParams()).foreach { param =>
      val conf = Engine.createSparkConf().setAppName("Quantize the model")
        .set("spark.akka.frameSize", 64.toString)
        .set("spark.task.maxFailures", "1")
      val sc = new SparkContext(conf)
      Engine.init

      val partitionNum = Engine.nodeNumber()
      val imageFrame = DataSet.SeqFileFolder.filesToImageFrame(param.folder, sc, 1000,
        partitionNum = Option(partitionNum))

      // the transformer is the same as as that in validation during training
      val evaluationSet = ImageNetDataSet.valDataSet(param.folder,
        sc, 224, param.batchSize).toDistributed().data(train = false)
      // Currently, we only support the graph model, so we add a `toGraph`
      // if the model is already graph, you can need not to it.
      val model = Module.loadModule[Float](param.model).toGraph()
      genereateInt8Scales(model, param.model, evaluationSet)
      saveQuantizedModel(model, param.model)
    }
  }
} 
Example 19
Source File: Test.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.example.lenetLocal

import com.intel.analytics.bigdl.dataset.{DataSet, SampleToBatch}
import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToSample}
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.optim.{Top1Accuracy, ValidationMethod}
import com.intel.analytics.bigdl.utils.Engine
import org.apache.log4j.{Level, Logger}

object Test {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)


  import Utils._

  def main(args: Array[String]): Unit = {
    testParser.parse(args, new TestParams()).foreach { param =>
      System.setProperty("bigdl.localMode", "true")
      System.setProperty("bigdl.coreNumber", param.coreNumber.toString)
      Engine.init

      val validationData = param.folder + "/t10k-images-idx3-ubyte"
      val validationLabel = param.folder + "/t10k-labels-idx1-ubyte"

      val evaluationSet = DataSet.array(load(validationData, validationLabel)) ->
        BytesToGreyImg(28, 28) ->
        GreyImgNormalizer(trainMean, trainStd) ->
        GreyImgToSample() -> SampleToBatch(
        batchSize = param.batchSize, None, None, None,
        partitionNum = Some(1))

      val model = Module.load[Float](param.model)
      val result = model.evaluate(evaluationSet.toLocal(),
        Array(new Top1Accuracy[Float].asInstanceOf[ValidationMethod[Float]]))
      result.foreach(r => println(s"${r._2} is ${r._1}"))
    }
  }
} 
Example 20
Source File: Predict.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.example.lenetLocal
import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToSample}
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.utils.Engine
import com.intel.analytics.bigdl.dataset.Sample
import com.intel.analytics.bigdl.optim.LocalPredictor
import org.apache.log4j.{Level, Logger}

import scala.collection.mutable.ArrayBuffer

object Predict {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)


  import Utils._

  def main(args: Array[String]): Unit = {
    predictParser.parse(args, new PredictParams()).foreach { param =>

      System.setProperty("bigdl.localMode", "true")
      System.setProperty("bigdl.coreNumber", (param.coreNumber.toString))
      Engine.init

      val validationData = param.folder + "/t10k-images-idx3-ubyte"
      val validationLabel = param.folder + "/t10k-labels-idx1-ubyte"

      val rawData = load(validationData, validationLabel)
      val iter = rawData.iterator
      val sampleIter = GreyImgToSample()(
          GreyImgNormalizer(trainMean, trainStd)(
          BytesToGreyImg(28, 28)(iter)))
      var samplesBuffer = ArrayBuffer[Sample[Float]]()
      while (sampleIter.hasNext) {
        val elem = sampleIter.next().clone()
        samplesBuffer += elem
      }
      val samples = samplesBuffer.toArray

      val model = Module.load[Float](param.model)
      val localPredictor = LocalPredictor(model)
      val result = localPredictor.predict(samples)
      val result_class = localPredictor.predictClass(samples)
      result_class.foreach(r => println(s"${r}"))
    }
  }
} 
Example 21
Source File: Train.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.example.lenetLocal

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.DataSet
import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToBatch}
import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Module}
import com.intel.analytics.bigdl.numeric.NumericFloat
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter}
import com.intel.analytics.bigdl.models.lenet.LeNet5
import org.apache.log4j.{Level, Logger}


object Train {
  LoggerFilter.redirectSparkInfoLogs()


  import Utils._

  def main(args: Array[String]): Unit = {
    trainParser.parse(args, new TrainParams()).map(param => {

      System.setProperty("bigdl.localMode", "true")
      System.setProperty("bigdl.coreNumber", param.coreNumber.toString)
      Engine.init

      val trainData = param.folder + "/train-images-idx3-ubyte"
      val trainLabel = param.folder + "/train-labels-idx1-ubyte"
      val validationData = param.folder + "/t10k-images-idx3-ubyte"
      val validationLabel = param.folder + "/t10k-labels-idx1-ubyte"

      val model = if (param.modelSnapshot.isDefined) {
        Module.load[Float](param.modelSnapshot.get)
      } else {
        LeNet5(classNum = 10)
      }

      val optimMethod = if (param.stateSnapshot.isDefined) {
        OptimMethod.load[Float](param.stateSnapshot.get)
      } else {
        new SGD[Float](learningRate = param.learningRate,
          learningRateDecay = param.learningRateDecay)
      }

      val trainSet = DataSet.array(load(trainData, trainLabel)) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch(
        param.batchSize)

      val optimizer = Optimizer(
        model = model,
        dataset = trainSet,
        criterion = ClassNLLCriterion[Float]())
      if (param.checkpoint.isDefined) {
        optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch)
      }
      if(param.overWriteCheckpoint) {
        optimizer.overWriteCheckpoint()
      }

      val validationSet = DataSet.array(load(validationData, validationLabel)) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToBatch(
        param.batchSize)

      optimizer
        .setValidation(
          trigger = Trigger.everyEpoch,
          dataset = validationSet,
          vMethods = Array(new Top1Accuracy, new Top5Accuracy[Float], new Loss[Float]))
        .setOptimMethod(optimMethod)
        .setEndWhen(Trigger.maxEpoch(param.maxEpoch))
        .optimize()
    })
  }
} 
Example 22
Source File: Utils.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.dataset

import com.intel.analytics.bigdl.utils.Engine
import org.apache.log4j.Logger

object Utils {
  private val logger = Logger.getLogger(getClass)

  def getBatchSize(batchSize : Int, totalPartition: Option[Int] = None): Int = {
    val nodeNumber = Engine.nodeNumber()
    val partitionNum = totalPartition.getOrElse(nodeNumber)
    logger.debug(s"partition number: $partitionNum, node number: $nodeNumber")

    require(partitionNum > 0,
      s"Utils.getBatchSize: partitionNum should be larger than 0, but get $partitionNum")
    require(batchSize % partitionNum == 0, s"Utils.getBatchSize: total batch size $batchSize " +
      s"should be divided by partitionNum ${partitionNum}")

    val batchPerUnit = batchSize / partitionNum
    logger.debug(s"Batch per unit: $batchPerUnit")
    batchPerUnit
  }
} 
Example 23
Source File: BGRImgToImageVector.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.dataset.image

import com.intel.analytics.bigdl.dataset.Transformer
import org.apache.log4j.Logger
import org.apache.spark.mllib.linalg.DenseVector

import scala.collection.Iterator

object BGRImgToImageVector {
  val logger = Logger.getLogger(getClass)

  def apply(): BGRImgToImageVector = {
    new BGRImgToImageVector()
  }
}


class BGRImgToImageVector()
  extends Transformer[LabeledBGRImage, DenseVector] {

  private var featureData: Array[Float] = null

  override def apply(prev: Iterator[LabeledBGRImage]): Iterator[DenseVector] = {
    prev.map(
      img => {
        if (null == featureData) {
          featureData = new Array[Float](3 * img.height() * img.width())
        }
        img.copyTo(featureData, 0, true)
        new DenseVector(featureData.map(_.toDouble))
      }
    )
  }
} 
Example 24
Source File: LocalImageFiles.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.dataset.image

import java.awt.color.ColorSpace
import java.nio.file.{Files, Path}

import org.apache.log4j.Logger

object LocalImageFiles {
  Class.forName("javax.imageio.ImageIO")
  Class.forName("java.awt.color.ICC_ColorSpace")
  // Class.forName("sun.java2d.cmm.lcms.LCMS")
  ColorSpace.getInstance(ColorSpace.CS_sRGB).toRGB(Array[Float](0, 0, 0))

  val logger = Logger.getLogger(getClass)

  
  private[bigdl] def readPaths(path: Path, hasLabel: Boolean = true)
  : Array[LocalLabeledImagePath] = {
    if (hasLabel) readPathsWithLabel(path) else readPathsNoLabel(path)
  }
} 
Example 25
Source File: FeatureTransformer.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.transform.vision.image

import com.intel.analytics.bigdl.dataset.{ChainedTransformer, Transformer}
import com.intel.analytics.bigdl.opencv.OpenCV
import com.intel.analytics.bigdl.transform.vision.image.opencv.OpenCVMat
import org.apache.log4j.Logger


class ChainedFeatureTransformer(first: FeatureTransformer, last: FeatureTransformer) extends
  FeatureTransformer {

  override def transform(prev: ImageFeature): ImageFeature = {
    last.transform(first.transform(prev))
  }

  override def enableIgnoreException(): this.type = {
    first.enableIgnoreException()
    last.enableIgnoreException()
    this
  }
} 
Example 26
Source File: ChannelScaledNormalizer.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.transform.vision.image.augmentation

import com.intel.analytics.bigdl.dataset.image.LabeledBGRImage
import com.intel.analytics.bigdl.dataset.{LocalDataSet, Transformer}
import com.intel.analytics.bigdl.transform.vision.image.{FeatureTransformer, ImageFeature}
import com.intel.analytics.bigdl.transform.vision.image.opencv.OpenCVMat
import org.apache.log4j.Logger

import scala.collection.Iterator

object ChannelScaledNormalizer {

  def apply(meanR: Int, meanG: Int, meanB: Int, scale: Double): ChannelScaledNormalizer = {
    new ChannelScaledNormalizer(meanR, meanG, meanB, scale)
  }
}



class ChannelScaledNormalizer(meanR: Int, meanG: Int, meanB: Int, scale: Double)
  extends FeatureTransformer {

  override protected def transformMat(feature: ImageFeature): Unit = {
    val mat = feature.opencvMat()
    val toFloats = OpenCVMat.toFloatPixels(mat)
    val content = toFloats._1
    require(content.length % 3 == 0, "Content should be multiple of 3 channels")
    var i = 0
    val frameLength = content.length / 3
    val height = toFloats._2
    val width = toFloats._3
    val bufferContent = new Array[Float](width * height * 3)

    val channels = 3
    val mean = Array(meanR, meanG, meanB)
    var c = 0
    while (c < channels) {
      i = 0
      while (i < frameLength) {
        val data_index = c * frameLength + i
        bufferContent(data_index) = ((content(data_index) - mean(c)) * scale).toFloat
        i += 1
      }
      c += 1
    }
    if (mat != null) {
      mat.release()
    }
    val newMat = OpenCVMat.fromFloats(bufferContent, height, width)
    feature(ImageFeature.mat) = newMat
  }

} 
Example 27
Source File: Test.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.models.resnet

import com.intel.analytics.bigdl.Module
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.utils.Engine
import com.intel.analytics.bigdl.models.resnet.Utils._
import com.intel.analytics.bigdl.optim.{Top1Accuracy, ValidationMethod, ValidationResult}
import com.intel.analytics.bigdl.dataset.image.{BGRImgNormalizer, BGRImgToSample, BytesToBGRImg}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object Test {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)

  def main(args: Array[String]): Unit = {
    testParser.parse(args, TestParams()).foreach { param =>
      val conf = Engine.createSparkConf().setAppName("Test ResNet on Cifar10")
        .set("spark.akka.frameSize", 64.toString)
        .set("spark.task.maxFailures", "1")
      val sc = new SparkContext(conf)

      Engine.init
      val partitionNum = Engine.nodeNumber() * Engine.coreNumber()

      val rddData = sc.parallelize(loadTest(param.folder), partitionNum)
      val transformer = BytesToBGRImg() -> BGRImgNormalizer(Cifar10DataSet.trainMean,
          Cifar10DataSet.trainStd) -> BGRImgToSample()
      val evaluationSet = transformer(rddData)

      val model = Module.load[Float](param.model)
      println(model)
      val result = model.evaluate(evaluationSet, Array(new Top1Accuracy[Float]),
        Some(param.batchSize))
      result.foreach(r => println(s"${r._2} is ${r._1}"))

      sc.stop()
    }
  }
} 
Example 28
Source File: TestImageNet.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.models.resnet

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.DataSet
import com.intel.analytics.bigdl.dataset.image.CropCenter
import com.intel.analytics.bigdl.models.resnet.ResNet.DatasetType
import com.intel.analytics.bigdl.nn.{Module, StaticGraph}
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._
import com.intel.analytics.bigdl.transform.vision.image.{ImageFeature, MTImageFeatureToBatch, MatToTensor, PixelBytesToMat}
import com.intel.analytics.bigdl.transform.vision.image.augmentation.{ChannelScaledNormalizer, RandomCropper, RandomResize}
import com.intel.analytics.bigdl.utils._
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext


object TestImageNet {
  LoggerFilter.redirectSparkInfoLogs()
  Logger.getLogger("com.intel.analytics.bigdl.optim").setLevel(Level.INFO)
  val logger = Logger.getLogger(getClass)

  import Utils._

  def main(args: Array[String]): Unit = {
    testParser.parse(args, new TestParams()).map(param => {
      val conf = Engine.createSparkConf().setAppName("Test model on ImageNet2012")
        .set("spark.rpc.message.maxSize", "200")
      val sc = new SparkContext(conf)
      Engine.init

      val model = Module.loadModule[Float](param.model)
      val evaluationSet = ImageNetDataSet.valDataSet(param.folder,
        sc, 224, param.batchSize).toDistributed().data(train = false)

      val result = model.evaluate(evaluationSet,
        Array(new Top1Accuracy[Float], new Top5Accuracy[Float]))
      result.foreach(r => println(s"${r._2} is ${r._1}"))

      sc.stop()
    })
  }
} 
Example 29
Source File: TrainCIFAR10.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.models.resnet

import com.intel.analytics.bigdl.nn.{CrossEntropyCriterion, Module}
import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.models.resnet.ResNet.{DatasetType, ShortcutType}
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter, OptimizerV1, OptimizerV2, T, Table}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._

object TrainCIFAR10 {
  LoggerFilter.redirectSparkInfoLogs()


  import Utils._

  def cifar10Decay(epoch: Int): Double =
    if (epoch >= 122) 2.0 else if (epoch >= 81) 1.0 else 0.0

  def main(args: Array[String]): Unit = {
    trainParser.parse(args, new TrainParams()).map(param => {
      val conf = Engine.createSparkConf().setAppName("Train ResNet on Cifar10")
        // Will throw exception without this config when has only one executor
        .set("spark.rpc.message.maxSize", "200")
      val sc = new SparkContext(conf)
      Engine.init

      val batchSize = param.batchSize
      val (imageSize, lrSchedule, maxEpoch, dataSet) =
        (32, DatasetType.CIFAR10, param.nepochs, Cifar10DataSet)

      val trainDataSet = dataSet.trainDataSet(param.folder, sc, imageSize, batchSize)

      val validateSet = dataSet.valDataSet(param.folder, sc, imageSize, batchSize)

      val shortcut: ShortcutType = param.shortcutType match {
        case "A" => ShortcutType.A
        case "B" => ShortcutType.B
        case _ => ShortcutType.C
      }

      val model = if (param.modelSnapshot.isDefined) {
        Module.load[Float](param.modelSnapshot.get)
      } else {
        val curModel = if (param.graphModel) {
          ResNet.graph(param.classes,
            T("shortcutType" -> shortcut, "depth" -> param.depth, "optnet" -> param.optnet))
        } else {
          ResNet(param.classes,
            T("shortcutType" -> shortcut, "depth" -> param.depth, "optnet" -> param.optnet))
        }
        if (param.optnet) {
          ResNet.shareGradInput(curModel)
        }
        ResNet.modelInit(curModel)
        curModel
      }

      if (param.optimizerVersion.isDefined) {
        param.optimizerVersion.get.toLowerCase match {
          case "optimizerv1" => Engine.setOptimizerVersion(OptimizerV1)
          case "optimizerv2" => Engine.setOptimizerVersion(OptimizerV2)
        }
      }

      val optimMethod = if (param.stateSnapshot.isDefined) {
        OptimMethod.load[Float](param.stateSnapshot.get)
      } else {
        new SGD[Float](learningRate = param.learningRate, learningRateDecay = 0.0,
          weightDecay = param.weightDecay, momentum = param.momentum, dampening = param.dampening,
          nesterov = param.nesterov, learningRateSchedule = SGD.EpochDecay(cifar10Decay))
      }

      val optimizer = Optimizer(
        model = model,
        dataset = trainDataSet,
        criterion = new CrossEntropyCriterion[Float]()
      )
      if (param.checkpoint.isDefined) {
        optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch)
      }

      optimizer
        .setOptimMethod(optimMethod)
        .setValidation(Trigger.everyEpoch,
          validateSet, Array(new Top1Accuracy[Float]))
        .setEndWhen(Trigger.maxEpoch(maxEpoch))
        .optimize()
      sc.stop()

    })
  }
} 
Example 30
Source File: Test.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.models.inception

import com.intel.analytics.bigdl.dataset.{ByteRecord, DataSet}
import com.intel.analytics.bigdl.dataset.image._
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.optim.{Top1Accuracy, Top5Accuracy, Validator}
import com.intel.analytics.bigdl.utils.Engine
import org.apache.hadoop.io.Text
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object Test {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)


  import Options._

  val imageSize = 224

  def main(args: Array[String]) {
    testParser.parse(args, new TestParams()).foreach { param =>
      val batchSize = param.batchSize.getOrElse(128)
      val conf = Engine.createSparkConf().setAppName("Test Inception on ImageNet")
      val sc = new SparkContext(conf)
      Engine.init

      // We set partition number to be node*core, actually you can also assign other partitionNum
      val partitionNum = Engine.nodeNumber() * Engine.coreNumber()
      val rawData = sc.sequenceFile(param.folder, classOf[Text], classOf[Text], partitionNum)
        .map(image => {
          ByteRecord(image._2.copyBytes(), DataSet.SeqFileFolder.readLabel(image._1).toFloat)
        }).coalesce(partitionNum, true)

      val rddData = DataSet.SeqFileFolder.filesToRdd(param.folder, sc, 1000)
      val transformer = BytesToBGRImg() -> BGRImgCropper(imageSize, imageSize, CropCenter) ->
        HFlip(0.5) -> BGRImgNormalizer(0.485, 0.456, 0.406, 0.229, 0.224, 0.225) -> BGRImgToSample()
      val evaluationSet = transformer(rddData)

      val model = Module.load[Float](param.model)
      val result = model.evaluate(evaluationSet,
        Array(new Top1Accuracy[Float], new Top5Accuracy[Float]), param.batchSize)

      result.foreach(r => println(s"${r._2} is ${r._1}"))
      sc.stop()
    }
  }
} 
Example 31
Source File: Test.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.models.lenet

import java.nio.file.Paths

import com.intel.analytics.bigdl.dataset.DataSet
import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToSample}
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.optim.Top1Accuracy
import com.intel.analytics.bigdl.utils.Engine
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object Test {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)


  import Utils._

  def main(args: Array[String]): Unit = {
    testParser.parse(args, new TestParams()).foreach { param =>
      val conf = Engine.createSparkConf().setAppName("Test Lenet on MNIST")
        .set("spark.akka.frameSize", 64.toString)
        .set("spark.task.maxFailures", "1")
      val sc = new SparkContext(conf)
      Engine.init

      val validationData = param.folder + "/t10k-images-idx3-ubyte"
      val validationLabel = param.folder + "/t10k-labels-idx1-ubyte"

      val partitionNum = Engine.nodeNumber() * Engine.coreNumber()
      val rddData = sc.parallelize(load(validationData, validationLabel), partitionNum)
      val transformer =
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToSample()
      val evaluationSet = transformer(rddData)

      val model = Module.load[Float](param.model)
      val result = model.evaluate(evaluationSet,
        Array(new Top1Accuracy[Float]), Some(param.batchSize))

      result.foreach(r => println(s"${r._2} is ${r._1}"))
      sc.stop()
    }
  }
} 
Example 32
Source File: Train.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.models.lenet

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.DataSet
import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToBatch}
import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, CrossEntropyCriterion, Module}
import com.intel.analytics.bigdl.numeric.NumericFloat
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.utils._
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object Train {
  LoggerFilter.redirectSparkInfoLogs()


  import Utils._

  def main(args: Array[String]): Unit = {
    trainParser.parse(args, new TrainParams()).map(param => {
      val conf = Engine.createSparkConf()
        .setAppName("Train Lenet on MNIST")
        .set("spark.task.maxFailures", "1")
      val sc = new SparkContext(conf)
      Engine.init

      val trainData = param.folder + "/train-images-idx3-ubyte"
      val trainLabel = param.folder + "/train-labels-idx1-ubyte"
      val validationData = param.folder + "/t10k-images-idx3-ubyte"
      val validationLabel = param.folder + "/t10k-labels-idx1-ubyte"

      val model = if (param.modelSnapshot.isDefined) {
        Module.load[Float](param.modelSnapshot.get)
      } else {
        if (param.graphModel) {
          LeNet5.graph(classNum = 10)
        } else {
          Engine.getEngineType() match {
            case MklBlas => LeNet5(10)
            case MklDnn => LeNet5.dnnGraph(param.batchSize / Engine.nodeNumber(), 10)
          }
        }
      }
      val criterion = Engine.getEngineType() match {
        case MklBlas => ClassNLLCriterion()
        case MklDnn => CrossEntropyCriterion()
      }

      if (param.optimizerVersion.isDefined) {
        param.optimizerVersion.get.toLowerCase match {
          case "optimizerv1" => Engine.setOptimizerVersion(OptimizerV1)
          case "optimizerv2" => Engine.setOptimizerVersion(OptimizerV2)
        }
      }

      val optimMethod = if (param.stateSnapshot.isDefined) {
        OptimMethod.load[Float](param.stateSnapshot.get)
      } else {
        new SGD[Float](learningRate = param.learningRate,
          learningRateDecay = param.learningRateDecay)
      }

      val trainSet = DataSet.array(load(trainData, trainLabel), sc) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch(
        param.batchSize)

      val optimizer = Optimizer(
        model = model,
        dataset = trainSet,
        criterion = criterion)
      if (param.checkpoint.isDefined) {
        optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch)
      }
      if(param.overWriteCheckpoint) {
        optimizer.overWriteCheckpoint()
      }

      val validationSet = DataSet.array(load(validationData, validationLabel), sc) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToBatch(
        param.batchSize)

      optimizer
        .setValidation(
          trigger = Trigger.everyEpoch,
          dataset = validationSet,
          vMethods = Array(new Top1Accuracy, new Top5Accuracy[Float], new Loss[Float]))
        .setOptimMethod(optimMethod)
        .setEndWhen(Trigger.maxEpoch(param.maxEpoch))
        .optimize()

      sc.stop()
    })
  }
} 
Example 33
Source File: Train.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.models.autoencoder

import java.nio.file.Paths

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.image._
import com.intel.analytics.bigdl.dataset.{DataSet, MiniBatch, Transformer}
import com.intel.analytics.bigdl.nn.{MSECriterion, Module}
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._
import com.intel.analytics.bigdl.utils.{Engine, OptimizerV1, OptimizerV2, T, Table}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

import scala.reflect.ClassTag

object toAutoencoderBatch {
  def apply(): toAutoencoderBatch[Float] = new toAutoencoderBatch[Float]()
}

class toAutoencoderBatch[T: ClassTag](implicit ev: TensorNumeric[T]
      )extends Transformer[MiniBatch[T], MiniBatch[T]] {
  override def apply(prev: Iterator[MiniBatch[T]]): Iterator[MiniBatch[T]] = {
    prev.map(batch => {
      MiniBatch(batch.getInput().toTensor[T], batch.getInput().toTensor[T])
    })
  }
}

object Train {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)


  import Utils._

  def main(args: Array[String]): Unit = {
    trainParser.parse(args, new TrainParams()).map(param => {
      val conf = Engine.createSparkConf().setAppName("Train Autoencoder on MNIST")

      val sc = new SparkContext(conf)
      Engine.init

      val trainData = Paths.get(param.folder, "/train-images-idx3-ubyte")
      val trainLabel = Paths.get(param.folder, "/train-labels-idx1-ubyte")

      val trainDataSet = DataSet.array(load(trainData, trainLabel), sc) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) ->
        GreyImgToBatch(param.batchSize) -> toAutoencoderBatch()

      val model = if (param.modelSnapshot.isDefined) {
        Module.load[Float](param.modelSnapshot.get)
      } else {
        if (param.graphModel) Autoencoder.graph(classNum = 32) else Autoencoder(classNum = 32)
      }

      if (param.optimizerVersion.isDefined) {
        param.optimizerVersion.get.toLowerCase match {
          case "optimizerv1" => Engine.setOptimizerVersion(OptimizerV1)
          case "optimizerv2" => Engine.setOptimizerVersion(OptimizerV2)
        }
      }

      val optimMethod = if (param.stateSnapshot.isDefined) {
        OptimMethod.load[Float](param.stateSnapshot.get)
      } else {
        new Adagrad[Float](learningRate = 0.01, learningRateDecay = 0.0, weightDecay = 0.0005)
      }

      val optimizer = Optimizer(
        model = model,
        dataset = trainDataSet,
        criterion = new MSECriterion[Float]()
      )

      if (param.checkpoint.isDefined) {
        optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch)
      }
      optimizer
        .setOptimMethod(optimMethod)
        .setEndWhen(Trigger.maxEpoch(param.maxEpoch))
        .optimize()
      sc.stop()
    })
  }
} 
Example 34
Source File: Test.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.models.vgg

import com.intel.analytics.bigdl.dataset.DataSet
import com.intel.analytics.bigdl.dataset.image._
import com.intel.analytics.bigdl.models.lenet.Utils._
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.optim.{Top1Accuracy, Validator}
import com.intel.analytics.bigdl.utils.Engine
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object Test {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)


  import Utils._

  def main(args: Array[String]) {
    testParser.parse(args, new TestParams()).foreach { param =>
      val conf = Engine.createSparkConf().setAppName("Test Vgg on Cifar10")
        .set("spark.akka.frameSize", 64.toString)
      val sc = new SparkContext(conf)
      Engine.init

      val partitionNum = Engine.nodeNumber() * Engine.coreNumber()
      val rddData = sc.parallelize(Utils.loadTest(param.folder), partitionNum)
      val transformer = BytesToBGRImg() -> BGRImgNormalizer(testMean, testStd) -> BGRImgToSample()
      val evaluationSet = transformer(rddData)

      val model = Module.load[Float](param.model)
      val result = model.evaluate(evaluationSet,
        Array(new Top1Accuracy[Float]), Some(param.batchSize))
      result.foreach(r => println(s"${r._2} is ${r._1}"))
      sc.stop()
    }
  }
} 
Example 35
Source File: Train.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.models.vgg

import java.text.SimpleDateFormat
import java.util.Date

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.DataSet
import com.intel.analytics.bigdl.dataset.image._
import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Module}
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._
import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter, OptimizerV1, OptimizerV2, T, Table}
import com.intel.analytics.bigdl.visualization.{TrainSummary, ValidationSummary}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object Train {
  LoggerFilter.redirectSparkInfoLogs()


  import Utils._

  def main(args: Array[String]): Unit = {
    trainParser.parse(args, new TrainParams()).map(param => {
      val conf = Engine.createSparkConf().setAppName("Train Vgg on Cifar10")
        // Will throw exception without this config when has only one executor
          .set("spark.rpc.message.maxSize", "200")
      val sc = new SparkContext(conf)
      Engine.init

      val trainDataSet = DataSet.array(Utils.loadTrain(param.folder), sc) ->
        BytesToBGRImg() -> BGRImgNormalizer(trainMean, trainStd) ->
        BGRImgToBatch(param.batchSize)

      val model = if (param.modelSnapshot.isDefined) {
        Module.load[Float](param.modelSnapshot.get)
      } else {
        if (param.graphModel) VggForCifar10.graph(classNum = 10) else VggForCifar10(classNum = 10)
      }

      if (param.optimizerVersion.isDefined) {
        param.optimizerVersion.get.toLowerCase match {
          case "optimizerv1" => Engine.setOptimizerVersion(OptimizerV1)
          case "optimizerv2" => Engine.setOptimizerVersion(OptimizerV2)
        }
      }

      val optimMethod = if (param.stateSnapshot.isDefined) {
        OptimMethod.load[Float](param.stateSnapshot.get)
      } else {
        new SGD[Float](learningRate = param.learningRate, learningRateDecay = 0.0,
          weightDecay = param.weightDecay, momentum = 0.9, dampening = 0.0, nesterov = false,
          learningRateSchedule = SGD.EpochStep(25, 0.5))
      }

      val optimizer = Optimizer(
        model = model,
        dataset = trainDataSet,
        criterion = new ClassNLLCriterion[Float]()
      )

      val validateSet = DataSet.array(Utils.loadTest(param.folder), sc) ->
        BytesToBGRImg() -> BGRImgNormalizer(testMean, testStd) ->
        BGRImgToBatch(param.batchSize)

      if (param.checkpoint.isDefined) {
        optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch)
      }

      if (param.overWriteCheckpoint) {
        optimizer.overWriteCheckpoint()
      }

      if (param.summaryPath.isDefined) {
        val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
        val timeStamp = sdf.format(new Date())
        val trainSummry = new TrainSummary(param.summaryPath.get,
          s"vgg-on-cifar10-train-$timeStamp")
        optimizer.setTrainSummary(trainSummry)
        val validationSummary = new ValidationSummary(param.summaryPath.get,
          s"vgg-on-cifar10-val-$timeStamp")
        optimizer.setValidationSummary(validationSummary)
      }

      optimizer
        .setValidation(Trigger.everyEpoch, validateSet, Array(new Top1Accuracy[Float]))
        .setOptimMethod(optimMethod)
        .setEndWhen(Trigger.maxEpoch(param.maxEpoch))
        .optimize()

      sc.stop()
    })
  }
} 
Example 36
Source File: Validator.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.optim

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.utils.Engine
import com.intel.analytics.bigdl.dataset.{DistributedDataSet, LocalDataSet, MiniBatch}
import org.apache.log4j.Logger


abstract class Validator[T, D](
  model: Module[T],
  dataSet: DataSet[D]
) {
  def test(vMethods: Array[ValidationMethod[T]]): Array[(ValidationResult, ValidationMethod[T])]
}
@deprecated(
  "Validator(model, dataset) is deprecated. Please use model.evaluate instead",
  "0.2.0")
object Validator {
  private val logger = Logger.getLogger(getClass)

  def apply[T, D](model: Module[T], dataset: DataSet[D]): Validator[T, D] = {
    logger.warn("Validator(model, dataset) is deprecated. Please use model.evaluate instead")
    dataset match {
      case d: DistributedDataSet[_] =>
        new DistriValidator[T](
          model = model,
          dataSet = d.asInstanceOf[DistributedDataSet[MiniBatch[T]]]
        ).asInstanceOf[Validator[T, D]]
      case d: LocalDataSet[_] =>
        new LocalValidator[T](
          model = model,
          dataSet = d.asInstanceOf[LocalDataSet[MiniBatch[T]]]
        ).asInstanceOf[Validator[T, D]]
      case _ =>
        throw new UnsupportedOperationException
    }
  }
} 
Example 37
Source File: DistriValidator.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.optim

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.{DistributedDataSet, MiniBatch}
import com.intel.analytics.bigdl.optim.DistriValidator._
import com.intel.analytics.bigdl.utils.{Engine, MklBlas}
import org.apache.log4j.Logger

object DistriValidator {
  val logger = Logger.getLogger(this.getClass)
}


  override def test(vMethods: Array[ValidationMethod[T]])
  : Array[(ValidationResult, ValidationMethod[T])] = {

    val rdd = dataSet.data(train = false)
    val broadcastModel = rdd.sparkContext.broadcast(model.evaluate(), vMethods)
    val _subModelNumber = Engine.getEngineType match {
      case MklBlas => Engine.coreNumber()
      case _ => throw new IllegalArgumentException
    }
    val nExecutor = Engine.nodeNumber()
    val executorCores = Engine.coreNumber()
    rdd.mapPartitions(dataIter => {
      Engine.setNodeAndCore(nExecutor, executorCores)
      val localModel = broadcastModel.value._1
      val localMethod = broadcastModel.value._2
      logger.info("model thread pool size is " + Engine.model.getPoolSize)
      val workingModels = (1 to _subModelNumber)
        .map(_ => localModel.cloneModule().evaluate()).toArray
      val vMethodsArr = (1 to _subModelNumber).map(i => localMethod.map(_.clone())).toArray
      dataIter.map(batch => {
        val stackSize = batch.size() / _subModelNumber
        val extraSize = batch.size() % _subModelNumber
        val parallelism = if (stackSize == 0) extraSize else _subModelNumber
        Engine.default.invokeAndWait(
          (0 until parallelism).map(b =>
            () => {
              val offset = b * stackSize + math.min(b, extraSize) + 1
              val length = stackSize + (if (b < extraSize) 1 else 0)
              val currentMiniBatch = batch.slice(offset, length)
              val input = currentMiniBatch.getInput()
              val target = currentMiniBatch.getTarget()
              val output = workingModels(b).forward(input)
              val validatMethods = vMethodsArr(b)
              validatMethods.map(validation => {
                validation(output, target)
              })
            }
          )
        ).reduce((left, right) => {
          left.zip(right).map { case (l, r) =>
            l + r
          }
        })
      })
    }).reduce((left, right) => {
      left.zip(right).map { case (l, r) =>
        l + r
      }
    }).zip(vMethods)
  }
} 
Example 38
Source File: LocalValidator.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.optim

import com.intel.analytics.bigdl.dataset.{LocalDataSet, MiniBatch}
import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.{Engine, MklBlas}
import org.apache.log4j.Logger

object LocalValidator {
  val logger = Logger.getLogger(getClass)
}


class LocalValidator[T] private[optim](model: Module[T], dataSet: LocalDataSet[MiniBatch[T]])
  extends Validator[T, MiniBatch[T]](model, dataSet) {

  val logger = LocalValidator.logger
  private val coreNumber = Engine.coreNumber()

  private val subModelNumber = Engine.getEngineType match {
    case MklBlas => coreNumber
    case _ => throw new IllegalArgumentException
  }

  private val workingModels = (1 to subModelNumber).map(_ => model.cloneModule().evaluate()).toArray

  override def test(vMethods: Array[ValidationMethod[T]])
  : Array[(ValidationResult, ValidationMethod[T])] = {
    val dataIter = dataSet.data (train = false)
    var count = 0
    val vMethodsArr = (1 to subModelNumber).map(i => vMethods.map(_.clone())).toArray
    logger.info("model thread pool size is " + Engine.model.getPoolSize)
    dataIter.map(batch => {
      val stackSize = batch.size() / subModelNumber
      val extraSize = batch.size() % subModelNumber
      val parallelism = if (stackSize == 0) extraSize else subModelNumber
      val start = System.nanoTime()
      val result = Engine.default.invokeAndWait(
        (0 until parallelism).map(b =>
          () => {
            val offset = b * stackSize + math.min(b, extraSize) + 1
            val length = stackSize + (if (b < extraSize) 1 else 0)
            val currentMiniBatch = batch.slice(offset, length)
            val input = currentMiniBatch.getInput()
            val target = currentMiniBatch.getTarget()
            val output = workingModels(b).forward(input)
            val validatMethods = vMethodsArr(b)
            validatMethods.map(validation => {
              validation(output.asInstanceOf[Tensor[T]], target)
            })
          }
        )
      ).reduce((left, right) => {
        left.zip(right).map { case (l, r) =>
          l + r
        }
      })
      count += batch.size()
      logger.info(s"[Validation] $count/${dataSet.size()} Throughput is ${
        batch.size() / ((System.nanoTime() - start) / 1e9)
      } record / sec")
      result
    }).reduce((left, right) => {
      left.zip(right).map { case (l, r) =>
        l + r
      }
    }).zip(vMethods)
  }
} 
Example 39
Source File: BigDLSpecHelper.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.utils

import java.io.{File => JFile}

import org.apache.log4j.Logger
import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers}

import scala.collection.mutable.ArrayBuffer

abstract class BigDLSpecHelper extends FlatSpec with Matchers with BeforeAndAfter {
  protected val logger = Logger.getLogger(getClass)

  private val tmpFiles : ArrayBuffer[JFile] = new ArrayBuffer[JFile]()

  protected def createTmpFile(): JFile = {
    val file = java.io.File.createTempFile("UnitTest", "BigDLSpecBase")
    logger.info(s"created file $file")
    tmpFiles.append(file)
    file
  }

  protected def getFileFolder(path: String): String = {
    path.substring(0, path.lastIndexOf(JFile.separator))
  }

  protected def getFileName(path: String): String = {
    path.substring(path.lastIndexOf(JFile.separator) + 1)
  }

  def doAfter(): Unit = {}

  def doBefore(): Unit = {}

  before {
    doBefore()
  }

  after {
    doAfter()
    tmpFiles.foreach(f => {
      if (f.exists()) {
        require(f.isFile, "cannot clean folder")
        f.delete()
        logger.info(s"deleted file $f")
      }
    })
  }
} 
Example 40
Source File: RefLocalOptimizer.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.optim

import com.intel.analytics.bigdl.DataSet
import com.intel.analytics.bigdl.dataset.MiniBatch
import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.optim.DistriOptimizer.getClass
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import org.apache.log4j.Logger

import scala.reflect.ClassTag


class RefLocalOptimizer[T: ClassTag](
  model: Module[T],
  dataset: DataSet[MiniBatch[T]],
  criterion: Criterion[T]
)(implicit ev: TensorNumeric[T]) extends Optimizer[T, MiniBatch[T]](model, dataset, criterion) {

  val logger: Logger = Logger.getLogger(getClass)

  val (w, g) = model.getParameters()

  override def optimize(): Module[T] = {
    val data = dataset.toLocal().data(train = true)
    var count = 0
    state("epoch") = state.get[Int]("epoch").getOrElse(1)
    state("neval") = state.get[Int]("neval").getOrElse(1)
    while (!endWhen(state)) {
      val batch = data.next()
      val input = batch.getInput
      val target = batch.getTarget
      model.training()
      model.zeroGradParameters()
      val output = model.forward(input).asInstanceOf[Tensor[T]]
      val loss = criterion.forward(output, target)
      model.backward(input, criterion.backward(output, target))
      optimMethods.head._2.optimize(_ => (loss, g), w, state)
      count += batch.size()
      state("neval") = state[Int]("neval") + 1
      logger.info(s"loss is $loss")
      if (count >= dataset.size()) {
        state("epoch") = state[Int]("epoch") + 1
        count = 0
      }
    }

    model
  }
} 
Example 41
Source File: ParallelOptimizerSpec.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.optim

import com.intel.analytics.bigdl.dataset.{DataSet, MiniBatch}
import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Linear, MSECriterion}
import com.intel.analytics.bigdl.optim.DistriOptimizerSpecModel.mse
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.{Engine, T}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers}

@com.intel.analytics.bigdl.tags.Serial
class ParallelOptimizerSpec extends FlatSpec with Matchers with BeforeAndAfter {

  Logger.getLogger("org").setLevel(Level.WARN)
  Logger.getLogger("akka").setLevel(Level.WARN)

  private var sc: SparkContext = _

  before {
    val conf = Engine.createSparkConf()
      .setMaster("local[1]").setAppName("ParallelOptimizerSpec")
    sc = new SparkContext(conf)
    Engine.init
    Engine.setCoreNumber(1)
  }

  after {
    if (sc != null) {
      sc.stop()
    }
  }

  "Train with parallel" should "work properly" in {
    val input = Tensor[Float](1, 10).fill(1.0f)
    val target = Tensor[Float](1).fill(1.0f)
    val miniBatch = MiniBatch(input, target)
    val model = Linear[Float](10, 2)
    model.getParameters()._1.fill(1.0f)
    val optimMethod = new SGD[Float]()

    val dataSet = DataSet.array(Array(miniBatch), sc)

    val optimizer = new DistriOptimizer[Float](model, dataSet, new ClassNLLCriterion[Float]())
      .setState(T("learningRate" -> 1.0))
      .setEndWhen(Trigger.maxIteration(10))

    optimizer.optimize()

  }

  "Train with parallel" should "have same results as DistriOptimizer" in {

    val input = Tensor[Float](1, 10).fill(1.0f)
    val target = Tensor[Float](1).fill(1.0f)
    val miniBatch = MiniBatch(input, target)
    val model1 = Linear[Float](10, 2)
    model1.getParameters()._1.fill(1.0f)

    val model2 = Linear[Float](10, 2)
    model2.getParameters()._1.fill(1.0f)

    val dataSet = DataSet.array(Array(miniBatch), sc)

    val parallelOptimizer = new DistriOptimizer[Float](model1,
      dataSet, new ClassNLLCriterion[Float]())
      .setState(T("learningRate" -> 1.0))
      .setEndWhen(Trigger.maxIteration(10))

    parallelOptimizer.optimize

    val distriOptimizer = new DistriOptimizer[Float](model2,
      dataSet, new ClassNLLCriterion[Float]())
      .setState(T("learningRate" -> 1.0))
      .setEndWhen(Trigger.maxIteration(10))

    distriOptimizer.optimize

    model1.getParameters()._1 should be (model2.getParameters()._1)

  }

} 
Example 42
Source File: SFObjectWriter.scala    From spark-salesforce   with Apache License 2.0 5 votes vote down vote up
package com.springml.spark.salesforce

import org.apache.log4j.Logger
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SaveMode}
import com.springml.salesforce.wave.api.APIFactory
import com.springml.salesforce.wave.api.BulkAPI
import com.springml.salesforce.wave.util.WaveAPIConstants
import com.springml.salesforce.wave.model.JobInfo



class SFObjectWriter (
    val username: String,
    val password: String,
    val login: String,
    val version: String,
    val sfObject: String,
    val mode: SaveMode,
    val upsert: Boolean,
    val externalIdFieldName: String,
    val csvHeader: String
    ) extends Serializable {

  @transient val logger = Logger.getLogger(classOf[SFObjectWriter])

  def writeData(rdd: RDD[Row]): Boolean = {
    val csvRDD = rdd.map(row => row.toSeq.map(value => Utils.rowValue(value)).mkString(","))

    val jobInfo = new JobInfo(WaveAPIConstants.STR_CSV, sfObject, operation(mode, upsert))
    jobInfo.setExternalIdFieldName(externalIdFieldName)

    val jobId = bulkAPI.createJob(jobInfo).getId

    csvRDD.mapPartitionsWithIndex {
      case (index, iterator) => {
        val records = iterator.toArray.mkString("\n")
        var batchInfoId : String = null
        if (records != null && !records.isEmpty()) {
          val data = csvHeader + "\n" + records
          val batchInfo = bulkAPI.addBatch(jobId, data)
          batchInfoId = batchInfo.getId
        }

        val success = (batchInfoId != null)
        // Job status will be checked after completing all batches
        List(success).iterator
      }
    }.reduce((a, b) => a & b)

    bulkAPI.closeJob(jobId)
    var i = 1
    while (i < 999999) {
      if (bulkAPI.isCompleted(jobId)) {
        logger.info("Job completed")
        return true
      }

      logger.info("Job not completed, waiting...")
      Thread.sleep(200)
      i = i + 1
    }

    print("Returning false...")
    logger.info("Job not completed. Timeout..." )
    false
  }

  // Create new instance of BulkAPI every time because Spark workers cannot serialize the object
  private def bulkAPI(): BulkAPI = {
    APIFactory.getInstance().bulkAPI(username, password, login, version)
  }

  private def operation(mode: SaveMode, upsert: Boolean): String = {
    if (upsert) {
      "upsert"
    } else if (mode != null && SaveMode.Overwrite.name().equalsIgnoreCase(mode.name())) {
      WaveAPIConstants.STR_UPDATE
    } else if (mode != null && SaveMode.Append.name().equalsIgnoreCase(mode.name())) {
      WaveAPIConstants.STR_INSERT
    } else {
      logger.warn("SaveMode " + mode + " Not supported. Using 'insert' operation")
      WaveAPIConstants.STR_INSERT
    }
  }

} 
Example 43
Source File: TopWORDSApp.scala    From topwords   with GNU General Public License v3.0 5 votes vote down vote up
package io.github.qf6101.topwords

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.log4j.Logger
import org.apache.spark.sql.SparkSession


object TopWORDSApp extends Serializable {
  @transient private[this] val LOGGER = Logger.getLogger(this.getClass.toString)

  def main(args: Array[String]) {
    // setup spark session
    val spark = SparkSession.builder().getOrCreate()
    try {
      TopWORDSParser.parse(args).foreach { args =>
        // remove output location files if exist
        val files = FileSystem.get(spark.sparkContext.hadoopConfiguration)
        if (files.exists(new Path(args.outputLoc))) files.delete(new Path(args.outputLoc), true)
        // read input corpus
        val corpus = if (args.numPartitions > 0)
          spark.sparkContext.textFile(args.inputLoc).repartition(args.numPartitions)
        else spark.sparkContext.textFile(args.inputLoc)
        LOGGER.info("Number of lines of input corpus: " + corpus.count())
        // run TopWORDS with the parsed arguments
        new TopWORDS(
          tauL = args.tauL,
          tauF = args.tauF,
          textLenThld = args.textLenThld,
          useProbThld = args.useProbThld,
          numIterations = args.numIterations,
          convergeTol = args.convergeTol,
          wordBoundaryThld = args.wordBoundaryThld)
          .run(corpus, args.outputLoc + "/dictionary", args.outputLoc + "/segmented_texts")
      }
      //exit normally
      LOGGER.info("Running TopWORDS successfully!")
      if (spark.sparkContext.master.contains("local")) sys.exit(0)
    } catch {
      case ex: Throwable =>
        LOGGER.error("Running TopWORDS fail!", ex)
        //signal to external process
        if (spark.sparkContext.master.contains("local")) sys.exit(1)
    } finally spark.stop()
  }
} 
Example 44
Source File: SparkFunSuite.scala    From spark-alchemy   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark

// scalastyle:off
import java.io.File

import scala.annotation.tailrec
import org.apache.log4j.{Appender, Level, Logger}
import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, BeforeAndAfterEach, FunSuite, Outcome, Suite}
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config.Tests.IS_TESTING
import org.apache.spark.util.{AccumulatorContext, Utils}


  protected def withLogAppender(
    appender: Appender,
    loggerName: Option[String] = None,
    level: Option[Level] = None)(
    f: => Unit): Unit = {
    val logger = loggerName.map(Logger.getLogger).getOrElse(Logger.getRootLogger)
    val restoreLevel = logger.getLevel
    logger.addAppender(appender)
    if (level.isDefined) {
      logger.setLevel(level.get)
    }
    try f finally {
      logger.removeAppender(appender)
      if (level.isDefined) {
        logger.setLevel(restoreLevel)
      }
    }
  }
} 
Example 45
Source File: DenseKMeans.scala    From AI   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package com.bigchange.mllib

import org.apache.log4j.{Level, Logger}

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors


object DenseKMeans {

  object InitializationMode extends Enumeration {
    type InitializationMode = Value
    val Random, Parallel = Value
  }

  import InitializationMode._

  case class Params(
      var input: String = null,
      k: Int = 2,
      numIterations: Int = 10,
      initializationMode: InitializationMode = Parallel) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()
    defaultParams.input = args(0)
    run(defaultParams)

  }

  def run(params: Params) {
    val conf = new SparkConf().setAppName(s"DenseKMeans with $params").setMaster("local")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = sc.textFile(params.input).map { line =>
      Vectors.dense(line.split(' ').map(_.toDouble))
    }.cache()

    val numExamples = examples.count()

    println(s"numExamples = $numExamples.")

    val initMode = params.initializationMode match {
      case Random => KMeans.RANDOM
      case Parallel => KMeans.K_MEANS_PARALLEL
    }

    val model = new KMeans()
      .setInitializationMode(initMode)
      .setK(params.k)
      .setMaxIterations(params.numIterations)
      .run(examples)

    //  Return the K-means cost (sum of squared distances of points to their nearest center) for this
    val cost = model.computeCost(examples)
    // 获取质点(k个)
    val centerPoint = model.clusterCenters
    val one = centerPoint(0)
    val two  =  centerPoint(1)
    println(s"centerPoint=$one,$two.")
    println(s"Total cost = $cost.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 46
Source File: NonCommonWordSelector.scala    From dbpedia-spotlight-model   with Apache License 2.0 5 votes vote down vote up
package org.dbpedia.spotlight.spot

import java.io._

import com.officedepot.cdap2.collection.CompactHashSet
import org.apache.log4j.Logger
import org.dbpedia.spotlight.io.WortschatzParser
import org.dbpedia.spotlight.model.SurfaceFormOccurrence

import scala.collection.JavaConversions._



    def select(occs: java.util.List[SurfaceFormOccurrence]) : java.util.List[SurfaceFormOccurrence] = {
        occs.filter( o => !isCommonWord(o) );
    }

    def main(args: Array[String]) {
        def usage = println(" Usage: scala -cp $CP NonCommonWordSelector words.txt ")
        args(0) match {
            case file: String => {
                new NonCommonWordSelector(file)
            }
            case _ => usage
        }
    }

} 
Example 47
Source File: SparkSqlUtils.scala    From HadoopLearning   with MIT License 5 votes vote down vote up
package com.c503.utils

import java.io.{BufferedInputStream, BufferedReader, FileInputStream, InputStreamReader}
import java.nio.file.Path

import com.google.common.io.Resources
import org.apache.log4j.{Level, Logger}
import org.apache.mesos.Protos.Resource
import org.apache.spark.sql.SparkSession

import scala.io.Source


  def readSqlByPath(sqlPath: String) = {
    val buf = new StringBuilder
    val path = this.getPathByName(sqlPath)
    val file = Source.fromFile(path)
    for (line <- file.getLines) {
      buf ++= line + "\n"
    }
    file.close
    buf.toString()
  }


} 
Example 48
Source File: Streaming.scala    From scala-spark-cab-rides-predictions   with MIT License 5 votes vote down vote up
import com.amazonaws.services.dynamodbv2.document.internal.InternalUtils
import com.amazonaws.services.dynamodbv2.streamsadapter.model.RecordAdapter
import com.amazonaws.services.kinesis.model.Record
import com.google.gson.Gson
import org.apache.spark.sql._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.kinesis.dynamostream.KinesisInitialPositions.Latest
import org.apache.spark.streaming.kinesis.dynamostream.KinesisInputDStream
import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext}

object Trials extends App {

  import org.apache.log4j.{Level, Logger}

  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)

  //session setup
  System.setProperty("hadoop.home.dir", "C:\\winutils")
  val sparkSession = SparkSession.builder()
    .master("local[*]")
    .appName("test")
    .getOrCreate()
  val sc = sparkSession.sparkContext
  val ssc = new StreamingContext(sc, Seconds(10))
  val sqlContext = sparkSession.sqlContext

  //creates an array of strings from raw byte array
  def kinesisRecordHandler: Record => Array[String] = (record: Record) => new String(record.getData.array()).split(",")

  //converts records to map of key value pair and then json
  def recordHandler = (record: Record) => {
    val gson = new Gson
    val sRecord = record.asInstanceOf[RecordAdapter].getInternalObject
    val map = InternalUtils.toSimpleMapValue(sRecord.getDynamodb.getNewImage)
    gson.toJson(map)
  }

  case class CabPrice(cab_type: String, product_id: String, name: String, price: String, distance: String, surge_multiplier: String, time_stamp: String, source: String, destination: String, id: String)

  val stream_cab = KinesisInputDStream.builder
    .streamingContext(ssc)
    .streamName("cab_rides")
    .regionName("us-east-1")
    .initialPosition(new Latest())
    .checkpointAppName("cab_rides-app")
    .checkpointInterval(Milliseconds(1000))
    .storageLevel(StorageLevel.MEMORY_AND_DISK_2)
    .buildWithMessageHandler(recordHandler)


  val stream_weather = KinesisInputDStream.builder
    .streamingContext(ssc)
    .streamName("weather")
    .regionName("us-east-1")
    .initialPosition(new Latest())
    .checkpointAppName("cab_rides-app")
    .checkpointInterval(Milliseconds(1000))
    .storageLevel(StorageLevel.MEMORY_AND_DISK_2)
    .buildWithMessageHandler(recordHandler)


  //creating dataframe, can be stored as temp view
  val cabSchema = Encoders.product[CabPrice].schema
  stream_cab.foreachRDD(rdd => {
    import sqlContext.implicits._
    //val xx: Dataset[String] = rdd.toDS()

    val df: DataFrame = sqlContext.read.schema(cabSchema).json(rdd.toDS())
    df.show()

  })
  ssc.start()
  ssc.awaitTermination()

} 
Example 49
Source File: MetricsTest.scala    From pulse   with Apache License 2.0 5 votes vote down vote up
package io.phdata.pulse.metrics

import org.apache.log4j.Logger
import org.scalatest.FunSuite

class MetricsTest extends FunSuite {
  implicit val logger = Logger.getLogger(this.getClass)

  test("Write a String metric") {
    Metrics.gauge("foo", "bar")
  }

  test("Write a Long metric") {
    Metrics.gauge("long_measure_i", 10)
  }

  test("Time a function") {
    val result = Metrics.time("timed_function") {
      Thread.sleep(10)
      "ok"
    }

    assertResult("ok")(result)
  }

  test("Write an Int gauge") {
    Metrics.gauge("int_measure_l", 10L)
  }

  test("Write an Double gauge") {
    Metrics.gauge("double_measure_l", 10.0F)
  }

  test("Write an Float gauge") {
    Metrics.gauge("float_measure_l", 10.0D)
  }

} 
Example 50
Source File: StandaloneAppExample.scala    From pulse   with Apache License 2.0 5 votes vote down vote up
package io.phdata.pulse.example

import javax.naming.NamingException
import org.apache.log4j.{ Logger, MDC, NDC }

object StandaloneAppExample {
  private val log = Logger.getLogger(this.getClass)

  def main(args: Array[String]): Unit = {
    MDC.put("hostname", java.net.InetAddress.getLocalHost().getHostName())
    NDC.push("ndc message")
    val numEvents   = args(0).toInt
    val sleepMillis = args(1).toInt
    0 to numEvents map { num =>
      val uuid = java.util.UUID.randomUUID.toString()
      log.info(s"info message $uuid")
      Thread.sleep(sleepMillis)
      if (num % 300 == 0) {
        log.error(s"error happened $uuid", new Exception())
      }
    }
    // Throw an exception so when we turn on `log4j.debug=true` we can see where events stop posting to the log collector
    // it's a <code>NamingException</code> so we can catch it in the test for this class
    throw new NamingException("exiting")

  }

} 
Example 51
Source File: SocialGraphJob.scala    From spark-graphx   with GNU General Public License v3.0 5 votes vote down vote up
package com.github.graphx.pregel.jobs.social

import com.github.graphx.pregel.social.SocialGraph
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object SocialGraphJob {

  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.ERROR)
    val sc = new SparkContext("local[*]", "GraphX")

    val graph = new SocialGraph(sc)

    println("Top 10 most-connected users:")
    graph.getMostConnectedUsers(10) foreach println

    println("Computing degrees of separation for user Arch")
    graph.degreeOfSeparationSingleUser(5306) foreach println

    println("Computing degrees of separation for user Arch and Fred")
    graph.degreeOfSeparationTwoUser(5306, 14) foreach println

    println("Connected component")
    graph.connectedComponentGroupedByUsers
      .sortBy ( {case (_, lowestVertexId) => lowestVertexId},
        ascending = false).take(10) foreach println

    sc.stop()
  }
} 
Example 52
Source File: ShortestPathProblemJob.scala    From spark-graphx   with GNU General Public License v3.0 5 votes vote down vote up
package com.github.graphx.pregel.jobs.ssp

import com.github.graphx.pregel.ssp.ShortestPathProblem
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.graphx.VertexId

object ShortestPathProblemJob extends App {
  Logger.getLogger("org").setLevel(Level.ERROR)
  val sc = new SparkContext("local[*]", "ShortestPathProblemDemo")
  val ssp = new ShortestPathProblem(sc)

  val sourceIdForTest: VertexId = 3
  val sourceIdForRandom: VertexId = 75

  val testGraph = ssp.testGraph
  val resultOnTestGraph = ssp.shortestPath(testGraph, sourceIdForTest)
  println(s"Test Graph:\n${ssp.graphToString(testGraph)}\n\n" +
    s"Distances on the test graph $resultOnTestGraph\n")

  val randomGraph = ssp.randomGraph
  val resultOnRandomGraph = ssp.shortestPath(randomGraph, sourceIdForRandom)
  println(s"Distances on the random graph $resultOnRandomGraph\n")
} 
Example 53
Source File: SocialPageRankJob.scala    From spark-graphx   with GNU General Public License v3.0 5 votes vote down vote up
package com.github.graphx.pagerank

import com.github.graphx.pregel.social.SocialGraph
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.graphx.VertexRDD

object SocialPageRankJob {

  
  def static(socialGraph: SocialGraph, tolerance: Double): VertexRDD[Double] =
    socialGraph.graph.staticPageRank(numIter = 20).vertices

  def handleResult(socialGraph: SocialGraph, ranks: VertexRDD[Double]) = {
    socialGraph.verts.join(ranks).map {
      case (_, (username, rank)) => (username, rank)
    }.sortBy({ case (_, rank) => rank }, ascending = false).take(10)
  }

  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.ERROR)
    val sc = new SparkContext("local[*]", "PageRank")

    val socialGraph: SocialGraph = new SocialGraph(sc)
    val TOLERANCE: Double = 0.0001

    import scala.compat.Platform.{EOL => D}
    val topUsersDynamically = handleResult(socialGraph, ranks(socialGraph, TOLERANCE)).mkString(D)
    val topUsersIterative = handleResult(socialGraph, static(socialGraph, TOLERANCE)).mkString(D)

    println(s"Top 10 users in network counted with TOLERANCE until convergence $TOLERANCE - $D $topUsersDynamically")
    println(s"Top 10 users in the network counted iteratively - $D $topUsersIterative")

    sc.stop()
  }
} 
Example 54
Source File: CheckDirEndPointImpl.scala    From Backup-Repo   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hbase

import com.google.protobuf.{RpcCallback, RpcController, Service}
import org.apache.hadoop.hbase.coprocessor._
import org.apache.hadoop.hbase.{Coprocessor, CoprocessorEnvironment}
import org.apache.log4j.Logger


class CheckDirEndPointImpl
  extends CheckDirProtos.CheckDirService with Coprocessor with CoprocessorService {

  private lazy val logger = Logger.getLogger(getClass.getName)
  private var env: RegionCoprocessorEnvironment = null

  override def start(env: CoprocessorEnvironment) = {
    env match {
      case e: RegionCoprocessorEnvironment => e
      case _ => throw new CoprocessorException("Must be loaded on a table region!")
    }
  }

  override def stop(env: CoprocessorEnvironment) = {}

  override def getService: Service = this

  override def getCheckResult(controller: RpcController,
                              request: CheckDirProtos.CheckRequest,
                              done: RpcCallback[CheckDirProtos.CheckResponse]) = {
    val isDir = new java.io.File(".").isDirectory
    if (!isDir) {
      logger.warn(
        """Current directory is not accessible,
          |please add 'cd ~' before start regionserver in your regionserver start script.""")
    }
    val response: CheckDirProtos.CheckResponse = {
      CheckDirProtos.CheckResponse.newBuilder().setAccessible(isDir).build()
    }
    done.run(response)
  }
} 
Example 55
Source File: AkkaUtils.scala    From DataXServer   with Apache License 2.0 5 votes vote down vote up
package org.tianlangstudio.data.hamal.yarn.util

import akka.actor.{ActorSystem, ExtendedActorSystem}
import com.typesafe.config.ConfigFactory
import org.apache.log4j.{Level, Logger}
import org.tianlangstudio.data.hamal.core.{Constants, HamalConf}
import org.tianlangstudio.data.hamal.core.HamalConf


  def maxFrameSizeBytes(conf: HamalConf): Int = {
    val frameSizeInMB = conf.getInt("datax.akka.frameSize", 128)
    if (frameSizeInMB > AKKA_MAX_FRAME_SIZE_IN_MB) {
      throw new IllegalArgumentException(
        s"spark.akka.frameSize should not be greater than $AKKA_MAX_FRAME_SIZE_IN_MB MB")
    }
    frameSizeInMB * 1024 * 1024
  }


  def protocol(actorSystem: ActorSystem): String = {
    val akkaConf = actorSystem.settings.config
    val sslProp = "akka.remote.netty.tcp.enable-ssl"
    protocol(akkaConf.hasPath(sslProp) && akkaConf.getBoolean(sslProp))
  }

  def protocol(ssl: Boolean = false): String = {
    if (ssl) {
      "akka.ssl.tcp"
    } else {
      "akka.tcp"
    }
  }

  def address(
      protocol: String,
      systemName: String,
      host: String,
      port: Int,
      actorName: String): String = {

        address(protocol,
          systemName,
          s"$host:$port",
          actorName
        )
  }
  def address(
               protocol: String,
               systemName: String,
               hostPort: String,
               actorName: String): String = {
    s"$protocol://$systemName@$hostPort/user/$actorName"
  }
} 
Example 56
Source File: ModelSerialization.scala    From CTRmodel   with Apache License 2.0 5 votes vote down vote up
package com.ggstar.example

import com.ggstar.ctrmodel._
import com.ggstar.features.FeatureEngineering
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}

object ModelSerialization {
  def main(args: Array[String]): Unit = {

    Logger.getLogger("org").setLevel(Level.ERROR)

    val conf = new SparkConf()
      .setMaster("local")
      .setAppName("ctrModel")
      .set("spark.submit.deployMode", "client")

    val spark = SparkSession.builder.config(conf).getOrCreate()

    val resourcesPath = this.getClass.getResource("/samples.snappy.orc")
    val rawSamples = spark.read.format("orc").option("compression", "snappy").load(resourcesPath.getPath)


    //transform array to vector for following vectorAssembler
    val samples = FeatureEngineering.transferArray2Vector(rawSamples)

    samples.printSchema()
    samples.show(5, false)


    //model training
    println("Neural Network Ctr Prediction Model:")
    val innModel = new InnerProductNNCtrModel()
    innModel.train(samples)
    val transformedData = innModel.transform(samples)

    transformedData.show(1,false)

    //model serialization by mleap
    val mleapModelSerializer = new com.ggstar.serving.mleap.serialization.ModelSerializer()
    mleapModelSerializer.serializeModel(innModel._pipelineModel, "jar:file:/Users/zhwang/Workspace/CTRmodel/model/inn.model.mleap.zip", transformedData)

    //model serialization by JPMML
    val jpmmlModelSerializer = new com.ggstar.serving.jpmml.serialization.ModelSerializer()
    jpmmlModelSerializer.serializeModel(innModel._pipelineModel, "model/inn.model.jpmml.xml", transformedData)
  }
} 
Example 57
Source File: ModelSelection.scala    From CTRmodel   with Apache License 2.0 5 votes vote down vote up
package com.ggstar.example

import com.ggstar.ctrmodel._
import com.ggstar.evaluation.Evaluator
import com.ggstar.features.FeatureEngineering
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.log4j.{Level, Logger}

object ModelSelection {
  def main(args: Array[String]): Unit = {

    Logger.getLogger("org").setLevel(Level.ERROR)

    val conf = new SparkConf()
      .setMaster("local")
      .setAppName("ctrModel")
      .set("spark.submit.deployMode", "client")

    val spark = SparkSession.builder.config(conf).getOrCreate()

    val resourcesPath = this.getClass.getResource("/samples.snappy.orc")
    val rawSamples = spark.read.format("orc").option("compression", "snappy").load(resourcesPath.getPath)
    rawSamples.printSchema()
    rawSamples.show(10)

    //transform array to vector for following vectorAssembler
    val samples = FeatureEngineering.transferArray2Vector(rawSamples)

    //split samples into training samples and validation samples
    val Array(trainingSamples, validationSamples) = samples.randomSplit(Array(0.7, 0.3))
    val evaluator = new Evaluator

    
  }
} 
Example 58
Source File: GenerateVerticesExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch08

// scalastyle:off println
import org.apache.log4j.{Level, Logger}

import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.rdd.RDD


object GenerateVerticesExample {

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }
    // ログレベルをWARNに設定
    Logger.getLogger("org").setLevel(Level.WARN)

    // SparkContextの生成
    val conf = new SparkConf().setAppName("GenerateVerticesExample")
    val sc = new SparkContext(conf)

    // 引数から設定値を取得
    val (numProducts, numUsers): (Int, Int) = (args(0).toInt, args(1).toInt)
    implicit val recOpts: RecommendLogOptions = RecommendLogOptions(numProducts, numUsers)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext)
         (implicit recOpts: RecommendLogOptions)
  : Unit = {

    // 商品リスト、ユーザリストのRDDを生成
    val products: RDD[VertexProperty] = sc.parallelize(PurchaseLogGenerator.genProductList)
    val users: RDD[VertexProperty] = sc.parallelize(PurchaseLogGenerator.genUserList)

    // 商品リスト20件を表示
    println("===================================")
    println("get top 20 products:")
    products.take(20).foreach(x => println(s"id: ${x.id},\ttype: ${x.kind},\tname: ${x.name}"))

    // ユーザリスト20件を表示
    println("===================================")
    println("get top 20 users:")
    users.take(20).foreach(x => println(s"id: ${x.id},\ttype: ${x.kind},\tname: ${x.name}"))

  }
}
// scalastyle:on println 
Example 59
Source File: ReduceExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object ReduceExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("ReduceExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1, 6, 3), 3)
    nums.reduce((x, y) => x + y)

    println(s"""nums: ${nums.collect().mkString(", ")}""")
    println(s"""sum:  ${nums.reduce((x, y) => x + y)}""")
  }
}

// scalastyle:on println 
Example 60
Source File: StatsExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object StatsExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("StatsExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array.range(1, 11))
    val stats = nums.stats()

    println(s"""nums:   ${nums.collect().mkString(", ")}""")
    println(s"""count:  ${stats.count}""")
    println(s"""mean:   ${stats.mean}""")
    println(s"""stdev:  ${stats.stdev}""")
  }
}

// scalastyle:on println 
Example 61
Source File: FoldExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object FoldExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("FoldExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1, 6, 3), 3)
    nums.reduce((x, y) => x + y)

    println(s"""nums: ${nums.collect().mkString(", ")}""")
    println(s"""sum:  ${nums.fold(0)((x, y) => x + y)}""")
  }
}

// scalastyle:on println 
Example 62
Source File: OrderExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object OrderExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("OrderExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1))

    println(s"""nums:          ${nums.collect().mkString(", ")}""")
    println(s"""top3:          ${nums.top(3).mkString(", ")}""")
    println(s"""takeOredered3: ${nums.takeOrdered(3).mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 63
Source File: AggregateExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object AggregateExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("AggregateExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  private[basic_action]
  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array.range(1, 11), 3)

    val acc = nums.aggregate(zeroValue = (0.0, 0))(
      seqOp = (partAcc, n) => (partAcc._1 + n, partAcc._2 + 1),
      combOp = (acc1, acc2) => (acc1._1 + acc2._1, acc1._2 + acc2._2)
    )
    val avg = acc._1 / acc._2

    println(s"""nums: ${nums.collect().mkString(", ")}""")
    println(s"""sum:  ${nums.fold(0)((x, y) => x + y)}""")
  }
}

// scalastyle:on println 
Example 64
Source File: CollectAsMapExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.pairrdd_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object CollectAsMapExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("CollectAsMapExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(
      Array(
        ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1)
      ), 3
    )
    val fruitsAsMap = fruits.collectAsMap()

    println(s"""fruits:      ${fruits.collect().mkString(", ")}""")
    println(s"""fruitsAsMap: $fruitsAsMap""")
  }
}

// scalastyle:on println 
Example 65
Source File: PersistExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.persistence

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

object PersistExample {
  def main(args: Array[String]) {
    if (args.length != 1) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }

    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("PersistExample")
    val sc = new SparkContext(conf)

    run(sc, args(0))
    sc.stop()
  }

  def run(sc: SparkContext, inputFile: String) {
    val lines = sc.textFile(inputFile)
    lines.count()
    lines.collect()

    val persistedLines = sc.textFile(inputFile).persist()
    persistedLines.collect()
    persistedLines.count()

    persistedLines.unpersist()
    persistedLines.collect()
  }
} 
Example 66
Source File: CustomPartitionerExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.partition

import org.apache.log4j.{Level, Logger}
import org.apache.spark.Partitioner
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object CustomPartitionerExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("CustomPartitionerExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))

    val defaultPartitioned = fruits.map((_, 1)).reduceByKey(_ + _)
    val customPartitioned = fruits.map((_, 1)).reduceByKey(
      new FirstLetterPartitioner(sc.defaultParallelism), _ + _)

    println(s"""fruits:\n  ${fruits.collect().mkString(", ")}""")
    println()

    println("partitioned by default partitioner")
    defaultPartitioned.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
    println()

    println("partitioned by first letter partitioner")
    customPartitioned.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
  }
}

private[partition]
class FirstLetterPartitioner(numParts: Int) extends Partitioner {
  override def numPartitions: Int = numParts

  override def getPartition(key: Any): Int = {
    key.toString.charAt(0).hashCode % numPartitions match {
      case p if p < 0 => p + numPartitions
      case p => p
    }
  }

  override def equals(other: Any): Boolean = {
    other match {
      case p: FirstLetterPartitioner => p.numPartitions == numPartitions
      case _ => false
    }
  }
}

// scalastyle:on println 
Example 67
Source File: PartitionExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.partition

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object PartitionExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("Partition")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1), 1)
    println(s"""nums:\n  ${nums.collect().mkString(", ")}""")
    println()

    println("original:")
    nums.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
    println()

    val numsPar3 = nums.repartition(3)
    println("repartition to 3:")
    numsPar3.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
    println()

    val numsPar2 = numsPar3.coalesce(2)
    println("coalesce to 2:")
    numsPar2.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
  }
}

// scalastyle:on println 
Example 68
Source File: WordCountExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.shared_variable

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object WordCountExample {
  def main(args: Array[String]) {
    if (args.length != 1) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }

    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("WordCountExample")
    val sc = new SparkContext(conf)

    run(sc, args(0))
    sc.stop()
  }

  def run(sc: SparkContext, inputFile: String) {
    val stopWordCount = sc.accumulator(0L)
    val stopWords = sc.broadcast(Set("a", "an", "for", "in", "on"))

    val lines = sc.textFile(inputFile)
    val words = lines.flatMap(_.split(" ")).filter(!_.isEmpty)
    val wordCounts = words.map(w => (w, 1)).reduceByKey(_ + _).filter { w =>
      val result = !stopWords.value.contains(w._1)
      if (!result) stopWordCount += 1L
      result
    }
    val sortedWordCounts = wordCounts.sortBy(_._2, ascending = false)

    println(s"""wordCounts:     ${sortedWordCounts.take(10).mkString(", ")}""")
    println(s"""stopWordCounts: ${stopWordCount.value}""")
  }
}

// scalastyle:on println 
Example 69
Source File: AggregateByKeyExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object AggregateByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("AggregateByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(
      Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1)))
    val fruitCountAvgs = fruits.aggregateByKey(zeroValue = Acc(0.0, 0))(
      seqOp = (partAcc, n) => partAcc += n,
      combOp = (acc1, acc2) => acc1 ++= acc2
    ).mapValues(acc => acc.sum / acc.count)

    println(s"""fruits:         ${fruits.collect().mkString(", ")}""")
    println(s"""fruitCountAvgs: ${fruitCountAvgs.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 70
Source File: MapValuesExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object MapValuesExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("MapValuesExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array(("Apple", 1), ("Orange", 4), ("Apple", 2), ("Peach", 1)))
    val plusOnes = fruits.mapValues(v => v + 1)

    println(s"""fruits:   ${fruits.collect().mkString(", ")}""")
    println(s"""plusOnes: ${plusOnes.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 71
Source File: SortByKeyExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object SortByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("SortByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(
      Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1)))
    val sortedByKeyAsc = fruits.sortByKey(ascending = false)

    println(s"""fruits:         ${fruits.collect().mkString(", ")}""")
    println(s"""sortedByKeyAsc: ${sortedByKeyAsc.collect().mkString(", ")}""")

    val nums = sc.parallelize(
      Array(("One", 1), ("Hundred", 100), ("Three", 3), ("Thousand", 1000)))
    implicit val sortByStrLen = new Ordering[String] {
      def compare(x: String, y: String): Int = x.length - y.length
    }
    val sortedByKeyLength = nums.sortByKey()

    println()
    println(s"""nums:              ${nums.collect().mkString(", ")}""")
    println(s"""sortedByKeyLength: ${sortedByKeyLength.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 72
Source File: CoGroupExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object CoGroupExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("CoGroupExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val persons = sc.parallelize(Array(
      ("Adam", "San francisco"),
      ("Bob", "San francisco"),
      ("Taro", "Tokyo"),
      ("Charles", "New York")
    ))
    val cities = sc.parallelize(Array(
      ("Tokyo", "Japan"),
      ("San francisco", "America"),
      ("Beijing", "China")
    ))
    val grouped = persons.map(_.swap).cogroup(cities)

    println(s"""persons: ${persons.collect().mkString(", ")}""")
    println(s"""cities:  ${cities.collect().mkString(", ")}""")
    println()
    println(s"""grouped:\n${grouped.collect().mkString("\n")}""")
  }
}

// scalastyle:on println 
Example 73
Source File: JoinExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object JoinExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("JoinExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val persons = sc.parallelize(Array(
      ("Adam", "San francisco"),
      ("Bob", "San francisco"),
      ("Taro", "Tokyo"),
      ("Charles", "New York")
    ))
    val cities = sc.parallelize(Array(
      ("Tokyo", "Japan"),
      ("San francisco", "America"),
      ("Beijing", "China")
    ))

    val leftJoined = persons.map(_.swap).join(cities)
    val leftOuterJoined = persons.map(_.swap).leftOuterJoin(cities)
    val rightOuterJoined = persons.map(_.swap).rightOuterJoin(cities)
    val fullOuterJoined = persons.map(_.swap).fullOuterJoin(cities)

    println(s"""persons: ${persons.collect().mkString(", ")}""")
    println(s"""cities:  ${cities.collect().mkString(", ")}""")
    println()
    println(s"""leftJoined:\n${leftJoined.collect().mkString("\n")}""")
    println()
    println(s"""leftOuterJoined:\n${leftOuterJoined.collect().mkString("\n")}""")
    println()
    println(s"""rightOuterJoined:\n${rightOuterJoined.collect().mkString("\n")}""")
    println()
    println(s"""fullOuterJoined:\n${fullOuterJoined.collect().mkString("\n")}""")
  }
}

// scalastyle:on println 
Example 74
Source File: GroupByKeyExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object GroupByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("GroupByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(
      Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1)))
    val grouped = fruits.groupByKey()

    println(s"""fruits:  ${fruits.collect().mkString(", ")}""")
    println(s"""grouped: ${grouped.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 75
Source File: ReduceByKeyExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object ReduceByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("ReduceByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array(
      ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1)))
    val fruitCounts = fruits.reduceByKey((x, y) => x + y)

    println(s"""fruits:      ${fruits.collect().mkString(", ")}""")
    println(s"""fruitCounts: ${fruitCounts.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 76
Source File: CombineByKeyExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object CombineByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("CombineByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(
      Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1)))
    val fruitCountAvgs = fruits.combineByKey(
      createCombiner = (v: Int) => Acc(v.toDouble, 1),
      mergeValue = (partAcc: Acc, n: Int) => partAcc += n,
      mergeCombiners = (acc1: Acc, acc2: Acc) => acc1 ++= acc2
    ).mapValues(acc => acc.sum / acc.count)

    println(s"""fruits:         ${fruits.collect().mkString(", ")}""")
    println(s"""fruitCountAvgs: ${fruitCountAvgs.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 77
Source File: FoldByKeyExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object FoldByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("FoldByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array(
      ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1)))
    val fruitCounts = fruits.foldByKey(0)((x, y) => x + y)

    println(s"""fruits:      ${fruits.collect().mkString(", ")}""")
    println(s"""fruitCounts: ${fruitCounts.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 78
Source File: MapPartitionsExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_transformation

import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object MapPartitionsExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("MapPartitionsExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val jsonLines = sc.parallelize(Array(
      """{"name": "Apple",  "num": 1}""",
      """{"name": "Orange", "num": 4}""",
      """{"name": "Apple",  "num": 2}""",
      """{"name": "Peach",  "num": 1}"""
    ))

    val parsed = jsonLines.mapPartitions { lines =>
      val mapper = new ObjectMapper()
      mapper.registerModule(DefaultScalaModule)
      lines.map { line =>
        val f = mapper.readValue(line, classOf[Map[String, String]])
        (f("name"), f("num"))
      }
    }

    println(s"""json:\n${jsonLines.collect().mkString("\n")}""")
    println()
    println(s"""parsed:\n${parsed.collect().mkString("\n")}""")
  }
}

// scalastyle:on println 
Example 79
Source File: FlatMapExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object FlatMapExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("FlatMapExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val lines = sc.parallelize(Array("Apple is red", "PineApple is yellow"))
    val words = lines.flatMap(line => line.split(" "))

    println(s"""lines: ${lines.collect().mkString(", ")}""")
    println(s"""words: ${words.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 80
Source File: SetOperationsExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object SetOperationsExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("SetOperationsExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits1 = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val fruits2 = sc.parallelize(Array("Grape", "Apple", "Banana", "Orange"))

    val union = fruits1.union(fruits2)
    val subtract = fruits1.subtract(fruits2)
    val intersection = fruits1.intersection(fruits2)
    val cartesian = fruits1.cartesian(fruits2)

    println(s"""fruits1: ${fruits1.collect().mkString(", ")}""")
    println(s"""fruits2: ${fruits2.collect().mkString(", ")}""")
    println(s"""union: ${union.collect().mkString(", ")}""")
    println(s"""subtract: ${subtract.collect().mkString(", ")}""")
    println(s"""intersection: ${intersection.collect().mkString(", ")}""")
    println(s"""cartesian: ${cartesian.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 81
Source File: MapExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object MapExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("MapExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val lengths = fruits.map(fruit => fruit.length)

    println(s"""fruits:  ${fruits.collect().mkString(", ")}""")
    println(s"""lengths: ${lengths.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 82
Source File: ZipExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object ZipExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("ZipExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits1 = sc.parallelize(
      Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val fruits2 = sc.parallelize(
      Array("りんご", "オレンジ", "桃", "オレンジ", "パイナップル", "オレンジ"))
    val zipped = fruits1.zip(fruits2)

    println(s"""fruits1: ${fruits1.collect().mkString(", ")}""")
    println(s"""fruits2: ${fruits2.collect().mkString(", ")}""")
    println(s"""zipped:  ${zipped.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 83
Source File: DistinctExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object DistinctExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("DistinctExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val uniques = fruits.distinct()

    println(s"""fruits:  ${fruits.collect().mkString(", ")}""")
    println(s"""uniques: ${uniques.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 84
Source File: SampleExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object SampleExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("SampleExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val samples = fruits.sample(withReplacement = false, 0.5, 1)

    println(s"""fruits:  ${fruits.collect().mkString(", ")}""")
    println(s"""samples: ${samples.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 85
Source File: FilterExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object FilterExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("FilterExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val startWithPs = fruits.filter(fruit => fruit.startsWith("P"))

    println(s"""fruits:      ${fruits.collect().mkString(", ")}""")
    println(s"""startWithPs: ${startWithPs.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 86
Source File: SparkFunSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark

// scalastyle:off
import org.apache.log4j.{Level, Logger}
import org.scalatest.{FunSuite, Outcome}

import org.apache.spark.Logging


  final protected override def withFixture(test: NoArgTest): Outcome = {
    val testName = test.text
    val suiteName = this.getClass.getName
    val shortSuiteName = suiteName.replaceAll("org.apache.spark", "o.a.s")
    try {
      Logger.getLogger("org").setLevel(Level.OFF)
      Logger.getLogger("akka").setLevel(Level.OFF)

      logInfo(s"\n\n===== TEST OUTPUT FOR $shortSuiteName: '$testName' =====\n")
      test()
    } finally {
      logInfo(s"\n\n===== FINISHED $shortSuiteName: '$testName' =====\n")
    }
  }

} 
Example 87
Source File: Logging.scala    From spark-distcp   with Apache License 2.0 5 votes vote down vote up
package com.coxautodata.objects

import org.apache.log4j.{Level, LogManager, Logger}

trait Logging {

  // Method to get the logger name for this object
  protected def logName: String = {
    // Ignore trailing $'s in the class names for Scala objects
    this.getClass.getName.stripSuffix("$")
  }

  private val log: Logger = LogManager.getLogger(logName)

  // Set logger level
  protected def setLogLevel(level: Level): Unit = log.setLevel(level)

  // Log methods that take only a String
  protected def logInfo(msg: => String) {
    if (log.isInfoEnabled) log.info(msg)
  }

  protected def logDebug(msg: => String) {
    if (log.isDebugEnabled) log.debug(msg)
  }

  protected def logTrace(msg: => String) {
    if (log.isTraceEnabled) log.trace(msg)
  }

  protected def logWarning(msg: => String) {
    log.warn(msg)
  }

  protected def logError(msg: => String) {
    log.error(msg)
  }

  // Log methods that take Throwables (Exceptions/Errors) too
  protected def logInfo(msg: => String, throwable: Throwable) {
    if (log.isInfoEnabled) log.info(msg, throwable)
  }

  protected def logDebug(msg: => String, throwable: Throwable) {
    if (log.isDebugEnabled) log.debug(msg, throwable)
  }

  protected def logTrace(msg: => String, throwable: Throwable) {
    if (log.isTraceEnabled) log.trace(msg, throwable)
  }

  protected def logWarning(msg: => String, throwable: Throwable) {
    log.warn(msg, throwable)
  }

  protected def logError(msg: => String, throwable: Throwable) {
    log.error(msg, throwable)
  }

  protected def isTraceEnabled: Boolean = {
    log.isTraceEnabled
  }


} 
Example 88
Source File: StressReceiver.scala    From spark-cassandra-stress   with Apache License 2.0 5 votes vote down vote up
package com.datastax.sparkstress

import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.receiver.Receiver
import org.apache.log4j.Logger

class StressReceiver[T](
  index: Int,
  rowGenerator: RowGenerator[T],
  config: Config,
  blockIntervalInMs: Int,
  storageLevel: StorageLevel)
  extends Receiver[T](storageLevel) {


  class EmitterThread(receiver: StressReceiver[_]) extends Thread(s"Emitter$index") {
    override def run(): Unit = {
      val rowIterator = rowGenerator.generatePartition(config.seed, index)
      val throughPutPerBlockInterval = (blockIntervalInMs / (config.streamingBatchIntervalSeconds * 1000.0) * config.receiverThroughputPerBatch).toLong
      while (rowIterator.hasNext) {
        val batchBegin = System.currentTimeMillis()
        for (x <- 1l to throughPutPerBlockInterval if rowIterator.hasNext) {
          store(rowIterator.next())
        }
        val batchEnd = System.currentTimeMillis()

        val napTime = blockIntervalInMs - (batchEnd - batchBegin)
        if (napTime > 0)
          Thread.sleep(napTime)
      }
      receiver.stop("Iterator Empty")
    }
  }


  def onStart() = {
    new EmitterThread(this).start()
  }



  def onStop() = {
  }
} 
Example 89
Source File: MCLModelSuite.scala    From MCL_spark   with MIT License 5 votes vote down vote up
package org.apache.spark.mllib.clustering

import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.util.Utils


class MCLModelSuite extends MCLFunSuite{
  // Disable Spark messages when running program
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  test("model save/load", UnitTest){

    val users: RDD[(VertexId, String)] =
      sc.parallelize(Array((0L,"Node1"), (1L,"Node2"),
        (2L,"Node3"), (3L,"Node4"),(4L,"Node5"),
        (5L,"Node6"), (6L,"Node7"), (7L, "Node8"),
        (8L, "Node9"), (9L, "Node10"), (10L, "Node11")))

    val relationships: RDD[Edge[Double]] =
      sc.parallelize(
        Seq(Edge(0, 1, 1.0), Edge(1, 0, 1.0),
          Edge(0, 2, 1.0), Edge(2, 0, 1.0),
          Edge(0, 3, 1.0), Edge(3, 0, 1.0),
          Edge(1, 2, 1.0), Edge(2, 1, 1.0),
          Edge(1, 3, 1.0), Edge(3, 1, 1.0),
          Edge(2, 3, 1.0), Edge(3, 2, 1.0),
          Edge(4, 5, 1.0), Edge(5, 4, 1.0),
          Edge(4, 6, 1.0), Edge(6, 4, 1.0),
          Edge(4, 7, 1.0), Edge(7, 4, 1.0),
          Edge(5, 6, 1.0), Edge(6, 5, 1.0),
          Edge(5, 7, 1.0), Edge(7, 5, 1.0),
          Edge(6, 7, 1.0), Edge(7, 6, 1.0),
          Edge(3, 8, 1.0), Edge(8, 3, 1.0),
          Edge(9, 8, 1.0), Edge(8, 9, 1.0),
          Edge(9, 10, 1.0), Edge(10, 9, 1.0),
          Edge(4, 10, 1.0), Edge(10, 4, 1.0)
        ))

    val graph = Graph(users, relationships)

    val model: MCLModel = MCL.train(graph)

    // Check number of clusters
    model.nbClusters shouldEqual 3

    // Check save and load methods
    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString

    Array(true, false).foreach { case selector =>
      // Save model, load it back, and compare.
      try {
        model.save(sc, path)
        val sameModel = MCLModel.load(sc, path)
        assertDatasetEquals(model.assignments.orderBy("id"), sameModel.assignments.orderBy("id"))
      } finally {
        Utils.deleteRecursively(tempDir)
      }
    }

  }

  test("nodes assignments", UnitTest) {
    val nodeId = 1.0.toLong
    val cluster = 2.0.toLong
    val newAssignment:Assignment = Assignment.apply(Row(nodeId, cluster))

    newAssignment.id shouldEqual nodeId
    newAssignment.cluster shouldEqual cluster
  }

} 
Example 90
Source File: PCA.scala    From Scala-for-Machine-Learning-Second-Edition   with MIT License 5 votes vote down vote up
package org.scalaml.unsupervised.pca

import scala.util.Try

import org.apache.log4j.Logger
import org.apache.commons.math3.linear._
import org.apache.commons.math3.stat.correlation.Covariance

import org.scalaml.stats.TSeries
import org.scalaml.Predef._
import org.scalaml.Predef.Context._
import org.scalaml.core.ITransform
import org.scalaml.util.LoggingUtils._
import TSeries._

case class PCAModel(covariance: DblMatrix, eigenvalues: Array[Double])


  override def |> : PartialFunction[Array[T], Try[Double]] = {
    case x: Array[T] if x.length == dimension(xt) && model.isDefined =>
      Try(margin(x, model.get.eigenvalues))
  }

  override def toString: String =
    model.map(m => {
      val covStr = m.covariance./:(new StringBuilder)((b, r) =>
        b.append(s"${r.mkString(" ")}\n")).toString()

      s"""\nEigenvalues:\n${m.eigenvalues.mkString(" ,")}\n\nCovariance matrix\n
	  | $covStr""".stripMargin
    }).getOrElse("PCA model undefined")
}

//--------------------------------  EOF ------------------------------------------------------------------------- 
Example 91
package org.scalaml.workflow.module

import org.apache.log4j.Logger
import org.scalaml.Predef.Context.ToDouble
import org.scalaml.Predef.VSeries


    override def execute(xt: Array[T]): Unit = {
      try {
        val results = em |> xt
        show(results, logger)
      } catch {
        case e: MatchError =>
          val errMsg = s"${e.getMessage()} caused by ${e.getCause.toString}"
          error(s"ClusteringModule.MultivariateEM $errMsg", logger)
        case e: Throwable => error("ClusteringModule.Kmeans", logger, e)
      }
    }
  }
}

// ---------------------------------------  EOF ------------------------------------------------------ 
Example 92
Source File: BiasVariance.scala    From Scala-for-Machine-Learning-Second-Edition   with MIT License 5 votes vote down vote up
package org.scalaml.stats

// 3rd party classes
import org.apache.log4j.Logger
// Scala for Machine Learning classes and singletons
import org.scalaml.Predef._


  def apply(emul: Double => Double, nValues: Int): BiasVariance = new BiasVariance(emul, nValues)

  private def check(nValues: Int): Unit = {
    require(
      nValues > NUMVALUES_LIMITS._1 && nValues < NUMVALUES_LIMITS._2,
      s"BiasVarianceEmulator.check Size of training sets $nValues is out of range"
    )
  }
}

// -----------------------  EOF -------------------------------------- 
Example 93
Source File: FileUtils.scala    From Scala-for-Machine-Learning-Second-Edition   with MIT License 5 votes vote down vote up
package org.scalaml.util

import org.apache.log4j.Logger

import scala.io.Source._
import scala.util.{Failure, Success, Try}


  def write(content: String, pathName: String, className: String): Boolean = {
    import java.io.PrintWriter

    import DisplayUtils._

    var printWriter: Option[PrintWriter] = None
    var status = false
    Try {
      printWriter = Some(new PrintWriter(pathName))
      printWriter.foreach(_.write(content))
      printWriter.foreach(_.flush)
      printWriter.foreach(_.close)
      status = true
    } match {
      // Catch and display exception description and return false
      case Failure(e) =>
        error(s"$className.write failed for $pathName", logger, e)
        if (printWriter.isDefined) printWriter.foreach(_.close)
        status
      case Success(s) => status
    }
  }
}

// ---------------------------------  EOF ------------------------------------- 
Example 94
Source File: KmeansTest.scala    From Scala-for-Machine-Learning-Second-Edition   with MIT License 5 votes vote down vote up
package org.scalaml.spark.mllib

import org.apache.log4j.{Level, Logger}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.{SparkConf, SparkContext}
import org.scalaml.{Logging, Resource}
import org.scalaml.Predef._
import org.scalaml.stats.TSeries._
import org.scalaml.trading.YahooFinancials
import org.scalaml.workflow.data.DataSource
import org.scalatest.FunSuite
import org.scalatest.concurrent.ScalaFutures

import scala.concurrent.Future


final class KmeansTest extends FunSuite with ScalaFutures with Logging with Resource {
  import scala.concurrent.ExecutionContext.Implicits.global

  protected[this] val name = "Spark MLlib K-Means"
  private val K = 8
  private val NRUNS = 4
  private val MAXITERS = 60
  private val PATH = "spark/CSCO.csv"
  private val CACHE = false

  test(s"$name evaluation") {
    show(s"Evaluation")

    Logger.getRootLogger.setLevel(Level.ERROR)
    // The Spark configuration has to be customize to your environment
    val sparkConf = new SparkConf().setMaster("local")
      .setAppName("Kmeans")
      .set("spark.executor.memory", "4096m")

    implicit val sc = SparkContext.getOrCreate(sparkConf) // no need to load additional jar file

    val kmeanClustering: Option[Kmeans] = extract.map(input => {
      val volatilityVol = zipToSeries(input._1, input._2).take(500)

      val config = new KmeansConfig(K, MAXITERS, NRUNS)
      val rddConfig = RDDConfig(CACHE, StorageLevel.MEMORY_ONLY)
      Kmeans(config, rddConfig, volatilityVol)
    })

      // Wraps into a future to enforce time out in case of a straggler
    val ft = Future[Boolean] { predict(kmeanClustering) }
    whenReady(ft) { result => assert(result) }
    sc.stop
  }

   private def predict(kmeanClustering: Option[Kmeans]): Boolean = {
     kmeanClustering.map(kmeansCluster => {
       val obs = Array[Double](0.1, 0.9)
       val clusterId1 = kmeansCluster |> obs
       show(s"(${obs(0)},${obs(1)}) => Cluster #$clusterId1")

       val obs2 = Array[Double](0.56, 0.11)
       val clusterId2 = kmeansCluster |> obs2
       val result = s"(${obs2(0)},${obs2(1)}) => Cluster #$clusterId2"
       show(s"$name result: $result")
     })
     true
   }

  private def extract: Option[(DblVec, DblVec)] = {
    import scala.util._
    val extractors = List[Array[String] => Double](
      YahooFinancials.volatility,
      YahooFinancials.volume
    )

    DataSource(getPath(PATH).get, true).map(_.|>) match {
      case Success(pfnSrc) => pfnSrc(extractors).map(res => ((res(0).toVector, res(1).toVector))).toOption
      case Failure(e) =>
        failureHandler(e)
        None
    }
  }
}


// ---------------------------------  EOF ------------------------------------------------- 
Example 95
Source File: StreamsTest.scala    From Scala-for-Machine-Learning-Second-Edition   with MIT License 5 votes vote down vote up
package org.scalaml.scalability.scala

import java.lang.ref._

import org.apache.log4j.Logger
import org.scalaml.Logging
import org.scalaml.Predef._
import org.scalatest.{FlatSpec, Matchers}

import scala.math._

case class DataPoint(x: DblVec, y: Double)


final class StreamsTest extends FlatSpec with Matchers with Logging {
  import scala.util.Random

  protected[this] val name = "Scala streams"

  it should s"$name huge list" in {
    show(s"$name huge list")

    val input = (0 until 1000000000).toStream
    input(10) should be(10)
  }

  it should s"$name recursion" in {
    show(s"$name recursion")

    def mean(strm: => Stream[Double]): Double = {
      @scala.annotation.tailrec
      def mean(z: Double, count: Int, strm: Stream[Double]): (Double, Int) =
        if (strm.isEmpty)
          (z, count)
        else
          mean((1.0 - 1.0 / count) * z + strm.head / count, count + 1, strm.tail)
      mean(0.0, 1, strm)._1
    }

    val input = List[Double](2.0, 5.0, 3.5, 2.0, 5.7, 1.0, 8.0)
    val ave: Double = mean(input.toStream)
    ave should be(3.88 +- 0.05)
  }

  it should s"$name with recycled memory blocks" in {
    show("$name with recycled memory blocks")

    type DblVec = Vector[Double]
    val DATASIZE = 20000

    val dot = (s: Double, xy: (Double, Double)) => s + xy._1 * xy._2
    val diff = (x: DblVec, y: DblVec) => x.zip(y).aggregate(0.0)(dot, _ + _)

    val weights = Vector[Double](0.5, 0.7)
    val lossFunction = new LossFunction(diff, weights, DATASIZE)

    // Create a stream of weak references to 10 stream segments of size DATESIZE/10
    val stream = () => new WeakReference(
      Stream.tabulate(DATASIZE)(n =>
        DataPoint(
          Vector[Double](n.toDouble, n * (n.toDouble)),
          n.toDouble * weights(0) + n * (n.toDouble) * weights(1) + 0.1 * Random.nextDouble
        ))
    )
    // Compute a simple distance using the dot product
    val totalLoss = sqrt(lossFunction.compute(stream))
    show(s"$name totalLoss ${totalLoss / DATASIZE}")

    val averageLoss = totalLoss / DATASIZE
    averageLoss should be(0.0 +- 0.001)
  }
}

// --------------------------  EOF -------------------------------- 
Example 96
Source File: Application.scala    From retail_analytics   with Apache License 2.0 5 votes vote down vote up
package controllers

import scalaz._
import Scalaz._
import scalaz.EitherT._
import scalaz.Validation
//import scalaz.Validation.FlatMap._
import scalaz.NonEmptyList._
import play.api.mvc._
import java.io.File
import scala.io.Source
import org.apache.log4j.Logger
import org.apache.log4j.Level
import models._
import models.stack._

import play.api.libs.json._

object Application extends Controller {

  def index() = Action { implicit request =>
    Ok(views.html.index("Megam Analytics."))
  }

  def upload = Action(parse.multipartFormData) { implicit request =>
       request.body.file("picture").map { picture =>
       import java.io.File
      val filename = picture.filename
      val contentType = picture.contentType
      picture.ref.moveTo(new File("/tmp/"+filename))
     
      models.HDFSFileService.saveFile("/tmp/"+filename) match {
        case Success(succ) => {
          val fu = List(("success" -> succ))
          Redirect("/").flashing(fu: _*)
        }
        case Failure(err) => {
          val fu = List(("error" -> "File doesn't get uploaded"))
          Redirect("/").flashing(fu: _*)
        }
      }
    }.getOrElse {
      val fu = List(("error" -> "File doesn't get uploaded.."))
      Redirect("/").flashing(fu: _*)
    }
  }

  def analysis() = Action { implicit request =>
  val tuple_res = models.Retail.buyingbehaviour(MConfig.recommand_ID.toInt, MConfig.retailfile)
  
     
  println("BACK==========================>>>")
  println(tuple_res._1)
 
  
  //val finalJson = {
  //  for {
  //    product <- productList
  //  } yield Json.parse(product).as[JsObject]
 // }
    Ok(views.html.finalProducts(tuple_res._1, tuple_res._2))
  }

} 
Example 97
Source File: InductiveClassifier.scala    From scala-cp   with Apache License 2.0 5 votes vote down vote up
package se.uu.it.cp

import org.apache.log4j.Logger


  def predict(features: Seq[Double], significance: Double) = {
    //Validate input
    require(significance > 0 && significance < 1, s"significance $significance is not in (0,1)")
    alphas.foreach { a =>
      if (a.length < 1 / significance - 1) {
        log.warn(s"too few calibration samples (${a.length}) for significance $significance")
      }
    }
    //Compute prediction set
    mondrianPv(features).zipWithIndex.map {
      case (pVal, c) =>
        if (pVal > significance) {
          Set(c.toDouble)
        } else {
          Set[Double]()
        }
    }.reduce(_ ++ _)
  }

} 
Example 98
Source File: FileOutputIT.scala    From sparta   with Apache License 2.0 5 votes vote down vote up
package com.stratio.sparta

import java.sql.Timestamp
import java.util.UUID

import com.github.nscala_time.time.Imports._
import com.stratio.sparta.sdk.pipeline.output.{Output, OutputFormatEnum, SaveModeEnum}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest._

import scala.reflect.io.File


class FileOutputIT extends FlatSpec with ShouldMatchers with BeforeAndAfterAll {
  self: FlatSpec =>

  @transient var sc: SparkContext = _

  override def beforeAll {
    Logger.getRootLogger.setLevel(Level.ERROR)
    sc = FileOutputIT.getNewLocalSparkContext(1, "test")
  }

  override def afterAll {
    sc.stop()
    System.clearProperty("spark.driver.port")
  }

  trait CommonValues {

    val sqlContext = SQLContext.getOrCreate(sc)

    import sqlContext.implicits._

    val time = new Timestamp(DateTime.now.getMillis)

    val data =
      sc.parallelize(Seq(Person("Kevin", 18, time), Person("Kira", 21, time), Person("Ariadne", 26, time))).toDF

    val tmpPath: String = s"/tmp/sparta-test/${UUID.randomUUID().toString}"
  }

  trait WithEventData extends CommonValues {
    val properties = Map("path" -> tmpPath, "createDifferentFiles" -> "false")
    val output = new FileOutput("file-test", properties)
  }

  "FileOutputIT" should "save a dataframe" in new WithEventData {
    output.save(data, SaveModeEnum.Append, Map(Output.TimeDimensionKey -> "minute", Output.TableNameKey -> "person"))

    val source = new java.io.File(tmpPath).listFiles()
    val read = sqlContext.read.json(tmpPath).toDF
    read.count shouldBe(3)
    File("/tmp/sparta-test").deleteRecursively
  }
}

object FileOutputIT {

  def getNewLocalSparkContext(numExecutors: Int = 1, title: String): SparkContext = {
    val conf = new SparkConf().setMaster(s"local[$numExecutors]").setAppName(title)
    SparkContext.getOrCreate(conf)
  }
}

case class Person(name: String, age: Int, minute: Timestamp) extends Serializable 
Example 99
Source File: GamerSparkSQLExample.scala    From SparkOnKudu   with Apache License 2.0 5 votes vote down vote up
package org.kududb.spark.demo.gamer.aggregates

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object GamerSparkSQLExample {
  def main(args:Array[String]): Unit = {
    if (args.length == 0) {
      println("{kudumaster} {runLocal}")
      return
    }

    Logger.getRootLogger.setLevel(Level.ERROR)

    val kuduMaster = args(0)
    val runLocal = args(1).equals("l")

    println("Loading Spark Context")
    var sc:SparkContext = null

    if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      sc = new SparkContext(sparkConfig)
    }
    println("Loading Spark Context: Finished")

    println("Setting up Tables")
    val sqlContext = new SQLContext(sc)
    sqlContext.load("org.kududb.spark",
      Map("kudu.table" -> "gamer", "kudu.master" -> kuduMaster)).registerTempTable("gamer")

    println("Query 1: SELECT count(*) FROM gamer")
    val startTimeQ1 = System.currentTimeMillis()
    sqlContext.sql("SELECT count(*) FROM gamer").take(10).foreach(r => {
      println(" - (" + r + ")")
    })
    println("Finish Query 1: " + (System.currentTimeMillis() - startTimeQ1))

    println("Query 2: SELECT * FROM gamer limit 100")
    val startTimeQ2 = System.currentTimeMillis()
    sqlContext.sql("SELECT * FROM gamer limit 100").take(100).foreach(r => {
      println(" - (" + r + ")")
    })
    println("Finish Query 2: " + (System.currentTimeMillis() - startTimeQ2))

    println("Query 3: SELECT * FROM gamer order_by last_time_played desc limit 100")
    val startTimeQ3 = System.currentTimeMillis()
    sqlContext.sql("SELECT * FROM gamer order by last_time_played desc limit 100").take(100).foreach(r => {
      println(" - (" + r + ")")
    })
    println("Finish Query 3: " + (System.currentTimeMillis() - startTimeQ3))

    println("Query 4: SELECT max(games_played), max(oks), max(damage_given) FROM gamer")
    val startTimeQ4 = System.currentTimeMillis()
    sqlContext.sql("SELECT max(games_played), max(oks), max(damage_given) FROM gamer").take(100).foreach(r => {
      println(" - (" + r + ")")
    })
    println("Finish Query 4: " + (System.currentTimeMillis() - startTimeQ4))

    println("Query 5 + MLLIB: SELECT gamer_id, oks, games_won, games_played FROM gamer" )
    val startTimeQ5 = System.currentTimeMillis()
    val resultDf = sqlContext.sql("SELECT gamer_id, oks, games_won, games_played FROM gamer")

    val parsedData = resultDf.map(r => {
      val array = Array(r.getInt(1).toDouble, r.getInt(2).toDouble, r.getInt(3).toDouble)
      Vectors.dense(array)
    })

    val dataCount = parsedData.count()

    if (dataCount > 0) {
      val clusters = KMeans.train(parsedData, 3, 5)
      clusters.clusterCenters.foreach(v => println(" Vector Center:" + v))

    }
    //TODO add Mllib here
    println("Finish Query 5 + MLLIB: " + (System.currentTimeMillis() - startTimeQ5))

  }
} 
Example 100
Source File: BasicSparkSQLExamples.scala    From SparkOnKudu   with Apache License 2.0 5 votes vote down vote up
package org.kududb.spark.demo.basic

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors

object BasicSparkSQLExamples {
  def main(args:Array[String]): Unit = {
    if (args.length == 0) {
      println("<kuduMaster> <tablename> <runLocal>")
    }

    Logger.getRootLogger.setLevel(Level.ERROR)

    val kuduMaster = args(0)
    val tableName = args(1)
    val runLocal = args(2).equals("l")

    println("starting")
    var sc:SparkContext = null
    if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      sc = new SparkContext(sparkConfig)
    }

    try {
      println("Setting up Tables")
      val sqlContext = new SQLContext(sc)
      sqlContext.load("org.kududb.spark",
        Map("kudu.table" -> tableName, "kudu.master" -> kuduMaster)).registerTempTable(tableName)

      println("Query 1: SELECT count(*) FROM " + tableName)
      val startTimeQ1 = System.currentTimeMillis()
      sqlContext.sql("SELECT count(*) FROM " + tableName).take(10).foreach(r => {
        println(" - (" + r + ")")
      })
      println("Finish Query 1: " + (System.currentTimeMillis() - startTimeQ1))

      println("Query 2: SELECT key_id, col_1 FROM " + tableName + " limit 100")
      val startTimeQ2 = System.currentTimeMillis()
      sqlContext.sql("SELECT key_id, col_1 FROM " + tableName + " limit 100 ").take(100).foreach(r => {
        println(" - (" + r + ")")
      })
      println("Finish Query 2: " + (System.currentTimeMillis() - startTimeQ2))

      val q3 = "select key_id from " + tableName + " a join (SELECT max(col_1) col_max FROM " + tableName + ") b on (a.col_1 = b.col_max)"
      println("Query 3: " + q3)
      val startTimeQ3 = System.currentTimeMillis()
      sqlContext.sql(q3).take(100).foreach(r => {
        println(" - (" + r + ")")
      })
      println("Finish Query 3: " + (System.currentTimeMillis() - startTimeQ3))


      println("Query 5 + MLLIB: SELECT key_id, col_1, col_2 FROM " + tableName )
      val startTimeQ5 = System.currentTimeMillis()
      val resultDf = sqlContext.sql("SELECT key_id, col_1, col_2 FROM " + tableName + " limit 1000")

      val parsedData = resultDf.map(r => {
        val array = Array(r.getInt(1).toDouble, r.getInt(2).toDouble)
        Vectors.dense(array)
      })
      val clusters = KMeans.train(parsedData, 3, 4)
      clusters.clusterCenters.foreach(v => println(" Vector Center:" + v))

      //TODO add Mllib here
      println("Finish Query 5 + MLLIB: " + (System.currentTimeMillis() - startTimeQ5))

    } finally {
      sc.stop()
    }
  }
} 
Example 101
Source File: CommandLineClient.scala    From incubator-retired-gearpump   with Apache License 2.0 5 votes vote down vote up
package org.apache.gearpump.integrationtest.minicluster

import org.apache.log4j.Logger
import org.apache.gearpump.cluster.ApplicationStatus
import org.apache.gearpump.integrationtest.Docker


class CommandLineClient(host: String) {

  private val LOG = Logger.getLogger(getClass)

  def listApps(): Array[String] = {
    gearCommand(host, "gear info").split("\n").filter(
      _.startsWith("application: ")
    )
  }

  def listRunningApps(): Array[String] =
    listApps().filter(_.contains(s", status: ${ApplicationStatus.ACTIVE}"))

  def queryApp(appId: Int): String = try {
    listApps().filter(_.startsWith(s"application: $appId")).head
  } catch {
    case ex: Throwable =>
      LOG.warn(s"swallowed an exception: $ex")
      ""
  }

  def submitAppAndCaptureOutput(jar: String, executorNum: Int, args: String = ""): String = {
    gearCommand(host, s"gear app -verbose true -jar $jar -executors $executorNum $args")
  }

  def submitApp(jar: String, args: String = ""): Int = {
    LOG.debug(s"|=> Submit Application $jar...")
    submitAppUse("gear app", jar, args)
  }

  private def submitAppUse(launcher: String, jar: String, args: String = ""): Int = try {
    gearCommand(host, s"$launcher -jar $jar $args").split("\n")
      .filter(_.contains("The application id is ")).head.split(" ").last.toInt
  } catch {
    case ex: Throwable =>
      LOG.warn(s"swallowed an exception: $ex")
      -1
  }

  def killApp(appId: Int): Boolean = {
    tryGearCommand(host, s"gear kill -appid $appId")
  }

  private def gearCommand(container: String, command: String): String = {
    LOG.debug(s"|=> Gear command $command in container $container...")
    Docker.execute(container, s"/opt/start $command")
  }

  private def tryGearCommand(container: String, command: String): Boolean = {
    LOG.debug(s"|=> Gear command $command in container $container...")
    Docker.executeSilently(container, s"/opt/start $command")
  }
} 
Example 102
Source File: Util.scala    From incubator-retired-gearpump   with Apache License 2.0 5 votes vote down vote up
package org.apache.gearpump.integrationtest

import scala.concurrent.duration._
import scala.util.{Failure, Success, Try}

import org.apache.log4j.Logger

object Util {

  private val LOG = Logger.getLogger(getClass)

  def encodeUriComponent(s: String): String = {
    try {
      java.net.URLEncoder.encode(s, "UTF-8")
        .replaceAll("\\+", "%20")
        .replaceAll("\\%21", "!")
        .replaceAll("\\%27", "'")
        .replaceAll("\\%28", "(")
        .replaceAll("\\%29", ")")
        .replaceAll("\\%7E", "~")
    } catch {
      case ex: Throwable => s
    }
  }

  def retryUntil(
      condition: () => Boolean, conditionDescription: String, maxTries: Int = 15,
      interval: Duration = 10.seconds): Unit = {
    var met = false
    var tries = 0

    while (!met && tries < maxTries) {

      met = Try(condition()) match {
        case Success(true) => true
        case Success(false) => false
        case Failure(ex) => false
      }

      tries += 1

      if (!met) {
        LOG.error(s"Failed due to (false == $conditionDescription), " +
          s"retrying for the ${tries} times...")
        Thread.sleep(interval.toMillis)
      } else {
        LOG.info(s"Success ($conditionDescription) after ${tries} retries")
      }
    }

    if (!met) {
      throw new Exception(s"Failed after ${tries} retries, ($conditionDescription) == false")
    }
  }
} 
Example 103
Source File: ShellExec.scala    From incubator-retired-gearpump   with Apache License 2.0 5 votes vote down vote up
package org.apache.gearpump.integrationtest

import scala.collection.JavaConverters._
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent._
import scala.concurrent.duration._
import scala.sys.process._

import org.apache.log4j.Logger
import org.apache.storm.shade.org.eclipse.jetty.util.QuotedStringTokenizer


  private def splitQuotedString(str: String): List[String] = {
    val splitter = new QuotedStringTokenizer(str, " \t\n\r")
    splitter.asInstanceOf[java.util.Enumeration[String]].asScala.toList
  }

  def exec(command: String, sender: String, timeout: Duration = PROCESS_TIMEOUT): Boolean = {
    LOG.debug(s"$sender => `$command`")

    val p = splitQuotedString(command).run()
    val f = Future(blocking(p.exitValue())) // wrap in Future
    val retval = {
      try {
        Await.result(f, timeout)
      } catch {
        case _: TimeoutException =>
          LOG.error(s"timeout to execute command `$command`")
          p.destroy()
          p.exitValue()
      }
    }
    LOG.debug(s"$sender <= exit $retval")
    retval == 0
  }

  def execAndCaptureOutput(command: String, sender: String, timeout: Duration = PROCESS_TIMEOUT)
    : String = {
    LOG.debug(s"$sender => `$command`")

    val buf = new StringBuilder
    val processLogger = ProcessLogger((o: String) => buf.append(o).append("\n"),
      (e: String) => buf.append(e).append("\n"))
    val p = splitQuotedString(command).run(processLogger)
    val f = Future(blocking(p.exitValue())) // wrap in Future
    val retval = {
      try {
        Await.result(f, timeout)
      } catch {
        case _: TimeoutException =>
          p.destroy()
          p.exitValue()
      }
    }
    val output = buf.toString().trim
    val PREVIEW_MAX_LENGTH = 200
    val preview = if (output.length > PREVIEW_MAX_LENGTH) {
      output.substring(0, PREVIEW_MAX_LENGTH) + "..."
    } else {
      output
    }

    LOG.debug(s"$sender <= `$preview` exit $retval")
    if (retval != 0) {
      throw new RuntimeException(
        s"exited ($retval) by executing `$command`")
    }
    output
  }
} 
Example 104
Source File: NumericalDataProducer.scala    From incubator-retired-gearpump   with Apache License 2.0 5 votes vote down vote up
package org.apache.gearpump.integrationtest.kafka

import java.util.Properties

import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import org.apache.kafka.common.serialization.ByteArraySerializer
import org.apache.log4j.Logger

import org.apache.gearpump.streaming.serializer.ChillSerializer

class NumericalDataProducer(topic: String, bootstrapServers: String) {

  private val LOG = Logger.getLogger(getClass)
  private val producer = createProducer
  private val WRITE_SLEEP_NANOS = 10
  private val serializer = new ChillSerializer[Int]
  var lastWriteNum = 0

  def start(): Unit = {
    produceThread.start()
  }

  def stop(): Unit = {
    if (produceThread.isAlive) {
      produceThread.interrupt()
      produceThread.join()
    }
    producer.close()
  }

  
  def producedNumbers: Range = {
    Range(1, lastWriteNum + 1)
  }

  private def createProducer: KafkaProducer[Array[Byte], Array[Byte]] = {
    val properties = new Properties()
    properties.setProperty("bootstrap.servers", bootstrapServers)
    new KafkaProducer[Array[Byte], Array[Byte]](properties,
      new ByteArraySerializer, new ByteArraySerializer)
  }

  private val produceThread = new Thread(new Runnable {
    override def run(): Unit = {
      try {
        while (!Thread.currentThread.isInterrupted) {
          lastWriteNum += 1
          val msg = serializer.serialize(lastWriteNum)
          val record = new ProducerRecord[Array[Byte], Array[Byte]](topic, msg)
          producer.send(record)
          Thread.sleep(0, WRITE_SLEEP_NANOS)
        }
      } catch {
        case ex: InterruptedException =>
          LOG.error("message producing is stopped by an interrupt")
      }
    }
  })
} 
Example 105
Source File: StreamingKMeansSuite.scala    From spark-structured-streaming-ml   with Apache License 2.0 5 votes vote down vote up
package com.highperformancespark.examples.structuredstreaming

import com.holdenkarau.spark.testing.DataFrameSuiteBase
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.ml.linalg._
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.scalatest.FunSuite
import org.apache.log4j.{Level, Logger}

case class TestRow(features: Vector)

class StreamingKMeansSuite extends FunSuite with DataFrameSuiteBase {

  override def beforeAll(): Unit = {
    super.beforeAll()
    Logger.getLogger("org").setLevel(Level.OFF)
  }

  test("streaming model with one center should converge to true center") {
    import spark.implicits._
    val k = 1
    val dim = 5
    val clusterSpread = 0.1
    val seed = 63
    // TODO: this test is very flaky. The centers do not converge for some
    // (most?) random seeds
    val (batches, trueCenters) =
      StreamingKMeansSuite.generateBatches(100, 80, k, dim, clusterSpread, seed)
    val inputStream = MemoryStream[TestRow]
    val ds = inputStream.toDS()
    val skm = new StreamingKMeans().setK(k).setRandomCenters(dim, 0.01)
    val query = skm.evilTrain(ds.toDF())
    val streamingModels = batches.map { batch =>
      inputStream.addData(batch)
      query.processAllAvailable()
      skm.getModel
    }
    // TODO: use spark's testing suite
    streamingModels.last.centers.zip(trueCenters).foreach {
      case (center, trueCenter) =>
        val centers = center.toArray.mkString(",")
        val trueCenters = trueCenter.toArray.mkString(",")
        println(s"${centers} | ${trueCenters}")
        assert(center.toArray.zip(trueCenter.toArray).forall(
          x => math.abs(x._1 - x._2) < 0.1))
    }
    query.stop()
  }

  def compareBatchAndStreaming(
      batchModel: KMeansModel,
      streamingModel: StreamingKMeansModel,
      validationData: DataFrame): Unit = {
    assert(batchModel.clusterCenters === streamingModel.centers)
    // TODO: implement prediction comparison
  }

}

object StreamingKMeansSuite {

  def generateBatches(
      numPoints: Int,
      numBatches: Int,
      k: Int,
      d: Int,
      r: Double,
      seed: Int,
      initCenters: Array[Vector] = null):
      (IndexedSeq[IndexedSeq[TestRow]], Array[Vector]) = {
    val rand = scala.util.Random
    rand.setSeed(seed)
    val centers = initCenters match {
      case null => Array.fill(k)(Vectors.dense(Array.fill(d)(rand.nextGaussian())))
      case _ => initCenters
    }
    val data = (0 until numBatches).map { i =>
      (0 until numPoints).map { idx =>
        val center = centers(idx % k)
        val vec = Vectors.dense(
          Array.tabulate(d)(x => center(x) + rand.nextGaussian() * r))
        TestRow(vec)
      }
    }
    (data, centers)
  }
} 
Example 106
Source File: SuspiciousConnects.scala    From oni-ml   with Apache License 2.0 5 votes vote down vote up
package org.opennetworkinsight

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import org.slf4j.LoggerFactory
import org.opennetworkinsight.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
import org.opennetworkinsight.dns.DNSSuspiciousConnects
import org.opennetworkinsight.netflow.FlowSuspiciousConnects
import org.opennetworkinsight.proxy.ProxySuspiciousConnectsAnalysis


  def main(args: Array[String]) {

    val parser = SuspiciousConnectsArgumentParser.parser

    parser.parse(args, SuspiciousConnectsConfig()) match {
      case Some(config) =>
        val logger = LoggerFactory.getLogger(this.getClass)
        Logger.getLogger("org").setLevel(Level.OFF)
        Logger.getLogger("akka").setLevel(Level.OFF)

        val analysis = config.analysis
        val sparkConfig = new SparkConf().setAppName("ONI ML:  " + analysis + " lda")
        val sparkContext = new SparkContext(sparkConfig)
        val sqlContext = new SQLContext(sparkContext)
        implicit val outputDelimiter = OutputDelimiter

        analysis match {
          case "flow" => FlowSuspiciousConnects.run(config, sparkContext, sqlContext, logger)
          case "dns" => DNSSuspiciousConnects.run(config, sparkContext, sqlContext, logger)
          case "proxy" => ProxySuspiciousConnectsAnalysis.run(config, sparkContext, sqlContext, logger)
          case _ => println("ERROR:  unsupported (or misspelled) analysis: " + analysis)
        }

        sparkContext.stop()

      case None => println("Error parsing arguments")
    }

    System.exit(0)
  }


} 
Example 107
package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object DecisionTreePipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val dt = new DecisionTreeClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setMaxDepth(5)
      .setMaxBins(32)
      .setMinInstancesPerNode(1)
      .setMinInfoGain(0.0)
      .setCacheNodeIds(false)
      .setCheckpointInterval(10)

    stages += vectorAssembler
    stages += dt
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)
  }
} 
Example 108
Source File: SVMPipeline.scala    From Machine-Learning-with-Spark-Second-Edition   with MIT License 5 votes vote down vote up
package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint


object SVMPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def svmPipeline(sc: SparkContext) = {
    val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t"))

    val data = records.map { r =>
      val trimmed = r.map(_.replaceAll("\"", ""))
      val label = trimmed(r.size - 1).toInt
      val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
      LabeledPoint(label, Vectors.dense(features))
    }

    // params for SVM
    val numIterations = 10

    // Run training algorithm to build the model
    val svmModel = SVMWithSGD.train(data, numIterations)

    // Clear the default threshold.
    svmModel.clearThreshold()

    val svmTotalCorrect = data.map { point =>
      if(svmModel.predict(point.features) == point.label) 1 else 0
    }.sum()

    // calculate accuracy
    val svmAccuracy = svmTotalCorrect / data.count()
    println(svmAccuracy)
  }

} 
Example 109
package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object NaiveBayesPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val nb = new NaiveBayes()

    stages += vectorAssembler
    stages += nb
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)
  }
} 
Example 110
package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.DataFrame


object LogisticRegressionPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val lr = new LogisticRegression()

    val paramGrid = new ParamGridBuilder()
      .addGrid(lr.regParam, Array(0.1, 0.01))
      .addGrid(lr.fitIntercept)
      .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0))
      .build()

    val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr))

    val trainValidationSplit = new TrainValidationSplit()
      .setEstimator(pipeline)
      .setEvaluator(new RegressionEvaluator)
      .setEstimatorParamMaps(paramGrid)
      // 80% of the data will be used for training and the remaining 20% for validation.
      .setTrainRatio(0.8)

    val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345)
    //val model = trainValidationSplit.fit(training)
    val model = trainValidationSplit.fit(dataFrame)

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)

    val totalPoints = dataFrame.count()
    val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum()
    val accuracy = lrTotalCorrect/totalPoints
    println("Accuracy of LogisticRegression is: ", accuracy)

    holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/LR.xls")
    holdout.rdd.map(x => x(1).asInstanceOf[Double]).repartition(1).saveAsTextFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/Actual.xls")

    savePredictions(holdout, dataFrame, rm, "/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/LogisticRegression.csv")
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    println("Mean Squared Error:", regressionMetrics.meanSquaredError)
    println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError)

    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }
} 
Example 111
package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object RandomForestPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val rf = new RandomForestClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setNumTrees(20)
      .setMaxDepth(5)
      .setMaxBins(32)
      .setMinInstancesPerNode(1)
      .setMinInfoGain(0.0)
      .setCacheNodeIds(false)
      .setCheckpointInterval(10)

    stages += vectorAssembler
    stages += rf
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)

    val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0))
    val labels = model.transform(test).select("label").rdd.map(_.getDouble(0))
    val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision
    println(s"  Accuracy : $accuracy")

    holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/RF.xls")

    savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/RandomForest.csv")
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }
} 
Example 112
package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object DecisionTreePipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val dt = new DecisionTreeClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setMaxDepth(5)
      .setMaxBins(32)
      .setMinInstancesPerNode(1)
      .setMinInfoGain(0.0)
      .setCacheNodeIds(false)
      .setCheckpointInterval(10)

    stages += vectorAssembler
    stages += dt
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)


    val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0))
    val labels = model.transform(test).select("label").rdd.map(_.getDouble(0))
    val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision
    println(s"  Accuracy : $accuracy")

    holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/DT.xls")

    savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/DecisionTree.csv")
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }

} 
Example 113
package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.GBTClassifier
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object GradientBoostedTreePipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val gbt = new GBTClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setMaxIter(10)

    stages += vectorAssembler
    stages += gbt
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)

    val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0))
    val labels = model.transform(test).select("label").rdd.map(_.getDouble(0))
    val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision
    println(s"  Accuracy : $accuracy")

    holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/GBT.xls")

    savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/GBT.csv")
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }

} 
Example 114
Source File: SVMPipeline.scala    From Machine-Learning-with-Spark-Second-Edition   with MIT License 5 votes vote down vote up
package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint


object SVMPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def svmPipeline(sc: SparkContext) = {
    val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t"))

    val data = records.map { r =>
      val trimmed = r.map(_.replaceAll("\"", ""))
      val label = trimmed(r.size - 1).toInt
      val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
      LabeledPoint(label, Vectors.dense(features))
    }

    // params for SVM
    val numIterations = 10

    // Run training algorithm to build the model
    val svmModel = SVMWithSGD.train(data, numIterations)

    // Clear the default threshold.
    svmModel.clearThreshold()

    val svmTotalCorrect = data.map { point =>
      if(svmModel.predict(point.features) == point.label) 1 else 0
    }.sum()

    // calculate accuracy
    val svmAccuracy = svmTotalCorrect / data.count()
    println(svmAccuracy)
  }

} 
Example 115
package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object NaiveBayesPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val nb = new NaiveBayes()

    stages += vectorAssembler
    stages += nb
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)

    val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0))
    val labels = model.transform(test).select("label").rdd.map(_.getDouble(0))
    val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision
    println(s"  Accuracy : $accuracy")

    holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/NB.xls")

    savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/NaiveBayes.csv")
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }
} 
Example 116
package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.DataFrame


object LogisticRegressionPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val lr = new LogisticRegression()

    val paramGrid = new ParamGridBuilder()
      .addGrid(lr.regParam, Array(0.1, 0.01))
      .addGrid(lr.fitIntercept)
      .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0))
      .build()

    val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr))

    val trainValidationSplit = new TrainValidationSplit()
      .setEstimator(pipeline)
      .setEvaluator(new RegressionEvaluator)
      .setEstimatorParamMaps(paramGrid)
      // 80% of the data will be used for training and the remaining 20% for validation.
      .setTrainRatio(0.8)

    val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345)
    //val model = trainValidationSplit.fit(training)
    val model = trainValidationSplit.fit(dataFrame)

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)

    val totalPoints = dataFrame.count()
    val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum()
    val accuracy = lrTotalCorrect/totalPoints
    println("Accuracy of LogisticRegression is: ", accuracy)

    holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/LR.xls")
    holdout.rdd.map(x => x(1).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/Actual.xls")

    savePredictions(holdout, dataFrame, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/LogisticRegression.csv")
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    println("Mean Squared Error:", regressionMetrics.meanSquaredError)
    println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError)

    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }
} 
Example 117
package org.stumbleuponclassifier

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object RandomForestPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val rf = new RandomForestClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setNumTrees(20)
      .setMaxDepth(5)
      .setMaxBins(32)
      .setMinInstancesPerNode(1)
      .setMinInfoGain(0.0)
      .setCacheNodeIds(false)
      .setCheckpointInterval(10)

    stages += vectorAssembler
    stages += rf
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)

  }
} 
Example 118
package org.stumbleuponclassifier

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object DecisionTreePipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val dt = new DecisionTreeClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setMaxDepth(5)
      .setMaxBins(32)
      .setMinInstancesPerNode(1)
      .setMinInfoGain(0.0)
      .setCacheNodeIds(false)
      .setCheckpointInterval(10)

    stages += vectorAssembler
    stages += dt
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)
  }
} 
Example 119
package org.stumbleuponclassifier

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.GBTClassifier
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object GradientBoostedTreePipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val gbt = new GBTClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setMaxIter(10)

    stages += vectorAssembler
    stages += gbt
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)

    val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0))
    val labels = model.transform(test).select("label").rdd.map(_.getDouble(0))
    val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision
    println(s"  Accuracy : $accuracy")
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }

} 
Example 120
Source File: SVMPipeline.scala    From Machine-Learning-with-Spark-Second-Edition   with MIT License 5 votes vote down vote up
package org.stumbleuponclassifier

import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint


object SVMPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def svmPipeline(sc: SparkContext) = {
    val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t"))

    val data = records.map { r =>
      val trimmed = r.map(_.replaceAll("\"", ""))
      val label = trimmed(r.size - 1).toInt
      val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
      LabeledPoint(label, Vectors.dense(features))
    }

    // params for SVM
    val numIterations = 10

    // Run training algorithm to build the model
    val svmModel = SVMWithSGD.train(data, numIterations)

    // Clear the default threshold.
    svmModel.clearThreshold()

    val svmTotalCorrect = data.map { point =>
      if(svmModel.predict(point.features) == point.label) 1 else 0
    }.sum()

    // calculate accuracy
    val svmAccuracy = svmTotalCorrect / data.count()
    println(svmAccuracy)
  }

} 
Example 121
package org.stumbleuponclassifier

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object NaiveBayesPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val nb = new NaiveBayes()

    stages += vectorAssembler
    stages += nb
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)
  }
} 
Example 122
package org.stumbleuponclassifier

import org.apache.log4j.Logger
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.DataFrame


object LogisticRegressionPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val lr = new LogisticRegression()

    val paramGrid = new ParamGridBuilder()
      .addGrid(lr.regParam, Array(0.1, 0.01))
      .addGrid(lr.fitIntercept)
      .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0))
      .build()

    val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr))

    val trainValidationSplit = new TrainValidationSplit()
      .setEstimator(pipeline)
      .setEvaluator(new RegressionEvaluator)
      .setEstimatorParamMaps(paramGrid)
      // 80% of the data will be used for training and the remaining 20% for validation.
      .setTrainRatio(0.8)

    val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345)
    //val model = trainValidationSplit.fit(training)
    val model = trainValidationSplit.fit(dataFrame)

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)

    val totalPoints = dataFrame.count()
    val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum()
    val accuracy = lrTotalCorrect/totalPoints
    println("Accuracy of LogisticRegression is: ", accuracy)
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    println("Mean Squared Error:", regressionMetrics.meanSquaredError)
    println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError)

    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }
} 
Example 123
package org.sparksamples.regression.bikesharing

import org.apache.log4j.Logger
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer}
import org.apache.spark.ml.regression.GeneralizedLinearRegression
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.{SparkSession, _}


object GeneralizedLinearRegressionPipeline {

  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def genLinearRegressionWithVectorFormat(vectorAssembler: VectorAssembler, vectorIndexer: VectorIndexer, dataFrame: DataFrame) = {
    val lr = new GeneralizedLinearRegression()
      .setFeaturesCol("features")
      .setLabelCol("label")
      .setFamily("gaussian")
      .setLink("identity")
      .setMaxIter(10)
      .setRegParam(0.3)

    val pipeline = new Pipeline().setStages(Array(vectorAssembler, vectorIndexer, lr))

    val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345)

    val model = pipeline.fit(training)

    val fullPredictions = model.transform(test).cache()
    val predictions = fullPredictions.select("prediction").rdd.map(_.getDouble(0))
    val labels = fullPredictions.select("label").rdd.map(_.getDouble(0))
    val RMSE = new RegressionMetrics(predictions.zip(labels)).rootMeanSquaredError
    println(s"  Root mean squared error (RMSE): $RMSE")
  }

  def genLinearRegressionWithSVMFormat(spark: SparkSession) = {
    // Load training data
    val training = spark.read.format("libsvm")
      .load("./src/main/scala/org/sparksamples/regression/dataset/BikeSharing/lsvmHours.txt")

    val lr = new GeneralizedLinearRegression()
      .setFamily("gaussian")
      .setLink("identity")
      .setMaxIter(10)
      .setRegParam(0.3)

    // Fit the model
    val model = lr.fit(training)

    // Print the coefficients and intercept for generalized linear regression model
    println(s"Coefficients: ${model.coefficients}")
    println(s"Intercept: ${model.intercept}")

    // Summarize the model over the training set and print out some metrics
    val summary = model.summary
    println(s"Coefficient Standard Errors: ${summary.coefficientStandardErrors.mkString(",")}")
    println(s"T Values: ${summary.tValues.mkString(",")}")
    println(s"P Values: ${summary.pValues.mkString(",")}")
    println(s"Dispersion: ${summary.dispersion}")
    println(s"Null Deviance: ${summary.nullDeviance}")
    println(s"Residual Degree Of Freedom Null: ${summary.residualDegreeOfFreedomNull}")
    println(s"Deviance: ${summary.deviance}")
    println(s"Residual Degree Of Freedom: ${summary.residualDegreeOfFreedom}")
    println(s"AIC: ${summary.aic}")
    println("Deviance Residuals: ")
    summary.residuals().show()  }

} 
Example 124
package org.sparksamples.regression.bikesharing

import org.apache.log4j.Logger
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer}
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.{DataFrame, SparkSession}


object LinearRegressionPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def linearRegressionWithVectorFormat(vectorAssembler: VectorAssembler, vectorIndexer: VectorIndexer, dataFrame: DataFrame) = {
    val lr = new LinearRegression()
      .setFeaturesCol("features")
      .setLabelCol("label")
      .setRegParam(0.1)
      .setElasticNetParam(1.0)
      .setMaxIter(10)

    val pipeline = new Pipeline().setStages(Array(vectorAssembler, vectorIndexer, lr))

    val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345)

    val model = pipeline.fit(training)

    val fullPredictions = model.transform(test).cache()
    val predictions = fullPredictions.select("prediction").rdd.map(_.getDouble(0))
    val labels = fullPredictions.select("label").rdd.map(_.getDouble(0))
    val RMSE = new RegressionMetrics(predictions.zip(labels)).rootMeanSquaredError
    println(s"  Root mean squared error (RMSE): $RMSE")
  }

  def linearRegressionWithSVMFormat(spark: SparkSession) = {
    // Load training data
    val training = spark.read.format("libsvm")
      .load("./src/main/scala/org/sparksamples/regression/dataset/BikeSharing/lsvmHours.txt")

    val lr = new LinearRegression()
      .setMaxIter(10)
      .setRegParam(0.3)
      .setElasticNetParam(0.8)

    // Fit the model
    val lrModel = lr.fit(training)

    // Print the coefficients and intercept for linear regression
    println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

    // Summarize the model over the training set and print out some metrics
    val trainingSummary = lrModel.summary
    println(s"numIterations: ${trainingSummary.totalIterations}")
    println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")
    trainingSummary.residuals.show()
    println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")

    println(s"r2: ${trainingSummary.r2}")
  }
} 
Example 125
Source File: SparseNaiveBayes.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.util.MLUtils


object SparseNaiveBayes {

  case class Params(
      input: String = null,
      minPartitions: Int = 0,
      numFeatures: Int = -1,
      lambda: Double = 1.0) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("SparseNaiveBayes") {
      head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.")
      opt[Int]("numPartitions")
        .text("min number of partitions")
        .action((x, c) => c.copy(minPartitions = x))
      opt[Int]("numFeatures")
        .text("number of features")
        .action((x, c) => c.copy(numFeatures = x))
      opt[Double]("lambda")
        .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}")
        .action((x, c) => c.copy(lambda = x))
      arg[String]("<input>")
        .text("input paths to labeled examples in LIBSVM format")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val minPartitions =
      if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions

    val examples =
      MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions)
    // Cache examples because it will be used in both training and evaluation.
    examples.cache()

    val splits = examples.randomSplit(Array(0.8, 0.2))
    val training = splits(0)
    val test = splits(1)

    val numTraining = training.count()
    val numTest = test.count()

    println(s"numTraining = $numTraining, numTest = $numTest.")

    val model = new NaiveBayes().setLambda(params.lambda).run(training)

    val prediction = model.predict(test.map(_.features))
    val predictionAndLabel = prediction.zip(test.map(_.label))
    val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest

    println(s"Test accuracy = $accuracy.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 126
Source File: DenseKMeans.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors


object DenseKMeans {

  object InitializationMode extends Enumeration {
    type InitializationMode = Value
    val Random, Parallel = Value
  }

  import InitializationMode._

  case class Params(
      input: String = null,
      k: Int = -1,
      numIterations: Int = 10,
      initializationMode: InitializationMode = Parallel) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DenseKMeans") {
      head("DenseKMeans: an example k-means app for dense data.")
      opt[Int]('k', "k")
        .required()
        .text(s"number of clusters, required")
        .action((x, c) => c.copy(k = x))
      opt[Int]("numIterations")
        .text(s"number of iterations, default: ${defaultParams.numIterations}")
        .action((x, c) => c.copy(numIterations = x))
      opt[String]("initMode")
        .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " +
        s"default: ${defaultParams.initializationMode}")
        .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x)))
      arg[String]("<input>")
        .text("input paths to examples")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"DenseKMeans with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = sc.textFile(params.input).map { line =>
      Vectors.dense(line.split(' ').map(_.toDouble))
    }.cache()

    val numExamples = examples.count()

    println(s"numExamples = $numExamples.")

    val initMode = params.initializationMode match {
      case Random => KMeans.RANDOM
      case Parallel => KMeans.K_MEANS_PARALLEL
    }

    val model = new KMeans()
      .setInitializationMode(initMode)
      .setK(params.k)
      .setMaxIterations(params.numIterations)
      .run(examples)

    val cost = model.computeCost(examples)

    println(s"Total cost = $cost.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 127
Source File: StreamingExamples.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.streaming

import org.apache.log4j.{Level, Logger}

import org.apache.spark.internal.Logging


  def setStreamingLogLevels() {
    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
    if (!log4jInitialized) {
      // We first log something to initialize Spark's default logging, then we override the
      // logging level.
      logInfo("Setting log level to [WARN] for streaming example." +
        " To override add a custom log4j.properties to the classpath.")
      Logger.getRootLogger.setLevel(Level.WARN)
    }
  }
} 
Example 128
Source File: YarnScheduler.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler.cluster

import org.apache.hadoop.yarn.util.RackResolver
import org.apache.log4j.{Level, Logger}

import org.apache.spark._
import org.apache.spark.scheduler.TaskSchedulerImpl
import org.apache.spark.util.Utils

private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) {

  // RackResolver logs an INFO message whenever it resolves a rack, which is way too often.
  if (Logger.getLogger(classOf[RackResolver]).getLevel == null) {
    Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN)
  }

  // By default, rack is unknown
  override def getRackForHost(hostPort: String): Option[String] = {
    val host = Utils.parseHostPort(hostPort)._1
    Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation)
  }
} 
Example 129
Source File: BeforeAndAfterWithContext.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rpc.netty

import eleflow.uberdata.core.IUberdataContext
import eleflow.uberdata.core.util.ClusterSettings
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkEnv}
import org.scalatest.{BeforeAndAfterEach, Suite}

object TestSparkConf {
  def conf = {
    val sconf = new SparkConf()
    sconf.set("spark.app.name", "teste")
    sconf
  }

  val separator = ","

}


trait BeforeAndAfterWithContext extends BeforeAndAfterEach { this: Suite =>

  val defaultFilePath = "src/test/resources/"
  import TestSparkConf._
  ClusterSettings.master = Some("local[*]")
  conf.set("spark.driver.allowMultipleContexts", "true")
  @transient val context = IUberdataContext.getUC(conf)

  override def beforeEach() = {
    setLogLevels(Level.INFO, Seq("spark", "org.eclipse.jetty", "akka"))
  }

  def setLogLevels(level: org.apache.log4j.Level, loggers: TraversableOnce[String]) = {
    loggers.map { loggerName =>
      val logger = Logger.getLogger(loggerName)
      val prevLevel = logger.getLevel
      logger.setLevel(level)
      loggerName -> prevLevel
    }.toMap
  }

  override def afterEach() = {
    val get = SparkEnv.get
    val rpcEnv =
      if (get != null) {
        Some(get.rpcEnv)
      } else None
    context.clearContext()
    //rpcEnv.foreach(
    //  _.fileServer.asInstanceOf[org.apache.spark.rpc.netty.HttpBasedFileServer].shutdown())


    System.clearProperty("spark.master.port")
  }
} 
Example 130
Source File: MLLibSuite.scala    From spark-sql-perf   with Apache License 2.0 5 votes vote down vote up
package com.databricks.spark.sql.perf.mllib

import org.scalatest.{BeforeAndAfterAll, FunSuite}

import org.apache.log4j.{Level, Logger}

import org.apache.spark.sql.{Row, SparkSession}

class MLLibSuite extends FunSuite with BeforeAndAfterAll {

  private var sparkSession: SparkSession = _
  var savedLevels: Map[String, Level] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    sparkSession = SparkSession.builder.master("local[2]").appName("MLlib QA").getOrCreate()

    // Travis limits the size of the log file produced by a build. Because we do run a small
    // version of all the ML benchmarks in this suite, we produce a ton of logs. Here we set the
    // log level to ERROR, just for this suite, to avoid displeasing travis.
    savedLevels = Seq("akka", "org", "com.databricks").map { name =>
      val logger = Logger.getLogger(name)
      val curLevel = logger.getLevel
      logger.setLevel(Level.ERROR)
      name -> curLevel
    }.toMap
  }

  override def afterAll(): Unit = {
    savedLevels.foreach { case (name, level) =>
      Logger.getLogger(name).setLevel(level)
    }
    try {
      if (sparkSession != null) {
        sparkSession.stop()
      }
      // To avoid RPC rebinding to the same port, since it doesn't unbind immediately on shutdown
      System.clearProperty("spark.driver.port")
      sparkSession = null
    } finally {
      super.afterAll()
    }
  }

  test("test MlLib benchmarks with mllib-small.yaml.") {
    val results = MLLib.run(yamlConfig = MLLib.smallConfig)
    val failures = results.na.drop(Seq("failure"))
    if (failures.count() > 0) {
      failures.select("name", "failure.*").collect().foreach {
        case Row(name: String, error: String, message: String) =>
          println(
            s"""There as a failure in the benchmark for $name:
               |  $error ${message.replace("\n", "\n  ")}
             """.stripMargin)
      }
      fail("Unable to run all benchmarks successfully, see console output for more info.")
    }
  }

  test("test before benchmark methods for pipeline benchmarks.") {
    val benchmarks = MLLib.getBenchmarks(MLLib.getConf(yamlConfig = MLLib.smallConfig))
    benchmarks.foreach { b =>
      b.beforeBenchmark()
    }
  }
} 
Example 131
Source File: DeleteFromTableEventListener.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.secondaryindex.events

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.internal.Logging
import org.apache.spark.sql.CarbonEnv
import org.apache.spark.sql.hive.CarbonRelation
import org.apache.spark.sql.index.CarbonIndexUtil
import org.apache.spark.sql.secondaryindex.hive.CarbonInternalMetastore
import org.apache.spark.sql.secondaryindex.util.SecondaryIndexUtil

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.events.{DeleteFromTablePostEvent, DeleteFromTablePreEvent, Event, OperationContext, OperationEventListener}


  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case deleteFromTablePreEvent: DeleteFromTablePreEvent =>
        LOGGER.info("Delete from table pre event listener called")
        val carbonTable = deleteFromTablePreEvent.carbonTable
        // Should not allow delete on index table
        if (carbonTable.isIndexTable) {
          sys
            .error(s"Delete is not permitted on Index Table [${
              carbonTable
                .getDatabaseName
            }.${ carbonTable.getTableName }]")
        }
      case deleteFromTablePostEvent: DeleteFromTablePostEvent =>
        LOGGER.info("Delete from table post event listener called")
        val parentCarbonTable = deleteFromTablePostEvent.carbonTable
        val sparkSession = deleteFromTablePostEvent.sparkSession
        CarbonInternalMetastore
          .refreshIndexInfo(parentCarbonTable.getDatabaseName,
            parentCarbonTable.getTableName,
            parentCarbonTable)(
            sparkSession)
        val indexTableList = CarbonIndexUtil.getSecondaryIndexes(parentCarbonTable)
        if (!indexTableList.isEmpty) {
          val indexCarbonTableList = indexTableList.asScala.map { indexTableName =>
            CarbonEnv.getInstance(sparkSession).carbonMetaStore
              .lookupRelation(Option(parentCarbonTable.getDatabaseName), indexTableName)(
                sparkSession)
              .asInstanceOf[CarbonRelation].carbonTable
          }.toList
          SecondaryIndexUtil
            .updateTableStatusForIndexTables(parentCarbonTable, indexCarbonTableList.asJava)
        }
    }
  }
} 
Example 132
Source File: AlterTableDropColumnEventListener.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.secondaryindex.events

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.sql.{CarbonEnv, SparkSession}
import org.apache.spark.sql.execution.command.AlterTableDropColumnModel
import org.apache.spark.sql.execution.command.index.DropIndexCommand
import org.apache.spark.sql.hive.CarbonRelation

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.metadata.index.IndexType
import org.apache.carbondata.events.{AlterTableDropColumnPreEvent, Event, OperationContext, OperationEventListener}

class AlterTableDropColumnEventListener extends OperationEventListener {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case alterTableDropColumnPreEvent: AlterTableDropColumnPreEvent =>
        LOGGER.info("alter table drop column event listener called")
        val carbonTable = alterTableDropColumnPreEvent.carbonTable
        val dbName = carbonTable.getDatabaseName
        val tableName = carbonTable.getTableName
        val tablePath = carbonTable.getTablePath
        val sparkSession = alterTableDropColumnPreEvent.sparkSession
        val alterTableDropColumnModel = alterTableDropColumnPreEvent.alterTableDropColumnModel
        dropApplicableSITables(dbName,
          tableName,
          tablePath,
          alterTableDropColumnModel)(sparkSession)
    }
  }

  private def dropApplicableSITables(dbName: String,
      tableName: String,
      tablePath: String,
      alterTableDropColumnModel: AlterTableDropColumnModel)
    (sparkSession: SparkSession) {
    var indexTableToDrop: Seq[String] = Seq.empty
    val catalog = CarbonEnv.getInstance(sparkSession).carbonMetaStore
    val parentCarbonTable = catalog.lookupRelation(Some(dbName), tableName)(sparkSession)
      .asInstanceOf[CarbonRelation].carbonTable
    val secondaryIndexMap =
      parentCarbonTable.getIndexesMap.get(IndexType.SI.getIndexProviderName)
    if (null == secondaryIndexMap) {
      // if secondary index map is empty, return
      return
    }
    secondaryIndexMap.asScala.foreach(indexTable => {
      val indexColumns = indexTable._2.asScala(CarbonCommonConstants.INDEX_COLUMNS).split(",")
      val colSize = alterTableDropColumnModel.columns.intersect(indexColumns).size
      if (colSize > 0) {
        if (colSize == indexColumns.size) {
          indexTableToDrop ++= Seq(indexTable._1)
        } else {
          sys
            .error(s"Index Table [${
              indexTable._1
            }] exists with combination of provided drop column(s) and other columns, drop " +
                   s"index table & retry")
        }
      }
    })
    indexTableToDrop.foreach { indexTable =>
      DropIndexCommand(ifExistsSet = true,
        Some(dbName),
        tableName,
        indexTable.toLowerCase,
        needLock = false).run(sparkSession)
    }
  }
} 
Example 133
Source File: DeleteSegmentByDateListener.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.secondaryindex.events

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.internal.Logging
import org.apache.spark.sql.CarbonEnv
import org.apache.spark.sql.hive.CarbonRelation
import org.apache.spark.sql.index.CarbonIndexUtil

import org.apache.carbondata.api.CarbonStore
import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.events.{DeleteSegmentByDatePostEvent, Event, OperationContext, OperationEventListener}

class DeleteSegmentByDateListener extends OperationEventListener with Logging {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {

    event match {
      case deleteSegmentPostEvent: DeleteSegmentByDatePostEvent =>
        LOGGER.info("Delete segment By date post event listener called")
        val carbonTable = deleteSegmentPostEvent.carbonTable
        val loadDates = deleteSegmentPostEvent.loadDates
        val sparkSession = deleteSegmentPostEvent.sparkSession
        CarbonIndexUtil.getSecondaryIndexes(carbonTable).asScala.foreach { tableName =>
          val metastore = CarbonEnv.getInstance(sparkSession).carbonMetaStore
          val table = metastore
            .lookupRelation(Some(carbonTable.getDatabaseName), tableName)(sparkSession)
            .asInstanceOf[CarbonRelation].carbonTable
          CarbonStore
            .deleteLoadByDate(loadDates, carbonTable.getDatabaseName, table.getTableName, table)
        }
    }
  }
} 
Example 134
Source File: DropCacheSIEventListener.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.secondaryindex.events

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.execution.command.cache.CarbonDropCacheCommand
import org.apache.spark.sql.index.CarbonIndexUtil

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.metadata.index.IndexType
import org.apache.carbondata.events.{DropTableCacheEvent, Event, OperationContext, OperationEventListener}

object DropCacheSIEventListener extends OperationEventListener {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  override protected def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case dropCacheEvent: DropTableCacheEvent =>
        val carbonTable = dropCacheEvent.carbonTable
        val sparkSession = dropCacheEvent.sparkSession
        val internalCall = dropCacheEvent.internalCall
        if (carbonTable.isIndexTable && !internalCall) {
          throw new UnsupportedOperationException("Operation not allowed on child table.")
        }

        val allIndexTables = carbonTable.getIndexTableNames(
          IndexType.SI.getIndexProviderName)
        val dbName = carbonTable.getDatabaseName
        for (indexTableName <- allIndexTables.asScala) {
          try {
            val dropCacheCommandForChildTable =
              CarbonDropCacheCommand(
                TableIdentifier(indexTableName, Some(dbName)),
                internalCall = true)
            dropCacheCommandForChildTable.processMetadata(sparkSession)
          }
          catch {
            case e: Exception =>
              LOGGER.error(s"Clean cache for SI table $indexTableName failed. ", e)
          }
        }

    }
  }
} 
Example 135
Source File: ShowCacheSIEventListener.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.secondaryindex.events

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.sql.CarbonEnv

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.metadata.index.IndexType
import org.apache.carbondata.core.metadata.schema.indextable.IndexMetadata
import org.apache.carbondata.events.{Event, OperationContext, OperationEventListener, ShowTableCacheEvent}

object ShowCacheSIEventListener extends OperationEventListener {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case showTableCacheEvent: ShowTableCacheEvent =>
        val carbonTable = showTableCacheEvent.carbonTable
        val sparkSession = showTableCacheEvent.sparkSession
        val internalCall = showTableCacheEvent.internalCall
        if (carbonTable.isIndexTable && !internalCall) {
          throw new UnsupportedOperationException("Operation not allowed on index table.")
        }

        val childTables = operationContext.getProperty(carbonTable.getTableUniqueName)
          .asInstanceOf[List[(String, String)]]

        val indexTables = carbonTable.getIndexTableNames(
          IndexType.SI.getIndexProviderName).asScala
        // if there are no index tables for a given fact table do not perform any action
        operationContext.setProperty(carbonTable.getTableUniqueName, indexTables.map {
          indexTable =>
            val indexCarbonTable = CarbonEnv.getCarbonTable(Some(carbonTable.getDatabaseName),
              indexTable)(sparkSession)
            (carbonTable.getDatabaseName + "-" +
             indexTable, "Secondary Index", indexCarbonTable.getTableId)
        }.toList ++ childTables)
    }
  }
} 
Example 136
Source File: UpdateTablePreEventListener.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.secondaryindex.events

import org.apache.log4j.Logger
import org.apache.spark.internal.Logging
import org.apache.spark.sql.index.CarbonIndexUtil

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.events.{Event, OperationContext, OperationEventListener, UpdateTablePreEvent}

class UpdateTablePreEventListener extends OperationEventListener with Logging {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case updateTablePreEvent: UpdateTablePreEvent =>
        LOGGER.info("Update table pre event listener called")
        val carbonTable = updateTablePreEvent.carbonTable
        // Should not allow update on index table
        if (carbonTable.isIndexTable) {
          sys
            .error(s"Update is not permitted on Index Table [${
              carbonTable
                .getDatabaseName
            }.${ carbonTable.getTableName }]")
        } else if (!carbonTable.getIndexesMap.isEmpty) {
          sys
            .error(s"Update is not permitted on table that contains secondary index [${
              carbonTable
                .getDatabaseName
            }.${ carbonTable.getTableName }]. Drop all indexes and retry")

        }
    }
  }
} 
Example 137
Source File: AlterTableRenameEventListener.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.secondaryindex.events

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.internal.Logging
import org.apache.spark.sql.CarbonEnv
import org.apache.spark.sql.hive._
import org.apache.spark.sql.index.CarbonIndexUtil

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.metadata.index.IndexType
import org.apache.carbondata.core.metadata.schema.table.CarbonTable
import org.apache.carbondata.events.{AlterTableRenamePostEvent, Event, OperationContext, OperationEventListener}

class AlterTableRenameEventListener extends OperationEventListener with Logging {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case alterTableRenamePreEvent: AlterTableRenamePostEvent =>
        LOGGER.info("alter table rename Pre event listener called")
        val alterTableRenameModel = alterTableRenamePreEvent.alterTableRenameModel
        val carbonTable = alterTableRenamePreEvent.carbonTable
        val sparkSession = alterTableRenamePreEvent.sparkSession
        val oldDatabaseName = carbonTable.getDatabaseName
        val newTableName = alterTableRenameModel.newTableIdentifier.table
        val metastore = CarbonEnv.getInstance(sparkSession).carbonMetaStore
        val table: CarbonTable = metastore
          .lookupRelation(Some(oldDatabaseName), newTableName)(sparkSession)
          .asInstanceOf[CarbonRelation].carbonTable
        table.getIndexTableNames(IndexType.SI.getIndexProviderName)
          .asScala.map {
          entry =>
            CarbonSessionCatalogUtil.getClient(sparkSession).runSqlHive(
              s"ALTER TABLE $oldDatabaseName.${ entry } " +
              s"SET SERDEPROPERTIES ('parentTableName'='$newTableName')")
        }
    }
  }
} 
Example 138
Source File: CleanFilesPostEventListener.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.secondaryindex.events

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.index.CarbonIndexUtil
import org.apache.spark.sql.optimizer.CarbonFilters

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.indexstore.PartitionSpec
import org.apache.carbondata.core.mutate.CarbonUpdateUtil
import org.apache.carbondata.core.statusmanager.SegmentStatusManager
import org.apache.carbondata.events.{CleanFilesPostEvent, Event, OperationContext, OperationEventListener}

class CleanFilesPostEventListener extends OperationEventListener with Logging {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case cleanFilesPostEvent: CleanFilesPostEvent =>
        LOGGER.info("Clean files post event listener called")
        val carbonTable = cleanFilesPostEvent.carbonTable
        val indexTables = CarbonIndexUtil
          .getIndexCarbonTables(carbonTable, cleanFilesPostEvent.sparkSession)
        indexTables.foreach { indexTable =>
          val partitions: Option[Seq[PartitionSpec]] = CarbonFilters.getPartitions(
            Seq.empty[Expression],
            cleanFilesPostEvent.sparkSession,
            indexTable)
          SegmentStatusManager.deleteLoadsAndUpdateMetadata(
            indexTable, true, partitions.map(_.asJava).orNull)
          CarbonUpdateUtil.cleanUpDeltaFiles(indexTable, true)
        }
    }
  }
} 
Example 139
Source File: DeleteSegmentByIdListener.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.secondaryindex.events

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.internal.Logging
import org.apache.spark.sql.CarbonEnv
import org.apache.spark.sql.hive.CarbonRelation
import org.apache.spark.sql.index.CarbonIndexUtil

import org.apache.carbondata.api.CarbonStore
import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.datastore.impl.FileFactory
import org.apache.carbondata.core.metadata.index.IndexType
import org.apache.carbondata.core.util.path.CarbonTablePath
import org.apache.carbondata.events.{DeleteSegmentByIdPostEvent, Event, OperationContext, OperationEventListener}

class DeleteSegmentByIdListener extends OperationEventListener with Logging {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case deleteSegmentPostEvent: DeleteSegmentByIdPostEvent =>
        LOGGER.info("Delete segment By id post event listener called")
        val carbonTable = deleteSegmentPostEvent.carbonTable
        val loadIds = deleteSegmentPostEvent.loadIds
        val sparkSession = deleteSegmentPostEvent.sparkSession
        val siIndexesMap = carbonTable.getIndexesMap
          .get(IndexType.SI.getIndexProviderName)
        if (null != siIndexesMap) {
          siIndexesMap.keySet().asScala.foreach { tableName =>
            val metastore = CarbonEnv.getInstance(sparkSession).carbonMetaStore
            val table = metastore
              .lookupRelation(Some(carbonTable.getDatabaseName), tableName)(sparkSession)
              .asInstanceOf[CarbonRelation].carbonTable
            val tableStatusFilePath = CarbonTablePath.getTableStatusFilePath(table.getTablePath)
            // this check is added to verify if the table status file for the index table exists
            // or not. Delete on index tables is only to be called if the table status file exists.
            if (FileFactory.isFileExist(tableStatusFilePath)) {
              CarbonStore
                .deleteLoadById(loadIds, carbonTable.getDatabaseName, table.getTableName, table)
            }
          }
        }
    }
  }
} 
Example 140
Source File: CreateCarbonRelationEventListener.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.secondaryindex.events

import org.apache.log4j.Logger
import org.apache.spark.internal.Logging
import org.apache.spark.sql.secondaryindex.hive.CarbonInternalMetastore

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.events.{CreateCarbonRelationPostEvent, Event, OperationContext, OperationEventListener}

class CreateCarbonRelationEventListener extends OperationEventListener with Logging {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case createCarbonRelationPostEvent: CreateCarbonRelationPostEvent =>
        LOGGER.debug("Create carbon relation post event listener called")
        val carbonTable = createCarbonRelationPostEvent.carbonTable
        val databaseName = createCarbonRelationPostEvent.carbonTable.getDatabaseName
        val tableName = createCarbonRelationPostEvent.carbonTable.getTableName
        val sparkSession = createCarbonRelationPostEvent.sparkSession
        CarbonInternalMetastore
          .refreshIndexInfo(databaseName,
            tableName,
            carbonTable,
            createCarbonRelationPostEvent.needLock)(sparkSession)
    }
  }
} 
Example 141
Source File: SILoadEventListener.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.secondaryindex.events

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.internal.Logging
import org.apache.spark.sql.{CarbonEnv, SparkSession}
import org.apache.spark.sql.hive.CarbonRelation
import org.apache.spark.sql.index.CarbonIndexUtil
import org.apache.spark.sql.secondaryindex.command.IndexModel

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.metadata.index.IndexType
import org.apache.carbondata.core.metadata.schema.indextable.IndexMetadata
import org.apache.carbondata.events._
import org.apache.carbondata.processing.loading.events.LoadEvents.LoadTablePreStatusUpdateEvent

class SILoadEventListener extends OperationEventListener with Logging {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case _: LoadTablePreStatusUpdateEvent =>
        if (operationContext.getProperty("isAddLoad") != null &&
            operationContext.getProperty("isAddLoad").toString.toBoolean) {
          return
        }
        LOGGER.info("Load pre status update event-listener called")
        val loadTablePreStatusUpdateEvent = event.asInstanceOf[LoadTablePreStatusUpdateEvent]
        val carbonLoadModel = loadTablePreStatusUpdateEvent.getCarbonLoadModel
        val sparkSession = SparkSession.getActiveSession.get
        // when Si creation and load to main table are parallel, get the carbonTable from the
        // metastore which will have the latest index Info
        val metaStore = CarbonEnv.getInstance(sparkSession).carbonMetaStore
        val carbonTable = metaStore
          .lookupRelation(Some(carbonLoadModel.getDatabaseName),
            carbonLoadModel.getTableName)(sparkSession).asInstanceOf[CarbonRelation].carbonTable
        val indexMetadata = carbonTable.getIndexMetadata
        val secondaryIndexProvider = IndexType.SI.getIndexProviderName
        if (null != indexMetadata && null != indexMetadata.getIndexesMap &&
            null != indexMetadata.getIndexesMap.get(secondaryIndexProvider)) {
          val indexTables = indexMetadata.getIndexesMap
            .get(secondaryIndexProvider).keySet().asScala
          // if there are no index tables for a given fact table do not perform any action
          if (indexTables.nonEmpty) {
            indexTables.foreach {
              indexTableName =>
                val secondaryIndex = IndexModel(Some(carbonTable.getDatabaseName),
                  indexMetadata.getParentTableName,
                  indexMetadata
                    .getIndexColumns(secondaryIndexProvider, indexTableName).split(",").toList,
                  indexTableName)

                val metaStore = CarbonEnv.getInstance(sparkSession).carbonMetaStore
                val indexTable = metaStore
                  .lookupRelation(Some(carbonLoadModel.getDatabaseName),
                    indexTableName)(sparkSession).asInstanceOf[CarbonRelation].carbonTable

                CarbonIndexUtil
                  .LoadToSITable(sparkSession,
                    carbonLoadModel,
                    indexTableName,
                    isLoadToFailedSISegments = false,
                    secondaryIndex,
                    carbonTable, indexTable)
            }
          } else {
            logInfo(s"No index tables found for table: ${carbonTable.getTableName}")
          }
        } else {
          logInfo(s"Index information is null for table: ${carbonTable.getTableName}")
        }
    }
  }
} 
Example 142
Source File: SIRefreshEventListener.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.secondaryindex.events

import org.apache.log4j.Logger
import org.apache.spark.internal.Logging
import org.apache.spark.sql.secondaryindex.hive.CarbonInternalMetastore

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.events.{Event, LookupRelationPostEvent, OperationContext, OperationEventListener}

class SIRefreshEventListener extends OperationEventListener with Logging {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case lookupRelationPostEvent: LookupRelationPostEvent =>
        LOGGER.debug("SI Refresh post event listener called")
        val carbonTable = lookupRelationPostEvent.carbonTable
        val databaseName = lookupRelationPostEvent.carbonTable.getDatabaseName
        val tableName = lookupRelationPostEvent.carbonTable.getTableName
        val sparkSession = lookupRelationPostEvent.sparkSession
        CarbonInternalMetastore.refreshIndexInfo(databaseName, tableName, carbonTable)(sparkSession)
    }
  }
} 
Example 143
Source File: CarbonDropMVCommand.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.command.view

import org.apache.log4j.Logger
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.execution.command.AtomicRunnableCommand
import org.apache.spark.sql.execution.command.table.CarbonDropTableCommand

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.datastore.impl.FileFactory
import org.apache.carbondata.core.util.CarbonProperties
import org.apache.carbondata.events.{OperationContext, OperationListenerBus}
import org.apache.carbondata.view.{MVCatalogInSpark, MVManagerInSpark, UpdateMVPostExecutionEvent, UpdateMVPreExecutionEvent}


case class CarbonDropMVCommand(
    databaseNameOption: Option[String],
    name: String,
    ifExistsSet: Boolean,
    forceDrop: Boolean = false)
  extends AtomicRunnableCommand {

  private val logger = CarbonDropMVCommand.LOGGER

  private var dropTableCommand: CarbonDropTableCommand = _

  override def processMetadata(session: SparkSession): Seq[Row] = {
    setAuditInfo(Map("mvName" -> name))
    val viewManager = MVManagerInSpark.get(session)
    try {
      logger.info("Trying to drop materialized view schema")
      val databaseName =
        databaseNameOption.getOrElse(session.sessionState.catalog.getCurrentDatabase)
      val schema = viewManager.getSchema(databaseName, name)
      if (schema != null) {
        // Drop mv status.
        val databaseLocation = viewManager.getDatabaseLocation(databaseName)
        val systemDirectoryPath = CarbonProperties.getInstance()
          .getSystemFolderLocationPerDatabase(FileFactory
            .getCarbonFile(databaseLocation)
            .getCanonicalPath)
        val identifier = TableIdentifier(name, Option(databaseName))
        val operationContext = new OperationContext()
        OperationListenerBus.getInstance().fireEvent(
          UpdateMVPreExecutionEvent(session, systemDirectoryPath, identifier),
          operationContext)
        viewManager.onDrop(databaseName, name)
        OperationListenerBus.getInstance().fireEvent(
          UpdateMVPostExecutionEvent(session, systemDirectoryPath, identifier),
          operationContext)

        // Drop mv table.
        val dropTableCommand = CarbonDropTableCommand(
          ifExistsSet = true,
          Option(databaseName),
          name,
          dropChildTable = true,
          isInternalCall = true)
        dropTableCommand.processMetadata(session)

        // Drop mv schema.
        try {
          viewManager.deleteSchema(databaseName, name)
        } finally {
          val viewCatalog = viewManager.getCatalog()
            .asInstanceOf[MVCatalogInSpark]
          if (viewCatalog != null) {
            viewCatalog.deregisterSchema(schema.getIdentifier)
          }
        }

        this.dropTableCommand = dropTableCommand
      }
    } catch {
      case exception: Exception =>
        if (!ifExistsSet) {
          throw exception
        }
    }
    Seq.empty
  }

  override def processData(sparkSession: SparkSession): Seq[Row] = {
    // delete the table folder
    if (this.dropTableCommand != null) {
      this.dropTableCommand.processData(sparkSession)
    }
    Seq.empty
  }

  override protected def opName: String = "DROP MATERIALIZED VIEW"
}

object CarbonDropMVCommand {

  private val LOGGER: Logger = LogServiceFactory.getLogService(
    classOf[CarbonDropMVCommand].getCanonicalName)

} 
Example 144
Source File: CacheUtil.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.command.cache

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.util.SparkSQLUtil

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.datastore.impl.FileFactory
import org.apache.carbondata.core.index.Segment
import org.apache.carbondata.core.metadata.schema.table.{CarbonTable, IndexSchema}
import org.apache.carbondata.core.readcommitter.LatestFilesReadCommittedScope
import org.apache.carbondata.core.statusmanager.SegmentStatusManager
import org.apache.carbondata.core.util.CarbonProperties
import org.apache.carbondata.index.bloom.{BloomCacheKeyValue, BloomCoarseGrainIndexFactory}
import org.apache.carbondata.indexserver.IndexServer
import org.apache.carbondata.processing.merger.CarbonDataMergerUtil


object CacheUtil {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  def getAllIndexFiles(carbonTable: CarbonTable)(sparkSession: SparkSession): List[String] = {
    if (carbonTable.isTransactionalTable) {
      val absoluteTableIdentifier = carbonTable.getAbsoluteTableIdentifier
      val validAndInvalidSegmentsInfo = new SegmentStatusManager(absoluteTableIdentifier)
        .getValidAndInvalidSegments(carbonTable.isMV)
      // Fire a job to clear the invalid segments cached in the executors.
      if (CarbonProperties.getInstance().isDistributedPruningEnabled(carbonTable.getDatabaseName,
        carbonTable.getTableName)) {
        val invalidSegmentIds = validAndInvalidSegmentsInfo.getInvalidSegments.asScala
          .map(_.getSegmentNo).toArray
        try {
          IndexServer.getClient
            .invalidateSegmentCache(carbonTable,
              invalidSegmentIds,
              SparkSQLUtil.getTaskGroupId(sparkSession))
        } catch {
          case e: Exception =>
            LOGGER.warn("Failed to clear cache from executors. ", e)
        }
      }
      validAndInvalidSegmentsInfo.getValidSegments.asScala.flatMap {
        segment =>
          segment.getCommittedIndexFile.keySet().asScala
      }.map { indexFile =>
        indexFile.replace(CarbonCommonConstants.WINDOWS_FILE_SEPARATOR,
          CarbonCommonConstants.FILE_SEPARATOR)
      }.toList
    } else {
      val tablePath = carbonTable.getTablePath
      val readCommittedScope = new LatestFilesReadCommittedScope(tablePath,
        FileFactory.getConfiguration)
      readCommittedScope.getSegmentList.flatMap {
        load =>
          val seg = new Segment(load.getLoadName, null, readCommittedScope)
          seg.getCommittedIndexFile.keySet().asScala
      }.map { indexFile =>
        indexFile.replace(CarbonCommonConstants.WINDOWS_FILE_SEPARATOR,
          CarbonCommonConstants.FILE_SEPARATOR)
      }.toList
    }
  }

  def getBloomCacheKeys(carbonTable: CarbonTable, indexSchema: IndexSchema): List[String] = {
    val segments = CarbonDataMergerUtil.getValidSegmentList(carbonTable).asScala

    // Generate shard Path for the indexSchema
    val shardPaths = segments.flatMap {
      segment =>
        BloomCoarseGrainIndexFactory.getAllShardPaths(carbonTable.getTablePath,
          segment.getSegmentNo, indexSchema.getIndexName).asScala
    }

    // get index columns
    val indexColumns = carbonTable.getIndexedColumns(indexSchema.getIndexColumns).asScala.map {
      entry =>
        entry.getColName
    }

    // generate cache key using shard path and index columns on which bloom was created.
    val indexKeys = shardPaths.flatMap {
      shardPath =>
        indexColumns.map {
          indexCol =>
            new BloomCacheKeyValue.CacheKey(shardPath, indexCol).toString
      }
    }
    indexKeys.toList
  }

} 
Example 145
Source File: DropCacheEventListeners.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.listeners

import scala.collection.JavaConverters._
import scala.collection.mutable

import org.apache.log4j.Logger
import org.apache.spark.sql.{CarbonEnv, SparkSession}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.execution.command.cache.{CacheUtil, CarbonDropCacheCommand}

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.cache.CacheProvider
import org.apache.carbondata.core.metadata.index.IndexType
import org.apache.carbondata.core.metadata.schema.table.IndexSchema
import org.apache.carbondata.core.view.MVSchema
import org.apache.carbondata.events.{DropTableCacheEvent, Event, OperationContext, OperationEventListener}
import org.apache.carbondata.view.MVManagerInSpark

object DropCacheMVEventListener extends OperationEventListener {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override protected def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case dropCacheEvent: DropTableCacheEvent =>
        val carbonTable = dropCacheEvent.carbonTable
        val cache = CacheProvider.getInstance().getCarbonCache
        val indexProviderMap = carbonTable.getIndexesMap
        val bloomIndexProvider = IndexType.BLOOMFILTER.getIndexProviderName
        if (!indexProviderMap.isEmpty && null != indexProviderMap.get(bloomIndexProvider)) {
          val bloomIndexes = indexProviderMap.get(bloomIndexProvider)
          val bloomIndexIterator = bloomIndexes.entrySet().iterator()
          while (bloomIndexIterator.hasNext) {
            val bloomIndexEntry = bloomIndexIterator.next()
            val index = new IndexSchema(bloomIndexEntry.getKey, bloomIndexProvider)
            index.setProperties(bloomIndexEntry.getValue)
            try {
              // Get index keys
              val indexKeys = CacheUtil.getBloomCacheKeys(carbonTable, index)

              // remove index keys from cache
              cache.removeAll(indexKeys.asJava)
            } catch {
              case e: Exception =>
                LOGGER.warn(
                  s"Clean cache for Bloom index ${ index.getIndexName } failed.", e)
            }
          }
        }
    }
  }
} 
Example 146
Source File: CompactionTaskCompletionListener.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.spark.rdd

import java.util

import org.apache.log4j.Logger
import org.apache.spark.TaskContext
import org.apache.spark.sql.carbondata.execution.datasources.tasklisteners.CarbonCompactionTaskCompletionListener
import org.apache.spark.sql.execution.command.management.CommonLoadUtils
import org.apache.spark.util.CollectionAccumulator

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.scan.result.iterator.RawResultIterator
import org.apache.carbondata.core.segmentmeta.SegmentMetaDataInfo
import org.apache.carbondata.processing.loading.TableProcessingOperations
import org.apache.carbondata.processing.loading.model.CarbonLoadModel
import org.apache.carbondata.processing.merger.{AbstractResultProcessor, CarbonCompactionExecutor, CarbonCompactionUtil}

class CompactionTaskCompletionListener(
    carbonLoadModel: CarbonLoadModel,
    exec: CarbonCompactionExecutor,
    processor: AbstractResultProcessor,
    rawResultIteratorMap: util.Map[String, util.List[RawResultIterator]],
    segmentMetaDataAccumulator: CollectionAccumulator[Map[String, SegmentMetaDataInfo]],
    queryStartTime: Long)
  extends CarbonCompactionTaskCompletionListener {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getName)

  override def onTaskCompletion(context: TaskContext): Unit = {
    deleteLocalDataFolders()
    // close all the query executor service and clean up memory acquired during query processing
    if (null != exec) {
      LOGGER.info("Cleaning up query resources acquired during compaction")
      exec.close(rawResultIteratorMap.get(CarbonCompactionUtil.UNSORTED_IDX), queryStartTime)
      exec.close(rawResultIteratorMap.get(CarbonCompactionUtil.SORTED_IDX), queryStartTime)
    }
    // clean up the resources for processor
    if (null != processor) {
      LOGGER.info("Closing compaction processor instance to clean up loading resources")
      processor.close()
    }
    // fill segment metadata to accumulator
    CommonLoadUtils.fillSegmentMetaDataInfoToAccumulator(carbonLoadModel.getTableName,
      carbonLoadModel.getSegmentId,
      segmentMetaDataAccumulator)
  }

  private def deleteLocalDataFolders(): Unit = {
    try {
      LOGGER.info("Deleting local folder store location")
      val isCompactionFlow = true
      TableProcessingOperations
        .deleteLocalDataLoadFolderLocation(carbonLoadModel, isCompactionFlow, false)
    } catch {
      case e: Exception =>
        LOGGER.error(e)
    }
  }

} 
Example 147
Source File: QueryExecutorWithLogging.scala    From variantsdwh   with Apache License 2.0 5 votes vote down vote up
package pl.edu.pw.ii.zsibio.dwh.benchmark.utils

import java.io.{File, FileOutputStream, PrintWriter}
import java.util.Calendar

import pl.edu.pw.ii.zsibio.dwh.benchmark.dao.{ConnectDriver, EngineConnection, QueryResult}
import net.jcazevedo.moultingyaml._
import net.jcazevedo.moultingyaml.DefaultYamlProtocol
import net.jcazevedo.moultingyaml.DefaultYamlProtocol._
import org.apache.log4j.Logger
import pl.edu.pw.ii.zsibio.dwh.benchmark.dao.ConnectDriver.Value
import pl.edu.pw.ii.zsibio.dwh.benchmark.utils.QueryType.QueryType


case class Query(queryId:String, queryType:String, queryEngine:String, storageFormat:String,queryDesc:String,
                 statement:String)

object QueryType extends Enumeration {

  type QueryType = Value
  val SELECT, CREATE, UPDATE  = Value


}
object QueryExecutorWithLogging {
  val log = Logger.getLogger("pl.edu.pw.ii.zsibio.dwh.benchmark.utils.QueryExecutorWithLogging")
  object QueryYamlProtocol extends DefaultYamlProtocol {
    implicit val queryFormat = yamlFormat6(Query)
  }


  def runStatement(query: Query, conn:EngineConnection, logFile:String, dryRun: Boolean) = {
    log.info(s"Running ${query.queryId} ... using ${query.queryEngine} engine")
    log.debug(s"Executing query: ${query.statement}")
    query.queryType.toLowerCase() match {
      case "select" => logQuery(conn, query, logFile, dryRun)
      case _ => conn.executeUpdate(query.statement.toLowerCase)
    }


  }

  def parseQueryYAML(file:String,storageType:String,connString:String, kuduMaster:String, dbName:String, ifExplain:Boolean = false)  : Query ={
    log.info(s"Parsing ${file}")
    val lines = scala.io.Source.fromFile(file).mkString
    val yml = lines.stripMargin.parseYaml
    import QueryYamlProtocol._
    queryPreprocess(yml.convertTo[Query], storageType, connString, kuduMaster, dbName, ifExplain)

  }

  private def logQuery(conn:EngineConnection, query: Query, logFile:String, dryRun:Boolean) ={
    val rs = conn.executeQuery(query.statement.toLowerCase,true)
    //rs.rs.next()
    val result = s"${Calendar.getInstance().getTime().toString},${query.queryId}," +
      s"${query.queryEngine},${query.storageFormat},${rs.timing.get.getTiming()},${dryRun.toString}\n"
    log.info(s"Result: ${result}")
    val writer = new PrintWriter(new FileOutputStream(new File(logFile),true))
    writer.write(result)
    writer.flush()
    writer.close()
  }

  private def queryPreprocess(query: Query, storageType: String, connString: String, kuduMaster: String, dbName: String, ifExplain: Boolean) = {
    def replaceVars(property:String) ={
      property
        .replaceAll("\\{\\{DATA_FORMAT\\}\\}",storageType.toLowerCase)
        .replaceAll("\\{\\{DB_NAME\\}\\}",dbName.toLowerCase)
        .replaceAll("\\{\\{KUDU_MASTER\\}\\}",kuduMaster )
        .replaceAll("\\{\\{IF_EXPLAIN\\}\\}", if(ifExplain) "EXPLAIN " else "")
        .replaceAll("\\{\\{PERCENTILE_APPROX\\}\\}", if(query.queryEngine.toLowerCase=="presto") "approx_percentile" else "percentile_approx")

    }
    query.copy(
      queryId = replaceVars(query.queryId),
      queryDesc = replaceVars(query.queryDesc),
      storageFormat = replaceVars(query.storageFormat),
      statement = replaceVars(query.statement.replaceAll(",",",\n").replaceAll("\\(","\\(  "))
    )
  }

} 
Example 148
Source File: SomeSQLOnTitanic.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.machinelearning.titanic

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object SomeSQLOnTitanic {

  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

    def main (args:Array[String]): Unit = {
      val testFile = args(0)
      val trainFile = args(1)

      val isLocal = true

      val sparkSession = if (isLocal) {
        SparkSession.builder
          .master("local")
          .appName("my-spark-app")
          .config("spark.some.config.option", "config-value")
          .config("spark.driver.host","127.0.0.1")
          .config("spark.sql.parquet.compression.codec", "gzip")
          .enableHiveSupport()
          .getOrCreate()
      } else {
        SparkSession.builder
          .appName("my-spark-app")
          .config("spark.some.config.option", "config-value")
          .enableHiveSupport()
          .getOrCreate()
      }
      import sparkSession.implicits._

      //Load Data
      val trainDs = sparkSession.read.option("header", "true")
        .option("charset", "UTF8")
        .option("delimiter",",")
        .csv(trainFile)

      trainDs.createOrReplaceTempView("train")

      println("Sex -> Servived")
      sparkSession.sql("select Sex, sum(Survived), count(*), (sum(Survived)/count(*)) from train group by Sex").collect().foreach(println)

      println("Cabin -> Servived")
      sparkSession.sql("select substring(Cabin,1,1), sum(Survived), count(*), (sum(Survived)/count(*)) from train group by 1 order by 1").collect().foreach(println)

      println("Age -> Servived")
      sparkSession.sql("select round(cast(Age as Int) / 10) as age_block, sum(Survived), count(*), (sum(Survived)/count(*)) from train group by 1 order by 1").collect().foreach(println)

      println("PClass -> Servived")
      sparkSession.sql("select pclass, sum(Survived), count(*), (sum(Survived)/count(*)) from train group by pclass order by 1").collect().foreach(println)

      println("Embarked -> Servived")
      sparkSession.sql("select Embarked, sum(Survived), count(*), (sum(Survived)/count(*)) from train group by Embarked order by 1").collect().foreach(println)

      println("Fare -> Servived")
      sparkSession.sql("select round((Fare / 10)), sum(Survived), count(*), (sum(Survived)/count(*)) from train group by 1 order by 1").collect().foreach(println)

      println("Servived -> Servived")
      sparkSession.sql("select sum(Survived), count(*) from train order by 1").collect().foreach(println)

      sparkSession.stop()
  }
} 
Example 149
Source File: ManyToManyNormalJoin.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.manytomany

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Row, SparkSession}

import scala.collection.mutable

object ManyToManyNormalJoin {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val jsonPath = args(0)

    val sparkSession = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .config("spark.driver.host","127.0.0.1")
      .getOrCreate()

    val jsonDf = sparkSession.read.json(jsonPath)

    val nGramWordCount = jsonDf.rdd.flatMap(r => {
      val actions = r.getAs[mutable.WrappedArray[Row]]("actions")

      val resultList = new mutable.MutableList[((Long, Long), Int)]

      actions.foreach(a => {
        val aValue = a.getAs[Long]("action")
        actions.foreach(b => {
          val bValue = b.getAs[Long]("action")
          if (aValue < bValue) {
            resultList.+=(((aValue, bValue), 1))
          }
        })
      })
      resultList.toSeq
    }).reduceByKey(_ + _)

    nGramWordCount.collect().foreach(r => {
      println(" - " + r)
    })
  }
} 
Example 150
Source File: ManyToManyNestedJoin.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.manytomany

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Row, SparkSession}

import scala.collection.mutable

object ManyToManyNestedJoin {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val jsonPath = args(0)

    val sparkSession = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .config("spark.driver.host","127.0.0.1")
      .getOrCreate()

    val jsonDf = sparkSession.read.json(jsonPath)

    val nGramWordCount = jsonDf.rdd.flatMap(r => {
      val actions = r.getAs[mutable.WrappedArray[Row]]("actions")

      val resultList = new mutable.MutableList[(Long, NestedCount)]

      actions.foreach(a => {
        val aValue = a.getAs[Long]("action")
        val aNestedCount = new NestedCount
        actions.foreach(b => {
          val bValue = b.getAs[Long]("action")
          if (aValue < bValue) {
            aNestedCount.+=(bValue, 1)
          }
        })
        resultList.+=((aValue, aNestedCount))
      })
      resultList.toSeq
    }).reduceByKey((a, b) => a + b)

      //.reduceByKey(_ + _)

    nGramWordCount.collect().foreach(r => {
      println(" - " + r)
    })
  }
}


//1,2
//1,3
//1,4

//1 (2, 3, 4)

class NestedCount() extends Serializable{

  val map = new mutable.HashMap[Long, Long]()

  def += (key:Long, count:Long): Unit = {
    val currentValue = map.getOrElse(key, 0l)
    map.put(key, currentValue + count)
  }

  def + (other:NestedCount): NestedCount = {
    val result = new NestedCount

    other.map.foreach(r => {
      result.+=(r._1, r._2)
    })
    this.map.foreach(r => {
      result.+=(r._1, r._2)
    })
    result
  }

  override def toString(): String = {
    val stringBuilder = new StringBuilder
    map.foreach(r => {
      stringBuilder.append("(" + r._1 + "," + r._2 + ")")
    })
    stringBuilder.toString()
  }
} 
Example 151
Source File: SaltedExample.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.salted

import java.util.Random

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object SaltedExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {

    val jsonPath = args(0)

    val sparkSession = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .getOrCreate()

    val jsonDfLeft = sparkSession.read.json(jsonPath)

    val saltedLeft = jsonDfLeft.rdd.flatMap(r => {
      val group = r.getAs[String]("group")
      val value = r.getAs[Long]("value")

      Seq((group + "_" + 0, value),(group + "_" + 1, value))
    })

    val jsonDfRight = sparkSession.read.json(jsonPath)

    val saltedRight = jsonDfRight.rdd.mapPartitions(it => {

      val random = new Random()

      it.map(r => {
        val group = r.getAs[String]("group")
        val value = r.getAs[Long]("value")

        (group + "_" + random.nextInt(2), value)
      })
    })

    jsonDfLeft.join(jsonDfRight).collect().foreach(r => {
      println("Normal.result:" + r)
    })
    println("----")
    saltedLeft.join(saltedRight).collect().foreach(r => {
      println("Salted.result:" + r)
    })
  }
} 
Example 152
Source File: SmallWindowing.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.windowing.small

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession


object SmallWindowing {

  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {

    val jsonPath = args(0)

    val sparkSession = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .getOrCreate()

    val jsonDf = sparkSession.read.json(jsonPath)

    val timeDifRdd = jsonDf.rdd.map(row => {
      val group = row.getAs[String]("group")
      val time = row.getAs[Long]("time")
      val value = row.getAs[Long]("value")
      //(key  , value)
      (group, (time, value))
    }).groupByKey().flatMap{case (group, records) =>

      var lastValue = 0l

      val localList = records.toSeq
      println("localList.size:" + localList.size)
      localList.sortBy(_._1).map{case (time, value) =>
        val dif = value - lastValue
        lastValue = value
        (group, time, value, dif)
      }
    }

    timeDifRdd.take(10).foreach(r => {
      println(r)
    })

    sparkSession.stop()
  }
} 
Example 153
Source File: SuperBigWindowing.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.windowing.superbig

import org.apache.log4j.{Level, Logger}
import org.apache.spark.Partitioner
import org.apache.spark.sql.SparkSession

object SuperBigWindowing {

  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args: Array[String]): Unit = {

    val jsonPath = args(0)
    val pageSize = args(1).toInt

    val spark = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .config("spark.driver.host","127.0.0.1")
      .getOrCreate()

    val jsonDf = spark.read.json(jsonPath)

    import spark.implicits._

    val diffDs = jsonDf.flatMap(row => {
      val group = row.getAs[String]("group")
      val time = row.getAs[Long]("time")
      val value = row.getAs[Long]("value")

      val timePage = time / pageSize

      if (time %  pageSize == 0) { //Am I on the edge of the page
        Seq((timePage, (time, value)), (timePage + 1, (time, value)))
      } else {
        Seq((timePage, (time, value)))
      }
    }).groupByKey(r => r._1).flatMapGroups((k, it) => {
      var lastValue = 0l

      it.toSeq.
        sortBy{case (page, (time, value)) => time}.
        map{case (page, (time, value)) =>
        val dif = value - lastValue
        lastValue = value
        (time, value, dif)
      }
    })

    diffDs.collect().foreach(r => println(" - " + r))

    spark.stop()

  }
} 
Example 154
Source File: SessionWindowing.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.timeseries

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

import scala.collection.mutable

object SessionWindowing {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val sessionJson = args(0)
    val timeGap = args(1).toInt

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host", "127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val sessionDs = sparkSession.read.json(sessionJson).as[JsonLeadLag]

    sessionDs.createOrReplaceTempView("session_table")

    sparkSession.sql("select * from session_table").collect().foreach(println)

    val sessionDefinitinonDf = sessionDs.rdd.map(r => {
      (r.group, r)
    }).groupByKey().flatMap{ case (group, jsonObjIt) =>

      var lastStart:Long = -1
      var lastEnd:Long = -1
      var sessionCount = 1
      var eventsInASession = 0

      val sessionList = new mutable.MutableList[SessionDefinition]

      jsonObjIt.toSeq.sortBy(r => r.ts).foreach(record => {
        val ts = record.ts
        eventsInASession += 1

        if (lastStart == -1) {
          lastStart = ts
        } else if (ts > lastEnd + timeGap) {
          sessionList += SessionDefinition(group, lastStart, lastEnd, lastEnd - lastStart, eventsInASession)
          lastStart = ts
          eventsInASession = 0
        }
        lastEnd = ts
      })
      sessionList
    }

    sessionDefinitinonDf.collect().foreach(println)

  }
}

case class SessionDefinition(group:String, sessionStart:Long, sessionEnd:Long, sessionLength:Long, sessionEvents:Int) 
Example 155
Source File: LeadLagExample.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.timeseries

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object LeadLagExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val leadLagJson = args(0)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host", "127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val leadLag = sparkSession.read.json(leadLagJson).as[JsonLeadLag]

    leadLag.createOrReplaceTempView("leadlag")

    sparkSession.sql("select * from leadlag").collect().foreach(println)

    val leadLagDf = sparkSession.sql("SELECT " +
      "group, ts, " +
      "value as v_now, " +
      "LEAD(value) OVER (PARTITION BY group ORDER BY ts) as v_after, " +
      "LAG(value)  OVER (PARTITION BY group ORDER BY ts) as v_before " +
      "FROM leadlag")

    leadLagDf.collect().foreach(println)

    leadLagDf.createOrReplaceTempView("leadlag_stage2")

    leadLagDf.printSchema()

    sparkSession.sql("select " +
      "group, ts, v_now, v_after, v_before, " +
      "case " +
      " when v_now < v_after and v_now < v_before then 'valley'" +
      " when v_now > v_after and v_now > v_before then 'peak'" +
      " else 'n/a' " +
      "end " +
      "from leadlag_stage2").collect().foreach(println)
  }
}

case class JsonLeadLag(group:String, ts:Long, value:Long) 
Example 156
Source File: TumblingWindows.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.timeseries

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object TumblingWindows {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val leadLagJson = args(0)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host", "127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val leadLag = sparkSession.read.json(leadLagJson).as[JsonLeadLag]

    leadLag.createOrReplaceTempView("leadlag")

    sparkSession.sql("select * from leadlag").collect().foreach(println)

    val leadLagDf = sparkSession.sql("SELECT " +
      "group, " +
      "round(ts / 3), " +
      "avg(value), " +
      "max(value), " +
      "min(value) " +
      "FROM leadlag " +
      "group by 1,2")

    leadLagDf.collect().foreach(println)

  }
} 
Example 157
Source File: InfectionPointWindow.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.timeseries

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object InfectionPointWindow {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val inflectionPointJson = args(0)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host", "127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val inflectionPointDs = sparkSession.read.json(inflectionPointJson).as[JsonInfectionPoint]

    inflectionPointDs.createOrReplaceTempView("inflection_point")

    sparkSession.sql("select * from inflection_point").collect().foreach(println)

    val leadLagDf = sparkSession.sql("SELECT " +
      "group, ts, " +
      "value as v_now, " +
      "AVG(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg, " +
      "Min(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg, " +
      "Max(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg " +
      "FROM inflection_point " +
      "where event_type = 'inflection'")

    leadLagDf.collect().foreach(println)

  }
}

case class JsonInfectionPoint(group:String, ts:Long, value:Long, event_type:String) 
Example 158
Source File: SplidingWindows.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.timeseries

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object SplidingWindows {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val leadLagJson = args(0)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host", "127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val leadLag = sparkSession.read.json(leadLagJson).as[JsonLeadLag]

    leadLag.createOrReplaceTempView("leadlag")

    sparkSession.sql("select * from leadlag").collect().foreach(println)

    val leadLagDf = sparkSession.sql("SELECT " +
      "group, ts, " +
      "value as v_now, " +
      "AVG(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg, " +
      "Min(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg, " +
      "Max(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg " +
      "FROM leadlag")

    leadLagDf.collect().foreach(println)

  }
} 
Example 159
Source File: JsonNestedExample.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.nested

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{ArrayType, DataType, StructField, StructType}

import scala.collection.mutable

object JsonNestedExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args: Array[String]): Unit = {

    val isLocal = args(0).equalsIgnoreCase("l")
    val jsonPath = args(1)
    val outputTableName = args(2)

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    val jsonDf = sparkSession.read.json(jsonPath)

    val localJsonDf = jsonDf.collect()

    println("--Df")
    jsonDf.foreach(row => {
      println("row:" + row)
    })
    println("--local")
    localJsonDf.foreach(row => {
      println("row:" + row)
    })

    jsonDf.createOrReplaceTempView("json_table")

    println("--Tree Schema")
    jsonDf.schema.printTreeString()
    println("--")
    jsonDf.write.saveAsTable(outputTableName)

    sparkSession.sqlContext.sql("select * from " + outputTableName).take(10).foreach(println)

    println("--")
    
    sparkSession.stop()
  }

  def populatedFlattedHashMap(row:Row,
                              schema:StructType,
                              fields:Array[StructField],
                              flattedMap:mutable.HashMap[(String, DataType), mutable.MutableList[Any]],
                              parentFieldName:String): Unit = {
    fields.foreach(field => {

      println("field:" + field.dataType)
      if (field.dataType.isInstanceOf[ArrayType]) {
        val elementType = field.dataType.asInstanceOf[ArrayType].elementType
        if (elementType.isInstanceOf[StructType]) {
          val childSchema = elementType.asInstanceOf[StructType]

          val childRow = Row.fromSeq(row.getAs[mutable.WrappedArray[Any]](field.name).toSeq)

          populatedFlattedHashMap(childRow, childSchema, childSchema.fields, flattedMap, parentFieldName + field.name + ".")
        }
      } else {
        val fieldList = flattedMap.getOrElseUpdate((parentFieldName + field.name, field.dataType), new mutable.MutableList[Any])
        fieldList.+=:(row.getAs[Any](schema.fieldIndex(field.name)))
      }

    })
  }
} 
Example 160
Source File: NestedTableExample.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.nested

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType}
import org.apache.spark.sql.{Row, SparkSession}

object NestedTableExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .config("spark.driver.host","127.0.0.1")
      .enableHiveSupport()
      .getOrCreate()


    spark.sql("create table IF NOT EXISTS nested_empty " +
      "( A int, " +
      "  B string, " +
      "  nested ARRAY<STRUCT< " +
      "     nested_C: int," +
      "     nested_D: string" +
      "  >>" +
      ") ")

    val rowRDD = spark.sparkContext.
      parallelize(Array(
        Row(1, "foo", Seq(Row(1, "barA"),Row(2, "bar"))),
        Row(2, "foo", Seq(Row(1, "barB"),Row(2, "bar"))),
        Row(3, "foo", Seq(Row(1, "barC"),Row(2, "bar")))))

    val emptyDf = spark.sql("select * from nested_empty limit 0")

    val tableSchema = emptyDf.schema

    val populated1Df = spark.sqlContext.createDataFrame(rowRDD, tableSchema)

    println("----")
    populated1Df.collect().foreach(r => println(" emptySchemaExample:" + r))

    val nestedSchema = new StructType()
      .add("nested_C", IntegerType)
      .add("nested_D", StringType)

    val definedSchema = new StructType()
      .add("A", IntegerType)
      .add("B", StringType)
      .add("nested", ArrayType(nestedSchema))

    val populated2Df = spark.sqlContext.createDataFrame(rowRDD, definedSchema)
    println("----")
    populated1Df.collect().foreach(r => println(" BuiltExample:" + r))

    spark.stop()
  }
} 
Example 161
Source File: PopulateHiveTable.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.nested

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType}


object PopulateHiveTable {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .config("spark.driver.host","127.0.0.1")
      .config("spark.sql.parquet.compression.codec", "gzip")
      .enableHiveSupport()
      .getOrCreate()


    spark.sql("create table IF NOT EXISTS nested_empty " +
      "( A int, " +
      "  B string, " +
      "  nested ARRAY<STRUCT< " +
      "     nested_C: int," +
      "     nested_D: string" +
      "  >>" +
      ") ")

    val rowRDD = spark.sparkContext.
      parallelize(Array(
        Row(1, "foo", Seq(Row(1, "barA"),Row(2, "bar"))),
        Row(2, "foo", Seq(Row(1, "barB"),Row(2, "bar"))),
        Row(3, "foo", Seq(Row(1, "barC"),Row(2, "bar")))))

    val emptyDf = spark.sql("select * from nested_empty limit 0")

    val tableSchema = emptyDf.schema

    val populated1Df = spark.sqlContext.createDataFrame(rowRDD, tableSchema)

    populated1Df.repartition(2).write.saveAsTable("nested_populated")

    println("----")
    populated1Df.collect().foreach(r => println(" emptySchemaExample:" + r))

    val nestedSchema = new StructType()
      .add("nested_C", IntegerType)
      .add("nested_D", StringType)

    val definedSchema = new StructType()
      .add("A", IntegerType)
      .add("B", StringType)
      .add("nested", ArrayType(nestedSchema))

    val populated2Df = spark.sqlContext.createDataFrame(rowRDD, definedSchema)

    println("----")
    populated1Df.collect().foreach(r => println(" BuiltExample:" + r))

    spark.stop()
  }
} 
Example 162
Source File: CountingInAStreamExpUpdateStateByKey.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.streaming.dstream

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}

object CountingInAStreamExpUpdateStateByKey {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .master("local[3]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }

    val ssc = new StreamingContext(sparkSession.sparkContext, Seconds(1))
    ssc.checkpoint(checkpointFolder)

    val lines = ssc.socketTextStream(host, port.toInt)
    val words = lines.flatMap(_.split(" "))
    val wordCounts = words.map(word => (word, 1))
      .updateStateByKey((values: Seq[(Int)], state: Option[(Int)]) => {
        var value = state.getOrElse(0)
        values.foreach(i => {
          value += i
        })
        Some(value)
    })

    wordCounts.foreachRDD(rdd => {
      println("{")
      val localCollection = rdd.collect()
      println("  size:" + localCollection.length)
      localCollection.foreach(r => println("  " + r))
      println("}")
    })
    ssc.start()


    ssc.awaitTermination()


  }
} 
Example 163
Source File: CountingInAStreamExpBatchCounting.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.streaming.dstream

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}

object CountingInAStreamExpBatchCounting {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .master("local[3]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }

    val ssc = new StreamingContext(sparkSession.sparkContext, Seconds(2))
    ssc.checkpoint(checkpointFolder)

    val lines = ssc.socketTextStream(host, port.toInt)
    val words = lines.flatMap(line => line.toLowerCase.split(" "))
    val wordCounts = words.map(word => (word, 1))
      .reduceByKey((a,b) => a + b)

    wordCounts.foreachRDD(rdd => {
      println("{")
      val localCollection = rdd.collect()
      println("  size:" + localCollection.length)
      localCollection.foreach(r => println("  " + r))
      println("}")
    })

    ssc.start()

    ssc.awaitTermination()


  }
} 
Example 164
Source File: CountingInAStreamMapWithState.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.streaming.structured

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout}

object CountingInAStreamMapWithState {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .master("local[3]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .master("local[3]")
        .getOrCreate()
    }

    import sparkSession.implicits._

    val socketLines = sparkSession.readStream
      .format("socket")
      .option("host", host)
      .option("port", port)
      .load()

    val messageDs = socketLines.as[String].
      flatMap(line => line.toLowerCase().split(" ")).
      map(word => WordCountEvent(word, 1))

    // Generate running word count
    val wordCounts = messageDs.groupByKey(tuple => tuple.word).
      mapGroupsWithState[WordCountInMemory, WordCountReturn](GroupStateTimeout.ProcessingTimeTimeout) {

      case (word: String, events: Iterator[WordCountEvent], state: GroupState[WordCountInMemory]) =>
        var newCount = if (state.exists) state.get.countOfWord else 0

        events.foreach(tuple => {
          newCount += tuple.countOfWord
        })

        state.update(WordCountInMemory(newCount))

        WordCountReturn(word, newCount)
    }

    // Start running the query that prints the running counts to the console
    val query = wordCounts.writeStream
      .outputMode("update")
      .format("console")
      .start()

    query.awaitTermination()
  }
}

case class WordCountEvent(word:String, countOfWord:Int) extends Serializable {

}

case class WordCountInMemory(countOfWord: Int) extends Serializable {
}

case class WordCountReturn(word:String, countOfWord:Int) extends Serializable {

} 
Example 165
Source File: CountingInAStreamExpGroupBy.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.streaming.structured

import com.malaska.spark.training.streaming.{Message, MessageBuilder}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.{OutputMode, Trigger}
import org.apache.spark.sql.functions._

object CountingInAStreamExpGroupBy {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .master("local[3]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .master("local[3]")
        .getOrCreate()
    }

    import sparkSession.implicits._

    val socketLines = sparkSession.readStream
      .format("socket")
      .option("host", host)
      .option("port", port)
      .load()

    val messageDs = socketLines.as[String].
      flatMap(line => line.toLowerCase().split(" "))

    // Generate running word count
    val wordCounts = messageDs.groupBy("value").count()

    // Start running the query that prints the running counts to the console
    val query = wordCounts.writeStream
      .outputMode("complete")
      .format("console")
      .start()

    query.awaitTermination()
  }
} 
Example 166
Source File: CountingInAStreamDatasetExpGroupBy.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.streaming.structured

import com.malaska.spark.training.streaming.{Message, MessageBuilder}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.functions._

object CountingInAStreamDatasetExpGroupBy {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .master("local[3]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .master("local[3]")
        .getOrCreate()
    }

    import sparkSession.implicits._

    val socketLines = sparkSession.readStream
      .format("socket")
      .option("host", host)
      .option("port", port)
      .load()

    val messageDs = socketLines.as[String].map(line => {
      MessageBuilder.build(line)
    }).as[Message]

    val tickerCount = messageDs.groupBy("ticker", "destUser").agg(sum($"price"), avg($"price"))

    val ticketOutput = tickerCount.writeStream
      .format("Console")
      .trigger(Trigger.ProcessingTime("5 seconds"))
      .option("checkpointLocation", checkpointFolder)
      .outputMode("complete")
      .format("console")
      .start()

    ticketOutput.awaitTermination()
  }
} 
Example 167
Source File: CountingInAStreamExpWindowing.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.streaming.structured

import com.malaska.spark.training.streaming.{Message, MessageBuilder}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.streaming.FileStreamSource.Timestamp
import org.apache.spark.sql.functions._
import org.apache.spark.sql.execution.streaming._
import org.apache.spark.sql.streaming.{OutputMode, Trigger}


object CountingInAStreamExpWindowing {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .master("local[5]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .master("local[5]")
        .getOrCreate()
    }

    import sparkSession.implicits._

    val socketLines = sparkSession.readStream
      .format("socket")
      .option("host", host)
      .option("port", port)
      .option("includeTimestamp", true)
      .load()

    val messageDsDStream = socketLines.as[(String, Timestamp)].map(line => {
      MessageBuilder.build(line._1, line._2)
    }).filter(r => r != null).as[Message]


    val tickerCount = messageDsDStream.withColumn("eventTime", $"tradeTs".cast("timestamp"))
      .withWatermark("eventTime", "30 seconds")
      .groupBy(window($"eventTime", "30 seconds", "5 seconds"), $"ticker")
      .agg(max($"tradeTs") as "max_time", sum($"price") as "total_price", avg($"price") as "avg_price", count($"price") as "number_of_trades")//.orderBy("window")


    val ticketOutput = tickerCount.writeStream
      .format("Console")
      .option("checkpointLocation", checkpointFolder)
      .outputMode("update")
      //.outputMode("complete")
      .format("console")
      .option("truncate", false)
      .option("numRows", 40)
      .start()

    ticketOutput.awaitTermination()
  }

} 
Example 168
Source File: ZombieExample.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.graph

import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx.{Edge, EdgeDirection, Graph, _}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession


object ZombieExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val vertexJsonFile = args(0)
    val edgeJsonFile = args(1)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val vectorDs = sparkSession.read.json(vertexJsonFile).as[JsonVertex]
    val edgeDs = sparkSession.read.json(edgeJsonFile).as[JsonEdge]

    val vectorRdd:RDD[(VertexId, ZombieStats)] = vectorDs.rdd.map(r => {
      (r.vertex_id.toLong, new ZombieStats(r.is_zombie.equals("yes"), r.time_alive))
    })

    val edgeRdd = edgeDs.rdd.map(r => {
      new Edge[String](r.src, r.dst, r.edge_type)
    })

    val defaultUser = new ZombieStats(false, 0)

    val graph = Graph(vectorRdd, edgeRdd, defaultUser)

    val zombieResults = graph.pregel[Long](0, 30, EdgeDirection.Either)(
      (vertexId, zombieState, message) => {
        if (message > 0 && !zombieState.isZombie) {
          new ZombieStats(true, message)
        } else {
          zombieState
        }
      }, triplet => {
        if (triplet.srcAttr.isZombie && !triplet.dstAttr.isZombie) {
          Iterator((triplet.dstId, triplet.srcAttr.lengthOfLife + 1l))
        } else if (triplet.dstAttr.isZombie && !triplet.srcAttr.isZombie) {
          Iterator((triplet.srcId, triplet.dstAttr.lengthOfLife + 1l))
        } else {
          Iterator.empty
        }
      }, (a, b) => Math.min(a, b))

    println("ZombieBite")
    zombieResults.vertices.collect().sortBy(r => r._1).foreach(r => {
      println("vertexId:" + r._1 + ",ZobmieStat:" + r._2)
    })

    sparkSession.stop()
  }
}

case class ZombieStats (isZombie:Boolean, lengthOfLife:Long) 
Example 169
Source File: TrianglesExample.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.graph

import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession


object TrianglesExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val vertexJsonFile = args(0)
    val edgeJsonFile = args(1)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val vectorDs = sparkSession.read.json(vertexJsonFile).as[JsonVertex]
    val edgeDs = sparkSession.read.json(edgeJsonFile).as[JsonEdge]

    val vectorRdd:RDD[(VertexId, ZombieStats)] = vectorDs.rdd.map(r => {
      (r.vertex_id.toLong, new ZombieStats(r.is_zombie.equals("yes"), r.time_alive))
    })

    val edgeRdd = edgeDs.rdd.map(r => {
      new Edge[String](r.src, r.dst, r.edge_type)
    })

    val defaultUser = new ZombieStats(false, 0)

    val graph = Graph(vectorRdd, edgeRdd, defaultUser)

    println("TriangleCount")
    graph.triangleCount().vertices.collect().sortBy(r => r._1).foreach(r => {
      println("vertexId:" + r._1 + ",triangleCount:" + r._2)
    })

    graph.pageRank(1.1, 1.1)

    sparkSession.stop()
  }
} 
Example 170
Source File: RegressionMetricsSpark.scala    From DynaML   with Apache License 2.0 5 votes vote down vote up
package io.github.mandar2812.dynaml.evaluation

import breeze.linalg.DenseVector
import io.github.mandar2812.dynaml.graphics.charts.Highcharts._
import org.apache.log4j.{Priority, Logger}
import org.apache.spark.Accumulator
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD

import scalax.chart.module.ChartFactories.{XYBarChart, XYLineChart, XYAreaChart}


    histogram(residuals, numBins = 20)
    title("Histogram of Regression Residuals")
  }

}

object RegressionMetricsSpark {

  def computeKPIs(scoresAndLabels: RDD[(Double, Double)], size: Long)
  : (Double, Double, Double, Double) = {
    val mean: Accumulator[Double] = scoresAndLabels.context.accumulator(0.0, "mean")

    val err:DenseVector[Double] = scoresAndLabels.map((sc) => {
      val diff = sc._1 - sc._2
      mean += sc._2
      val difflog = math.pow(math.log(1 + math.abs(sc._1)) - math.log(math.abs(sc._2) + 1),
        2)
      DenseVector(math.abs(diff), math.pow(diff, 2.0), difflog)
    }).reduce((a,b) => a+b)

    val SS_res = err(1)

    val mu: Broadcast[Double] = scoresAndLabels.context.broadcast(mean.value/size.toDouble)

    val SS_tot = scoresAndLabels.map((sc) => math.pow(sc._2 - mu.value, 2.0)).sum()

    val rmse = math.sqrt(SS_res/size.toDouble)
    val mae = err(0)/size.toDouble
    val rsq = if(1/SS_tot != Double.NaN) 1 - (SS_res/SS_tot) else 0.0
    val rmsle = err(2)/size.toDouble
    (mae, rmse, rsq, rmsle)
  } 
  
} 
Example 171
Source File: RejectionSamplingScheme.scala    From DynaML   with Apache License 2.0 5 votes vote down vote up
package io.github.mandar2812.dynaml.probability

import breeze.stats.distributions.{Density, Rand}
import io.github.mandar2812.dynaml.pipes._
import io.github.mandar2812.dynaml.probability.distributions.GenericDistribution

import scala.util.Random
import org.apache.log4j.Logger




abstract class RejectionSamplingScheme[
ConditioningSet, Domain,
Dist <: Density[ConditioningSet] with Rand[ConditioningSet],
DistL <: Density[Domain] with Rand[Domain],
JointDist <: Density[(ConditioningSet, Domain)] with Rand[(ConditioningSet, Domain)],
Likelihood <: RandomVarWithDistr[Domain, DistL]](
  p: RandomVarWithDistr[ConditioningSet, Dist],
  c: DataPipe[ConditioningSet, Likelihood])
  extends RandomVarWithDistr[(ConditioningSet, Domain), JointDist]
    with BayesJointProbabilityScheme[
    ConditioningSet, Domain,
    RandomVarWithDistr[ConditioningSet, Dist],
    Likelihood] { self =>

  override val prior: RandomVarWithDistr[ConditioningSet, Dist] = p

  override val likelihood: DataPipe[ConditioningSet, Likelihood] = c

  var Max_Candidates: Int = 1000

  var Max_Estimations: Int = 10000

  override val sample = prior.sample >
  BifurcationPipe[ConditioningSet, ConditioningSet, Domain](
      (c: ConditioningSet) => (c, likelihood(c).draw)
    )

  override val posterior: DataPipe[Domain, RandomVariable[ConditioningSet]] =
    DataPipe((data: Domain) => {

    val sampl = this.prior.sample
    val q = this.prior.underlyingDist

    val M = (1 to Max_Estimations).map(i => {
      likelihood(sampl()).underlyingDist(data)
    }).sum/Max_Estimations.toDouble

    new RandomVarWithDistr[ConditioningSet, GenericDistribution[ConditioningSet]] { innerself =>

      val logger = Logger.getLogger(this.getClass)

      override val sample: DataPipe[Unit, ConditioningSet] = DataPipe(() => {
        val iterations = 0
        var accepted = false
        var accepted_sample: ConditioningSet = sampl()

        while(!accepted && iterations < Max_Candidates) {
          // generate a candidate
          val candidate = sampl()
          val a = underlyingDist(candidate)/(M*q(candidate))
          if(Random.nextDouble() <= a) {
            logger.info("... Sample Accepted ...")
            accepted = true
            accepted_sample = candidate
          }
        }

        accepted_sample
      })

      override val underlyingDist: GenericDistribution[ConditioningSet] = new GenericDistribution[ConditioningSet] {
        override def apply(x: ConditioningSet): Double =
          prior.underlyingDist(x)*likelihood(x).underlyingDist(data)

        override def draw() = innerself.sample()
      }

    }
  })

} 
Example 172
Source File: MOGPRegressionModel.scala    From DynaML   with Apache License 2.0 5 votes vote down vote up
package io.github.mandar2812.dynaml.models.gp

import breeze.linalg.{DenseMatrix, DenseVector}
import io.github.mandar2812.dynaml.kernels.LocalScalarKernel
import io.github.mandar2812.dynaml.pipes.{DataPipe, DataPipe2}
import io.github.mandar2812.dynaml.probability.distributions.MatrixNormal
import org.apache.log4j.Logger


  override def dataAsSeq(data: Stream[(I, DenseVector[Double])]): Seq[((I, Int), Double)] =
    data.map((patternAndLabel) =>
      patternAndLabel._2.mapPairs((i, label) =>
        ((patternAndLabel._1, i), label)
      ).toArray.toSeq).reduceLeft((s1, s2) => s1 ++ s2)
}


class KroneckerMOGPModel[I](
  covFunc: LocalScalarKernel[I], noiseCovFunc: LocalScalarKernel[I], coRegCov: LocalScalarKernel[Int],
  data: Stream[(I, DenseVector[Double])], num: Int, numOutputs: Int,
  meanFunc: DataPipe[(I, Int), Double] = DataPipe((_: (I, Int)) => 0.0)) extends
  MOGPRegressionModel[I](covFunc:*coRegCov, noiseCovFunc:* coRegCov, data, num, numOutputs, meanFunc) {

  val (covFPipe, noiseCovPipe, coRegCovPipe) = (covFunc.asPipe, noiseCovFunc.asPipe, coRegCov.asPipe)

  override def energy(h: Map[String, Double], options: Map[String, String]): Double = {
    setState(h)

    val (features, targets) = data.unzip

    val covMatrix: DenseMatrix[Double] =
      covFunc
        .buildKernelMatrix(features, features.length)
        .getKernelMatrix()

    val noiseMatrix: DenseMatrix[Double] =
      noiseCovFunc
        .buildKernelMatrix(features, features.length)
        .getKernelMatrix()

    val colCovMatrix = coRegCov
      .buildKernelMatrix(0 until noutputs, noutputs)
      .getKernelMatrix()

    val meanMat: DenseMatrix[Double] = DenseMatrix.vertcat(
      features.map(instance =>
        DenseVector.tabulate[Double](noutputs)(o => mean((instance, o))).asDenseMatrix):_*
    )

    val mvn = MatrixNormal(meanMat, covMatrix+noiseMatrix, colCovMatrix)

    -mvn.logPdf(DenseMatrix.vertcat(targets.map(_.asDenseMatrix):_*))
  }
} 
Example 173
Source File: GPBasisFuncRegressionModel.scala    From DynaML   with Apache License 2.0 5 votes vote down vote up
package io.github.mandar2812.dynaml.models.gp

import breeze.linalg.{DenseMatrix, DenseVector, cholesky, trace, inv}
import breeze.numerics.{log, sqrt}
import io.github.mandar2812.dynaml.algebra._
import io.github.mandar2812.dynaml.analysis._
import io.github.mandar2812.dynaml.algebra.PartitionedMatrixOps._
import io.github.mandar2812.dynaml.algebra.PartitionedMatrixSolvers._
import io.github.mandar2812.dynaml.kernels._
import io.github.mandar2812.dynaml.models.{ContinuousProcessModel, SecondOrderProcessModel}
import io.github.mandar2812.dynaml.optimization.GloballyOptWithGrad
import io.github.mandar2812.dynaml.pipes.{DataPipe, DataPipe2}
import io.github.mandar2812.dynaml.probability.{MultGaussianPRV, MultGaussianRV}
import org.apache.log4j.Logger

import scala.reflect.ClassTag


abstract class GPBasisFuncRegressionModel[T, I: ClassTag](
  cov: LocalScalarKernel[I], n: LocalScalarKernel[I],
  data: T, num: Int, basisFunc: DataPipe[I, DenseVector[Double]],
  basis_param_prior: MultGaussianRV) extends AbstractGPRegressionModel[T, I](
  cov, n, data, num) {

  val MultGaussianRV(b, covB) = basis_param_prior

  implicit val vf = VectorField(b.length)

  private lazy val lowB = cholesky(covB)

  private lazy val covBsolveb = lowB.t \ (lowB \ b)

  private lazy val h: PartitionedMatrix = PartitionedMatrix.horzcat(_blockSize)(trainingData.map(basisFunc(_)):_*)

  override val mean: DataPipe[I, Double] = basisFunc > DataPipe((h: DenseVector[Double]) => h.t * b)

  private val basisFeatureMap: DataPipe[I, DenseVector[Double]] = basisFunc > DataPipe((x: DenseVector[Double]) => lowB*x)

  val feature_map_cov = CovarianceFunction(basisFunc > DataPipe((x: DenseVector[Double]) => lowB*x))

  override protected def getTrainKernelMatrix[U <: Seq[I]] = {
    SVMKernel.buildPartitionedKernelMatrix(trainingData,
      trainingData.length, _blockSize, _blockSize,
      (x: I, y: I) => {covariance.evaluate(x, y) + feature_map_cov.evaluate(x, y) + noiseModel.evaluate(x, y)}
    )
  }

  override protected def getCrossKernelMatrix[U <: Seq[I]](test: U) =
    SVMKernel.crossPartitonedKernelMatrix(
      trainingData, test, _blockSize, _blockSize,
      (x: I, y: I) => {covariance.evaluate(x, y) + feature_map_cov.evaluate(x, y)}
    )

  override protected def getTestKernelMatrix[U <: Seq[I]](test: U) =
    SVMKernel.buildPartitionedKernelMatrix(
      test, test.length.toLong,
      _blockSize, _blockSize,
      (x: I, y: I) => {covariance.evaluate(x, y) + feature_map_cov.evaluate(x, y)}
    )


} 
Example 174
Source File: QuasiNewtonOptimizer.scala    From DynaML   with Apache License 2.0 5 votes vote down vote up
package io.github.mandar2812.dynaml.optimization

import breeze.linalg.{DenseMatrix, DenseVector, inv}
import io.github.mandar2812.dynaml.pipes.DataPipe
import org.apache.log4j.Logger
import spire.implicits._


  override def optimize(
    nPoints: Long,
    ParamOutEdges: Stream[(DenseVector[Double], Double)],
    initialP: DenseVector[Double]): DenseVector[Double] =
    QuasiNewtonOptimizer.run(
      nPoints, this.regParam, this.numIterations,
      updater, gradient, this.stepSize, initialP,
      ParamOutEdges, DataPipe(identity[Stream[(DenseVector[Double], Double)]] _)
    )
}

object QuasiNewtonOptimizer {

  private val logger = Logger.getLogger(this.getClass)

  def run[T](
    nPoints: Long, regParam: Double, numIterations: Int,
    updater: HessianUpdater, gradient: Gradient, stepSize: Double,
    initial: DenseVector[Double], POutEdges: T,
    transform: DataPipe[T, Stream[(DenseVector[Double], Double)]],
    logging: Boolean = true, logging_rate: Int = 100): DenseVector[Double] = {

    var oldW: DenseVector[Double] = initial

    var newW = oldW
    val hessian = transform(POutEdges)
      .map(_._1)
      .map(x => DenseVector(x.toArray ++ Array(1.0)))
      .map(x => x*x.t)
      .reduce((x: DenseMatrix[Double],
               y: DenseMatrix[Double]) =>
        x + y)

    var regInvHessian = inv(hessian + DenseMatrix.eye[Double](initial.length)*regParam)
    var oldCumGradient = DenseVector.zeros[Double](initial.length)

    println("Performing Quasi-Newton Optimization")
    cfor(1)(iter => iter < numIterations, iter => iter + 1)( iter => {
      val cumGradient: DenseVector[Double] = DenseVector.zeros(initial.length)
      var cumLoss: Double = 0.0
      transform(POutEdges).foreach(ed => {
        val x = DenseVector(ed._1.toArray ++ Array(1.0))
        val y = ed._2
        cumLoss += gradient.compute(x, y, oldW, cumGradient)
      })

      if(logging && iter % logging_rate == 0) RegularizedOptimizer.prettyPrint(iter, cumLoss/nPoints.toDouble)

      //Find the search direction p = inv(H)*grad(J)
      //perform update x_new = x + step*p
      val searchDirection = regInvHessian*cumGradient*(-1.0)
      newW = updater.compute(oldW, searchDirection, stepSize, iter, regParam)._1

      regInvHessian = updater.hessianUpdate(regInvHessian, newW-oldW, cumGradient-oldCumGradient)
      oldW = newW
      oldCumGradient = cumGradient
    })
    newW
  }
} 
Example 175
Source File: GradientDescentSpark.scala    From DynaML   with Apache License 2.0 5 votes vote down vote up
package io.github.mandar2812.dynaml.optimization

import breeze.linalg.DenseVector
import org.apache.log4j.{Logger, Priority}
import org.apache.spark.AccumulatorParam
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD


  override def optimize(nPoints: Long, ParamOutEdges: RDD[LabeledPoint], initialP: DenseVector[Double])
  : DenseVector[Double] =
    GradientDescentSpark.runBatchSGD(
    nPoints,
    this.regParam,
    this.numIterations,
    this.updater,
    this.gradient,
    this.stepSize,
    initialP,
    ParamOutEdges,
    this.miniBatchFraction
  )
}

object GradientDescentSpark {

  private val logger = Logger.getLogger(this.getClass)

  def runBatchSGD(
                   nPoints: Long,
                   regParam: Double,
                   numIterations: Int,
                   updater: Updater,
                   gradient: Gradient,
                   stepSize: Double,
                   initial: DenseVector[Double],
                   POutEdges: RDD[LabeledPoint],
                   miniBatchFraction: Double): DenseVector[Double] = {
    var count = 1
    var oldW: DenseVector[Double] = initial
    var newW = oldW
    val sc = POutEdges.context
    val gradb = sc.broadcast(gradient)

    logger.log(Priority.INFO, "Training model using SGD")
    while(count <= numIterations) {
      val cumGradient =
        sc.accumulator(DenseVector.zeros[Double](initial.length))(new VectorAccumulator())
      val wb = sc.broadcast(oldW)
      POutEdges sample(withReplacement = false, fraction = miniBatchFraction) foreach
        ((ed) => {
          val features = DenseVector(ed.features.toArray)
          val label = ed.label
          val (g, _) = gradb.value.compute(features, label, wb.value)
          cumGradient += g
        })
      newW = updater.compute(oldW, cumGradient.value / nPoints.toDouble,
        stepSize, count, regParam)._1
      oldW = newW
      count += 1
    }
    newW
  }
}

class VectorAccumulator extends AccumulatorParam[DenseVector[Double]] {
  override def addInPlace(r1: DenseVector[Double],
                          r2: DenseVector[Double]): DenseVector[Double] = r1 + r2

  override def zero(initialValue: DenseVector[Double]): DenseVector[Double] =
    DenseVector.zeros(initialValue.length)
} 
Example 176
Source File: GradBasedGlobalOptimizer.scala    From DynaML   with Apache License 2.0 5 votes vote down vote up
package io.github.mandar2812.dynaml.optimization

import breeze.linalg.{DenseVector, norm}
import org.apache.log4j.Logger


class GradBasedGlobalOptimizer[M <: GloballyOptWithGrad](model: M)
  extends GlobalOptimizer[M] {
  override val system: M = model

  override protected val logger: Logger = Logger.getLogger(this.getClass)

  override def optimize(initialConfig: Map[String, Double],
                        options: Map[String, String] = Map("tolerance" -> "0.0001",
                          "step" -> "0.005",
                          "maxIterations" -> "50"))
  : (M, Map[String, Double]) = {

    logger.info("Starting Maximum Likelihood based optimization: ML-II")
    println(
      "-----------------------------------------------------"+
      "-----------------------------------------------------")
    //Carry out gradient descent with step size alpha and
    //for a specified number of maximum iterations

    val tolerance = options("tolerance").toDouble
    val alpha = options("step").toDouble
    val maxit = options("maxIterations").toInt

    var count = 1
    var gradNorm = 1.0
    var working_solution = initialConfig

    logger.info("Starting state: \n"+GlobalOptimizer.prettyPrint(working_solution))

    do {
      val gradient = system.gradEnergy(working_solution)
      println(
        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"+
          "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
      print("\n")
      logger.info("Gradient at "+count+" iterations is: \n"+GlobalOptimizer.prettyPrint(gradient))
      print("\n")
      gradNorm = norm(DenseVector(gradient.values.toArray), 2)

      working_solution = working_solution.zip(gradient).map((confAndGrad) => {
        val hyp = confAndGrad._1._1

        val gr:Double = if(confAndGrad._2._2 == Double.PositiveInfinity){
          1.0
        } else if(confAndGrad._2._2 == Double.NegativeInfinity){
          -1.0
        } else if(confAndGrad._2._2 == Double.NaN){
          1.0
        } else {
          confAndGrad._2._2
        }

        val newValue = math.abs(confAndGrad._1._2 - alpha*gr)
        (hyp, newValue)
      })

      logger.info("Updated state : \n"+GlobalOptimizer.prettyPrint(working_solution))
      print("\n")
      println(
        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"+
          "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

      count += 1
    } while (count < maxit && gradNorm >= tolerance)
    logger.info("Stopped ML-II at "+count+" iterations")
    logger.info("Final state : \n"+GlobalOptimizer.prettyPrint(working_solution))

    //Persist the current configuration to the model memory
    if(options.contains("persist") && (options("persist") == "true" || options("persist") == "1"))
      system.persist(working_solution)

    (system, working_solution)
  }
} 
Example 177
Source File: SVMKernelMatrix.scala    From DynaML   with Apache License 2.0 5 votes vote down vote up
package io.github.mandar2812.dynaml.kernels

import breeze.linalg.{DenseMatrix, DenseVector, eig, max, min}
import org.apache.log4j.{Logger, Priority}


  override def eigenDecomposition(dimensions: Int = this.dimension.toInt):
  (DenseVector[Double], DenseMatrix[Double]) = {
    logger.log(Priority.INFO, "Eigenvalue decomposition of the kernel matrix using JBlas.")
    val decomp = eig(this.kernel)
    logger.log(Priority.INFO, "Eigenvalue stats: "
      +min(decomp.eigenvalues)
      +" =< lambda =< "
      +max(decomp.eigenvalues)
    )
    (decomp.eigenvalues, decomp.eigenvectors)

  }

} 
Example 178
Source File: Test_example_CNN.scala    From SparkMLlibDeepLearn   with Apache License 2.0 5 votes vote down vote up
package tests

import org.apache.log4j.{ Level, Logger }
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.storage.StorageLevel
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.linalg.{ Vector, Vectors }
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.regression.LabeledPoint
import breeze.linalg.{
  Matrix => BM,
  CSCMatrix => BSM,
  DenseMatrix => BDM,
  Vector => BV,
  DenseVector => BDV,
  SparseVector => BSV,
  axpy => brzAxpy,
  svd => brzSvd,
  max => Bmax,
  min => Bmin,
  sum => Bsum
}
import scala.collection.mutable.ArrayBuffer
import CNN.CNN

object Test_example_CNN {

  def main(args: Array[String]) {
    //1 ����Spark����
    val conf = new SparkConf().setAppName("CNNtest")
    val sc = new SparkContext(conf)

    //2 ��������
    Logger.getRootLogger.setLevel(Level.WARN)
    val data_path = "/deeplearn/train_d3.txt"
    val examples = sc.textFile(data_path).cache()
    val train_d1 = examples.map { line =>
      val f1 = line.split("\t")
      val f = f1.map(f => f.toDouble)
      val y = f.slice(0, 10)
      val x = f.slice(10, f.length)
      (new BDM(1, y.length, y), (new BDM(1, x.length, x)).reshape(28, 28) / 255.0)
    }
    val train_d = train_d1.map(f => (f._1, f._2))
    
    //3 ����ѵ������������ģ��
    // opts:��������������������������֤����
    val opts = Array(50.0, 1.0, 0.0)
    train_d.cache
    val numExamples = train_d.count()
    println(s"numExamples = $numExamples.")
    val CNNmodel = new CNN().
      setMapsize(new BDM(1, 2, Array(28.0, 28.0))).
      setTypes(Array("i", "c", "s", "c", "s")).
      setLayer(5).
      setOnum(10).
      setOutputmaps(Array(0.0, 6.0, 0.0, 12.0, 0.0)).
      setKernelsize(Array(0.0, 5.0, 0.0, 5.0, 0.0)).
      setScale(Array(0.0, 0.0, 2.0, 0.0, 2.0)).
      setAlpha(1.0).
      CNNtrain(train_d, opts)

    //4 ģ�Ͳ���
    val CNNforecast = CNNmodel.predict(train_d)
    val CNNerror = CNNmodel.Loss(CNNforecast)
    println(s"NNerror = $CNNerror.")
    val printf1 = CNNforecast.map(f => (f.label.data,  f.predict_label.data)).take(200)
    println("Ԥ��ֵ")
    for (i <- 0 until printf1.length) {
      val outi = printf1(i)._2.mkString("\t")
      println(outi)
    }

  }
} 
Example 179
Source File: LRAccuracyTest.scala    From SparseML   with Apache License 2.0 5 votes vote down vote up
package MLlib

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, LogisticRegressionModel, SparseLogisticRegressionWithLBFGS}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkContext, SparkConf}


object LRAccuracyTest {

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName(s"LogisticRegressionTest with $args").setMaster("local")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)
    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").map(
      l => LabeledPoint(l.label, l.features.toSparse))

    // Split data into training (60%) and test (40%).
    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    // Run training algorithm to build the model
    val model = new SparseLogisticRegressionWithLBFGS()
      .setNumClasses(5)
      .run(training)

    // Compute raw scores on the test set.
    val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
      val prediction = model.predict(features)
      (prediction, label)
    }

    // Get evaluation metrics.
    val metrics = new MulticlassMetrics(predictionAndLabels)

    val precision = metrics.precision
    println("Precision = " + precision)


  }

} 
Example 180
Source File: MnistExample.scala    From SparseML   with Apache License 2.0 5 votes vote down vote up
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.clustering.{KMeans, ScalableKMeans, SparseKMeans}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.SparkSession

object MnistExample {


  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)
    val spark = SparkSession.builder.appName("svm").master("local[8]").getOrCreate()

    val trainRDD = spark.sparkContext.textFile("data/mnist/mnist_train.csv", 8)
      .map(line => line.split(",")).map(arr => arr.map(_.toDouble))
      .map(arr =>  Vectors.dense(arr.slice(1, 785)))

    val model = new KMeans()
      .setK(10)
      .setInitializationMode("random")
      .setMaxIterations(10)
      .run(trainRDD)

    println("final clusters:")
    println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n"))
  }

} 
Example 181
Source File: KMeanTest.scala    From SparseML   with Apache License 2.0 5 votes vote down vote up
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.clustering.{ScalableKMeans, KMeans}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.{SparseVector, Vectors, Vector}

import scala.util.Random


//spark/bin/spark-submit --master spark://10.100.34.48:7077 --class  ScalableKMeanTest --executor-memory 20g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 1000000 100 0.1 1 my 9

//guale spark/bin/spark-submit --master spark://10.100.34.48:7077 --class  ScalableKMeanTest --executor-memory 5g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 5000000 100 0.1 1 my 15

object ScalableKMeanTest {

  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName(s"kmeans: ${args.mkString(",")}")
    val sc = new SparkContext(conf)

    val k = args(0).toInt
    val dimension = args(1).toInt
    val recordNum = args(2).toInt
    val sparsity = args(3).toDouble
    val iterations = args(4).toInt
    val means = args(5)
    val parNumber = args(6).toInt

    val data: RDD[Vector] = sc.parallelize(1 to recordNum, parNumber).map(i => {
      val ran = new Random()
      val indexArr = ran.shuffle((0 until dimension).toList).take((dimension * sparsity).toInt).sorted.toArray
      val valueArr = (1 to (dimension * sparsity).toInt).map(in => ran.nextDouble()).sorted.toArray
      val vec: Vector = new SparseVector(dimension, indexArr, valueArr)
      vec
    }).cache()
    println(args.mkString(", "))
    println(data.count() + " records generated")

    val st = System.nanoTime()

    val model = if(means == "my") {
      println("running scalable kmeans")
      val model = new ScalableKMeans()
        .setK(k)
        .setInitializationMode("random")
        .setMaxIterations(iterations)
        .run(data)
      model
    } else {
      println("running mllib kmeans")
      val model = new KMeans()
        .setK(k)
        .setInitializationMode("random")
        .setMaxIterations(iterations)
        .run(data)
      model
    }

    println((System.nanoTime() - st) / 1e9 + " seconds cost")
    println("final clusters: " + model.clusterCenters.length)
    println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n"))

    sc.stop()
  }

} 
Example 182
Source File: MannWhitneyUTestSuite.scala    From StatisticsOnSpark   with Apache License 2.0 5 votes vote down vote up
package test

import org.apache.commons.math3.stat.inference.MannWhitneyUTest
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkContext, SparkConf}


object MannWhitneyUTestSuite {

  Logger.getLogger("org").setLevel(Level.WARN)
  Logger.getLogger("akka").setLevel(Level.WARN)
  val conf = new SparkConf().setAppName("TallSkinnySVD").setMaster("local")
  val sc = new SparkContext(conf)

  def main(args: Array[String]) {
    testMannWhitneyU
    testMannWhitneyUTest
  }

  private def testMannWhitneyU(): Unit ={
    val sample1 = Array(1d, 3d, 5, 7)
    val sample2 = Array(2, 4, 6, 8d)

    val rdd1 = sc.parallelize(sample1)
    val rdd2 = sc.parallelize(sample2)

    val result = new MannWhitneyUTest()
      .mannWhitneyU(sample1, sample2)
    val result2 = org.apache.spark.mllib.stat.test.MannWhitneyUTest.mannWhitneyU(rdd1, rdd2)
    assert(result == result2)
  }

  private def testMannWhitneyUTest(): Unit ={
    val sample1 = Array(1d, 3d, 5, 7)
    val sample2 = Array(2, 4, 6, 8d)

    val rdd1 = sc.parallelize(sample1)
    val rdd2 = sc.parallelize(sample2)

    val result = new MannWhitneyUTest()
      .mannWhitneyUTest(sample1, sample2)
    val result2 = org.apache.spark.mllib.stat.test.MannWhitneyUTest.mannWhitneyUTest(rdd1, rdd2)
    println(result)
    println(result2)
    assert(result == result2)
  }



} 
Example 183
Source File: TTestSuite.scala    From StatisticsOnSpark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat

import org.apache.commons.math3.stat.inference.TestUtils
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}



object TTestSuite {

  Logger.getLogger("org").setLevel(Level.WARN)
  Logger.getLogger("akka").setLevel(Level.WARN)
  val conf = new SparkConf().setAppName("TallSkinnySVD").setMaster("local")
  val sc = new SparkContext(conf)

  def main(args: Array[String]) {
    OneSampleTTest
    twoIndependentSampleTTest
    pairedTwoSampleTTest
  }

  def OneSampleTTest(): Unit ={
    val observed = Array(100d, 200d, 300d, 400d)
    val mu = 2.5d

    assert(TestUtils.tTest(mu, observed, 0.05) == new OneSampleTTest().tTest(mu, sc.parallelize(observed), 0.05))
    assert(TestUtils.tTest(mu, observed) == new OneSampleTTest().tTest(mu, sc.parallelize(observed)))
  }

  def twoIndependentSampleTTest(): Unit ={
    val sample1 = Array(100d, 200d, 300d, 400d)
    val sample2 = Array(101d, 205d, 300d, 400d)

    val rdd1 = sc.parallelize(sample1)
    val rdd2 = sc.parallelize(sample2)

    assert(TestUtils.tTest(sample1, sample2, 0.05) == new TwoSampleIndependentTTest().tTest(rdd1, rdd2, 0.05))
    assert(TestUtils.tTest(sample1, sample2) == new TwoSampleIndependentTTest().tTest(rdd1, rdd2))
  }

  def pairedTwoSampleTTest(): Unit ={
    val sample1 = Array(100d, 200d, 300d, 400d)
    val sample2 = Array(101d, 202d, 300d, 400d)

    val rdd1 = sc.parallelize(sample1)
    val rdd2 = sc.parallelize(sample2)

    assert(TestUtils.pairedTTest(sample1, sample2, 0.05) == new PairTwoSampleTTest().tTest(rdd1, rdd2, 0.05))
    assert(TestUtils.pairedTTest(sample1, sample2) == new PairTwoSampleTTest().tTest(rdd1, rdd2))
  }

} 
Example 184
Source File: ANOVASuite.scala    From StatisticsOnSpark   with Apache License 2.0 5 votes vote down vote up
package test

import java.util

import main.ANOVA.OneWayANOVA
import org.apache.commons.math3.stat.inference.TestUtils
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.stat.OneSampleTTest
import org.apache.spark.{SparkContext, SparkConf}


object ANOVASuite {

  Logger.getLogger("org").setLevel(Level.WARN)
  Logger.getLogger("akka").setLevel(Level.WARN)
  val conf = new SparkConf().setAppName("TallSkinnySVD").setMaster("local")
  val sc = new SparkContext(conf)

  def main(args: Array[String]) {
    OneWayANOVA
  }

  def OneWayANOVA(): Unit ={
    val sample1 = Array(100d, 200d, 300d, 400d)
    val sample2 = Array(101d, 200d, 300d, 400d)
    val sample3 = Array(102d, 200d, 300d, 400d)
    val data = new util.ArrayList[Array[Double]]()
    data.add(sample1)
    data.add(sample2)
    data.add(sample3)

    val rdd1 = sc.parallelize(sample1)
    val rdd2 = sc.parallelize(sample2)
    val rdd3 = sc.parallelize(sample3)
    val rddData = Seq(rdd1, rdd2, rdd3)

    assert(TestUtils.oneWayAnovaFValue(data) == new OneWayANOVA().anovaFValue(rddData))
    assert(TestUtils.oneWayAnovaPValue(data) == new OneWayANOVA().anovaPValue(rddData))
  }

} 
Example 185
Source File: ProxyPlugin.scala    From AppCrawler   with Apache License 2.0 5 votes vote down vote up
package com.testerhome.appcrawler.plugin

import java.io.File

import com.brsanthu.googleanalytics.GoogleAnalytics
import com.testerhome.appcrawler.URIElement
import com.testerhome.appcrawler.Plugin
import net.lightbody.bmp.BrowserMobProxyServer
import net.lightbody.bmp.proxy.CaptureType
import org.apache.log4j.{BasicConfigurator, Level, Logger}

import scala.util.Try


class ProxyPlugin extends Plugin {
  private var proxy: BrowserMobProxyServer = _
  val port = 7777

  //todo: 支持代理
  override def start(): Unit = {
    BasicConfigurator.configure()
    Logger.getRootLogger.setLevel(Level.INFO)
    Logger.getLogger("ProxyServer").setLevel(Level.WARN)

    proxy = new BrowserMobProxyServer()
    proxy.setHarCaptureTypes(CaptureType.getNonBinaryContentCaptureTypes)
    proxy.setTrustAllServers(true)
    proxy.start(port)

    //proxy.setHarCaptureTypes(CaptureType.getAllContentCaptureTypes)
    //proxy.setHarCaptureTypes(CaptureType.getHeaderCaptureTypes)
    log.info(s"proxy server listen on ${port}")
    proxy.newHar("start")
  }

  override def beforeElementAction(element: URIElement): Unit = {
    log.info("clear har")
    proxy.endHar()
    //创建新的har
    val harFileName = getCrawler().getBasePathName() + ".har"
    proxy.newHar(harFileName)
  }

  override def afterElementAction(element: URIElement): Unit = {
    log.info("save har")
    val harFileName = getCrawler().getBasePathName() + ".har"
    val file = new File(harFileName)
    try {
      log.info(proxy.getHar)
      log.info(proxy.getHar.getLog)
      log.info(proxy.getHar.getLog.getEntries.size())
      log.info(s"har entry size = ${proxy.getHar.getLog.getEntries.size()}")
      if (proxy.getHar.getLog.getEntries.size() > 0) {
        proxy.getHar.writeTo(file)
      }
    } catch {
      case e: Exception =>{
        log.error("read har error")
        log.error(e.getCause)
        log.error(e.getMessage)
        e.getStackTrace.foreach(log.error)
      }
    }

  }

  override def stop(): Unit ={
    log.info("prpxy stop")
    proxy.stop()
  }
} 
Example 186
Source File: TestGA.scala    From AppCrawler   with Apache License 2.0 5 votes vote down vote up
package com.testerhome.appcrawler.ut

import com.brsanthu.googleanalytics.{GoogleAnalytics, PageViewHit}
import org.apache.log4j.{BasicConfigurator, Level, Logger}
import org.scalatest.FunSuite


class TestGA extends FunSuite{
  test("google analyse"){
    println("ga start")

    BasicConfigurator.configure()
    Logger.getRootLogger().setLevel(Level.WARN)
    val ga = new GoogleAnalytics("UA-74406102-1")
    1 to 10 foreach(x=>{
      ga.postAsync(new PageViewHit(s"http://appcrawler.io/demo${x}", "test"))
    })
    Thread.sleep(10000)

    1 to 10 foreach(x=>{
      ga.postAsync(new PageViewHit(s"http://appcrawler.io/dem1${x}", "test"))
    })

    Thread.sleep(10000)
    1 to 10 foreach(x=>{
      ga.postAsync(new PageViewHit(s"http://appcrawler.io/dem2${x}", "test"))
    })
    //ga.post(new PageViewHit("http://appcrawler.io/test2", "test"))
    println("ga end")

  }

} 
Example 187
Source File: TestMacaca.scala    From AppCrawler   with Apache License 2.0 5 votes vote down vote up
package com.testerhome.appcrawler.it

import org.scalatest.{BeforeAndAfterAll, FunSuite}
import org.apache.log4j.Logger
import com.alibaba.fastjson.JSONObject
import macaca.client.MacacaClient



class TestMacaca extends FunSuite with BeforeAndAfterAll{

  val driver=new MacacaClient()
  override def beforeAll(): Unit = {

    val porps = new JSONObject()
    porps.put("autoAcceptAlerts", true)
    porps.put("browserName", "")
    porps.put("platformName", "android")
    porps.put("package", "com.gotokeep.keep")
    porps.put("activity", ".activity.SplashActivity")
    porps.put("reuse", 3)

    val desiredCapabilities = new JSONObject()
    desiredCapabilities.put("desiredCapabilities", porps)
    driver.initDriver(desiredCapabilities)

  }
  test("macaca android"){
    println(driver.source())
  }
  test("macaca chrome"){
    val porps = new JSONObject()
    porps.put("autoAcceptAlerts", true)
    porps.put("browserName", "Chrome")
    porps.put("platformName", "desktop") // android or ios

    porps.put("javascriptEnabled", true)
    porps.put("platform", "ANY")

    val desiredCapabilities = new JSONObject()
    desiredCapabilities.put("desiredCapabilities", porps)
    driver.initDriver(desiredCapabilities)
    driver.get("http://www.baidu.com/")
  }

} 
Example 188
Source File: StreamHQL.scala    From spark-cep   with Apache License 2.0 5 votes vote down vote up
import java.util.Properties

import kafka.consumer.ConsumerConfig
import org.I0Itec.zkclient.ZkClient
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.streaming.StreamSQLContext
import org.apache.spark.sql.streaming.sources.MessageDelimiter
import org.apache.spark.streaming.dstream.ConstantInputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import redis.RedisManager

import scala.util.parsing.json.JSON

class TabDelimiter extends MessageDelimiter {
  override val delimiter = "\t"
}

object StreamDDL {
  def main(args: Array[String]): Unit = {
    Logger.getRootLogger.setLevel(Level.WARN)
    val query = args(0)
    val sc = new SparkContext(new SparkConf())
    val ssc = new StreamingContext(sc, Seconds(1))
    val streamSqlContext = new StreamSQLContext(ssc, new HiveContext(sc))
    streamSqlContext.command(query)
    new ConstantInputDStream[Int](ssc, sc.parallelize(Seq(1))).print
    ssc.start()
    ssc.awaitTerminationOrTimeout(100)
    ssc.stop()
  }
}

object StreamHQL {

  object Redis {
    var initialized = false
    var manager: RedisManager = _
    def init(confMap: Map[String, String]) {
      if (initialized == false) {
        manager = new RedisManager(
          confMap("redis.shards"),
          confMap("redis.sentinels"),
          confMap("redis.database").toInt)
        manager.init
        initialized = true
      }
    }
  }

  def removeConsumerGroup(zkQuorum: String, groupId: String) {
    val properties = new Properties()
    properties.put("zookeeper.connect", zkQuorum)
    properties.put("group.id", groupId)
    val conf = new ConsumerConfig(properties)
    val zkClient = new ZkClient(conf.zkConnect)
    zkClient.deleteRecursive(s"/consumers/${conf.groupId}")
    zkClient.close()
  }

  def main(args: Array[String]): Unit = {
    Logger.getRootLogger.setLevel(Level.WARN)
    val confMap = JSON.parseFull(args(0)).get.asInstanceOf[Map[String, String]]
    val qid = args(1)
    val query = args(2)
    val sc = new SparkContext(new SparkConf())
    val ssc = new StreamingContext(sc, Seconds(1))
    val hc = new HiveContext(sc)
    val streamSqlContext = new StreamSQLContext(ssc, hc)
    val redisExpireSec = confMap("redis.expire.sec").toInt
    ssc.checkpoint(s"checkpoint/$qid")
    hc.setConf("spark.streaming.query.id", qid)
    hc.setConf("spark.sql.shuffle.partitions", confMap("spark.sql.shuffle.partitions"))

    removeConsumerGroup(confMap("kafka.zookeeper.quorum"), qid)
    val result = streamSqlContext.sql(query)
    val schema = result.schema

    result.foreachRDD((rdd, time) => {
      rdd.foreachPartition(partition => {
        Redis.init(confMap)
        val jedis = Redis.manager.getResource
        val pipe = jedis.pipelined
        partition.foreach(record => {
          val seq = record.toSeq(schema)
          val ts = time.milliseconds / 1000
          val hkey = seq.take(seq.size - 1).mkString(".")
          pipe.hset(qid + "." + ts, hkey, seq(seq.size - 1).toString)
          pipe.expire(qid + "." + ts, redisExpireSec)
        })
        pipe.sync
        Redis.manager.returnResource(jedis)
      })
    })

    ssc.start()
    ssc.awaitTermination()
    ssc.stop()
  }
} 
Example 189
Source File: ZeroMQWordCount.scala    From bahir   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.streaming.zeromq

import scala.language.implicitConversions
import scala.util.Random

import org.apache.log4j.{Level, Logger}
import org.zeromq.ZContext
import org.zeromq.ZMQ
import org.zeromq.ZMQException
import org.zeromq.ZMsg

import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.zeromq.ZeroMQUtils


object ZeroMQWordCount {
  def main(args: Array[String]) {
    if (args.length < 2) {
      // scalastyle:off println
      System.err.println("Usage: ZeroMQWordCount <zeroMqUrl> <topic>")
      // scalastyle:on println
      System.exit(1)
    }

    // Set logging level if log4j not configured (override by adding log4j.properties to classpath).
    Logger.getRootLogger.setLevel(Level.WARN)

    val Seq(url, topic) = args.toSeq
    val sparkConf = new SparkConf().setAppName("ZeroMQWordCount")

    // Check Spark configuration for master URL, set it to local if not present.
    if (!sparkConf.contains("spark.master")) {
      sparkConf.setMaster("local[2]")
    }

    // Create the context and set the batch size.
    val ssc = new StreamingContext(sparkConf, Seconds(2))

    val lines = ZeroMQUtils.createTextStream(
      ssc, url, true, Seq(topic.getBytes)
    )
    val words = lines.flatMap(_.split(" "))
    val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)

    wordCounts.print()

    ssc.start()
    ssc.awaitTermination()
  }
} 
Example 190
Source File: TwitterLocations.scala    From bahir   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.streaming.twitter

import org.apache.log4j.{Level, Logger}
import twitter4j.FilterQuery

import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.twitter._


object TwitterLocations {
  def main(args: Array[String]) {
    if (args.length < 4 || args.length % 4 != 0) {
      System.err.println("Usage: TwitterLocations <consumer key> <consumer secret> " +
        "<access token> <access token secret> " +
        "[<latitude-south-west> <longitude-south-west>" +
        " <latitude-north-east> <longitude-north-east> ...]")
      System.exit(1)
    }

    // Set logging level if log4j not configured (override by adding log4j.properties to classpath)
    if (!Logger.getRootLogger.getAllAppenders.hasMoreElements) {
      Logger.getRootLogger.setLevel(Level.WARN)
    }

    // Set the system properties so that Twitter4j library used by twitter stream
    // can use them to generate OAuth credentials
    val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4)
    System.setProperty("twitter4j.oauth.consumerKey", consumerKey)
    System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret)
    System.setProperty("twitter4j.oauth.accessToken", accessToken)
    System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret)

    // Get bounding boxes of locations for which to retrieve Tweets from command line
    val locationArgs = args.takeRight(args.length - 4)
    val boundingBoxes = if (locationArgs.length == 0) {
      System.out.println("No location bounding boxes specified, using defaults for New York City")
      val nycSouthWest = Array(-74.0, 40.0)
      val nycNorthEast = Array(-73.0, 41.0)
      Array(nycSouthWest, nycNorthEast)
    } else {
      locationArgs.map(_.toDouble).sliding(2, 2).toArray
    }

    val sparkConf = new SparkConf().setAppName("TwitterLocations")

    // check Spark configuration for master URL, set it to local if not configured
    if (!sparkConf.contains("spark.master")) {
      sparkConf.setMaster("local[2]")
    }

    val ssc = new StreamingContext(sparkConf, Seconds(2))
    val locationsQuery = new FilterQuery().locations(boundingBoxes : _*)

    // Print Tweets from the specified coordinates
    // This includes Tweets geo-tagged in the bounding box defined by the coordinates
    // As well as Tweets tagged in places inside of the bounding box
    TwitterUtils.createFilteredStream(ssc, None, Some(locationsQuery))
      .map(tweet => {
        val latitude = Option(tweet.getGeoLocation).map(l => s"${l.getLatitude},${l.getLongitude}")
        val place = Option(tweet.getPlace).map(_.getName)
        val location = latitude.getOrElse(place.getOrElse("(no location)"))
        val text = tweet.getText.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        s"$location\t$text"
      })
      .print()

    ssc.start()
    ssc.awaitTermination()
  }
}
// scalastyle:on println 
Example 191
Source File: TwitterAlgebirdHLL.scala    From bahir   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.streaming.twitter

import com.twitter.algebird.HyperLogLog._
import com.twitter.algebird.HyperLogLogMonoid
import org.apache.log4j.{Level, Logger}

import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.twitter._

// scalastyle:off

    val BIT_SIZE = 12
    val filters = args
    val sparkConf = new SparkConf().setAppName("TwitterAlgebirdHLL")

    // check Spark configuration for master URL, set it to local if not configured
    if (!sparkConf.contains("spark.master")) {
      sparkConf.setMaster("local[2]")
    }

    val ssc = new StreamingContext(sparkConf, Seconds(5))
    val stream = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_ONLY_SER)

    val users = stream.map(status => status.getUser.getId)

    val hll = new HyperLogLogMonoid(BIT_SIZE)
    var globalHll = hll.zero
    var userSet: Set[Long] = Set()

    val approxUsers = users.mapPartitions(ids => {
      ids.map(id => hll.create(id))
    }).reduce(_ + _)

    val exactUsers = users.map(id => Set(id)).reduce(_ ++ _)

    approxUsers.foreachRDD(rdd => {
      if (rdd.count() != 0) {
        val partial = rdd.first()
        globalHll += partial
        println("Approx distinct users this batch: %d".format(partial.estimatedSize.toInt))
        println("Approx distinct users overall: %d".format(globalHll.estimatedSize.toInt))
      }
    })

    exactUsers.foreachRDD(rdd => {
      if (rdd.count() != 0) {
        val partial = rdd.first()
        userSet ++= partial
        println("Exact distinct users this batch: %d".format(partial.size))
        println("Exact distinct users overall: %d".format(userSet.size))
        println("Error rate: %2.5f%%".format(((globalHll.estimatedSize / userSet.size.toDouble) - 1
          ) * 100))
      }
    })

    ssc.start()
    ssc.awaitTermination()
  }
}
// scalastyle:on println 
Example 192
Source File: TwitterPopularTags.scala    From bahir   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.streaming.twitter

import org.apache.log4j.{Level, Logger}

import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.twitter._
import org.apache.spark.SparkConf


object TwitterPopularTags {
  def main(args: Array[String]) {
    if (args.length < 4) {
      System.err.println("Usage: TwitterPopularTags <consumer key> <consumer secret> " +
        "<access token> <access token secret> [<filters>]")
      System.exit(1)
    }

    // Set logging level if log4j not configured (override by adding log4j.properties to classpath)
    if (!Logger.getRootLogger.getAllAppenders.hasMoreElements) {
      Logger.getRootLogger.setLevel(Level.WARN)
    }

    val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4)
    val filters = args.takeRight(args.length - 4)

    // Set the system properties so that Twitter4j library used by twitter stream
    // can use them to generate OAuth credentials
    System.setProperty("twitter4j.oauth.consumerKey", consumerKey)
    System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret)
    System.setProperty("twitter4j.oauth.accessToken", accessToken)
    System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret)

    val sparkConf = new SparkConf().setAppName("TwitterPopularTags")

    // check Spark configuration for master URL, set it to local if not configured
    if (!sparkConf.contains("spark.master")) {
      sparkConf.setMaster("local[2]")
    }

    val ssc = new StreamingContext(sparkConf, Seconds(2))
    val stream = TwitterUtils.createStream(ssc, None, filters)

    val hashTags = stream.flatMap(status => status.getText.split(" ").filter(_.startsWith("#")))

    val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60))
                     .map{case (topic, count) => (count, topic)}
                     .transform(_.sortByKey(false))

    val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10))
                     .map{case (topic, count) => (count, topic)}
                     .transform(_.sortByKey(false))


    // Print popular hashtags
    topCounts60.foreachRDD(rdd => {
      val topList = rdd.take(10)
      println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
      topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))}
    })

    topCounts10.foreachRDD(rdd => {
      val topList = rdd.take(10)
      println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
      topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))}
    })

    ssc.start()
    ssc.awaitTermination()
  }
}
// scalastyle:on println 
Example 193
Source File: TrainNewsClassWithDTDemo.scala    From CkoocNLP   with Apache License 2.0 5 votes vote down vote up
package applications.mining

import config.paramconf.ClassParams
import functions.Preprocessor
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.feature._
import org.apache.spark.sql.SparkSession


object TrainNewsClassWithDTDemo {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.WARN)

    val spark = SparkSession
      .builder
      .master("local[2]")
      .appName("train news with DT Demo")
      .getOrCreate()

    val args = Array("ckooc-ml/data/classnews/train")
    val filePath = args(0)

    import spark.implicits._
    val data = spark.sparkContext.textFile(filePath).flatMap { line =>
      val tokens: Array[String] = line.split("\u00ef")
      if (tokens.length > 3) Some((tokens(0), tokens(1), tokens(2), tokens(3))) else None
    }.toDF("label", "title", "time", "content")
    data.persist()

    val preprocessor = new Preprocessor
    val pipeline = preprocessor.preprocess(data)

    // DT模型训练
    val params = new ClassParams
    val dtClassifier = new DecisionTreeClassifier()
      .setMinInfoGain(params.minInfoGain)
      .setMaxDepth(params.maxDepth) //目前Spark只支持最大30层深度
      .setLabelCol("indexedLabel")
      .setFeaturesCol("features")

    val indexModel = pipeline.getStages(1).asInstanceOf[StringIndexerModel]
    //索引标签化
    val labelConverter = new IndexToString()
      .setLabels(indexModel.labels)
      .setInputCol(dtClassifier.getPredictionCol)
      .setOutputCol("predictedLabel")

    val stages = pipeline.getStages ++ Array(dtClassifier, labelConverter)
    pipeline.setStages(stages)

    val model = pipeline.fit(data)
    model.write.overwrite().save(params.DTModelPath)

    data.unpersist()
    spark.stop()
  }
} 
Example 194
Source File: PredictNewsClassDemo.scala    From CkoocNLP   with Apache License 2.0 5 votes vote down vote up
package applications.mining

import algorithms.evaluation.MultiClassEvaluation
import config.paramconf.ClassParams
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.PipelineModel
import org.apache.spark.sql.{Row, SparkSession}


object PredictNewsClassDemo extends Serializable {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.WARN)

    val spark = SparkSession
      .builder
      .master("local[2]")
      .appName("predict news multi class demo")
      .getOrCreate()

    val args = Array("ckooc-ml/data/classnews/predict", "lr")
    val filePath = args(0)
    val modelType = args(1)

    var modelPath = ""
    val params = new ClassParams

    modelType match {
      case "lr" => modelPath = params.LRModelPath
      case "dt" => modelPath = params.DTModelPath
      case _ =>
        println("模型类型错误!")
        System.exit(1)
    }

    import spark.implicits._
    val data = spark.sparkContext.textFile(filePath).flatMap { line =>
      val tokens: Array[String] = line.split("\u00ef")
      if (tokens.length > 3) Some((tokens(0), tokens(1), tokens(2), tokens(3))) else None
    }.toDF("label", "title", "time", "content")
    data.persist()

    //加载模型,进行数据转换
    val model = PipelineModel.load(modelPath)
    val predictions = model.transform(data)

    //=== 模型评估
    val resultRDD = predictions.select("prediction", "indexedLabel").rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) }
    val (precision, recall, f1) = MultiClassEvaluation.multiClassEvaluate(resultRDD)
    println("\n\n========= 评估结果 ==========")
    println(s"\n加权准确率:$precision")
    println(s"加权召回率:$recall")
    println(s"F1值:$f1")

    //    predictions.select("label", "predictedLabel", "content").show(100, truncate = false)
    data.unpersist()

    spark.stop()
  }
} 
Example 195
Source File: TrainNewsClassWithLRDemo.scala    From CkoocNLP   with Apache License 2.0 5 votes vote down vote up
package applications.mining

import config.paramconf.ClassParams
import functions.Preprocessor
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature._
import org.apache.spark.sql.SparkSession


object TrainNewsClassWithLRDemo extends Serializable {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.WARN)

    val spark = SparkSession
      .builder
      .master("local[2]")
      .appName("train news with LR Demo")
      .getOrCreate()

    val args = Array("ckooc-ml/data/classnews/train")
    val filePath = args(0)

    import spark.implicits._
    val data = spark.sparkContext.textFile(filePath).flatMap { line =>
      val tokens: Array[String] = line.split("\u00ef")
      if (tokens.length > 3) Some((tokens(0), tokens(1), tokens(2), tokens(3))) else None
    }.toDF("label", "title", "time", "content")
    data.persist()

    val preprocessor = new Preprocessor
    val pipeline = preprocessor.preprocess(data)

    //LR模型训练
    val params = new ClassParams
    val logisticRegression = new LogisticRegression()
      .setTol(params.converTol)
      .setMaxIter(params.maxIteration)
      .setRegParam(params.regParam)
      .setElasticNetParam(params.elasticNetParam)
      .setLabelCol("indexedLabel")
      .setFeaturesCol("features")

    val indexModel = pipeline.getStages(1).asInstanceOf[StringIndexerModel]
    //索引标签化
    val labelConverter = new IndexToString()
      .setLabels(indexModel.labels)
      .setInputCol(logisticRegression.getPredictionCol)
      .setOutputCol("predictedLabel")

    val stages = pipeline.getStages ++ Array(logisticRegression, labelConverter)
    pipeline.setStages(stages)

    val model = pipeline.fit(data)
    model.write.overwrite().save(params.LRModelPath)

    data.unpersist()
    spark.stop()
  }
} 
Example 196
Source File: StarsAnalysisDemo.scala    From CkoocNLP   with Apache License 2.0 5 votes vote down vote up
package applications.analysis

import java.io.{BufferedWriter, FileOutputStream, OutputStreamWriter}

import functions.segment.Segmenter
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}


object StarsAnalysisDemo {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val spark = SparkSession
      .builder
      .master("local[2]")
      .appName("Stars Analysis Demo")
      .getOrCreate()

    val filePath = "E:/data/chinaNews/entertainment.txt"


    // 加载数据,并保留年份和内容字段,并对内容字段进行过滤
    import spark.implicits._
    val data = spark.sparkContext.textFile(filePath).flatMap { line =>
      val tokens: Array[String] = line.split("\u00ef")
      if (tokens.length > 3) {
        var year: String = tokens(2).split("-")(0)
        if (tokens(2).contains("年")) year = tokens(2).split("年")(0)

        var content = tokens(3)
        if (content.length > 22 && content.substring(0, 20).contains("日电")) {
          content = content.substring(content.indexOf("日电") + 2, content.length).trim
        }

        if (content.startsWith("(")) content = content.substring(content.indexOf(")") + 1, content.length)
        if (content.length > 20 && content.substring(content.length - 20, content.length).contains("记者")) {
          content = content.substring(0, content.lastIndexOf("记者")).trim
        }

        Some(year, content)
      } else None
    }.toDF("year", "content")

    // 分词,去除长度为1的词,每个词保留词性
    val segmenter = new Segmenter()
      .isAddNature(true)
      .isDelEn(true)
      .isDelNum(true)
      .setMinTermLen(2)
      .setMinTermNum(5)
      .setSegType("StandardSegment")
      .setInputCol("content")
      .setOutputCol("segmented")
    val segDF: DataFrame = segmenter.transform(data)
    segDF.cache()

    val segRDD: RDD[(Int, Seq[String])] = segDF.select("year", "segmented").rdd.map { case Row(year: String, terms: Seq[String]) =>
      (Integer.parseInt(year), terms)
    }

    val result: Array[String] = segRDD.map(line => line._1.toString + "\u00ef" + line._2.mkString(",")).collect()
    val writer: BufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("E:/entertainment_seg.txt")))
    result.foreach(line => writer.write(line + "\n"))
    writer.close()

    // 统计2016出现在新闻中最多的明星
    val stars2016 = segRDD.filter(_._1 == 2016)
      .flatMap { case (year: Int, termStr: Seq[String]) =>
        val person = termStr
          .map(term => (term.split("/")(0), term.split("/")(1)))
          .filter(_._2.equalsIgnoreCase("nr"))
          .map(term => (term._1, 1L))

        person
      }
      .reduceByKey(_ + _)
      .sortBy(_._2, ascending = false)

    segDF.unpersist()

    stars2016.take(100).foreach(println)

    spark.stop()
  }
} 
Example 197
Source File: NLPPreprocessTest.scala    From CkoocNLP   with Apache License 2.0 5 votes vote down vote up
package nlp

import com.hankcs.hanlp.utility.Predefine
import functions.clean.Cleaner
import functions.segment.Segmenter
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.junit.Test

import scala.reflect.io.File


  @Test
  def testSegmenter(): Unit = {
    val spark = SparkSession
      .builder
      .master("local[2]")
      .appName("Segment Demo")
      .getOrCreate()

    val text = Seq(
      (0, "这段文本是用来做分词测试的!This text is for test!"),
      (1, "江州市长江大桥参加长江大桥通车仪式"),
      (2, "他邀请了不少于10个明星,有:范冰冰、赵薇、周杰伦等,还有20几位商业大佬")
    )
    val sentenceData = spark.createDataFrame(text).toDF("id", "sentence")

    // 设置HanLP配置文件路径, 默认位于classpath路径中
    val path = this.getClass.getClassLoader.getResource("").getPath
    Predefine.HANLP_PROPERTIES_PATH = path + File.separator + "hanlp.properties"

    val segmenter = new Segmenter()
      .isDelEn(true)
      .isDelNum(true)
      .isAddNature(true)
      .setSegType("StandardSegment")
      .setMinTermLen(2)
      .setMinTermNum(3)
      .setInputCol("sentence")
      .setOutputCol("segmented")

    segmenter.transform(sentenceData).show(false)

    spark.stop()
  }
} 
Example 198
Source File: Schema.scala    From osmesa   with Apache License 2.0 5 votes vote down vote up
package osmesa.analytics.updater

import java.sql.Timestamp
import java.time.Instant

import geotrellis.vectortile.Layer
import org.apache.log4j.Logger
import osmesa.analytics.updater.Implicits._

trait Schema {
  val layer: Layer
  val features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)]

  val newFeatures: Seq[VTFeature]
  lazy val replacementFeatures: Seq[VTFeature] = Seq.empty[VTFeature]
  lazy val retainedFeatures: Seq[VTFeature] = Seq.empty[VTFeature]

  protected lazy val logger: Logger = Logger.getLogger(getClass)

  protected lazy val touchedFeatures: Map[String, Seq[VTFeature]] =
    Map.empty[String, Seq[VTFeature]]

  protected lazy val versionInfo: Map[String, (Int, Int, Timestamp)] =
    touchedFeatures
      .mapValues(_.last)
      .mapValues(
        f =>
          (
            f.data("__version").toInt,
            f.data("__minorVersion").toInt,
            Timestamp.from(Instant.ofEpochMilli(f.data("__updated")))
        ))

  protected lazy val minorVersions: Map[String, Int] =
    features
      .mapValues {
        case (_, curr) => curr.data
      }
      .map {
        case (id, f) =>
          versionInfo.get(id) match {
            case Some((prevVersion, _, _)) if prevVersion < f.version => (id, 0)
            case Some((prevVersion, prevMinorVersion, _)) if prevVersion == f.version =>
              (id, prevMinorVersion + 1)
            case _ => (id, 0)
          }
      }
}

trait SchemaBuilder {
  val layerName: String

  def apply(layer: Layer,
            features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)]): Schema
} 
Example 199
Source File: printMatrix.scala    From mCNN   with Apache License 2.0 5 votes vote down vote up
package hhbyyh.mCNN

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.{SparkContext, SparkConf}
import breeze.linalg.{DenseMatrix => BDM, kron}


object printMatrix {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)
    val conf = new SparkConf().setMaster("local[8]").setAppName("ttt")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("dataset/mnist/mnist_train.csv", 8)
    val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble))
      .map(arr => (arr(0), Example.Vector2Tensor(Vectors.dense(arr.slice(1, 785).map(v => if(v > 200) 1.0 else 0)))(0)))

    val lines2 = sc.textFile("dataset/train.format", 8)
    val data2 = lines2.map(line => line.split(",")).map(arr => arr.map(_.toDouble))
      .map(arr => (arr(784), Example.Vector2Tensor(Vectors.dense(arr.slice(0, 784)))(0)))

    data2.take(10).foreach(record =>{
      println("label: " + record._1)
      val intm = new BDM[Int](28, 28, record._2.toArray.map(d => d.toInt))
      val str = intm.toString(1000, 1000).replace('0', '.').replace('0', '*')
      println(str)
    })

  }
} 
Example 200
Source File: Example.scala    From mCNN   with Apache License 2.0 5 votes vote down vote up
package hhbyyh.mCNN

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.{SparkConf, SparkContext}
import breeze.linalg.{DenseMatrix => BDM, _}

object Example {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)
    val conf = new SparkConf().setMaster("local[8]").setAppName("ttt")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("dataset/train.format", 8)
    val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble))
      .map(arr => (arr(784), Vector2Tensor(Vectors.dense(arr.slice(0, 784)))))

    val topology = new CNNTopology
    topology.addLayer(CNNLayer.buildConvolutionLayer(1, 6, new Scale(5, 5)))
    topology.addLayer(CNNLayer.buildMeanPoolingLayer(new Scale(2, 2)))
    topology.addLayer(CNNLayer.buildConvolutionLayer(6, 12, new Scale(5, 5)))
    topology.addLayer(CNNLayer.buildMeanPoolingLayer(new Scale(2, 2)))
    topology.addLayer(CNNLayer.buildConvolutionLayer(12, 12, new Scale(4, 4)))
    val cnn: CNN = new CNN(topology).setMaxIterations(5).setMiniBatchSize(16)
    val start = System.nanoTime()
    cnn.trainOneByOne(data)
    println("Training time: " + (System.nanoTime() - start) / 1e9)

    val right = data.map(record =>{
      val result = cnn.predict(record._2)
      if(result == record._1) 1 else 0
    }).sum()
    println(s"Predicting precision: $right " + right.toDouble/(data.count()))

//    val testData = sc.textFile("dataset/mnist/mnist_test.csv", 8)
//      .map(line => line.split(",")).map(arr => arr.map(_.toDouble))
//      .map(arr => (arr(0), Example.Vector2Tensor(Vectors.dense(arr.slice(1, 785).map(v => if(v > 200) 1.0 else 0)))))

    val rightM = data.map(record =>{
      val result = cnn.predict(record._2)
      if(result == record._1) 1 else 0
    }).sum()
    println(s"Mnist Full Predicting precision: $rightM " + rightM.toDouble/(data.count()))
  }

  
  def Vector2Tensor(record: Vector): Array[BDM[Double]] = {
    val mapSize = new Scale(28, 28)
    val m = new BDM[Double](mapSize.x, mapSize.y)
    var i: Int = 0
    while (i < mapSize.x) {
      var j: Int = 0
      while (j < mapSize.y) {
        m(i, j) = record(mapSize.x * i + j)
        j += 1
      }
      i += 1
    }
    Array(m)
  }


}