org.apache.log4j.Logger Scala Example

Source File: WortschatzParser.scala From dbpedia-spotlight-model with Apache License 2.0

7 votes

package org.dbpedia.spotlight.io

import com.officedepot.cdap2.collection.CompactHashSet
import org.apache.log4j.Logger

import scala.io.Source



object WortschatzParser {

    val LOG = Logger.getLogger(this.getClass)

    def parse(filename: String) : CompactHashSet[String] = {
        parse(filename, count => true);
    }

    def parse(filename: String, minimumCount: Int) : CompactHashSet[String] = {
        parse(filename, count => (count > minimumCount) )
    }

    def parse(filename: String, minimumCount: Int, maximumCount: Int) : CompactHashSet[String] = {
        parse(filename, count => (count > minimumCount) && (count < maximumCount))
    }

    def parse(filename: String, condition: Int => Boolean) : CompactHashSet[String] = {
        LOG.info(" parsing common words file ")
        // get lines, split in three fields, get the middle one (word)
        val commonWords = new CompactHashSet[String]();

        val log = Source.fromFile(filename, "iso-8859-1").getLines.foreach(line => {
            if (line.trim()!="") {
                val fields = line.split("\\s")
                if (condition(fields(2).toInt)) commonWords.add(fields(1))
            }
        });
        commonWords
    }
}

Source File: DenseKMeans.scala From drizzle-spark with Apache License 2.0

6 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors


object DenseKMeans {

  object InitializationMode extends Enumeration {
    type InitializationMode = Value
    val Random, Parallel = Value
  }

  import InitializationMode._

  case class Params(
      input: String = null,
      k: Int = -1,
      numIterations: Int = 10,
      initializationMode: InitializationMode = Parallel) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DenseKMeans") {
      head("DenseKMeans: an example k-means app for dense data.")
      opt[Int]('k', "k")
        .required()
        .text(s"number of clusters, required")
        .action((x, c) => c.copy(k = x))
      opt[Int]("numIterations")
        .text(s"number of iterations, default: ${defaultParams.numIterations}")
        .action((x, c) => c.copy(numIterations = x))
      opt[String]("initMode")
        .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " +
        s"default: ${defaultParams.initializationMode}")
        .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x)))
      arg[String]("<input>")
        .text("input paths to examples")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"DenseKMeans with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = sc.textFile(params.input).map { line =>
      Vectors.dense(line.split(' ').map(_.toDouble))
    }.cache()

    val numExamples = examples.count()

    println(s"numExamples = $numExamples.")

    val initMode = params.initializationMode match {
      case Random => KMeans.RANDOM
      case Parallel => KMeans.K_MEANS_PARALLEL
    }

    val model = new KMeans()
      .setInitializationMode(initMode)
      .setK(params.k)
      .setMaxIterations(params.numIterations)
      .run(examples)

    val cost = model.computeCost(examples)

    println(s"Total cost = $cost.")

    sc.stop()
  }
}
// scalastyle:on println

Source File: HttpUtil.scala From sparta with Apache License 2.0

6 votes

package com.stratio.benchmark.generator.utils

import org.apache.http.HttpStatus
import org.apache.http.client.methods.{HttpDelete, HttpGet, HttpPost, HttpPut}
import org.apache.http.entity.StringEntity
import org.apache.http.impl.client.HttpClientBuilder
import org.apache.http.util.EntityUtils
import org.apache.log4j.Logger
import org.json4s.DefaultFormats
import org.json4s.native.JsonMethods._

import scala.io.Source

trait HttpUtil   {

  private val logger = Logger.getLogger(this.getClass)

  
  def createPolicy(policyContent: String, endpoint: String)(implicit defaultFormats: DefaultFormats): String = {

    val policyName = (parse(policyContent) \ "name").extract[String]

    // If the policy exists when it launches the benchmark, it should stop and delete it.
    getPolicyId(policyName, endpoint) match {
      case Some(id) =>
        stopPolicy(id, endpoint)
        deletePolicy(id, endpoint)
      case None => logger.debug(s"No policy with name $policyName exists in Sparta yet.")
    }

    val client = HttpClientBuilder.create().build()
    val post = new HttpPost(s"$endpoint/policyContext")
    post.setHeader("Content-type", "application/json")
    post.setEntity(new StringEntity(policyContent))
    val response = client.execute(post)

   if(response.getStatusLine.getStatusCode != HttpStatus.SC_OK)
     throw new IllegalStateException(s"Sparta status code is not OK: ${response.getStatusLine.getStatusCode}")
   else {
     val entity = response.getEntity
     val policyId = (parse(EntityUtils.toString(entity)) \ "policyId").extract[String]
     policyId
   }
  }

  def getPolicyId(name: String, endpoint: String)(implicit defaultFormats: DefaultFormats): Option[String] = {
    val client = HttpClientBuilder.create().build()
    val get = new HttpGet(s"$endpoint/policy/findByName/$name")

    val response = client.execute(get)

    response.getStatusLine.getStatusCode match {
      case HttpStatus.SC_OK =>
        Option((parse(EntityUtils.toString(response.getEntity)) \ "id").extract[String])
      case _ => None
    }
  }

  def stopPolicy(id: String, endpoint: String): Unit = {
    val client = HttpClientBuilder.create().build()
    val put = new HttpPut(s"$endpoint/policyContext")
    put.setHeader("Content-Type", "application/json")
    val entity = new StringEntity(s"""{"id":"$id", "status":"Stopping"}""")
    put.setEntity(entity)
    val response = client.execute(put)

    if(response.getStatusLine.getStatusCode != HttpStatus.SC_CREATED) {
      logger.info(Source.fromInputStream(response.getEntity.getContent).mkString(""))
      logger.info(s"Sparta status code is not OK: ${response.getStatusLine.getStatusCode}")
    }
  }

  def deletePolicy(id: String, endpoint: String): Unit = {
    val client = HttpClientBuilder.create().build()
    val delete = new HttpDelete(s"$endpoint/policy/$id")
    val response = client.execute(delete)

    if(response.getStatusLine.getStatusCode != HttpStatus.SC_OK)
      logger.info(s"Sparta status code is not OK: ${response.getStatusLine.getStatusCode}")
  }
}

Source File: RandomForestPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

6 votes

package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object RandomForestPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val rf = new RandomForestClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setNumTrees(20)
      .setMaxDepth(5)
      .setMaxBins(32)
      .setMinInstancesPerNode(1)
      .setMinInfoGain(0.0)
      .setCacheNodeIds(false)
      .setCheckpointInterval(10)

    stages += vectorAssembler
    stages += rf
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)

  }
}

Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

6 votes

package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.GBTClassifier
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object GradientBoostedTreePipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val gbt = new GBTClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setMaxIter(10)

    stages += vectorAssembler
    stages += gbt
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)

    val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0))
    val labels = model.transform(test).select("label").rdd.map(_.getDouble(0))
    val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision
    println(s"  Accuracy : $accuracy")

    holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/GBT.xls")

    savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/GBT.csv")
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }

}

Source File: JDBCSink.scala From BigData-News with Apache License 2.0

5 votes

package com.vita.spark

import java.sql.{Connection, ResultSet, SQLException, Statement}

import org.apache.log4j.{LogManager, Logger}
import org.apache.spark.sql.{ForeachWriter, Row}

/**
  * 处理从StructuredStreaming中向mysql中写入数据
  */
class JDBCSink(url: String, username: String, password: String) extends ForeachWriter[Row] {

  var statement: Statement = _
  var resultSet: ResultSet = _
  var connection: Connection = _

  override def open(partitionId: Long, version: Long): Boolean = {
    connection = new MySqlPool(url, username, password).getJdbcConn()
    statement = connection.createStatement();
    print("open")
    return true
  }

  override def process(value: Row): Unit = {
    println("process step one")
    val titleName = value.getAs[String]("titleName").replaceAll("[\\[\\]]", "")
    val count = value.getAs[Long]("count")

    val querySql = "select 1 from webCount where titleName = '" + titleName + "'"
    val insertSql = "insert into webCount(titleName,count) values('" + titleName + "' , '" + count + "')"
    val updateSql = "update webCount set count = " + count + " where titleName = '" + titleName + "'"
    println("process step two")
    try {
      //查看连接是否成功
      var resultSet = statement.executeQuery(querySql)
      if (resultSet.next()) {
        println("updateSql")
        statement.executeUpdate(updateSql)
      } else {
        println("insertSql")
        statement.execute(insertSql)
      }

    } catch {
      case ex: SQLException => {
        println("SQLException")
      }
      case ex: Exception => {
        println("Exception")
      }
      case ex: RuntimeException => {
        println("RuntimeException")
      }
      case ex: Throwable => {
        println("Throwable")
      }
    }
  }

  override def close(errorOrNull: Throwable): Unit = {
    if (statement == null) {
      statement.close()
    }
    if (connection == null) {
      connection.close()
    }
  }
}

Source File: MySqlPool.scala From BigData-News with Apache License 2.0

5 votes

package com.vita.spark

import java.sql.{Connection, DriverManager}
import java.util

import org.apache.log4j.{LogManager, Logger}


/**
  * 从mysql连接池中获取连接
  */
class MySqlPool(url: String, user: String, pwd: String) extends Serializable {
  //连接池连接总数
  private val max = 3

  //每次产生连接数
  private val connectionNum = 1

  //当前连接池已产生的连接数
  private var conNum = 0

  private val pool = new util.LinkedList[Connection]() //连接池

  val LOGGER :Logger = LogManager.getLogger("vita")

  //获取连接
  def getJdbcConn(): Connection = {
    LOGGER.info("getJdbcConn")
    //同步代码块，AnyRef为所有引用类型的基类，AnyVal为所有值类型的基类
    AnyRef.synchronized({
      if (pool.isEmpty) {
        //加载驱动
        preGetConn()
        for (i <- 1 to connectionNum) {
          val conn = DriverManager.getConnection(url, user, pwd)
          pool.push(conn)
          conNum += 1
        }
      }
      pool.poll()
    })
  }

  //释放连接
  def releaseConn(conn: Connection): Unit = {
    pool.push(conn)
  }

  //加载驱动
  private def preGetConn(): Unit = {
    //控制加载
    if (conNum < max && !pool.isEmpty) {
      LOGGER.info("Jdbc Pool has no connection now, please wait a moments!")
      Thread.sleep(2000)
      preGetConn()
    } else {
      Class.forName("com.mysql.jdbc.Driver")
    }
  }
}

Source File: StructuredStreamingOffset.scala From BigData-News with Apache License 2.0

5 votes

package com.vita.spark.streaming

import com.vita.Constants
import com.vita.redies.RedisSingle
import com.vita.spark.streaming.writer.RedisWriteKafkaOffset
import org.apache.log4j.{LogManager, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.{ProcessingTime, Trigger}


object StructuredStreamingOffset {

  val LOGGER: Logger = LogManager.getLogger("StructuredStreamingOffset")

  //topic
  val SUBSCRIBE = "log"

  case class readLogs(context: String, offset: String)

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .master("local[*]")
      .appName("StructuredStreamingOffset")
      .getOrCreate()

    //开始 offset
    var startOffset = -1

    //init
    val redisSingle: RedisSingle = new RedisSingle()
    redisSingle.init(Constants.IP, Constants.PORT)
    //get redis
    if (redisSingle.exists(Constants.REDIDS_KEY) && redisSingle.getTime(Constants.REDIDS_KEY) != -1) {
      startOffset = redisSingle.get(Constants.REDIDS_KEY).toInt
    }

    //sink
    val df = spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("subscribe", SUBSCRIBE)
      .option("startingOffsets", "{\"" + SUBSCRIBE + "\":{\"0\":" + startOffset + "}}")
      .load()

    import spark.implicits._

    //row 包含: key、value 、topic、 partition、offset、timestamp、timestampType
    val lines = df.selectExpr("CAST(value AS STRING)", "CAST(offset AS LONG)").as[(String, Long)]

    val content = lines.map(x => readLogs(x._1, x._2.toString))

    val count = content.toDF("context", "offset")

    //sink foreach 记录offset
    val query = count
      .writeStream
      .foreach(new RedisWriteKafkaOffset)
      .outputMode("update")
      .trigger(Trigger.ProcessingTime("5 seconds"))
      .format("console")
      .start()

    query.awaitTermination()
  }
}

Source File: LinearRegression.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.optimization.{L1Updater, SimpleUpdater, SquaredL2Updater}
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
import org.apache.spark.mllib.util.MLUtils

spark-examples-*.jar \
          |  data/mllib/sample_linear_regression_data.txt
        """.stripMargin)
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"LinearRegression with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()

    val splits = examples.randomSplit(Array(0.8, 0.2))
    val training = splits(0).cache()
    val test = splits(1).cache()

    val numTraining = training.count()
    val numTest = test.count()
    println(s"Training: $numTraining, test: $numTest.")

    examples.unpersist(blocking = false)

    val updater = params.regType match {
      case NONE => new SimpleUpdater()
      case L1 => new L1Updater()
      case L2 => new SquaredL2Updater()
    }

    val algorithm = new LinearRegressionWithSGD()
    algorithm.optimizer
      .setNumIterations(params.numIterations)
      .setStepSize(params.stepSize)
      .setUpdater(updater)
      .setRegParam(params.regParam)

    val model = algorithm.run(training)

    val prediction = model.predict(test.map(_.features))
    val predictionAndLabel = prediction.zip(test.map(_.label))

    val loss = predictionAndLabel.map { case (p, l) =>
      val err = p - l
      err * err
    }.reduce(_ + _)
    val rmse = math.sqrt(loss / numTest)

    println(s"Test RMSE = $rmse.")

    sc.stop()
  }
}
// scalastyle:on println

Source File: BinaryClassification.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, SVMWithSGD}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.optimization.{L1Updater, SquaredL2Updater}
import org.apache.spark.mllib.util.MLUtils

spark-examples-*.jar \
          |  --algorithm LR --regType L2 --regParam 1.0 \
          |  data/mllib/sample_binary_classification_data.txt
        """.stripMargin)
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"BinaryClassification with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()

    val splits = examples.randomSplit(Array(0.8, 0.2))
    val training = splits(0).cache()
    val test = splits(1).cache()

    val numTraining = training.count()
    val numTest = test.count()
    println(s"Training: $numTraining, test: $numTest.")

    examples.unpersist(blocking = false)

    val updater = params.regType match {
      case L1 => new L1Updater()
      case L2 => new SquaredL2Updater()
    }

    val model = params.algorithm match {
      case LR =>
        val algorithm = new LogisticRegressionWithLBFGS()
        algorithm.optimizer
          .setNumIterations(params.numIterations)
          .setUpdater(updater)
          .setRegParam(params.regParam)
        algorithm.run(training).clearThreshold()
      case SVM =>
        val algorithm = new SVMWithSGD()
        algorithm.optimizer
          .setNumIterations(params.numIterations)
          .setStepSize(params.stepSize)
          .setUpdater(updater)
          .setRegParam(params.regParam)
        algorithm.run(training).clearThreshold()
    }

    val prediction = model.predict(test.map(_.features))
    val predictionAndLabel = prediction.zip(test.map(_.label))

    val metrics = new BinaryClassificationMetrics(predictionAndLabel)

    println(s"Test areaUnderPR = ${metrics.areaUnderPR()}.")
    println(s"Test areaUnderROC = ${metrics.areaUnderROC()}.")

    sc.stop()
  }
}
// scalastyle:on println

Source File: SparseNaiveBayes.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.util.MLUtils


object SparseNaiveBayes {

  case class Params(
      input: String = null,
      minPartitions: Int = 0,
      numFeatures: Int = -1,
      lambda: Double = 1.0) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("SparseNaiveBayes") {
      head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.")
      opt[Int]("numPartitions")
        .text("min number of partitions")
        .action((x, c) => c.copy(minPartitions = x))
      opt[Int]("numFeatures")
        .text("number of features")
        .action((x, c) => c.copy(numFeatures = x))
      opt[Double]("lambda")
        .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}")
        .action((x, c) => c.copy(lambda = x))
      arg[String]("<input>")
        .text("input paths to labeled examples in LIBSVM format")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val minPartitions =
      if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions

    val examples =
      MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions)
    // Cache examples because it will be used in both training and evaluation.
    examples.cache()

    val splits = examples.randomSplit(Array(0.8, 0.2))
    val training = splits(0)
    val test = splits(1)

    val numTraining = training.count()
    val numTest = test.count()

    println(s"numTraining = $numTraining, numTest = $numTest.")

    val model = new NaiveBayes().setLambda(params.lambda).run(training)

    val prediction = model.predict(test.map(_.features))
    val predictionAndLabel = prediction.zip(test.map(_.label))
    val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest

    println(s"Test accuracy = $accuracy.")

    sc.stop()
  }
}
// scalastyle:on println

Source File: StreamingExamples.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.examples.streaming

import org.apache.log4j.{Level, Logger}

import org.apache.spark.internal.Logging


  def setStreamingLogLevels() {
    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
    if (!log4jInitialized) {
      // We first log something to initialize Spark's default logging, then we override the
      // logging level.
      logInfo("Setting log level to [WARN] for streaming example." +
        " To override add a custom log4j.properties to the classpath.")
      Logger.getRootLogger.setLevel(Level.WARN)
    }
  }
}

Source File: YarnScheduler.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import org.apache.hadoop.yarn.util.RackResolver
import org.apache.log4j.{Level, Logger}

import org.apache.spark._
import org.apache.spark.scheduler.TaskSchedulerImpl
import org.apache.spark.util.Utils

private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) {

  // RackResolver logs an INFO message whenever it resolves a rack, which is way too often.
  if (Logger.getLogger(classOf[RackResolver]).getLevel == null) {
    Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN)
  }

  // By default, rack is unknown
  override def getRackForHost(hostPort: String): Option[String] = {
    val host = Utils.parseHostPort(hostPort)._1
    Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation)
  }
}

Source File: DLClassifierLeNet.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.example.MLPipeline

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToBatch}
import com.intel.analytics.bigdl.dataset.{DataSet, DistributedDataSet, MiniBatch, _}
import com.intel.analytics.bigdl.dlframes.DLClassifier
import com.intel.analytics.bigdl.models.lenet.LeNet5
import com.intel.analytics.bigdl.models.lenet.Utils._
import com.intel.analytics.bigdl.nn.ClassNLLCriterion
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat
import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext


object DLClassifierLeNet {

  LoggerFilter.redirectSparkInfoLogs()

  def main(args: Array[String]): Unit = {
    val inputs = Array[String]("Feature data", "Label data")
    trainParser.parse(args, new TrainParams()).foreach(param => {
      val conf = Engine.createSparkConf()
        .setAppName("MLPipeline Example")
        .set("spark.task.maxFailures", "1")
      val sc = new SparkContext(conf)
      val sqLContext = SQLContext.getOrCreate(sc)
      Engine.init

      val trainData = param.folder + "/train-images-idx3-ubyte"
      val trainLabel = param.folder + "/train-labels-idx1-ubyte"
      val validationData = param.folder + "/t10k-images-idx3-ubyte"
      val validationLabel = param.folder + "/t10k-labels-idx1-ubyte"

      val trainSet = DataSet.array(load(trainData, trainLabel), sc) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch(1)

      val trainingRDD : RDD[Data[Float]] = trainSet.
        asInstanceOf[DistributedDataSet[MiniBatch[Float]]].data(false).map(batch => {
          val feature = batch.getInput().asInstanceOf[Tensor[Float]]
          val label = batch.getTarget().asInstanceOf[Tensor[Float]]
          Data[Float](feature.storage().array(), label.storage().array())
        })
      val trainingDF = sqLContext.createDataFrame(trainingRDD).toDF(inputs: _*)

      val model = LeNet5(classNum = 10)
      val criterion = ClassNLLCriterion[Float]()
      val featureSize = Array(28, 28)
      val estimator = new DLClassifier[Float](model, criterion, featureSize)
        .setFeaturesCol(inputs(0))
        .setLabelCol(inputs(1))
        .setBatchSize(param.batchSize)
        .setMaxEpoch(param.maxEpoch)
      val transformer = estimator.fit(trainingDF)

      val validationSet = DataSet.array(load(validationData, validationLabel), sc) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToBatch(1)

      val validationRDD: RDD[Data[Float]] = validationSet.
        asInstanceOf[DistributedDataSet[MiniBatch[Float]]].data(false).map{batch =>
          val feature = batch.getInput().asInstanceOf[Tensor[Float]]
          val label = batch.getTarget().asInstanceOf[Tensor[Float]]
          Data[Float](feature.storage().array(), label.storage().array())
        }
      val validationDF = sqLContext.createDataFrame(validationRDD).toDF(inputs: _*)
      val transformed = transformer.transform(validationDF)
      transformed.show()
      sc.stop()
    })
  }
}

private case class Data[T](featureData : Array[T], labelData : Array[T])

Source File: ImagePredictor.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.example.imageclassification

import java.nio.file.Paths

import com.intel.analytics.bigdl.dataset.image._
import com.intel.analytics.bigdl.dlframes.DLClassifierModel
import com.intel.analytics.bigdl.example.imageclassification.MlUtils._
import com.intel.analytics.bigdl.numeric.NumericFloat
import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext


object ImagePredictor {
  LoggerFilter.redirectSparkInfoLogs()
  Logger.getLogger("com.intel.analytics.bigdl.example").setLevel(Level.INFO)

  def main(args: Array[String]): Unit = {
    predictParser.parse(args, new PredictParams()).map(param => {
      val conf = Engine.createSparkConf()
      conf.setAppName("Predict with trained model")
      val sc = new SparkContext(conf)
      Engine.init
      val sqlContext = new SQLContext(sc)

      val partitionNum = Engine.nodeNumber() * Engine.coreNumber()
      val model = loadModel(param)
      val valTrans = new DLClassifierModel(model, Array(3, imageSize, imageSize))
        .setBatchSize(param.batchSize)
        .setFeaturesCol("features")
        .setPredictionCol("predict")

      val valRDD = if (param.isHdfs) {
        // load image set from hdfs
        imagesLoadSeq(param.folder, sc, param.classNum).coalesce(partitionNum, true)
      } else {
        // load image set from local
        val paths = LocalImageFiles.readPaths(Paths.get(param.folder), hasLabel = false)
        sc.parallelize(imagesLoad(paths, 256), partitionNum)
      }

      val transf = RowToByteRecords() ->
          BytesToBGRImg() ->
          BGRImgCropper(imageSize, imageSize) ->
          BGRImgNormalizer(testMean, testStd) ->
          BGRImgToImageVector()

      val valDF = transformDF(sqlContext.createDataFrame(valRDD), transf)

      valTrans.transform(valDF)
          .select("imageName", "predict")
          .collect()
          .take(param.showNum)
          .foreach(println)
      sc.stop()
    })
  }
}

Source File: RowToByteRecords.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.example.imageclassification

import com.intel.analytics.bigdl.dataset.{ByteRecord, Transformer}
import org.apache.log4j.Logger
import org.apache.spark.sql.Row

import scala.collection.Iterator

object RowToByteRecords {
  val logger = Logger.getLogger(getClass)

  def apply(colName: String = "data"): RowToByteRecords = {
    new RowToByteRecords(colName)
  }
}


class RowToByteRecords(colName: String)
  extends Transformer[Row, ByteRecord] {

  override def apply(prev: Iterator[Row]): Iterator[ByteRecord] = {
    prev.map(
      img => {
        ByteRecord(img.getAs[Array[Byte]](colName), -1.0f)
      }
    )
  }
}

Source File: ImageNetInference.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.example.mkldnn.int8

import com.intel.analytics.bigdl.models.resnet.ImageNetDataSet
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._
import com.intel.analytics.bigdl.utils._
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext


object ImageNetInference {
  LoggerFilter.redirectSparkInfoLogs()
  Logger.getLogger("com.intel.analytics.bigdl.optim").setLevel(Level.INFO)

  val logger: Logger = Logger.getLogger(getClass)

  import Utils._

  def main(args: Array[String]): Unit = {
    testParser.parse(args, TestParams()).foreach(param => {
      val conf = Engine.createSparkConf()
        .setAppName("Test model on ImageNet2012 with Int8")
        .set("spark.rpc.message.maxSize", "200")
      val sc = new SparkContext(conf)
      Engine.init

      val evaluationSet = ImageNetDataSet.valDataSet(param.folder,
        sc, 224, param.batchSize).toDistributed().data(train = false)

      val model = Module.loadModule[Float](param.model).quantize()
      model.evaluate()

      val result = model.evaluate(evaluationSet, Array(new Top1Accuracy[Float],
        new Top5Accuracy[Float]))

      result.foreach(r => println(s"${r._2} is ${r._1}"))

      sc.stop()
    })
  }
}

Source File: GenerateInt8Scales.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.example.mkldnn.int8

import com.intel.analytics.bigdl.dataset.{DataSet, MiniBatch}
import com.intel.analytics.bigdl.models.resnet.ImageNetDataSet
import com.intel.analytics.bigdl.nn.{Graph, Module}
import com.intel.analytics.bigdl.utils.Engine
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD


object GenerateInt8Scales {
  val logger: Logger = Logger.getLogger(getClass)
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)

  import Utils._

  def genereateInt8Scales(model: Graph[Float], modelName: String,
    evaluationSet: RDD[MiniBatch[Float]]): Unit = {
    model.evaluate()

    model.setInputDimMask(0, true)
    model.setOutputDimMask(0, true)
    model.setWeightDimMask(1, true)

    logger.info(s"Generate the scales for $modelName ...")
    val samples = evaluationSet
      .repartition(1) // repartition (shuffle) will have better accuracy
      .take(1) // only split one batch to sample
      .map(_.getInput().toTensor[Float])

    samples.foreach { sample =>
      model.forward(sample)
      model.calcScales(sample)
    }

    // we should clean the state, such as output
    model.clearState()

    logger.info(s"Generate the scales for $modelName done.")
  }

  def saveQuantizedModel(model: Graph[Float], modelName: String): Unit = {
    val suffix = ".bigdl"
    val prefix = modelName.stripSuffix(suffix)
    val name = prefix.concat(".quantized").concat(suffix)
    logger.info(s"Save the quantized model $name ...")
    // it will force overWrite the existed model file
    model.saveModule(name, overWrite = true)
    logger.info(s"Save the quantized model $name done.")
  }

  def main(args: Array[String]): Unit = {
    genInt8ScalesParser.parse(args, GenInt8ScalesParams()).foreach { param =>
      val conf = Engine.createSparkConf().setAppName("Quantize the model")
        .set("spark.akka.frameSize", 64.toString)
        .set("spark.task.maxFailures", "1")
      val sc = new SparkContext(conf)
      Engine.init

      val partitionNum = Engine.nodeNumber()
      val imageFrame = DataSet.SeqFileFolder.filesToImageFrame(param.folder, sc, 1000,
        partitionNum = Option(partitionNum))

      // the transformer is the same as as that in validation during training
      val evaluationSet = ImageNetDataSet.valDataSet(param.folder,
        sc, 224, param.batchSize).toDistributed().data(train = false)
      // Currently, we only support the graph model, so we add a `toGraph`
      // if the model is already graph, you can need not to it.
      val model = Module.loadModule[Float](param.model).toGraph()
      genereateInt8Scales(model, param.model, evaluationSet)
      saveQuantizedModel(model, param.model)
    }
  }
}

Source File: Test.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.example.lenetLocal

import com.intel.analytics.bigdl.dataset.{DataSet, SampleToBatch}
import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToSample}
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.optim.{Top1Accuracy, ValidationMethod}
import com.intel.analytics.bigdl.utils.Engine
import org.apache.log4j.{Level, Logger}

object Test {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)


  import Utils._

  def main(args: Array[String]): Unit = {
    testParser.parse(args, new TestParams()).foreach { param =>
      System.setProperty("bigdl.localMode", "true")
      System.setProperty("bigdl.coreNumber", param.coreNumber.toString)
      Engine.init

      val validationData = param.folder + "/t10k-images-idx3-ubyte"
      val validationLabel = param.folder + "/t10k-labels-idx1-ubyte"

      val evaluationSet = DataSet.array(load(validationData, validationLabel)) ->
        BytesToGreyImg(28, 28) ->
        GreyImgNormalizer(trainMean, trainStd) ->
        GreyImgToSample() -> SampleToBatch(
        batchSize = param.batchSize, None, None, None,
        partitionNum = Some(1))

      val model = Module.load[Float](param.model)
      val result = model.evaluate(evaluationSet.toLocal(),
        Array(new Top1Accuracy[Float].asInstanceOf[ValidationMethod[Float]]))
      result.foreach(r => println(s"${r._2} is ${r._1}"))
    }
  }
}

Source File: Predict.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.example.lenetLocal
import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToSample}
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.utils.Engine
import com.intel.analytics.bigdl.dataset.Sample
import com.intel.analytics.bigdl.optim.LocalPredictor
import org.apache.log4j.{Level, Logger}

import scala.collection.mutable.ArrayBuffer

object Predict {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)


  import Utils._

  def main(args: Array[String]): Unit = {
    predictParser.parse(args, new PredictParams()).foreach { param =>

      System.setProperty("bigdl.localMode", "true")
      System.setProperty("bigdl.coreNumber", (param.coreNumber.toString))
      Engine.init

      val validationData = param.folder + "/t10k-images-idx3-ubyte"
      val validationLabel = param.folder + "/t10k-labels-idx1-ubyte"

      val rawData = load(validationData, validationLabel)
      val iter = rawData.iterator
      val sampleIter = GreyImgToSample()(
          GreyImgNormalizer(trainMean, trainStd)(
          BytesToGreyImg(28, 28)(iter)))
      var samplesBuffer = ArrayBuffer[Sample[Float]]()
      while (sampleIter.hasNext) {
        val elem = sampleIter.next().clone()
        samplesBuffer += elem
      }
      val samples = samplesBuffer.toArray

      val model = Module.load[Float](param.model)
      val localPredictor = LocalPredictor(model)
      val result = localPredictor.predict(samples)
      val result_class = localPredictor.predictClass(samples)
      result_class.foreach(r => println(s"${r}"))
    }
  }
}

Source File: Train.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.example.lenetLocal

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.DataSet
import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToBatch}
import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Module}
import com.intel.analytics.bigdl.numeric.NumericFloat
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter}
import com.intel.analytics.bigdl.models.lenet.LeNet5
import org.apache.log4j.{Level, Logger}


object Train {
  LoggerFilter.redirectSparkInfoLogs()


  import Utils._

  def main(args: Array[String]): Unit = {
    trainParser.parse(args, new TrainParams()).map(param => {

      System.setProperty("bigdl.localMode", "true")
      System.setProperty("bigdl.coreNumber", param.coreNumber.toString)
      Engine.init

      val trainData = param.folder + "/train-images-idx3-ubyte"
      val trainLabel = param.folder + "/train-labels-idx1-ubyte"
      val validationData = param.folder + "/t10k-images-idx3-ubyte"
      val validationLabel = param.folder + "/t10k-labels-idx1-ubyte"

      val model = if (param.modelSnapshot.isDefined) {
        Module.load[Float](param.modelSnapshot.get)
      } else {
        LeNet5(classNum = 10)
      }

      val optimMethod = if (param.stateSnapshot.isDefined) {
        OptimMethod.load[Float](param.stateSnapshot.get)
      } else {
        new SGD[Float](learningRate = param.learningRate,
          learningRateDecay = param.learningRateDecay)
      }

      val trainSet = DataSet.array(load(trainData, trainLabel)) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch(
        param.batchSize)

      val optimizer = Optimizer(
        model = model,
        dataset = trainSet,
        criterion = ClassNLLCriterion[Float]())
      if (param.checkpoint.isDefined) {
        optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch)
      }
      if(param.overWriteCheckpoint) {
        optimizer.overWriteCheckpoint()
      }

      val validationSet = DataSet.array(load(validationData, validationLabel)) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToBatch(
        param.batchSize)

      optimizer
        .setValidation(
          trigger = Trigger.everyEpoch,
          dataset = validationSet,
          vMethods = Array(new Top1Accuracy, new Top5Accuracy[Float], new Loss[Float]))
        .setOptimMethod(optimMethod)
        .setEndWhen(Trigger.maxEpoch(param.maxEpoch))
        .optimize()
    })
  }
}

Source File: Utils.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.dataset

import com.intel.analytics.bigdl.utils.Engine
import org.apache.log4j.Logger

object Utils {
  private val logger = Logger.getLogger(getClass)

  def getBatchSize(batchSize : Int, totalPartition: Option[Int] = None): Int = {
    val nodeNumber = Engine.nodeNumber()
    val partitionNum = totalPartition.getOrElse(nodeNumber)
    logger.debug(s"partition number: $partitionNum, node number: $nodeNumber")

    require(partitionNum > 0,
      s"Utils.getBatchSize: partitionNum should be larger than 0, but get $partitionNum")
    require(batchSize % partitionNum == 0, s"Utils.getBatchSize: total batch size $batchSize " +
      s"should be divided by partitionNum ${partitionNum}")

    val batchPerUnit = batchSize / partitionNum
    logger.debug(s"Batch per unit: $batchPerUnit")
    batchPerUnit
  }
}

Source File: BGRImgToImageVector.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.dataset.image

import com.intel.analytics.bigdl.dataset.Transformer
import org.apache.log4j.Logger
import org.apache.spark.mllib.linalg.DenseVector

import scala.collection.Iterator

object BGRImgToImageVector {
  val logger = Logger.getLogger(getClass)

  def apply(): BGRImgToImageVector = {
    new BGRImgToImageVector()
  }
}


class BGRImgToImageVector()
  extends Transformer[LabeledBGRImage, DenseVector] {

  private var featureData: Array[Float] = null

  override def apply(prev: Iterator[LabeledBGRImage]): Iterator[DenseVector] = {
    prev.map(
      img => {
        if (null == featureData) {
          featureData = new Array[Float](3 * img.height() * img.width())
        }
        img.copyTo(featureData, 0, true)
        new DenseVector(featureData.map(_.toDouble))
      }
    )
  }
}

Source File: LocalImageFiles.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.dataset.image

import java.awt.color.ColorSpace
import java.nio.file.{Files, Path}

import org.apache.log4j.Logger

object LocalImageFiles {
  Class.forName("javax.imageio.ImageIO")
  Class.forName("java.awt.color.ICC_ColorSpace")
  // Class.forName("sun.java2d.cmm.lcms.LCMS")
  ColorSpace.getInstance(ColorSpace.CS_sRGB).toRGB(Array[Float](0, 0, 0))

  val logger = Logger.getLogger(getClass)

  
  private[bigdl] def readPaths(path: Path, hasLabel: Boolean = true)
  : Array[LocalLabeledImagePath] = {
    if (hasLabel) readPathsWithLabel(path) else readPathsNoLabel(path)
  }
}

Source File: FeatureTransformer.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.transform.vision.image

import com.intel.analytics.bigdl.dataset.{ChainedTransformer, Transformer}
import com.intel.analytics.bigdl.opencv.OpenCV
import com.intel.analytics.bigdl.transform.vision.image.opencv.OpenCVMat
import org.apache.log4j.Logger


class ChainedFeatureTransformer(first: FeatureTransformer, last: FeatureTransformer) extends
  FeatureTransformer {

  override def transform(prev: ImageFeature): ImageFeature = {
    last.transform(first.transform(prev))
  }

  override def enableIgnoreException(): this.type = {
    first.enableIgnoreException()
    last.enableIgnoreException()
    this
  }
}

Source File: ChannelScaledNormalizer.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.transform.vision.image.augmentation

import com.intel.analytics.bigdl.dataset.image.LabeledBGRImage
import com.intel.analytics.bigdl.dataset.{LocalDataSet, Transformer}
import com.intel.analytics.bigdl.transform.vision.image.{FeatureTransformer, ImageFeature}
import com.intel.analytics.bigdl.transform.vision.image.opencv.OpenCVMat
import org.apache.log4j.Logger

import scala.collection.Iterator

object ChannelScaledNormalizer {

  def apply(meanR: Int, meanG: Int, meanB: Int, scale: Double): ChannelScaledNormalizer = {
    new ChannelScaledNormalizer(meanR, meanG, meanB, scale)
  }
}



class ChannelScaledNormalizer(meanR: Int, meanG: Int, meanB: Int, scale: Double)
  extends FeatureTransformer {

  override protected def transformMat(feature: ImageFeature): Unit = {
    val mat = feature.opencvMat()
    val toFloats = OpenCVMat.toFloatPixels(mat)
    val content = toFloats._1
    require(content.length % 3 == 0, "Content should be multiple of 3 channels")
    var i = 0
    val frameLength = content.length / 3
    val height = toFloats._2
    val width = toFloats._3
    val bufferContent = new Array[Float](width * height * 3)

    val channels = 3
    val mean = Array(meanR, meanG, meanB)
    var c = 0
    while (c < channels) {
      i = 0
      while (i < frameLength) {
        val data_index = c * frameLength + i
        bufferContent(data_index) = ((content(data_index) - mean(c)) * scale).toFloat
        i += 1
      }
      c += 1
    }
    if (mat != null) {
      mat.release()
    }
    val newMat = OpenCVMat.fromFloats(bufferContent, height, width)
    feature(ImageFeature.mat) = newMat
  }

}

Source File: Test.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.models.resnet

import com.intel.analytics.bigdl.Module
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.utils.Engine
import com.intel.analytics.bigdl.models.resnet.Utils._
import com.intel.analytics.bigdl.optim.{Top1Accuracy, ValidationMethod, ValidationResult}
import com.intel.analytics.bigdl.dataset.image.{BGRImgNormalizer, BGRImgToSample, BytesToBGRImg}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object Test {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)

  def main(args: Array[String]): Unit = {
    testParser.parse(args, TestParams()).foreach { param =>
      val conf = Engine.createSparkConf().setAppName("Test ResNet on Cifar10")
        .set("spark.akka.frameSize", 64.toString)
        .set("spark.task.maxFailures", "1")
      val sc = new SparkContext(conf)

      Engine.init
      val partitionNum = Engine.nodeNumber() * Engine.coreNumber()

      val rddData = sc.parallelize(loadTest(param.folder), partitionNum)
      val transformer = BytesToBGRImg() -> BGRImgNormalizer(Cifar10DataSet.trainMean,
          Cifar10DataSet.trainStd) -> BGRImgToSample()
      val evaluationSet = transformer(rddData)

      val model = Module.load[Float](param.model)
      println(model)
      val result = model.evaluate(evaluationSet, Array(new Top1Accuracy[Float]),
        Some(param.batchSize))
      result.foreach(r => println(s"${r._2} is ${r._1}"))

      sc.stop()
    }
  }
}

Source File: TestImageNet.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.models.resnet

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.DataSet
import com.intel.analytics.bigdl.dataset.image.CropCenter
import com.intel.analytics.bigdl.models.resnet.ResNet.DatasetType
import com.intel.analytics.bigdl.nn.{Module, StaticGraph}
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._
import com.intel.analytics.bigdl.transform.vision.image.{ImageFeature, MTImageFeatureToBatch, MatToTensor, PixelBytesToMat}
import com.intel.analytics.bigdl.transform.vision.image.augmentation.{ChannelScaledNormalizer, RandomCropper, RandomResize}
import com.intel.analytics.bigdl.utils._
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext


object TestImageNet {
  LoggerFilter.redirectSparkInfoLogs()
  Logger.getLogger("com.intel.analytics.bigdl.optim").setLevel(Level.INFO)
  val logger = Logger.getLogger(getClass)

  import Utils._

  def main(args: Array[String]): Unit = {
    testParser.parse(args, new TestParams()).map(param => {
      val conf = Engine.createSparkConf().setAppName("Test model on ImageNet2012")
        .set("spark.rpc.message.maxSize", "200")
      val sc = new SparkContext(conf)
      Engine.init

      val model = Module.loadModule[Float](param.model)
      val evaluationSet = ImageNetDataSet.valDataSet(param.folder,
        sc, 224, param.batchSize).toDistributed().data(train = false)

      val result = model.evaluate(evaluationSet,
        Array(new Top1Accuracy[Float], new Top5Accuracy[Float]))
      result.foreach(r => println(s"${r._2} is ${r._1}"))

      sc.stop()
    })
  }
}

Source File: TrainCIFAR10.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.models.resnet

import com.intel.analytics.bigdl.nn.{CrossEntropyCriterion, Module}
import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.models.resnet.ResNet.{DatasetType, ShortcutType}
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter, OptimizerV1, OptimizerV2, T, Table}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._

object TrainCIFAR10 {
  LoggerFilter.redirectSparkInfoLogs()


  import Utils._

  def cifar10Decay(epoch: Int): Double =
    if (epoch >= 122) 2.0 else if (epoch >= 81) 1.0 else 0.0

  def main(args: Array[String]): Unit = {
    trainParser.parse(args, new TrainParams()).map(param => {
      val conf = Engine.createSparkConf().setAppName("Train ResNet on Cifar10")
        // Will throw exception without this config when has only one executor
        .set("spark.rpc.message.maxSize", "200")
      val sc = new SparkContext(conf)
      Engine.init

      val batchSize = param.batchSize
      val (imageSize, lrSchedule, maxEpoch, dataSet) =
        (32, DatasetType.CIFAR10, param.nepochs, Cifar10DataSet)

      val trainDataSet = dataSet.trainDataSet(param.folder, sc, imageSize, batchSize)

      val validateSet = dataSet.valDataSet(param.folder, sc, imageSize, batchSize)

      val shortcut: ShortcutType = param.shortcutType match {
        case "A" => ShortcutType.A
        case "B" => ShortcutType.B
        case _ => ShortcutType.C
      }

      val model = if (param.modelSnapshot.isDefined) {
        Module.load[Float](param.modelSnapshot.get)
      } else {
        val curModel = if (param.graphModel) {
          ResNet.graph(param.classes,
            T("shortcutType" -> shortcut, "depth" -> param.depth, "optnet" -> param.optnet))
        } else {
          ResNet(param.classes,
            T("shortcutType" -> shortcut, "depth" -> param.depth, "optnet" -> param.optnet))
        }
        if (param.optnet) {
          ResNet.shareGradInput(curModel)
        }
        ResNet.modelInit(curModel)
        curModel
      }

      if (param.optimizerVersion.isDefined) {
        param.optimizerVersion.get.toLowerCase match {
          case "optimizerv1" => Engine.setOptimizerVersion(OptimizerV1)
          case "optimizerv2" => Engine.setOptimizerVersion(OptimizerV2)
        }
      }

      val optimMethod = if (param.stateSnapshot.isDefined) {
        OptimMethod.load[Float](param.stateSnapshot.get)
      } else {
        new SGD[Float](learningRate = param.learningRate, learningRateDecay = 0.0,
          weightDecay = param.weightDecay, momentum = param.momentum, dampening = param.dampening,
          nesterov = param.nesterov, learningRateSchedule = SGD.EpochDecay(cifar10Decay))
      }

      val optimizer = Optimizer(
        model = model,
        dataset = trainDataSet,
        criterion = new CrossEntropyCriterion[Float]()
      )
      if (param.checkpoint.isDefined) {
        optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch)
      }

      optimizer
        .setOptimMethod(optimMethod)
        .setValidation(Trigger.everyEpoch,
          validateSet, Array(new Top1Accuracy[Float]))
        .setEndWhen(Trigger.maxEpoch(maxEpoch))
        .optimize()
      sc.stop()

    })
  }
}

Source File: Test.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.models.inception

import com.intel.analytics.bigdl.dataset.{ByteRecord, DataSet}
import com.intel.analytics.bigdl.dataset.image._
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.optim.{Top1Accuracy, Top5Accuracy, Validator}
import com.intel.analytics.bigdl.utils.Engine
import org.apache.hadoop.io.Text
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object Test {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)


  import Options._

  val imageSize = 224

  def main(args: Array[String]) {
    testParser.parse(args, new TestParams()).foreach { param =>
      val batchSize = param.batchSize.getOrElse(128)
      val conf = Engine.createSparkConf().setAppName("Test Inception on ImageNet")
      val sc = new SparkContext(conf)
      Engine.init

      // We set partition number to be node*core, actually you can also assign other partitionNum
      val partitionNum = Engine.nodeNumber() * Engine.coreNumber()
      val rawData = sc.sequenceFile(param.folder, classOf[Text], classOf[Text], partitionNum)
        .map(image => {
          ByteRecord(image._2.copyBytes(), DataSet.SeqFileFolder.readLabel(image._1).toFloat)
        }).coalesce(partitionNum, true)

      val rddData = DataSet.SeqFileFolder.filesToRdd(param.folder, sc, 1000)
      val transformer = BytesToBGRImg() -> BGRImgCropper(imageSize, imageSize, CropCenter) ->
        HFlip(0.5) -> BGRImgNormalizer(0.485, 0.456, 0.406, 0.229, 0.224, 0.225) -> BGRImgToSample()
      val evaluationSet = transformer(rddData)

      val model = Module.load[Float](param.model)
      val result = model.evaluate(evaluationSet,
        Array(new Top1Accuracy[Float], new Top5Accuracy[Float]), param.batchSize)

      result.foreach(r => println(s"${r._2} is ${r._1}"))
      sc.stop()
    }
  }
}

Source File: Test.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.models.lenet

import java.nio.file.Paths

import com.intel.analytics.bigdl.dataset.DataSet
import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToSample}
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.optim.Top1Accuracy
import com.intel.analytics.bigdl.utils.Engine
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object Test {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)


  import Utils._

  def main(args: Array[String]): Unit = {
    testParser.parse(args, new TestParams()).foreach { param =>
      val conf = Engine.createSparkConf().setAppName("Test Lenet on MNIST")
        .set("spark.akka.frameSize", 64.toString)
        .set("spark.task.maxFailures", "1")
      val sc = new SparkContext(conf)
      Engine.init

      val validationData = param.folder + "/t10k-images-idx3-ubyte"
      val validationLabel = param.folder + "/t10k-labels-idx1-ubyte"

      val partitionNum = Engine.nodeNumber() * Engine.coreNumber()
      val rddData = sc.parallelize(load(validationData, validationLabel), partitionNum)
      val transformer =
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToSample()
      val evaluationSet = transformer(rddData)

      val model = Module.load[Float](param.model)
      val result = model.evaluate(evaluationSet,
        Array(new Top1Accuracy[Float]), Some(param.batchSize))

      result.foreach(r => println(s"${r._2} is ${r._1}"))
      sc.stop()
    }
  }
}

Source File: Train.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.models.lenet

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.DataSet
import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToBatch}
import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, CrossEntropyCriterion, Module}
import com.intel.analytics.bigdl.numeric.NumericFloat
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.utils._
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object Train {
  LoggerFilter.redirectSparkInfoLogs()


  import Utils._

  def main(args: Array[String]): Unit = {
    trainParser.parse(args, new TrainParams()).map(param => {
      val conf = Engine.createSparkConf()
        .setAppName("Train Lenet on MNIST")
        .set("spark.task.maxFailures", "1")
      val sc = new SparkContext(conf)
      Engine.init

      val trainData = param.folder + "/train-images-idx3-ubyte"
      val trainLabel = param.folder + "/train-labels-idx1-ubyte"
      val validationData = param.folder + "/t10k-images-idx3-ubyte"
      val validationLabel = param.folder + "/t10k-labels-idx1-ubyte"

      val model = if (param.modelSnapshot.isDefined) {
        Module.load[Float](param.modelSnapshot.get)
      } else {
        if (param.graphModel) {
          LeNet5.graph(classNum = 10)
        } else {
          Engine.getEngineType() match {
            case MklBlas => LeNet5(10)
            case MklDnn => LeNet5.dnnGraph(param.batchSize / Engine.nodeNumber(), 10)
          }
        }
      }
      val criterion = Engine.getEngineType() match {
        case MklBlas => ClassNLLCriterion()
        case MklDnn => CrossEntropyCriterion()
      }

      if (param.optimizerVersion.isDefined) {
        param.optimizerVersion.get.toLowerCase match {
          case "optimizerv1" => Engine.setOptimizerVersion(OptimizerV1)
          case "optimizerv2" => Engine.setOptimizerVersion(OptimizerV2)
        }
      }

      val optimMethod = if (param.stateSnapshot.isDefined) {
        OptimMethod.load[Float](param.stateSnapshot.get)
      } else {
        new SGD[Float](learningRate = param.learningRate,
          learningRateDecay = param.learningRateDecay)
      }

      val trainSet = DataSet.array(load(trainData, trainLabel), sc) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch(
        param.batchSize)

      val optimizer = Optimizer(
        model = model,
        dataset = trainSet,
        criterion = criterion)
      if (param.checkpoint.isDefined) {
        optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch)
      }
      if(param.overWriteCheckpoint) {
        optimizer.overWriteCheckpoint()
      }

      val validationSet = DataSet.array(load(validationData, validationLabel), sc) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToBatch(
        param.batchSize)

      optimizer
        .setValidation(
          trigger = Trigger.everyEpoch,
          dataset = validationSet,
          vMethods = Array(new Top1Accuracy, new Top5Accuracy[Float], new Loss[Float]))
        .setOptimMethod(optimMethod)
        .setEndWhen(Trigger.maxEpoch(param.maxEpoch))
        .optimize()

      sc.stop()
    })
  }
}

Source File: Train.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.models.autoencoder

import java.nio.file.Paths

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.image._
import com.intel.analytics.bigdl.dataset.{DataSet, MiniBatch, Transformer}
import com.intel.analytics.bigdl.nn.{MSECriterion, Module}
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._
import com.intel.analytics.bigdl.utils.{Engine, OptimizerV1, OptimizerV2, T, Table}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

import scala.reflect.ClassTag

object toAutoencoderBatch {
  def apply(): toAutoencoderBatch[Float] = new toAutoencoderBatch[Float]()
}

class toAutoencoderBatch[T: ClassTag](implicit ev: TensorNumeric[T]
      )extends Transformer[MiniBatch[T], MiniBatch[T]] {
  override def apply(prev: Iterator[MiniBatch[T]]): Iterator[MiniBatch[T]] = {
    prev.map(batch => {
      MiniBatch(batch.getInput().toTensor[T], batch.getInput().toTensor[T])
    })
  }
}

object Train {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)


  import Utils._

  def main(args: Array[String]): Unit = {
    trainParser.parse(args, new TrainParams()).map(param => {
      val conf = Engine.createSparkConf().setAppName("Train Autoencoder on MNIST")

      val sc = new SparkContext(conf)
      Engine.init

      val trainData = Paths.get(param.folder, "/train-images-idx3-ubyte")
      val trainLabel = Paths.get(param.folder, "/train-labels-idx1-ubyte")

      val trainDataSet = DataSet.array(load(trainData, trainLabel), sc) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) ->
        GreyImgToBatch(param.batchSize) -> toAutoencoderBatch()

      val model = if (param.modelSnapshot.isDefined) {
        Module.load[Float](param.modelSnapshot.get)
      } else {
        if (param.graphModel) Autoencoder.graph(classNum = 32) else Autoencoder(classNum = 32)
      }

      if (param.optimizerVersion.isDefined) {
        param.optimizerVersion.get.toLowerCase match {
          case "optimizerv1" => Engine.setOptimizerVersion(OptimizerV1)
          case "optimizerv2" => Engine.setOptimizerVersion(OptimizerV2)
        }
      }

      val optimMethod = if (param.stateSnapshot.isDefined) {
        OptimMethod.load[Float](param.stateSnapshot.get)
      } else {
        new Adagrad[Float](learningRate = 0.01, learningRateDecay = 0.0, weightDecay = 0.0005)
      }

      val optimizer = Optimizer(
        model = model,
        dataset = trainDataSet,
        criterion = new MSECriterion[Float]()
      )

      if (param.checkpoint.isDefined) {
        optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch)
      }
      optimizer
        .setOptimMethod(optimMethod)
        .setEndWhen(Trigger.maxEpoch(param.maxEpoch))
        .optimize()
      sc.stop()
    })
  }
}

Source File: Test.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.models.vgg

import com.intel.analytics.bigdl.dataset.DataSet
import com.intel.analytics.bigdl.dataset.image._
import com.intel.analytics.bigdl.models.lenet.Utils._
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.optim.{Top1Accuracy, Validator}
import com.intel.analytics.bigdl.utils.Engine
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object Test {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)


  import Utils._

  def main(args: Array[String]) {
    testParser.parse(args, new TestParams()).foreach { param =>
      val conf = Engine.createSparkConf().setAppName("Test Vgg on Cifar10")
        .set("spark.akka.frameSize", 64.toString)
      val sc = new SparkContext(conf)
      Engine.init

      val partitionNum = Engine.nodeNumber() * Engine.coreNumber()
      val rddData = sc.parallelize(Utils.loadTest(param.folder), partitionNum)
      val transformer = BytesToBGRImg() -> BGRImgNormalizer(testMean, testStd) -> BGRImgToSample()
      val evaluationSet = transformer(rddData)

      val model = Module.load[Float](param.model)
      val result = model.evaluate(evaluationSet,
        Array(new Top1Accuracy[Float]), Some(param.batchSize))
      result.foreach(r => println(s"${r._2} is ${r._1}"))
      sc.stop()
    }
  }
}

Source File: Train.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.models.vgg

import java.text.SimpleDateFormat
import java.util.Date

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.DataSet
import com.intel.analytics.bigdl.dataset.image._
import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Module}
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._
import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter, OptimizerV1, OptimizerV2, T, Table}
import com.intel.analytics.bigdl.visualization.{TrainSummary, ValidationSummary}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object Train {
  LoggerFilter.redirectSparkInfoLogs()


  import Utils._

  def main(args: Array[String]): Unit = {
    trainParser.parse(args, new TrainParams()).map(param => {
      val conf = Engine.createSparkConf().setAppName("Train Vgg on Cifar10")
        // Will throw exception without this config when has only one executor
          .set("spark.rpc.message.maxSize", "200")
      val sc = new SparkContext(conf)
      Engine.init

      val trainDataSet = DataSet.array(Utils.loadTrain(param.folder), sc) ->
        BytesToBGRImg() -> BGRImgNormalizer(trainMean, trainStd) ->
        BGRImgToBatch(param.batchSize)

      val model = if (param.modelSnapshot.isDefined) {
        Module.load[Float](param.modelSnapshot.get)
      } else {
        if (param.graphModel) VggForCifar10.graph(classNum = 10) else VggForCifar10(classNum = 10)
      }

      if (param.optimizerVersion.isDefined) {
        param.optimizerVersion.get.toLowerCase match {
          case "optimizerv1" => Engine.setOptimizerVersion(OptimizerV1)
          case "optimizerv2" => Engine.setOptimizerVersion(OptimizerV2)
        }
      }

      val optimMethod = if (param.stateSnapshot.isDefined) {
        OptimMethod.load[Float](param.stateSnapshot.get)
      } else {
        new SGD[Float](learningRate = param.learningRate, learningRateDecay = 0.0,
          weightDecay = param.weightDecay, momentum = 0.9, dampening = 0.0, nesterov = false,
          learningRateSchedule = SGD.EpochStep(25, 0.5))
      }

      val optimizer = Optimizer(
        model = model,
        dataset = trainDataSet,
        criterion = new ClassNLLCriterion[Float]()
      )

      val validateSet = DataSet.array(Utils.loadTest(param.folder), sc) ->
        BytesToBGRImg() -> BGRImgNormalizer(testMean, testStd) ->
        BGRImgToBatch(param.batchSize)

      if (param.checkpoint.isDefined) {
        optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch)
      }

      if (param.overWriteCheckpoint) {
        optimizer.overWriteCheckpoint()
      }

      if (param.summaryPath.isDefined) {
        val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
        val timeStamp = sdf.format(new Date())
        val trainSummry = new TrainSummary(param.summaryPath.get,
          s"vgg-on-cifar10-train-$timeStamp")
        optimizer.setTrainSummary(trainSummry)
        val validationSummary = new ValidationSummary(param.summaryPath.get,
          s"vgg-on-cifar10-val-$timeStamp")
        optimizer.setValidationSummary(validationSummary)
      }

      optimizer
        .setValidation(Trigger.everyEpoch, validateSet, Array(new Top1Accuracy[Float]))
        .setOptimMethod(optimMethod)
        .setEndWhen(Trigger.maxEpoch(param.maxEpoch))
        .optimize()

      sc.stop()
    })
  }
}

Source File: Validator.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.optim

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.utils.Engine
import com.intel.analytics.bigdl.dataset.{DistributedDataSet, LocalDataSet, MiniBatch}
import org.apache.log4j.Logger


abstract class Validator[T, D](
  model: Module[T],
  dataSet: DataSet[D]
) {
  def test(vMethods: Array[ValidationMethod[T]]): Array[(ValidationResult, ValidationMethod[T])]
}
@deprecated(
  "Validator(model, dataset) is deprecated. Please use model.evaluate instead",
  "0.2.0")
object Validator {
  private val logger = Logger.getLogger(getClass)

  def apply[T, D](model: Module[T], dataset: DataSet[D]): Validator[T, D] = {
    logger.warn("Validator(model, dataset) is deprecated. Please use model.evaluate instead")
    dataset match {
      case d: DistributedDataSet[_] =>
        new DistriValidator[T](
          model = model,
          dataSet = d.asInstanceOf[DistributedDataSet[MiniBatch[T]]]
        ).asInstanceOf[Validator[T, D]]
      case d: LocalDataSet[_] =>
        new LocalValidator[T](
          model = model,
          dataSet = d.asInstanceOf[LocalDataSet[MiniBatch[T]]]
        ).asInstanceOf[Validator[T, D]]
      case _ =>
        throw new UnsupportedOperationException
    }
  }
}

Source File: DistriValidator.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.optim

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.{DistributedDataSet, MiniBatch}
import com.intel.analytics.bigdl.optim.DistriValidator._
import com.intel.analytics.bigdl.utils.{Engine, MklBlas}
import org.apache.log4j.Logger

object DistriValidator {
  val logger = Logger.getLogger(this.getClass)
}


  override def test(vMethods: Array[ValidationMethod[T]])
  : Array[(ValidationResult, ValidationMethod[T])] = {

    val rdd = dataSet.data(train = false)
    val broadcastModel = rdd.sparkContext.broadcast(model.evaluate(), vMethods)
    val _subModelNumber = Engine.getEngineType match {
      case MklBlas => Engine.coreNumber()
      case _ => throw new IllegalArgumentException
    }
    val nExecutor = Engine.nodeNumber()
    val executorCores = Engine.coreNumber()
    rdd.mapPartitions(dataIter => {
      Engine.setNodeAndCore(nExecutor, executorCores)
      val localModel = broadcastModel.value._1
      val localMethod = broadcastModel.value._2
      logger.info("model thread pool size is " + Engine.model.getPoolSize)
      val workingModels = (1 to _subModelNumber)
        .map(_ => localModel.cloneModule().evaluate()).toArray
      val vMethodsArr = (1 to _subModelNumber).map(i => localMethod.map(_.clone())).toArray
      dataIter.map(batch => {
        val stackSize = batch.size() / _subModelNumber
        val extraSize = batch.size() % _subModelNumber
        val parallelism = if (stackSize == 0) extraSize else _subModelNumber
        Engine.default.invokeAndWait(
          (0 until parallelism).map(b =>
            () => {
              val offset = b * stackSize + math.min(b, extraSize) + 1
              val length = stackSize + (if (b < extraSize) 1 else 0)
              val currentMiniBatch = batch.slice(offset, length)
              val input = currentMiniBatch.getInput()
              val target = currentMiniBatch.getTarget()
              val output = workingModels(b).forward(input)
              val validatMethods = vMethodsArr(b)
              validatMethods.map(validation => {
                validation(output, target)
              })
            }
          )
        ).reduce((left, right) => {
          left.zip(right).map { case (l, r) =>
            l + r
          }
        })
      })
    }).reduce((left, right) => {
      left.zip(right).map { case (l, r) =>
        l + r
      }
    }).zip(vMethods)
  }
}

Source File: LocalValidator.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.optim

import com.intel.analytics.bigdl.dataset.{LocalDataSet, MiniBatch}
import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.{Engine, MklBlas}
import org.apache.log4j.Logger

object LocalValidator {
  val logger = Logger.getLogger(getClass)
}


class LocalValidator[T] private[optim](model: Module[T], dataSet: LocalDataSet[MiniBatch[T]])
  extends Validator[T, MiniBatch[T]](model, dataSet) {

  val logger = LocalValidator.logger
  private val coreNumber = Engine.coreNumber()

  private val subModelNumber = Engine.getEngineType match {
    case MklBlas => coreNumber
    case _ => throw new IllegalArgumentException
  }

  private val workingModels = (1 to subModelNumber).map(_ => model.cloneModule().evaluate()).toArray

  override def test(vMethods: Array[ValidationMethod[T]])
  : Array[(ValidationResult, ValidationMethod[T])] = {
    val dataIter = dataSet.data (train = false)
    var count = 0
    val vMethodsArr = (1 to subModelNumber).map(i => vMethods.map(_.clone())).toArray
    logger.info("model thread pool size is " + Engine.model.getPoolSize)
    dataIter.map(batch => {
      val stackSize = batch.size() / subModelNumber
      val extraSize = batch.size() % subModelNumber
      val parallelism = if (stackSize == 0) extraSize else subModelNumber
      val start = System.nanoTime()
      val result = Engine.default.invokeAndWait(
        (0 until parallelism).map(b =>
          () => {
            val offset = b * stackSize + math.min(b, extraSize) + 1
            val length = stackSize + (if (b < extraSize) 1 else 0)
            val currentMiniBatch = batch.slice(offset, length)
            val input = currentMiniBatch.getInput()
            val target = currentMiniBatch.getTarget()
            val output = workingModels(b).forward(input)
            val validatMethods = vMethodsArr(b)
            validatMethods.map(validation => {
              validation(output.asInstanceOf[Tensor[T]], target)
            })
          }
        )
      ).reduce((left, right) => {
        left.zip(right).map { case (l, r) =>
          l + r
        }
      })
      count += batch.size()
      logger.info(s"[Validation] $count/${dataSet.size()} Throughput is ${
        batch.size() / ((System.nanoTime() - start) / 1e9)
      } record / sec")
      result
    }).reduce((left, right) => {
      left.zip(right).map { case (l, r) =>
        l + r
      }
    }).zip(vMethods)
  }
}

Source File: BigDLSpecHelper.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.utils

import java.io.{File => JFile}

import org.apache.log4j.Logger
import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers}

import scala.collection.mutable.ArrayBuffer

abstract class BigDLSpecHelper extends FlatSpec with Matchers with BeforeAndAfter {
  protected val logger = Logger.getLogger(getClass)

  private val tmpFiles : ArrayBuffer[JFile] = new ArrayBuffer[JFile]()

  protected def createTmpFile(): JFile = {
    val file = java.io.File.createTempFile("UnitTest", "BigDLSpecBase")
    logger.info(s"created file $file")
    tmpFiles.append(file)
    file
  }

  protected def getFileFolder(path: String): String = {
    path.substring(0, path.lastIndexOf(JFile.separator))
  }

  protected def getFileName(path: String): String = {
    path.substring(path.lastIndexOf(JFile.separator) + 1)
  }

  def doAfter(): Unit = {}

  def doBefore(): Unit = {}

  before {
    doBefore()
  }

  after {
    doAfter()
    tmpFiles.foreach(f => {
      if (f.exists()) {
        require(f.isFile, "cannot clean folder")
        f.delete()
        logger.info(s"deleted file $f")
      }
    })
  }
}

Source File: RefLocalOptimizer.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.optim

import com.intel.analytics.bigdl.DataSet
import com.intel.analytics.bigdl.dataset.MiniBatch
import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.optim.DistriOptimizer.getClass
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import org.apache.log4j.Logger

import scala.reflect.ClassTag


class RefLocalOptimizer[T: ClassTag](
  model: Module[T],
  dataset: DataSet[MiniBatch[T]],
  criterion: Criterion[T]
)(implicit ev: TensorNumeric[T]) extends Optimizer[T, MiniBatch[T]](model, dataset, criterion) {

  val logger: Logger = Logger.getLogger(getClass)

  val (w, g) = model.getParameters()

  override def optimize(): Module[T] = {
    val data = dataset.toLocal().data(train = true)
    var count = 0
    state("epoch") = state.get[Int]("epoch").getOrElse(1)
    state("neval") = state.get[Int]("neval").getOrElse(1)
    while (!endWhen(state)) {
      val batch = data.next()
      val input = batch.getInput
      val target = batch.getTarget
      model.training()
      model.zeroGradParameters()
      val output = model.forward(input).asInstanceOf[Tensor[T]]
      val loss = criterion.forward(output, target)
      model.backward(input, criterion.backward(output, target))
      optimMethods.head._2.optimize(_ => (loss, g), w, state)
      count += batch.size()
      state("neval") = state[Int]("neval") + 1
      logger.info(s"loss is $loss")
      if (count >= dataset.size()) {
        state("epoch") = state[Int]("epoch") + 1
        count = 0
      }
    }

    model
  }
}

Source File: ParallelOptimizerSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.optim

import com.intel.analytics.bigdl.dataset.{DataSet, MiniBatch}
import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Linear, MSECriterion}
import com.intel.analytics.bigdl.optim.DistriOptimizerSpecModel.mse
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.{Engine, T}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers}

@com.intel.analytics.bigdl.tags.Serial
class ParallelOptimizerSpec extends FlatSpec with Matchers with BeforeAndAfter {

  Logger.getLogger("org").setLevel(Level.WARN)
  Logger.getLogger("akka").setLevel(Level.WARN)

  private var sc: SparkContext = _

  before {
    val conf = Engine.createSparkConf()
      .setMaster("local[1]").setAppName("ParallelOptimizerSpec")
    sc = new SparkContext(conf)
    Engine.init
    Engine.setCoreNumber(1)
  }

  after {
    if (sc != null) {
      sc.stop()
    }
  }

  "Train with parallel" should "work properly" in {
    val input = Tensor[Float](1, 10).fill(1.0f)
    val target = Tensor[Float](1).fill(1.0f)
    val miniBatch = MiniBatch(input, target)
    val model = Linear[Float](10, 2)
    model.getParameters()._1.fill(1.0f)
    val optimMethod = new SGD[Float]()

    val dataSet = DataSet.array(Array(miniBatch), sc)

    val optimizer = new DistriOptimizer[Float](model, dataSet, new ClassNLLCriterion[Float]())
      .setState(T("learningRate" -> 1.0))
      .setEndWhen(Trigger.maxIteration(10))

    optimizer.optimize()

  }

  "Train with parallel" should "have same results as DistriOptimizer" in {

    val input = Tensor[Float](1, 10).fill(1.0f)
    val target = Tensor[Float](1).fill(1.0f)
    val miniBatch = MiniBatch(input, target)
    val model1 = Linear[Float](10, 2)
    model1.getParameters()._1.fill(1.0f)

    val model2 = Linear[Float](10, 2)
    model2.getParameters()._1.fill(1.0f)

    val dataSet = DataSet.array(Array(miniBatch), sc)

    val parallelOptimizer = new DistriOptimizer[Float](model1,
      dataSet, new ClassNLLCriterion[Float]())
      .setState(T("learningRate" -> 1.0))
      .setEndWhen(Trigger.maxIteration(10))

    parallelOptimizer.optimize

    val distriOptimizer = new DistriOptimizer[Float](model2,
      dataSet, new ClassNLLCriterion[Float]())
      .setState(T("learningRate" -> 1.0))
      .setEndWhen(Trigger.maxIteration(10))

    distriOptimizer.optimize

    model1.getParameters()._1 should be (model2.getParameters()._1)

  }

}

Source File: SFObjectWriter.scala From spark-salesforce with Apache License 2.0

5 votes

package com.springml.spark.salesforce

import org.apache.log4j.Logger
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SaveMode}
import com.springml.salesforce.wave.api.APIFactory
import com.springml.salesforce.wave.api.BulkAPI
import com.springml.salesforce.wave.util.WaveAPIConstants
import com.springml.salesforce.wave.model.JobInfo



class SFObjectWriter (
    val username: String,
    val password: String,
    val login: String,
    val version: String,
    val sfObject: String,
    val mode: SaveMode,
    val upsert: Boolean,
    val externalIdFieldName: String,
    val csvHeader: String
    ) extends Serializable {

  @transient val logger = Logger.getLogger(classOf[SFObjectWriter])

  def writeData(rdd: RDD[Row]): Boolean = {
    val csvRDD = rdd.map(row => row.toSeq.map(value => Utils.rowValue(value)).mkString(","))

    val jobInfo = new JobInfo(WaveAPIConstants.STR_CSV, sfObject, operation(mode, upsert))
    jobInfo.setExternalIdFieldName(externalIdFieldName)

    val jobId = bulkAPI.createJob(jobInfo).getId

    csvRDD.mapPartitionsWithIndex {
      case (index, iterator) => {
        val records = iterator.toArray.mkString("\n")
        var batchInfoId : String = null
        if (records != null && !records.isEmpty()) {
          val data = csvHeader + "\n" + records
          val batchInfo = bulkAPI.addBatch(jobId, data)
          batchInfoId = batchInfo.getId
        }

        val success = (batchInfoId != null)
        // Job status will be checked after completing all batches
        List(success).iterator
      }
    }.reduce((a, b) => a & b)

    bulkAPI.closeJob(jobId)
    var i = 1
    while (i < 999999) {
      if (bulkAPI.isCompleted(jobId)) {
        logger.info("Job completed")
        return true
      }

      logger.info("Job not completed, waiting...")
      Thread.sleep(200)
      i = i + 1
    }

    print("Returning false...")
    logger.info("Job not completed. Timeout..." )
    false
  }

  // Create new instance of BulkAPI every time because Spark workers cannot serialize the object
  private def bulkAPI(): BulkAPI = {
    APIFactory.getInstance().bulkAPI(username, password, login, version)
  }

  private def operation(mode: SaveMode, upsert: Boolean): String = {
    if (upsert) {
      "upsert"
    } else if (mode != null && SaveMode.Overwrite.name().equalsIgnoreCase(mode.name())) {
      WaveAPIConstants.STR_UPDATE
    } else if (mode != null && SaveMode.Append.name().equalsIgnoreCase(mode.name())) {
      WaveAPIConstants.STR_INSERT
    } else {
      logger.warn("SaveMode " + mode + " Not supported. Using 'insert' operation")
      WaveAPIConstants.STR_INSERT
    }
  }

}

Source File: TopWORDSApp.scala From topwords with GNU General Public License v3.0

5 votes

package io.github.qf6101.topwords

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.log4j.Logger
import org.apache.spark.sql.SparkSession


object TopWORDSApp extends Serializable {
  @transient private[this] val LOGGER = Logger.getLogger(this.getClass.toString)

  def main(args: Array[String]) {
    // setup spark session
    val spark = SparkSession.builder().getOrCreate()
    try {
      TopWORDSParser.parse(args).foreach { args =>
        // remove output location files if exist
        val files = FileSystem.get(spark.sparkContext.hadoopConfiguration)
        if (files.exists(new Path(args.outputLoc))) files.delete(new Path(args.outputLoc), true)
        // read input corpus
        val corpus = if (args.numPartitions > 0)
          spark.sparkContext.textFile(args.inputLoc).repartition(args.numPartitions)
        else spark.sparkContext.textFile(args.inputLoc)
        LOGGER.info("Number of lines of input corpus: " + corpus.count())
        // run TopWORDS with the parsed arguments
        new TopWORDS(
          tauL = args.tauL,
          tauF = args.tauF,
          textLenThld = args.textLenThld,
          useProbThld = args.useProbThld,
          numIterations = args.numIterations,
          convergeTol = args.convergeTol,
          wordBoundaryThld = args.wordBoundaryThld)
          .run(corpus, args.outputLoc + "/dictionary", args.outputLoc + "/segmented_texts")
      }
      //exit normally
      LOGGER.info("Running TopWORDS successfully!")
      if (spark.sparkContext.master.contains("local")) sys.exit(0)
    } catch {
      case ex: Throwable =>
        LOGGER.error("Running TopWORDS fail!", ex)
        //signal to external process
        if (spark.sparkContext.master.contains("local")) sys.exit(1)
    } finally spark.stop()
  }
}

Source File: SparkFunSuite.scala From spark-alchemy with Apache License 2.0

5 votes

package org.apache.spark

// scalastyle:off
import java.io.File

import scala.annotation.tailrec
import org.apache.log4j.{Appender, Level, Logger}
import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, BeforeAndAfterEach, FunSuite, Outcome, Suite}
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config.Tests.IS_TESTING
import org.apache.spark.util.{AccumulatorContext, Utils}


  protected def withLogAppender(
    appender: Appender,
    loggerName: Option[String] = None,
    level: Option[Level] = None)(
    f: => Unit): Unit = {
    val logger = loggerName.map(Logger.getLogger).getOrElse(Logger.getRootLogger)
    val restoreLevel = logger.getLevel
    logger.addAppender(appender)
    if (level.isDefined) {
      logger.setLevel(level.get)
    }
    try f finally {
      logger.removeAppender(appender)
      if (level.isDefined) {
        logger.setLevel(restoreLevel)
      }
    }
  }
}

Source File: DenseKMeans.scala From AI with Apache License 2.0

5 votes

// scalastyle:off println
package com.bigchange.mllib

import org.apache.log4j.{Level, Logger}

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors


object DenseKMeans {

  object InitializationMode extends Enumeration {
    type InitializationMode = Value
    val Random, Parallel = Value
  }

  import InitializationMode._

  case class Params(
      var input: String = null,
      k: Int = 2,
      numIterations: Int = 10,
      initializationMode: InitializationMode = Parallel) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()
    defaultParams.input = args(0)
    run(defaultParams)

  }

  def run(params: Params) {
    val conf = new SparkConf().setAppName(s"DenseKMeans with $params").setMaster("local")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = sc.textFile(params.input).map { line =>
      Vectors.dense(line.split(' ').map(_.toDouble))
    }.cache()

    val numExamples = examples.count()

    println(s"numExamples = $numExamples.")

    val initMode = params.initializationMode match {
      case Random => KMeans.RANDOM
      case Parallel => KMeans.K_MEANS_PARALLEL
    }

    val model = new KMeans()
      .setInitializationMode(initMode)
      .setK(params.k)
      .setMaxIterations(params.numIterations)
      .run(examples)

    //  Return the K-means cost (sum of squared distances of points to their nearest center) for this
    val cost = model.computeCost(examples)
    // 获取质点(k个)
    val centerPoint = model.clusterCenters
    val one = centerPoint(0)
    val two  =  centerPoint(1)
    println(s"centerPoint=$one,$two.")
    println(s"Total cost = $cost.")

    sc.stop()
  }
}
// scalastyle:on println

Source File: NonCommonWordSelector.scala From dbpedia-spotlight-model with Apache License 2.0

5 votes

package org.dbpedia.spotlight.spot

import java.io._

import com.officedepot.cdap2.collection.CompactHashSet
import org.apache.log4j.Logger
import org.dbpedia.spotlight.io.WortschatzParser
import org.dbpedia.spotlight.model.SurfaceFormOccurrence

import scala.collection.JavaConversions._



    def select(occs: java.util.List[SurfaceFormOccurrence]) : java.util.List[SurfaceFormOccurrence] = {
        occs.filter( o => !isCommonWord(o) );
    }

    def main(args: Array[String]) {
        def usage = println(" Usage: scala -cp $CP NonCommonWordSelector words.txt ")
        args(0) match {
            case file: String => {
                new NonCommonWordSelector(file)
            }
            case _ => usage
        }
    }

}

Source File: SparkSqlUtils.scala From HadoopLearning with MIT License

5 votes

package com.c503.utils

import java.io.{BufferedInputStream, BufferedReader, FileInputStream, InputStreamReader}
import java.nio.file.Path

import com.google.common.io.Resources
import org.apache.log4j.{Level, Logger}
import org.apache.mesos.Protos.Resource
import org.apache.spark.sql.SparkSession

import scala.io.Source


  def readSqlByPath(sqlPath: String) = {
    val buf = new StringBuilder
    val path = this.getPathByName(sqlPath)
    val file = Source.fromFile(path)
    for (line <- file.getLines) {
      buf ++= line + "\n"
    }
    file.close
    buf.toString()
  }


}

Source File: Streaming.scala From scala-spark-cab-rides-predictions with MIT License

5 votes

import com.amazonaws.services.dynamodbv2.document.internal.InternalUtils
import com.amazonaws.services.dynamodbv2.streamsadapter.model.RecordAdapter
import com.amazonaws.services.kinesis.model.Record
import com.google.gson.Gson
import org.apache.spark.sql._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.kinesis.dynamostream.KinesisInitialPositions.Latest
import org.apache.spark.streaming.kinesis.dynamostream.KinesisInputDStream
import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext}

object Trials extends App {

  import org.apache.log4j.{Level, Logger}

  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)

  //session setup
  System.setProperty("hadoop.home.dir", "C:\\winutils")
  val sparkSession = SparkSession.builder()
    .master("local[*]")
    .appName("test")
    .getOrCreate()
  val sc = sparkSession.sparkContext
  val ssc = new StreamingContext(sc, Seconds(10))
  val sqlContext = sparkSession.sqlContext

  //creates an array of strings from raw byte array
  def kinesisRecordHandler: Record => Array[String] = (record: Record) => new String(record.getData.array()).split(",")

  //converts records to map of key value pair and then json
  def recordHandler = (record: Record) => {
    val gson = new Gson
    val sRecord = record.asInstanceOf[RecordAdapter].getInternalObject
    val map = InternalUtils.toSimpleMapValue(sRecord.getDynamodb.getNewImage)
    gson.toJson(map)
  }

  case class CabPrice(cab_type: String, product_id: String, name: String, price: String, distance: String, surge_multiplier: String, time_stamp: String, source: String, destination: String, id: String)

  val stream_cab = KinesisInputDStream.builder
    .streamingContext(ssc)
    .streamName("cab_rides")
    .regionName("us-east-1")
    .initialPosition(new Latest())
    .checkpointAppName("cab_rides-app")
    .checkpointInterval(Milliseconds(1000))
    .storageLevel(StorageLevel.MEMORY_AND_DISK_2)
    .buildWithMessageHandler(recordHandler)


  val stream_weather = KinesisInputDStream.builder
    .streamingContext(ssc)
    .streamName("weather")
    .regionName("us-east-1")
    .initialPosition(new Latest())
    .checkpointAppName("cab_rides-app")
    .checkpointInterval(Milliseconds(1000))
    .storageLevel(StorageLevel.MEMORY_AND_DISK_2)
    .buildWithMessageHandler(recordHandler)


  //creating dataframe, can be stored as temp view
  val cabSchema = Encoders.product[CabPrice].schema
  stream_cab.foreachRDD(rdd => {
    import sqlContext.implicits._
    //val xx: Dataset[String] = rdd.toDS()

    val df: DataFrame = sqlContext.read.schema(cabSchema).json(rdd.toDS())
    df.show()

  })
  ssc.start()
  ssc.awaitTermination()

}

Source File: MetricsTest.scala From pulse with Apache License 2.0

5 votes

package io.phdata.pulse.metrics

import org.apache.log4j.Logger
import org.scalatest.FunSuite

class MetricsTest extends FunSuite {
  implicit val logger = Logger.getLogger(this.getClass)

  test("Write a String metric") {
    Metrics.gauge("foo", "bar")
  }

  test("Write a Long metric") {
    Metrics.gauge("long_measure_i", 10)
  }

  test("Time a function") {
    val result = Metrics.time("timed_function") {
      Thread.sleep(10)
      "ok"
    }

    assertResult("ok")(result)
  }

  test("Write an Int gauge") {
    Metrics.gauge("int_measure_l", 10L)
  }

  test("Write an Double gauge") {
    Metrics.gauge("double_measure_l", 10.0F)
  }

  test("Write an Float gauge") {
    Metrics.gauge("float_measure_l", 10.0D)
  }

}

Source File: StandaloneAppExample.scala From pulse with Apache License 2.0

5 votes

package io.phdata.pulse.example

import javax.naming.NamingException
import org.apache.log4j.{ Logger, MDC, NDC }

object StandaloneAppExample {
  private val log = Logger.getLogger(this.getClass)

  def main(args: Array[String]): Unit = {
    MDC.put("hostname", java.net.InetAddress.getLocalHost().getHostName())
    NDC.push("ndc message")
    val numEvents   = args(0).toInt
    val sleepMillis = args(1).toInt
    0 to numEvents map { num =>
      val uuid = java.util.UUID.randomUUID.toString()
      log.info(s"info message $uuid")
      Thread.sleep(sleepMillis)
      if (num % 300 == 0) {
        log.error(s"error happened $uuid", new Exception())
      }
    }
    // Throw an exception so when we turn on `log4j.debug=true` we can see where events stop posting to the log collector
    // it's a <code>NamingException</code> so we can catch it in the test for this class
    throw new NamingException("exiting")

  }

}

Source File: SocialGraphJob.scala From spark-graphx with GNU General Public License v3.0

5 votes

package com.github.graphx.pregel.jobs.social

import com.github.graphx.pregel.social.SocialGraph
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object SocialGraphJob {

  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.ERROR)
    val sc = new SparkContext("local[*]", "GraphX")

    val graph = new SocialGraph(sc)

    println("Top 10 most-connected users:")
    graph.getMostConnectedUsers(10) foreach println

    println("Computing degrees of separation for user Arch")
    graph.degreeOfSeparationSingleUser(5306) foreach println

    println("Computing degrees of separation for user Arch and Fred")
    graph.degreeOfSeparationTwoUser(5306, 14) foreach println

    println("Connected component")
    graph.connectedComponentGroupedByUsers
      .sortBy ( {case (_, lowestVertexId) => lowestVertexId},
        ascending = false).take(10) foreach println

    sc.stop()
  }
}

Source File: ShortestPathProblemJob.scala From spark-graphx with GNU General Public License v3.0

5 votes

package com.github.graphx.pregel.jobs.ssp

import com.github.graphx.pregel.ssp.ShortestPathProblem
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.graphx.VertexId

object ShortestPathProblemJob extends App {
  Logger.getLogger("org").setLevel(Level.ERROR)
  val sc = new SparkContext("local[*]", "ShortestPathProblemDemo")
  val ssp = new ShortestPathProblem(sc)

  val sourceIdForTest: VertexId = 3
  val sourceIdForRandom: VertexId = 75

  val testGraph = ssp.testGraph
  val resultOnTestGraph = ssp.shortestPath(testGraph, sourceIdForTest)
  println(s"Test Graph:\n${ssp.graphToString(testGraph)}\n\n" +
    s"Distances on the test graph $resultOnTestGraph\n")

  val randomGraph = ssp.randomGraph
  val resultOnRandomGraph = ssp.shortestPath(randomGraph, sourceIdForRandom)
  println(s"Distances on the random graph $resultOnRandomGraph\n")
}

Source File: SocialPageRankJob.scala From spark-graphx with GNU General Public License v3.0

5 votes

package com.github.graphx.pagerank

import com.github.graphx.pregel.social.SocialGraph
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.graphx.VertexRDD

object SocialPageRankJob {

  
  def static(socialGraph: SocialGraph, tolerance: Double): VertexRDD[Double] =
    socialGraph.graph.staticPageRank(numIter = 20).vertices

  def handleResult(socialGraph: SocialGraph, ranks: VertexRDD[Double]) = {
    socialGraph.verts.join(ranks).map {
      case (_, (username, rank)) => (username, rank)
    }.sortBy({ case (_, rank) => rank }, ascending = false).take(10)
  }

  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.ERROR)
    val sc = new SparkContext("local[*]", "PageRank")

    val socialGraph: SocialGraph = new SocialGraph(sc)
    val TOLERANCE: Double = 0.0001

    import scala.compat.Platform.{EOL => D}
    val topUsersDynamically = handleResult(socialGraph, ranks(socialGraph, TOLERANCE)).mkString(D)
    val topUsersIterative = handleResult(socialGraph, static(socialGraph, TOLERANCE)).mkString(D)

    println(s"Top 10 users in network counted with TOLERANCE until convergence $TOLERANCE - $D $topUsersDynamically")
    println(s"Top 10 users in the network counted iteratively - $D $topUsersIterative")

    sc.stop()
  }
}

Source File: CheckDirEndPointImpl.scala From Backup-Repo with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase

import com.google.protobuf.{RpcCallback, RpcController, Service}
import org.apache.hadoop.hbase.coprocessor._
import org.apache.hadoop.hbase.{Coprocessor, CoprocessorEnvironment}
import org.apache.log4j.Logger


class CheckDirEndPointImpl
  extends CheckDirProtos.CheckDirService with Coprocessor with CoprocessorService {

  private lazy val logger = Logger.getLogger(getClass.getName)
  private var env: RegionCoprocessorEnvironment = null

  override def start(env: CoprocessorEnvironment) = {
    env match {
      case e: RegionCoprocessorEnvironment => e
      case _ => throw new CoprocessorException("Must be loaded on a table region!")
    }
  }

  override def stop(env: CoprocessorEnvironment) = {}

  override def getService: Service = this

  override def getCheckResult(controller: RpcController,
                              request: CheckDirProtos.CheckRequest,
                              done: RpcCallback[CheckDirProtos.CheckResponse]) = {
    val isDir = new java.io.File(".").isDirectory
    if (!isDir) {
      logger.warn(
        """Current directory is not accessible,
          |please add 'cd ~' before start regionserver in your regionserver start script.""")
    }
    val response: CheckDirProtos.CheckResponse = {
      CheckDirProtos.CheckResponse.newBuilder().setAccessible(isDir).build()
    }
    done.run(response)
  }
}

Source File: AkkaUtils.scala From DataXServer with Apache License 2.0

5 votes

package org.tianlangstudio.data.hamal.yarn.util

import akka.actor.{ActorSystem, ExtendedActorSystem}
import com.typesafe.config.ConfigFactory
import org.apache.log4j.{Level, Logger}
import org.tianlangstudio.data.hamal.core.{Constants, HamalConf}
import org.tianlangstudio.data.hamal.core.HamalConf


  def maxFrameSizeBytes(conf: HamalConf): Int = {
    val frameSizeInMB = conf.getInt("datax.akka.frameSize", 128)
    if (frameSizeInMB > AKKA_MAX_FRAME_SIZE_IN_MB) {
      throw new IllegalArgumentException(
        s"spark.akka.frameSize should not be greater than $AKKA_MAX_FRAME_SIZE_IN_MB MB")
    }
    frameSizeInMB * 1024 * 1024
  }


  def protocol(actorSystem: ActorSystem): String = {
    val akkaConf = actorSystem.settings.config
    val sslProp = "akka.remote.netty.tcp.enable-ssl"
    protocol(akkaConf.hasPath(sslProp) && akkaConf.getBoolean(sslProp))
  }

  def protocol(ssl: Boolean = false): String = {
    if (ssl) {
      "akka.ssl.tcp"
    } else {
      "akka.tcp"
    }
  }

  def address(
      protocol: String,
      systemName: String,
      host: String,
      port: Int,
      actorName: String): String = {

        address(protocol,
          systemName,
          s"$host:$port",
          actorName
        )
  }
  def address(
               protocol: String,
               systemName: String,
               hostPort: String,
               actorName: String): String = {
    s"$protocol://$systemName@$hostPort/user/$actorName"
  }
}

Source File: ModelSerialization.scala From CTRmodel with Apache License 2.0

5 votes

package com.ggstar.example

import com.ggstar.ctrmodel._
import com.ggstar.features.FeatureEngineering
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}

object ModelSerialization {
  def main(args: Array[String]): Unit = {

    Logger.getLogger("org").setLevel(Level.ERROR)

    val conf = new SparkConf()
      .setMaster("local")
      .setAppName("ctrModel")
      .set("spark.submit.deployMode", "client")

    val spark = SparkSession.builder.config(conf).getOrCreate()

    val resourcesPath = this.getClass.getResource("/samples.snappy.orc")
    val rawSamples = spark.read.format("orc").option("compression", "snappy").load(resourcesPath.getPath)


    //transform array to vector for following vectorAssembler
    val samples = FeatureEngineering.transferArray2Vector(rawSamples)

    samples.printSchema()
    samples.show(5, false)


    //model training
    println("Neural Network Ctr Prediction Model:")
    val innModel = new InnerProductNNCtrModel()
    innModel.train(samples)
    val transformedData = innModel.transform(samples)

    transformedData.show(1,false)

    //model serialization by mleap
    val mleapModelSerializer = new com.ggstar.serving.mleap.serialization.ModelSerializer()
    mleapModelSerializer.serializeModel(innModel._pipelineModel, "jar:file:/Users/zhwang/Workspace/CTRmodel/model/inn.model.mleap.zip", transformedData)

    //model serialization by JPMML
    val jpmmlModelSerializer = new com.ggstar.serving.jpmml.serialization.ModelSerializer()
    jpmmlModelSerializer.serializeModel(innModel._pipelineModel, "model/inn.model.jpmml.xml", transformedData)
  }
}

Source File: ModelSelection.scala From CTRmodel with Apache License 2.0

5 votes

package com.ggstar.example

import com.ggstar.ctrmodel._
import com.ggstar.evaluation.Evaluator
import com.ggstar.features.FeatureEngineering
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.log4j.{Level, Logger}

object ModelSelection {
  def main(args: Array[String]): Unit = {

    Logger.getLogger("org").setLevel(Level.ERROR)

    val conf = new SparkConf()
      .setMaster("local")
      .setAppName("ctrModel")
      .set("spark.submit.deployMode", "client")

    val spark = SparkSession.builder.config(conf).getOrCreate()

    val resourcesPath = this.getClass.getResource("/samples.snappy.orc")
    val rawSamples = spark.read.format("orc").option("compression", "snappy").load(resourcesPath.getPath)
    rawSamples.printSchema()
    rawSamples.show(10)

    //transform array to vector for following vectorAssembler
    val samples = FeatureEngineering.transferArray2Vector(rawSamples)

    //split samples into training samples and validation samples
    val Array(trainingSamples, validationSamples) = samples.randomSplit(Array(0.7, 0.3))
    val evaluator = new Evaluator

    
  }
}

Source File: GenerateVerticesExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch08

// scalastyle:off println
import org.apache.log4j.{Level, Logger}

import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.rdd.RDD


object GenerateVerticesExample {

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }
    // ログレベルをWARNに設定
    Logger.getLogger("org").setLevel(Level.WARN)

    // SparkContextの生成
    val conf = new SparkConf().setAppName("GenerateVerticesExample")
    val sc = new SparkContext(conf)

    // 引数から設定値を取得
    val (numProducts, numUsers): (Int, Int) = (args(0).toInt, args(1).toInt)
    implicit val recOpts: RecommendLogOptions = RecommendLogOptions(numProducts, numUsers)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext)
         (implicit recOpts: RecommendLogOptions)
  : Unit = {

    // 商品リスト、ユーザリストのRDDを生成
    val products: RDD[VertexProperty] = sc.parallelize(PurchaseLogGenerator.genProductList)
    val users: RDD[VertexProperty] = sc.parallelize(PurchaseLogGenerator.genUserList)

    // 商品リスト20件を表示
    println("===================================")
    println("get top 20 products:")
    products.take(20).foreach(x => println(s"id: ${x.id},\ttype: ${x.kind},\tname: ${x.name}"))

    // ユーザリスト20件を表示
    println("===================================")
    println("get top 20 users:")
    users.take(20).foreach(x => println(s"id: ${x.id},\ttype: ${x.kind},\tname: ${x.name}"))

  }
}
// scalastyle:on println

Source File: ReduceExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object ReduceExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("ReduceExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1, 6, 3), 3)
    nums.reduce((x, y) => x + y)

    println(s"""nums: ${nums.collect().mkString(", ")}""")
    println(s"""sum:  ${nums.reduce((x, y) => x + y)}""")
  }
}

// scalastyle:on println

Source File: StatsExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object StatsExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("StatsExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array.range(1, 11))
    val stats = nums.stats()

    println(s"""nums:   ${nums.collect().mkString(", ")}""")
    println(s"""count:  ${stats.count}""")
    println(s"""mean:   ${stats.mean}""")
    println(s"""stdev:  ${stats.stdev}""")
  }
}

// scalastyle:on println

Source File: FoldExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object FoldExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("FoldExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1, 6, 3), 3)
    nums.reduce((x, y) => x + y)

    println(s"""nums: ${nums.collect().mkString(", ")}""")
    println(s"""sum:  ${nums.fold(0)((x, y) => x + y)}""")
  }
}

// scalastyle:on println

Source File: OrderExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object OrderExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("OrderExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1))

    println(s"""nums:          ${nums.collect().mkString(", ")}""")
    println(s"""top3:          ${nums.top(3).mkString(", ")}""")
    println(s"""takeOredered3: ${nums.takeOrdered(3).mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: AggregateExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object AggregateExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("AggregateExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  private[basic_action]
  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array.range(1, 11), 3)

    val acc = nums.aggregate(zeroValue = (0.0, 0))(
      seqOp = (partAcc, n) => (partAcc._1 + n, partAcc._2 + 1),
      combOp = (acc1, acc2) => (acc1._1 + acc2._1, acc1._2 + acc2._2)
    )
    val avg = acc._1 / acc._2

    println(s"""nums: ${nums.collect().mkString(", ")}""")
    println(s"""sum:  ${nums.fold(0)((x, y) => x + y)}""")
  }
}

// scalastyle:on println

Source File: CollectAsMapExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.pairrdd_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object CollectAsMapExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("CollectAsMapExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(
      Array(
        ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1)
      ), 3
    )
    val fruitsAsMap = fruits.collectAsMap()

    println(s"""fruits:      ${fruits.collect().mkString(", ")}""")
    println(s"""fruitsAsMap: $fruitsAsMap""")
  }
}

// scalastyle:on println

Source File: PersistExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.persistence

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

object PersistExample {
  def main(args: Array[String]) {
    if (args.length != 1) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }

    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("PersistExample")
    val sc = new SparkContext(conf)

    run(sc, args(0))
    sc.stop()
  }

  def run(sc: SparkContext, inputFile: String) {
    val lines = sc.textFile(inputFile)
    lines.count()
    lines.collect()

    val persistedLines = sc.textFile(inputFile).persist()
    persistedLines.collect()
    persistedLines.count()

    persistedLines.unpersist()
    persistedLines.collect()
  }
}

Source File: CustomPartitionerExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.partition

import org.apache.log4j.{Level, Logger}
import org.apache.spark.Partitioner
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object CustomPartitionerExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("CustomPartitionerExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))

    val defaultPartitioned = fruits.map((_, 1)).reduceByKey(_ + _)
    val customPartitioned = fruits.map((_, 1)).reduceByKey(
      new FirstLetterPartitioner(sc.defaultParallelism), _ + _)

    println(s"""fruits:\n  ${fruits.collect().mkString(", ")}""")
    println()

    println("partitioned by default partitioner")
    defaultPartitioned.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
    println()

    println("partitioned by first letter partitioner")
    customPartitioned.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
  }
}

private[partition]
class FirstLetterPartitioner(numParts: Int) extends Partitioner {
  override def numPartitions: Int = numParts

  override def getPartition(key: Any): Int = {
    key.toString.charAt(0).hashCode % numPartitions match {
      case p if p < 0 => p + numPartitions
      case p => p
    }
  }

  override def equals(other: Any): Boolean = {
    other match {
      case p: FirstLetterPartitioner => p.numPartitions == numPartitions
      case _ => false
    }
  }
}

// scalastyle:on println

Source File: PartitionExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.partition

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object PartitionExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("Partition")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1), 1)
    println(s"""nums:\n  ${nums.collect().mkString(", ")}""")
    println()

    println("original:")
    nums.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
    println()

    val numsPar3 = nums.repartition(3)
    println("repartition to 3:")
    numsPar3.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
    println()

    val numsPar2 = numsPar3.coalesce(2)
    println("coalesce to 2:")
    numsPar2.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
  }
}

// scalastyle:on println

Source File: WordCountExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.shared_variable

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object WordCountExample {
  def main(args: Array[String]) {
    if (args.length != 1) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }

    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("WordCountExample")
    val sc = new SparkContext(conf)

    run(sc, args(0))
    sc.stop()
  }

  def run(sc: SparkContext, inputFile: String) {
    val stopWordCount = sc.accumulator(0L)
    val stopWords = sc.broadcast(Set("a", "an", "for", "in", "on"))

    val lines = sc.textFile(inputFile)
    val words = lines.flatMap(_.split(" ")).filter(!_.isEmpty)
    val wordCounts = words.map(w => (w, 1)).reduceByKey(_ + _).filter { w =>
      val result = !stopWords.value.contains(w._1)
      if (!result) stopWordCount += 1L
      result
    }
    val sortedWordCounts = wordCounts.sortBy(_._2, ascending = false)

    println(s"""wordCounts:     ${sortedWordCounts.take(10).mkString(", ")}""")
    println(s"""stopWordCounts: ${stopWordCount.value}""")
  }
}

// scalastyle:on println

Source File: AggregateByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object AggregateByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("AggregateByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(
      Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1)))
    val fruitCountAvgs = fruits.aggregateByKey(zeroValue = Acc(0.0, 0))(
      seqOp = (partAcc, n) => partAcc += n,
      combOp = (acc1, acc2) => acc1 ++= acc2
    ).mapValues(acc => acc.sum / acc.count)

    println(s"""fruits:         ${fruits.collect().mkString(", ")}""")
    println(s"""fruitCountAvgs: ${fruitCountAvgs.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: MapValuesExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object MapValuesExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("MapValuesExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array(("Apple", 1), ("Orange", 4), ("Apple", 2), ("Peach", 1)))
    val plusOnes = fruits.mapValues(v => v + 1)

    println(s"""fruits:   ${fruits.collect().mkString(", ")}""")
    println(s"""plusOnes: ${plusOnes.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: SortByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object SortByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("SortByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(
      Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1)))
    val sortedByKeyAsc = fruits.sortByKey(ascending = false)

    println(s"""fruits:         ${fruits.collect().mkString(", ")}""")
    println(s"""sortedByKeyAsc: ${sortedByKeyAsc.collect().mkString(", ")}""")

    val nums = sc.parallelize(
      Array(("One", 1), ("Hundred", 100), ("Three", 3), ("Thousand", 1000)))
    implicit val sortByStrLen = new Ordering[String] {
      def compare(x: String, y: String): Int = x.length - y.length
    }
    val sortedByKeyLength = nums.sortByKey()

    println()
    println(s"""nums:              ${nums.collect().mkString(", ")}""")
    println(s"""sortedByKeyLength: ${sortedByKeyLength.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: CoGroupExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object CoGroupExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("CoGroupExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val persons = sc.parallelize(Array(
      ("Adam", "San francisco"),
      ("Bob", "San francisco"),
      ("Taro", "Tokyo"),
      ("Charles", "New York")
    ))
    val cities = sc.parallelize(Array(
      ("Tokyo", "Japan"),
      ("San francisco", "America"),
      ("Beijing", "China")
    ))
    val grouped = persons.map(_.swap).cogroup(cities)

    println(s"""persons: ${persons.collect().mkString(", ")}""")
    println(s"""cities:  ${cities.collect().mkString(", ")}""")
    println()
    println(s"""grouped:\n${grouped.collect().mkString("\n")}""")
  }
}

// scalastyle:on println

Source File: JoinExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object JoinExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("JoinExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val persons = sc.parallelize(Array(
      ("Adam", "San francisco"),
      ("Bob", "San francisco"),
      ("Taro", "Tokyo"),
      ("Charles", "New York")
    ))
    val cities = sc.parallelize(Array(
      ("Tokyo", "Japan"),
      ("San francisco", "America"),
      ("Beijing", "China")
    ))

    val leftJoined = persons.map(_.swap).join(cities)
    val leftOuterJoined = persons.map(_.swap).leftOuterJoin(cities)
    val rightOuterJoined = persons.map(_.swap).rightOuterJoin(cities)
    val fullOuterJoined = persons.map(_.swap).fullOuterJoin(cities)

    println(s"""persons: ${persons.collect().mkString(", ")}""")
    println(s"""cities:  ${cities.collect().mkString(", ")}""")
    println()
    println(s"""leftJoined:\n${leftJoined.collect().mkString("\n")}""")
    println()
    println(s"""leftOuterJoined:\n${leftOuterJoined.collect().mkString("\n")}""")
    println()
    println(s"""rightOuterJoined:\n${rightOuterJoined.collect().mkString("\n")}""")
    println()
    println(s"""fullOuterJoined:\n${fullOuterJoined.collect().mkString("\n")}""")
  }
}

// scalastyle:on println

Source File: GroupByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object GroupByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("GroupByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(
      Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1)))
    val grouped = fruits.groupByKey()

    println(s"""fruits:  ${fruits.collect().mkString(", ")}""")
    println(s"""grouped: ${grouped.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: ReduceByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object ReduceByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("ReduceByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array(
      ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1)))
    val fruitCounts = fruits.reduceByKey((x, y) => x + y)

    println(s"""fruits:      ${fruits.collect().mkString(", ")}""")
    println(s"""fruitCounts: ${fruitCounts.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: CombineByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object CombineByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("CombineByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(
      Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1)))
    val fruitCountAvgs = fruits.combineByKey(
      createCombiner = (v: Int) => Acc(v.toDouble, 1),
      mergeValue = (partAcc: Acc, n: Int) => partAcc += n,
      mergeCombiners = (acc1: Acc, acc2: Acc) => acc1 ++= acc2
    ).mapValues(acc => acc.sum / acc.count)

    println(s"""fruits:         ${fruits.collect().mkString(", ")}""")
    println(s"""fruitCountAvgs: ${fruitCountAvgs.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: FoldByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object FoldByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("FoldByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array(
      ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1)))
    val fruitCounts = fruits.foldByKey(0)((x, y) => x + y)

    println(s"""fruits:      ${fruits.collect().mkString(", ")}""")
    println(s"""fruitCounts: ${fruitCounts.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: MapPartitionsExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_transformation

import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object MapPartitionsExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("MapPartitionsExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val jsonLines = sc.parallelize(Array(
      """{"name": "Apple",  "num": 1}""",
      """{"name": "Orange", "num": 4}""",
      """{"name": "Apple",  "num": 2}""",
      """{"name": "Peach",  "num": 1}"""
    ))

    val parsed = jsonLines.mapPartitions { lines =>
      val mapper = new ObjectMapper()
      mapper.registerModule(DefaultScalaModule)
      lines.map { line =>
        val f = mapper.readValue(line, classOf[Map[String, String]])
        (f("name"), f("num"))
      }
    }

    println(s"""json:\n${jsonLines.collect().mkString("\n")}""")
    println()
    println(s"""parsed:\n${parsed.collect().mkString("\n")}""")
  }
}

// scalastyle:on println

Source File: FlatMapExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object FlatMapExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("FlatMapExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val lines = sc.parallelize(Array("Apple is red", "PineApple is yellow"))
    val words = lines.flatMap(line => line.split(" "))

    println(s"""lines: ${lines.collect().mkString(", ")}""")
    println(s"""words: ${words.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: SetOperationsExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object SetOperationsExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("SetOperationsExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits1 = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val fruits2 = sc.parallelize(Array("Grape", "Apple", "Banana", "Orange"))

    val union = fruits1.union(fruits2)
    val subtract = fruits1.subtract(fruits2)
    val intersection = fruits1.intersection(fruits2)
    val cartesian = fruits1.cartesian(fruits2)

    println(s"""fruits1: ${fruits1.collect().mkString(", ")}""")
    println(s"""fruits2: ${fruits2.collect().mkString(", ")}""")
    println(s"""union: ${union.collect().mkString(", ")}""")
    println(s"""subtract: ${subtract.collect().mkString(", ")}""")
    println(s"""intersection: ${intersection.collect().mkString(", ")}""")
    println(s"""cartesian: ${cartesian.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: MapExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object MapExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("MapExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val lengths = fruits.map(fruit => fruit.length)

    println(s"""fruits:  ${fruits.collect().mkString(", ")}""")
    println(s"""lengths: ${lengths.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: ZipExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object ZipExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("ZipExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits1 = sc.parallelize(
      Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val fruits2 = sc.parallelize(
      Array("りんご", "オレンジ", "桃", "オレンジ", "パイナップル", "オレンジ"))
    val zipped = fruits1.zip(fruits2)

    println(s"""fruits1: ${fruits1.collect().mkString(", ")}""")
    println(s"""fruits2: ${fruits2.collect().mkString(", ")}""")
    println(s"""zipped:  ${zipped.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: DistinctExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object DistinctExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("DistinctExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val uniques = fruits.distinct()

    println(s"""fruits:  ${fruits.collect().mkString(", ")}""")
    println(s"""uniques: ${uniques.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: SampleExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object SampleExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("SampleExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val samples = fruits.sample(withReplacement = false, 0.5, 1)

    println(s"""fruits:  ${fruits.collect().mkString(", ")}""")
    println(s"""samples: ${samples.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: FilterExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object FilterExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("FilterExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val startWithPs = fruits.filter(fruit => fruit.startsWith("P"))

    println(s"""fruits:      ${fruits.collect().mkString(", ")}""")
    println(s"""startWithPs: ${startWithPs.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: SparkFunSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark

// scalastyle:off
import org.apache.log4j.{Level, Logger}
import org.scalatest.{FunSuite, Outcome}

import org.apache.spark.Logging


  final protected override def withFixture(test: NoArgTest): Outcome = {
    val testName = test.text
    val suiteName = this.getClass.getName
    val shortSuiteName = suiteName.replaceAll("org.apache.spark", "o.a.s")
    try {
      Logger.getLogger("org").setLevel(Level.OFF)
      Logger.getLogger("akka").setLevel(Level.OFF)

      logInfo(s"\n\n===== TEST OUTPUT FOR $shortSuiteName: '$testName' =====\n")
      test()
    } finally {
      logInfo(s"\n\n===== FINISHED $shortSuiteName: '$testName' =====\n")
    }
  }

}

Source File: Logging.scala From spark-distcp with Apache License 2.0

5 votes

package com.coxautodata.objects

import org.apache.log4j.{Level, LogManager, Logger}

trait Logging {

  // Method to get the logger name for this object
  protected def logName: String = {
    // Ignore trailing $'s in the class names for Scala objects
    this.getClass.getName.stripSuffix("$")
  }

  private val log: Logger = LogManager.getLogger(logName)

  // Set logger level
  protected def setLogLevel(level: Level): Unit = log.setLevel(level)

  // Log methods that take only a String
  protected def logInfo(msg: => String) {
    if (log.isInfoEnabled) log.info(msg)
  }

  protected def logDebug(msg: => String) {
    if (log.isDebugEnabled) log.debug(msg)
  }

  protected def logTrace(msg: => String) {
    if (log.isTraceEnabled) log.trace(msg)
  }

  protected def logWarning(msg: => String) {
    log.warn(msg)
  }

  protected def logError(msg: => String) {
    log.error(msg)
  }

  // Log methods that take Throwables (Exceptions/Errors) too
  protected def logInfo(msg: => String, throwable: Throwable) {
    if (log.isInfoEnabled) log.info(msg, throwable)
  }

  protected def logDebug(msg: => String, throwable: Throwable) {
    if (log.isDebugEnabled) log.debug(msg, throwable)
  }

  protected def logTrace(msg: => String, throwable: Throwable) {
    if (log.isTraceEnabled) log.trace(msg, throwable)
  }

  protected def logWarning(msg: => String, throwable: Throwable) {
    log.warn(msg, throwable)
  }

  protected def logError(msg: => String, throwable: Throwable) {
    log.error(msg, throwable)
  }

  protected def isTraceEnabled: Boolean = {
    log.isTraceEnabled
  }


}

Source File: StressReceiver.scala From spark-cassandra-stress with Apache License 2.0

5 votes

package com.datastax.sparkstress

import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.receiver.Receiver
import org.apache.log4j.Logger

class StressReceiver[T](
  index: Int,
  rowGenerator: RowGenerator[T],
  config: Config,
  blockIntervalInMs: Int,
  storageLevel: StorageLevel)
  extends Receiver[T](storageLevel) {


  class EmitterThread(receiver: StressReceiver[_]) extends Thread(s"Emitter$index") {
    override def run(): Unit = {
      val rowIterator = rowGenerator.generatePartition(config.seed, index)
      val throughPutPerBlockInterval = (blockIntervalInMs / (config.streamingBatchIntervalSeconds * 1000.0) * config.receiverThroughputPerBatch).toLong
      while (rowIterator.hasNext) {
        val batchBegin = System.currentTimeMillis()
        for (x <- 1l to throughPutPerBlockInterval if rowIterator.hasNext) {
          store(rowIterator.next())
        }
        val batchEnd = System.currentTimeMillis()

        val napTime = blockIntervalInMs - (batchEnd - batchBegin)
        if (napTime > 0)
          Thread.sleep(napTime)
      }
      receiver.stop("Iterator Empty")
    }
  }


  def onStart() = {
    new EmitterThread(this).start()
  }



  def onStop() = {
  }
}

Source File: MCLModelSuite.scala From MCL_spark with MIT License

5 votes

package org.apache.spark.mllib.clustering

import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.util.Utils


class MCLModelSuite extends MCLFunSuite{
  // Disable Spark messages when running program
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  test("model save/load", UnitTest){

    val users: RDD[(VertexId, String)] =
      sc.parallelize(Array((0L,"Node1"), (1L,"Node2"),
        (2L,"Node3"), (3L,"Node4"),(4L,"Node5"),
        (5L,"Node6"), (6L,"Node7"), (7L, "Node8"),
        (8L, "Node9"), (9L, "Node10"), (10L, "Node11")))

    val relationships: RDD[Edge[Double]] =
      sc.parallelize(
        Seq(Edge(0, 1, 1.0), Edge(1, 0, 1.0),
          Edge(0, 2, 1.0), Edge(2, 0, 1.0),
          Edge(0, 3, 1.0), Edge(3, 0, 1.0),
          Edge(1, 2, 1.0), Edge(2, 1, 1.0),
          Edge(1, 3, 1.0), Edge(3, 1, 1.0),
          Edge(2, 3, 1.0), Edge(3, 2, 1.0),
          Edge(4, 5, 1.0), Edge(5, 4, 1.0),
          Edge(4, 6, 1.0), Edge(6, 4, 1.0),
          Edge(4, 7, 1.0), Edge(7, 4, 1.0),
          Edge(5, 6, 1.0), Edge(6, 5, 1.0),
          Edge(5, 7, 1.0), Edge(7, 5, 1.0),
          Edge(6, 7, 1.0), Edge(7, 6, 1.0),
          Edge(3, 8, 1.0), Edge(8, 3, 1.0),
          Edge(9, 8, 1.0), Edge(8, 9, 1.0),
          Edge(9, 10, 1.0), Edge(10, 9, 1.0),
          Edge(4, 10, 1.0), Edge(10, 4, 1.0)
        ))

    val graph = Graph(users, relationships)

    val model: MCLModel = MCL.train(graph)

    // Check number of clusters
    model.nbClusters shouldEqual 3

    // Check save and load methods
    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString

    Array(true, false).foreach { case selector =>
      // Save model, load it back, and compare.
      try {
        model.save(sc, path)
        val sameModel = MCLModel.load(sc, path)
        assertDatasetEquals(model.assignments.orderBy("id"), sameModel.assignments.orderBy("id"))
      } finally {
        Utils.deleteRecursively(tempDir)
      }
    }

  }

  test("nodes assignments", UnitTest) {
    val nodeId = 1.0.toLong
    val cluster = 2.0.toLong
    val newAssignment:Assignment = Assignment.apply(Row(nodeId, cluster))

    newAssignment.id shouldEqual nodeId
    newAssignment.cluster shouldEqual cluster
  }

}

Source File: PCA.scala From Scala-for-Machine-Learning-Second-Edition with MIT License

5 votes

package org.scalaml.unsupervised.pca

import scala.util.Try

import org.apache.log4j.Logger
import org.apache.commons.math3.linear._
import org.apache.commons.math3.stat.correlation.Covariance

import org.scalaml.stats.TSeries
import org.scalaml.Predef._
import org.scalaml.Predef.Context._
import org.scalaml.core.ITransform
import org.scalaml.util.LoggingUtils._
import TSeries._

case class PCAModel(covariance: DblMatrix, eigenvalues: Array[Double])


  override def |> : PartialFunction[Array[T], Try[Double]] = {
    case x: Array[T] if x.length == dimension(xt) && model.isDefined =>
      Try(margin(x, model.get.eigenvalues))
  }

  override def toString: String =
    model.map(m => {
      val covStr = m.covariance./:(new StringBuilder)((b, r) =>
        b.append(s"${r.mkString(" ")}\n")).toString()

      s"""\nEigenvalues:\n${m.eigenvalues.mkString(" ,")}\n\nCovariance matrix\n
	  | $covStr""".stripMargin
    }).getOrElse("PCA model undefined")
}

//--------------------------------  EOF -------------------------------------------------------------------------

Source File: ClusteringModule.scala From Scala-for-Machine-Learning-Second-Edition with MIT License

5 votes

package org.scalaml.workflow.module

import org.apache.log4j.Logger
import org.scalaml.Predef.Context.ToDouble
import org.scalaml.Predef.VSeries


    override def execute(xt: Array[T]): Unit = {
      try {
        val results = em |> xt
        show(results, logger)
      } catch {
        case e: MatchError =>
          val errMsg = s"${e.getMessage()} caused by ${e.getCause.toString}"
          error(s"ClusteringModule.MultivariateEM $errMsg", logger)
        case e: Throwable => error("ClusteringModule.Kmeans", logger, e)
      }
    }
  }
}

// ---------------------------------------  EOF ------------------------------------------------------

Source File: BiasVariance.scala From Scala-for-Machine-Learning-Second-Edition with MIT License

5 votes

package org.scalaml.stats

// 3rd party classes
import org.apache.log4j.Logger
// Scala for Machine Learning classes and singletons
import org.scalaml.Predef._


  def apply(emul: Double => Double, nValues: Int): BiasVariance = new BiasVariance(emul, nValues)

  private def check(nValues: Int): Unit = {
    require(
      nValues > NUMVALUES_LIMITS._1 && nValues < NUMVALUES_LIMITS._2,
      s"BiasVarianceEmulator.check Size of training sets $nValues is out of range"
    )
  }
}

// -----------------------  EOF --------------------------------------

Source File: FileUtils.scala From Scala-for-Machine-Learning-Second-Edition with MIT License

5 votes

package org.scalaml.util

import org.apache.log4j.Logger

import scala.io.Source._
import scala.util.{Failure, Success, Try}


  def write(content: String, pathName: String, className: String): Boolean = {
    import java.io.PrintWriter

    import DisplayUtils._

    var printWriter: Option[PrintWriter] = None
    var status = false
    Try {
      printWriter = Some(new PrintWriter(pathName))
      printWriter.foreach(_.write(content))
      printWriter.foreach(_.flush)
      printWriter.foreach(_.close)
      status = true
    } match {
      // Catch and display exception description and return false
      case Failure(e) =>
        error(s"$className.write failed for $pathName", logger, e)
        if (printWriter.isDefined) printWriter.foreach(_.close)
        status
      case Success(s) => status
    }
  }
}

// ---------------------------------  EOF -------------------------------------

Source File: KmeansTest.scala From Scala-for-Machine-Learning-Second-Edition with MIT License

5 votes

package org.scalaml.spark.mllib

import org.apache.log4j.{Level, Logger}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.{SparkConf, SparkContext}
import org.scalaml.{Logging, Resource}
import org.scalaml.Predef._
import org.scalaml.stats.TSeries._
import org.scalaml.trading.YahooFinancials
import org.scalaml.workflow.data.DataSource
import org.scalatest.FunSuite
import org.scalatest.concurrent.ScalaFutures

import scala.concurrent.Future


final class KmeansTest extends FunSuite with ScalaFutures with Logging with Resource {
  import scala.concurrent.ExecutionContext.Implicits.global

  protected[this] val name = "Spark MLlib K-Means"
  private val K = 8
  private val NRUNS = 4
  private val MAXITERS = 60
  private val PATH = "spark/CSCO.csv"
  private val CACHE = false

  test(s"$name evaluation") {
    show(s"Evaluation")

    Logger.getRootLogger.setLevel(Level.ERROR)
    // The Spark configuration has to be customize to your environment
    val sparkConf = new SparkConf().setMaster("local")
      .setAppName("Kmeans")
      .set("spark.executor.memory", "4096m")

    implicit val sc = SparkContext.getOrCreate(sparkConf) // no need to load additional jar file

    val kmeanClustering: Option[Kmeans] = extract.map(input => {
      val volatilityVol = zipToSeries(input._1, input._2).take(500)

      val config = new KmeansConfig(K, MAXITERS, NRUNS)
      val rddConfig = RDDConfig(CACHE, StorageLevel.MEMORY_ONLY)
      Kmeans(config, rddConfig, volatilityVol)
    })

      // Wraps into a future to enforce time out in case of a straggler
    val ft = Future[Boolean] { predict(kmeanClustering) }
    whenReady(ft) { result => assert(result) }
    sc.stop
  }

   private def predict(kmeanClustering: Option[Kmeans]): Boolean = {
     kmeanClustering.map(kmeansCluster => {
       val obs = Array[Double](0.1, 0.9)
       val clusterId1 = kmeansCluster |> obs
       show(s"(${obs(0)},${obs(1)}) => Cluster #$clusterId1")

       val obs2 = Array[Double](0.56, 0.11)
       val clusterId2 = kmeansCluster |> obs2
       val result = s"(${obs2(0)},${obs2(1)}) => Cluster #$clusterId2"
       show(s"$name result: $result")
     })
     true
   }

  private def extract: Option[(DblVec, DblVec)] = {
    import scala.util._
    val extractors = List[Array[String] => Double](
      YahooFinancials.volatility,
      YahooFinancials.volume
    )

    DataSource(getPath(PATH).get, true).map(_.|>) match {
      case Success(pfnSrc) => pfnSrc(extractors).map(res => ((res(0).toVector, res(1).toVector))).toOption
      case Failure(e) =>
        failureHandler(e)
        None
    }
  }
}


// ---------------------------------  EOF -------------------------------------------------

Source File: StreamsTest.scala From Scala-for-Machine-Learning-Second-Edition with MIT License

5 votes

package org.scalaml.scalability.scala

import java.lang.ref._

import org.apache.log4j.Logger
import org.scalaml.Logging
import org.scalaml.Predef._
import org.scalatest.{FlatSpec, Matchers}

import scala.math._

case class DataPoint(x: DblVec, y: Double)


final class StreamsTest extends FlatSpec with Matchers with Logging {
  import scala.util.Random

  protected[this] val name = "Scala streams"

  it should s"$name huge list" in {
    show(s"$name huge list")

    val input = (0 until 1000000000).toStream
    input(10) should be(10)
  }

  it should s"$name recursion" in {
    show(s"$name recursion")

    def mean(strm: => Stream[Double]): Double = {
      @scala.annotation.tailrec
      def mean(z: Double, count: Int, strm: Stream[Double]): (Double, Int) =
        if (strm.isEmpty)
          (z, count)
        else
          mean((1.0 - 1.0 / count) * z + strm.head / count, count + 1, strm.tail)
      mean(0.0, 1, strm)._1
    }

    val input = List[Double](2.0, 5.0, 3.5, 2.0, 5.7, 1.0, 8.0)
    val ave: Double = mean(input.toStream)
    ave should be(3.88 +- 0.05)
  }

  it should s"$name with recycled memory blocks" in {
    show("$name with recycled memory blocks")

    type DblVec = Vector[Double]
    val DATASIZE = 20000

    val dot = (s: Double, xy: (Double, Double)) => s + xy._1 * xy._2
    val diff = (x: DblVec, y: DblVec) => x.zip(y).aggregate(0.0)(dot, _ + _)

    val weights = Vector[Double](0.5, 0.7)
    val lossFunction = new LossFunction(diff, weights, DATASIZE)

    // Create a stream of weak references to 10 stream segments of size DATESIZE/10
    val stream = () => new WeakReference(
      Stream.tabulate(DATASIZE)(n =>
        DataPoint(
          Vector[Double](n.toDouble, n * (n.toDouble)),
          n.toDouble * weights(0) + n * (n.toDouble) * weights(1) + 0.1 * Random.nextDouble
        ))
    )
    // Compute a simple distance using the dot product
    val totalLoss = sqrt(lossFunction.compute(stream))
    show(s"$name totalLoss ${totalLoss / DATASIZE}")

    val averageLoss = totalLoss / DATASIZE
    averageLoss should be(0.0 +- 0.001)
  }
}

// --------------------------  EOF --------------------------------

Source File: Application.scala From retail_analytics with Apache License 2.0

5 votes

package controllers

import scalaz._
import Scalaz._
import scalaz.EitherT._
import scalaz.Validation
//import scalaz.Validation.FlatMap._
import scalaz.NonEmptyList._
import play.api.mvc._
import java.io.File
import scala.io.Source
import org.apache.log4j.Logger
import org.apache.log4j.Level
import models._
import models.stack._

import play.api.libs.json._

object Application extends Controller {

  def index() = Action { implicit request =>
    Ok(views.html.index("Megam Analytics."))
  }

  def upload = Action(parse.multipartFormData) { implicit request =>
       request.body.file("picture").map { picture =>
       import java.io.File
      val filename = picture.filename
      val contentType = picture.contentType
      picture.ref.moveTo(new File("/tmp/"+filename))
     
      models.HDFSFileService.saveFile("/tmp/"+filename) match {
        case Success(succ) => {
          val fu = List(("success" -> succ))
          Redirect("/").flashing(fu: _*)
        }
        case Failure(err) => {
          val fu = List(("error" -> "File doesn't get uploaded"))
          Redirect("/").flashing(fu: _*)
        }
      }
    }.getOrElse {
      val fu = List(("error" -> "File doesn't get uploaded.."))
      Redirect("/").flashing(fu: _*)
    }
  }

  def analysis() = Action { implicit request =>
  val tuple_res = models.Retail.buyingbehaviour(MConfig.recommand_ID.toInt, MConfig.retailfile)
  
     
  println("BACK==========================>>>")
  println(tuple_res._1)
 
  
  //val finalJson = {
  //  for {
  //    product <- productList
  //  } yield Json.parse(product).as[JsObject]
 // }
    Ok(views.html.finalProducts(tuple_res._1, tuple_res._2))
  }

}

Source File: InductiveClassifier.scala From scala-cp with Apache License 2.0

5 votes

package se.uu.it.cp

import org.apache.log4j.Logger


  def predict(features: Seq[Double], significance: Double) = {
    //Validate input
    require(significance > 0 && significance < 1, s"significance $significance is not in (0,1)")
    alphas.foreach { a =>
      if (a.length < 1 / significance - 1) {
        log.warn(s"too few calibration samples (${a.length}) for significance $significance")
      }
    }
    //Compute prediction set
    mondrianPv(features).zipWithIndex.map {
      case (pVal, c) =>
        if (pVal > significance) {
          Set(c.toDouble)
        } else {
          Set[Double]()
        }
    }.reduce(_ ++ _)
  }

}

Source File: FileOutputIT.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta

import java.sql.Timestamp
import java.util.UUID

import com.github.nscala_time.time.Imports._
import com.stratio.sparta.sdk.pipeline.output.{Output, OutputFormatEnum, SaveModeEnum}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest._

import scala.reflect.io.File


class FileOutputIT extends FlatSpec with ShouldMatchers with BeforeAndAfterAll {
  self: FlatSpec =>

  @transient var sc: SparkContext = _

  override def beforeAll {
    Logger.getRootLogger.setLevel(Level.ERROR)
    sc = FileOutputIT.getNewLocalSparkContext(1, "test")
  }

  override def afterAll {
    sc.stop()
    System.clearProperty("spark.driver.port")
  }

  trait CommonValues {

    val sqlContext = SQLContext.getOrCreate(sc)

    import sqlContext.implicits._

    val time = new Timestamp(DateTime.now.getMillis)

    val data =
      sc.parallelize(Seq(Person("Kevin", 18, time), Person("Kira", 21, time), Person("Ariadne", 26, time))).toDF

    val tmpPath: String = s"/tmp/sparta-test/${UUID.randomUUID().toString}"
  }

  trait WithEventData extends CommonValues {
    val properties = Map("path" -> tmpPath, "createDifferentFiles" -> "false")
    val output = new FileOutput("file-test", properties)
  }

  "FileOutputIT" should "save a dataframe" in new WithEventData {
    output.save(data, SaveModeEnum.Append, Map(Output.TimeDimensionKey -> "minute", Output.TableNameKey -> "person"))

    val source = new java.io.File(tmpPath).listFiles()
    val read = sqlContext.read.json(tmpPath).toDF
    read.count shouldBe(3)
    File("/tmp/sparta-test").deleteRecursively
  }
}

object FileOutputIT {

  def getNewLocalSparkContext(numExecutors: Int = 1, title: String): SparkContext = {
    val conf = new SparkConf().setMaster(s"local[$numExecutors]").setAppName(title)
    SparkContext.getOrCreate(conf)
  }
}

case class Person(name: String, age: Int, minute: Timestamp) extends Serializable

Source File: GamerSparkSQLExample.scala From SparkOnKudu with Apache License 2.0

5 votes

package org.kududb.spark.demo.gamer.aggregates

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object GamerSparkSQLExample {
  def main(args:Array[String]): Unit = {
    if (args.length == 0) {
      println("{kudumaster} {runLocal}")
      return
    }

    Logger.getRootLogger.setLevel(Level.ERROR)

    val kuduMaster = args(0)
    val runLocal = args(1).equals("l")

    println("Loading Spark Context")
    var sc:SparkContext = null

    if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      sc = new SparkContext(sparkConfig)
    }
    println("Loading Spark Context: Finished")

    println("Setting up Tables")
    val sqlContext = new SQLContext(sc)
    sqlContext.load("org.kududb.spark",
      Map("kudu.table" -> "gamer", "kudu.master" -> kuduMaster)).registerTempTable("gamer")

    println("Query 1: SELECT count(*) FROM gamer")
    val startTimeQ1 = System.currentTimeMillis()
    sqlContext.sql("SELECT count(*) FROM gamer").take(10).foreach(r => {
      println(" - (" + r + ")")
    })
    println("Finish Query 1: " + (System.currentTimeMillis() - startTimeQ1))

    println("Query 2: SELECT * FROM gamer limit 100")
    val startTimeQ2 = System.currentTimeMillis()
    sqlContext.sql("SELECT * FROM gamer limit 100").take(100).foreach(r => {
      println(" - (" + r + ")")
    })
    println("Finish Query 2: " + (System.currentTimeMillis() - startTimeQ2))

    println("Query 3: SELECT * FROM gamer order_by last_time_played desc limit 100")
    val startTimeQ3 = System.currentTimeMillis()
    sqlContext.sql("SELECT * FROM gamer order by last_time_played desc limit 100").take(100).foreach(r => {
      println(" - (" + r + ")")
    })
    println("Finish Query 3: " + (System.currentTimeMillis() - startTimeQ3))

    println("Query 4: SELECT max(games_played), max(oks), max(damage_given) FROM gamer")
    val startTimeQ4 = System.currentTimeMillis()
    sqlContext.sql("SELECT max(games_played), max(oks), max(damage_given) FROM gamer").take(100).foreach(r => {
      println(" - (" + r + ")")
    })
    println("Finish Query 4: " + (System.currentTimeMillis() - startTimeQ4))

    println("Query 5 + MLLIB: SELECT gamer_id, oks, games_won, games_played FROM gamer" )
    val startTimeQ5 = System.currentTimeMillis()
    val resultDf = sqlContext.sql("SELECT gamer_id, oks, games_won, games_played FROM gamer")

    val parsedData = resultDf.map(r => {
      val array = Array(r.getInt(1).toDouble, r.getInt(2).toDouble, r.getInt(3).toDouble)
      Vectors.dense(array)
    })

    val dataCount = parsedData.count()

    if (dataCount > 0) {
      val clusters = KMeans.train(parsedData, 3, 5)
      clusters.clusterCenters.foreach(v => println(" Vector Center:" + v))

    }
    //TODO add Mllib here
    println("Finish Query 5 + MLLIB: " + (System.currentTimeMillis() - startTimeQ5))

  }
}

Source File: BasicSparkSQLExamples.scala From SparkOnKudu with Apache License 2.0

5 votes

package org.kududb.spark.demo.basic

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors

object BasicSparkSQLExamples {
  def main(args:Array[String]): Unit = {
    if (args.length == 0) {
      println("<kuduMaster> <tablename> <runLocal>")
    }

    Logger.getRootLogger.setLevel(Level.ERROR)

    val kuduMaster = args(0)
    val tableName = args(1)
    val runLocal = args(2).equals("l")

    println("starting")
    var sc:SparkContext = null
    if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      sc = new SparkContext(sparkConfig)
    }

    try {
      println("Setting up Tables")
      val sqlContext = new SQLContext(sc)
      sqlContext.load("org.kududb.spark",
        Map("kudu.table" -> tableName, "kudu.master" -> kuduMaster)).registerTempTable(tableName)

      println("Query 1: SELECT count(*) FROM " + tableName)
      val startTimeQ1 = System.currentTimeMillis()
      sqlContext.sql("SELECT count(*) FROM " + tableName).take(10).foreach(r => {
        println(" - (" + r + ")")
      })
      println("Finish Query 1: " + (System.currentTimeMillis() - startTimeQ1))

      println("Query 2: SELECT key_id, col_1 FROM " + tableName + " limit 100")
      val startTimeQ2 = System.currentTimeMillis()
      sqlContext.sql("SELECT key_id, col_1 FROM " + tableName + " limit 100 ").take(100).foreach(r => {
        println(" - (" + r + ")")
      })
      println("Finish Query 2: " + (System.currentTimeMillis() - startTimeQ2))

      val q3 = "select key_id from " + tableName + " a join (SELECT max(col_1) col_max FROM " + tableName + ") b on (a.col_1 = b.col_max)"
      println("Query 3: " + q3)
      val startTimeQ3 = System.currentTimeMillis()
      sqlContext.sql(q3).take(100).foreach(r => {
        println(" - (" + r + ")")
      })
      println("Finish Query 3: " + (System.currentTimeMillis() - startTimeQ3))


      println("Query 5 + MLLIB: SELECT key_id, col_1, col_2 FROM " + tableName )
      val startTimeQ5 = System.currentTimeMillis()
      val resultDf = sqlContext.sql("SELECT key_id, col_1, col_2 FROM " + tableName + " limit 1000")

      val parsedData = resultDf.map(r => {
        val array = Array(r.getInt(1).toDouble, r.getInt(2).toDouble)
        Vectors.dense(array)
      })
      val clusters = KMeans.train(parsedData, 3, 4)
      clusters.clusterCenters.foreach(v => println(" Vector Center:" + v))

      //TODO add Mllib here
      println("Finish Query 5 + MLLIB: " + (System.currentTimeMillis() - startTimeQ5))

    } finally {
      sc.stop()
    }
  }
}

Source File: CommandLineClient.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.integrationtest.minicluster

import org.apache.log4j.Logger
import org.apache.gearpump.cluster.ApplicationStatus
import org.apache.gearpump.integrationtest.Docker


class CommandLineClient(host: String) {

  private val LOG = Logger.getLogger(getClass)

  def listApps(): Array[String] = {
    gearCommand(host, "gear info").split("\n").filter(
      _.startsWith("application: ")
    )
  }

  def listRunningApps(): Array[String] =
    listApps().filter(_.contains(s", status: ${ApplicationStatus.ACTIVE}"))

  def queryApp(appId: Int): String = try {
    listApps().filter(_.startsWith(s"application: $appId")).head
  } catch {
    case ex: Throwable =>
      LOG.warn(s"swallowed an exception: $ex")
      ""
  }

  def submitAppAndCaptureOutput(jar: String, executorNum: Int, args: String = ""): String = {
    gearCommand(host, s"gear app -verbose true -jar $jar -executors $executorNum $args")
  }

  def submitApp(jar: String, args: String = ""): Int = {
    LOG.debug(s"|=> Submit Application $jar...")
    submitAppUse("gear app", jar, args)
  }

  private def submitAppUse(launcher: String, jar: String, args: String = ""): Int = try {
    gearCommand(host, s"$launcher -jar $jar $args").split("\n")
      .filter(_.contains("The application id is ")).head.split(" ").last.toInt
  } catch {
    case ex: Throwable =>
      LOG.warn(s"swallowed an exception: $ex")
      -1
  }

  def killApp(appId: Int): Boolean = {
    tryGearCommand(host, s"gear kill -appid $appId")
  }

  private def gearCommand(container: String, command: String): String = {
    LOG.debug(s"|=> Gear command $command in container $container...")
    Docker.execute(container, s"/opt/start $command")
  }

  private def tryGearCommand(container: String, command: String): Boolean = {
    LOG.debug(s"|=> Gear command $command in container $container...")
    Docker.executeSilently(container, s"/opt/start $command")
  }
}

Source File: Util.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.integrationtest

import scala.concurrent.duration._
import scala.util.{Failure, Success, Try}

import org.apache.log4j.Logger

object Util {

  private val LOG = Logger.getLogger(getClass)

  def encodeUriComponent(s: String): String = {
    try {
      java.net.URLEncoder.encode(s, "UTF-8")
        .replaceAll("\\+", "%20")
        .replaceAll("\\%21", "!")
        .replaceAll("\\%27", "'")
        .replaceAll("\\%28", "(")
        .replaceAll("\\%29", ")")
        .replaceAll("\\%7E", "~")
    } catch {
      case ex: Throwable => s
    }
  }

  def retryUntil(
      condition: () => Boolean, conditionDescription: String, maxTries: Int = 15,
      interval: Duration = 10.seconds): Unit = {
    var met = false
    var tries = 0

    while (!met && tries < maxTries) {

      met = Try(condition()) match {
        case Success(true) => true
        case Success(false) => false
        case Failure(ex) => false
      }

      tries += 1

      if (!met) {
        LOG.error(s"Failed due to (false == $conditionDescription), " +
          s"retrying for the ${tries} times...")
        Thread.sleep(interval.toMillis)
      } else {
        LOG.info(s"Success ($conditionDescription) after ${tries} retries")
      }
    }

    if (!met) {
      throw new Exception(s"Failed after ${tries} retries, ($conditionDescription) == false")
    }
  }
}

Source File: ShellExec.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.integrationtest

import scala.collection.JavaConverters._
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent._
import scala.concurrent.duration._
import scala.sys.process._

import org.apache.log4j.Logger
import org.apache.storm.shade.org.eclipse.jetty.util.QuotedStringTokenizer


  private def splitQuotedString(str: String): List[String] = {
    val splitter = new QuotedStringTokenizer(str, " \t\n\r")
    splitter.asInstanceOf[java.util.Enumeration[String]].asScala.toList
  }

  def exec(command: String, sender: String, timeout: Duration = PROCESS_TIMEOUT): Boolean = {
    LOG.debug(s"$sender => `$command`")

    val p = splitQuotedString(command).run()
    val f = Future(blocking(p.exitValue())) // wrap in Future
    val retval = {
      try {
        Await.result(f, timeout)
      } catch {
        case _: TimeoutException =>
          LOG.error(s"timeout to execute command `$command`")
          p.destroy()
          p.exitValue()
      }
    }
    LOG.debug(s"$sender <= exit $retval")
    retval == 0
  }

  def execAndCaptureOutput(command: String, sender: String, timeout: Duration = PROCESS_TIMEOUT)
    : String = {
    LOG.debug(s"$sender => `$command`")

    val buf = new StringBuilder
    val processLogger = ProcessLogger((o: String) => buf.append(o).append("\n"),
      (e: String) => buf.append(e).append("\n"))
    val p = splitQuotedString(command).run(processLogger)
    val f = Future(blocking(p.exitValue())) // wrap in Future
    val retval = {
      try {
        Await.result(f, timeout)
      } catch {
        case _: TimeoutException =>
          p.destroy()
          p.exitValue()
      }
    }
    val output = buf.toString().trim
    val PREVIEW_MAX_LENGTH = 200
    val preview = if (output.length > PREVIEW_MAX_LENGTH) {
      output.substring(0, PREVIEW_MAX_LENGTH) + "..."
    } else {
      output
    }

    LOG.debug(s"$sender <= `$preview` exit $retval")
    if (retval != 0) {
      throw new RuntimeException(
        s"exited ($retval) by executing `$command`")
    }
    output
  }
}

Source File: NumericalDataProducer.scala From incubator-retired-gearpump with Apache License 2.0

5 votes

package org.apache.gearpump.integrationtest.kafka

import java.util.Properties

import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import org.apache.kafka.common.serialization.ByteArraySerializer
import org.apache.log4j.Logger

import org.apache.gearpump.streaming.serializer.ChillSerializer

class NumericalDataProducer(topic: String, bootstrapServers: String) {

  private val LOG = Logger.getLogger(getClass)
  private val producer = createProducer
  private val WRITE_SLEEP_NANOS = 10
  private val serializer = new ChillSerializer[Int]
  var lastWriteNum = 0

  def start(): Unit = {
    produceThread.start()
  }

  def stop(): Unit = {
    if (produceThread.isAlive) {
      produceThread.interrupt()
      produceThread.join()
    }
    producer.close()
  }

  
  def producedNumbers: Range = {
    Range(1, lastWriteNum + 1)
  }

  private def createProducer: KafkaProducer[Array[Byte], Array[Byte]] = {
    val properties = new Properties()
    properties.setProperty("bootstrap.servers", bootstrapServers)
    new KafkaProducer[Array[Byte], Array[Byte]](properties,
      new ByteArraySerializer, new ByteArraySerializer)
  }

  private val produceThread = new Thread(new Runnable {
    override def run(): Unit = {
      try {
        while (!Thread.currentThread.isInterrupted) {
          lastWriteNum += 1
          val msg = serializer.serialize(lastWriteNum)
          val record = new ProducerRecord[Array[Byte], Array[Byte]](topic, msg)
          producer.send(record)
          Thread.sleep(0, WRITE_SLEEP_NANOS)
        }
      } catch {
        case ex: InterruptedException =>
          LOG.error("message producing is stopped by an interrupt")
      }
    }
  })
}

Source File: StreamingKMeansSuite.scala From spark-structured-streaming-ml with Apache License 2.0

5 votes

package com.highperformancespark.examples.structuredstreaming

import com.holdenkarau.spark.testing.DataFrameSuiteBase
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.ml.linalg._
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.scalatest.FunSuite
import org.apache.log4j.{Level, Logger}

case class TestRow(features: Vector)

class StreamingKMeansSuite extends FunSuite with DataFrameSuiteBase {

  override def beforeAll(): Unit = {
    super.beforeAll()
    Logger.getLogger("org").setLevel(Level.OFF)
  }

  test("streaming model with one center should converge to true center") {
    import spark.implicits._
    val k = 1
    val dim = 5
    val clusterSpread = 0.1
    val seed = 63
    // TODO: this test is very flaky. The centers do not converge for some
    // (most?) random seeds
    val (batches, trueCenters) =
      StreamingKMeansSuite.generateBatches(100, 80, k, dim, clusterSpread, seed)
    val inputStream = MemoryStream[TestRow]
    val ds = inputStream.toDS()
    val skm = new StreamingKMeans().setK(k).setRandomCenters(dim, 0.01)
    val query = skm.evilTrain(ds.toDF())
    val streamingModels = batches.map { batch =>
      inputStream.addData(batch)
      query.processAllAvailable()
      skm.getModel
    }
    // TODO: use spark's testing suite
    streamingModels.last.centers.zip(trueCenters).foreach {
      case (center, trueCenter) =>
        val centers = center.toArray.mkString(",")
        val trueCenters = trueCenter.toArray.mkString(",")
        println(s"${centers} | ${trueCenters}")
        assert(center.toArray.zip(trueCenter.toArray).forall(
          x => math.abs(x._1 - x._2) < 0.1))
    }
    query.stop()
  }

  def compareBatchAndStreaming(
      batchModel: KMeansModel,
      streamingModel: StreamingKMeansModel,
      validationData: DataFrame): Unit = {
    assert(batchModel.clusterCenters === streamingModel.centers)
    // TODO: implement prediction comparison
  }

}

object StreamingKMeansSuite {

  def generateBatches(
      numPoints: Int,
      numBatches: Int,
      k: Int,
      d: Int,
      r: Double,
      seed: Int,
      initCenters: Array[Vector] = null):
      (IndexedSeq[IndexedSeq[TestRow]], Array[Vector]) = {
    val rand = scala.util.Random
    rand.setSeed(seed)
    val centers = initCenters match {
      case null => Array.fill(k)(Vectors.dense(Array.fill(d)(rand.nextGaussian())))
      case _ => initCenters
    }
    val data = (0 until numBatches).map { i =>
      (0 until numPoints).map { idx =>
        val center = centers(idx % k)
        val vec = Vectors.dense(
          Array.tabulate(d)(x => center(x) + rand.nextGaussian() * r))
        TestRow(vec)
      }
    }
    (data, centers)
  }
}

Source File: SuspiciousConnects.scala From oni-ml with Apache License 2.0

5 votes

package org.opennetworkinsight

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import org.slf4j.LoggerFactory
import org.opennetworkinsight.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
import org.opennetworkinsight.dns.DNSSuspiciousConnects
import org.opennetworkinsight.netflow.FlowSuspiciousConnects
import org.opennetworkinsight.proxy.ProxySuspiciousConnectsAnalysis


  def main(args: Array[String]) {

    val parser = SuspiciousConnectsArgumentParser.parser

    parser.parse(args, SuspiciousConnectsConfig()) match {
      case Some(config) =>
        val logger = LoggerFactory.getLogger(this.getClass)
        Logger.getLogger("org").setLevel(Level.OFF)
        Logger.getLogger("akka").setLevel(Level.OFF)

        val analysis = config.analysis
        val sparkConfig = new SparkConf().setAppName("ONI ML:  " + analysis + " lda")
        val sparkContext = new SparkContext(sparkConfig)
        val sqlContext = new SQLContext(sparkContext)
        implicit val outputDelimiter = OutputDelimiter

        analysis match {
          case "flow" => FlowSuspiciousConnects.run(config, sparkContext, sqlContext, logger)
          case "dns" => DNSSuspiciousConnects.run(config, sparkContext, sqlContext, logger)
          case "proxy" => ProxySuspiciousConnectsAnalysis.run(config, sparkContext, sqlContext, logger)
          case _ => println("ERROR:  unsupported (or misspelled) analysis: " + analysis)
        }

        sparkContext.stop()

      case None => println("Error parsing arguments")
    }

    System.exit(0)
  }


}

Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object DecisionTreePipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val dt = new DecisionTreeClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setMaxDepth(5)
      .setMaxBins(32)
      .setMinInstancesPerNode(1)
      .setMinInfoGain(0.0)
      .setCacheNodeIds(false)
      .setCheckpointInterval(10)

    stages += vectorAssembler
    stages += dt
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)
  }
}

Source File: SVMPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint


object SVMPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def svmPipeline(sc: SparkContext) = {
    val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t"))

    val data = records.map { r =>
      val trimmed = r.map(_.replaceAll("\"", ""))
      val label = trimmed(r.size - 1).toInt
      val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
      LabeledPoint(label, Vectors.dense(features))
    }

    // params for SVM
    val numIterations = 10

    // Run training algorithm to build the model
    val svmModel = SVMWithSGD.train(data, numIterations)

    // Clear the default threshold.
    svmModel.clearThreshold()

    val svmTotalCorrect = data.map { point =>
      if(svmModel.predict(point.features) == point.label) 1 else 0
    }.sum()

    // calculate accuracy
    val svmAccuracy = svmTotalCorrect / data.count()
    println(svmAccuracy)
  }

}

Source File: NaiveBayesPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object NaiveBayesPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val nb = new NaiveBayes()

    stages += vectorAssembler
    stages += nb
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)
  }
}

Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.DataFrame


object LogisticRegressionPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val lr = new LogisticRegression()

    val paramGrid = new ParamGridBuilder()
      .addGrid(lr.regParam, Array(0.1, 0.01))
      .addGrid(lr.fitIntercept)
      .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0))
      .build()

    val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr))

    val trainValidationSplit = new TrainValidationSplit()
      .setEstimator(pipeline)
      .setEvaluator(new RegressionEvaluator)
      .setEstimatorParamMaps(paramGrid)
      // 80% of the data will be used for training and the remaining 20% for validation.
      .setTrainRatio(0.8)

    val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345)
    //val model = trainValidationSplit.fit(training)
    val model = trainValidationSplit.fit(dataFrame)

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)

    val totalPoints = dataFrame.count()
    val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum()
    val accuracy = lrTotalCorrect/totalPoints
    println("Accuracy of LogisticRegression is: ", accuracy)

    holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/LR.xls")
    holdout.rdd.map(x => x(1).asInstanceOf[Double]).repartition(1).saveAsTextFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/Actual.xls")

    savePredictions(holdout, dataFrame, rm, "/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/LogisticRegression.csv")
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    println("Mean Squared Error:", regressionMetrics.meanSquaredError)
    println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError)

    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }
}

Source File: RandomForestPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object RandomForestPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val rf = new RandomForestClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setNumTrees(20)
      .setMaxDepth(5)
      .setMaxBins(32)
      .setMinInstancesPerNode(1)
      .setMinInfoGain(0.0)
      .setCacheNodeIds(false)
      .setCheckpointInterval(10)

    stages += vectorAssembler
    stages += rf
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)

    val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0))
    val labels = model.transform(test).select("label").rdd.map(_.getDouble(0))
    val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision
    println(s"  Accuracy : $accuracy")

    holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/RF.xls")

    savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/RandomForest.csv")
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }
}

Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object DecisionTreePipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val dt = new DecisionTreeClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setMaxDepth(5)
      .setMaxBins(32)
      .setMinInstancesPerNode(1)
      .setMinInfoGain(0.0)
      .setCacheNodeIds(false)
      .setCheckpointInterval(10)

    stages += vectorAssembler
    stages += dt
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)


    val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0))
    val labels = model.transform(test).select("label").rdd.map(_.getDouble(0))
    val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision
    println(s"  Accuracy : $accuracy")

    holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/DT.xls")

    savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/DecisionTree.csv")
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }

}

Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.GBTClassifier
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object GradientBoostedTreePipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val gbt = new GBTClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setMaxIter(10)

    stages += vectorAssembler
    stages += gbt
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)

    val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0))
    val labels = model.transform(test).select("label").rdd.map(_.getDouble(0))
    val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision
    println(s"  Accuracy : $accuracy")

    holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/GBT.xls")

    savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/GBT.csv")
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }

}

Source File: SVMPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint


object SVMPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def svmPipeline(sc: SparkContext) = {
    val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t"))

    val data = records.map { r =>
      val trimmed = r.map(_.replaceAll("\"", ""))
      val label = trimmed(r.size - 1).toInt
      val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
      LabeledPoint(label, Vectors.dense(features))
    }

    // params for SVM
    val numIterations = 10

    // Run training algorithm to build the model
    val svmModel = SVMWithSGD.train(data, numIterations)

    // Clear the default threshold.
    svmModel.clearThreshold()

    val svmTotalCorrect = data.map { point =>
      if(svmModel.predict(point.features) == point.label) 1 else 0
    }.sum()

    // calculate accuracy
    val svmAccuracy = svmTotalCorrect / data.count()
    println(svmAccuracy)
  }

}

Source File: NaiveBayesPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object NaiveBayesPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val nb = new NaiveBayes()

    stages += vectorAssembler
    stages += nb
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)

    val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0))
    val labels = model.transform(test).select("label").rdd.map(_.getDouble(0))
    val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision
    println(s"  Accuracy : $accuracy")

    holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/NB.xls")

    savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/NaiveBayes.csv")
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }
}

Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.DataFrame


object LogisticRegressionPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val lr = new LogisticRegression()

    val paramGrid = new ParamGridBuilder()
      .addGrid(lr.regParam, Array(0.1, 0.01))
      .addGrid(lr.fitIntercept)
      .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0))
      .build()

    val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr))

    val trainValidationSplit = new TrainValidationSplit()
      .setEstimator(pipeline)
      .setEvaluator(new RegressionEvaluator)
      .setEstimatorParamMaps(paramGrid)
      // 80% of the data will be used for training and the remaining 20% for validation.
      .setTrainRatio(0.8)

    val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345)
    //val model = trainValidationSplit.fit(training)
    val model = trainValidationSplit.fit(dataFrame)

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)

    val totalPoints = dataFrame.count()
    val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum()
    val accuracy = lrTotalCorrect/totalPoints
    println("Accuracy of LogisticRegression is: ", accuracy)

    holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/LR.xls")
    holdout.rdd.map(x => x(1).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/Actual.xls")

    savePredictions(holdout, dataFrame, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/LogisticRegression.csv")
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    println("Mean Squared Error:", regressionMetrics.meanSquaredError)
    println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError)

    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }
}

Source File: RandomForestPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.stumbleuponclassifier

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object RandomForestPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val rf = new RandomForestClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setNumTrees(20)
      .setMaxDepth(5)
      .setMaxBins(32)
      .setMinInstancesPerNode(1)
      .setMinInfoGain(0.0)
      .setCacheNodeIds(false)
      .setCheckpointInterval(10)

    stages += vectorAssembler
    stages += rf
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)

  }
}

Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.stumbleuponclassifier

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object DecisionTreePipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val dt = new DecisionTreeClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setMaxDepth(5)
      .setMaxBins(32)
      .setMinInstancesPerNode(1)
      .setMinInfoGain(0.0)
      .setCacheNodeIds(false)
      .setCheckpointInterval(10)

    stages += vectorAssembler
    stages += dt
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)
  }
}

Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.stumbleuponclassifier

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.GBTClassifier
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object GradientBoostedTreePipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val gbt = new GBTClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setMaxIter(10)

    stages += vectorAssembler
    stages += gbt
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)

    val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0))
    val labels = model.transform(test).select("label").rdd.map(_.getDouble(0))
    val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision
    println(s"  Accuracy : $accuracy")
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }

}

Source File: SVMPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.stumbleuponclassifier

import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint


object SVMPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def svmPipeline(sc: SparkContext) = {
    val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t"))

    val data = records.map { r =>
      val trimmed = r.map(_.replaceAll("\"", ""))
      val label = trimmed(r.size - 1).toInt
      val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
      LabeledPoint(label, Vectors.dense(features))
    }

    // params for SVM
    val numIterations = 10

    // Run training algorithm to build the model
    val svmModel = SVMWithSGD.train(data, numIterations)

    // Clear the default threshold.
    svmModel.clearThreshold()

    val svmTotalCorrect = data.map { point =>
      if(svmModel.predict(point.features) == point.label) 1 else 0
    }.sum()

    // calculate accuracy
    val svmAccuracy = svmTotalCorrect / data.count()
    println(svmAccuracy)
  }

}

Source File: NaiveBayesPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.stumbleuponclassifier

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object NaiveBayesPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val nb = new NaiveBayes()

    stages += vectorAssembler
    stages += nb
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)
  }
}

Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.stumbleuponclassifier

import org.apache.log4j.Logger
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.DataFrame


object LogisticRegressionPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val lr = new LogisticRegression()

    val paramGrid = new ParamGridBuilder()
      .addGrid(lr.regParam, Array(0.1, 0.01))
      .addGrid(lr.fitIntercept)
      .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0))
      .build()

    val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr))

    val trainValidationSplit = new TrainValidationSplit()
      .setEstimator(pipeline)
      .setEvaluator(new RegressionEvaluator)
      .setEstimatorParamMaps(paramGrid)
      // 80% of the data will be used for training and the remaining 20% for validation.
      .setTrainRatio(0.8)

    val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345)
    //val model = trainValidationSplit.fit(training)
    val model = trainValidationSplit.fit(dataFrame)

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)

    val totalPoints = dataFrame.count()
    val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum()
    val accuracy = lrTotalCorrect/totalPoints
    println("Accuracy of LogisticRegression is: ", accuracy)
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    println("Mean Squared Error:", regressionMetrics.meanSquaredError)
    println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError)

    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }
}

Source File: GeneralizedLinearRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.regression.bikesharing

import org.apache.log4j.Logger
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer}
import org.apache.spark.ml.regression.GeneralizedLinearRegression
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.{SparkSession, _}


object GeneralizedLinearRegressionPipeline {

  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def genLinearRegressionWithVectorFormat(vectorAssembler: VectorAssembler, vectorIndexer: VectorIndexer, dataFrame: DataFrame) = {
    val lr = new GeneralizedLinearRegression()
      .setFeaturesCol("features")
      .setLabelCol("label")
      .setFamily("gaussian")
      .setLink("identity")
      .setMaxIter(10)
      .setRegParam(0.3)

    val pipeline = new Pipeline().setStages(Array(vectorAssembler, vectorIndexer, lr))

    val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345)

    val model = pipeline.fit(training)

    val fullPredictions = model.transform(test).cache()
    val predictions = fullPredictions.select("prediction").rdd.map(_.getDouble(0))
    val labels = fullPredictions.select("label").rdd.map(_.getDouble(0))
    val RMSE = new RegressionMetrics(predictions.zip(labels)).rootMeanSquaredError
    println(s"  Root mean squared error (RMSE): $RMSE")
  }

  def genLinearRegressionWithSVMFormat(spark: SparkSession) = {
    // Load training data
    val training = spark.read.format("libsvm")
      .load("./src/main/scala/org/sparksamples/regression/dataset/BikeSharing/lsvmHours.txt")

    val lr = new GeneralizedLinearRegression()
      .setFamily("gaussian")
      .setLink("identity")
      .setMaxIter(10)
      .setRegParam(0.3)

    // Fit the model
    val model = lr.fit(training)

    // Print the coefficients and intercept for generalized linear regression model
    println(s"Coefficients: ${model.coefficients}")
    println(s"Intercept: ${model.intercept}")

    // Summarize the model over the training set and print out some metrics
    val summary = model.summary
    println(s"Coefficient Standard Errors: ${summary.coefficientStandardErrors.mkString(",")}")
    println(s"T Values: ${summary.tValues.mkString(",")}")
    println(s"P Values: ${summary.pValues.mkString(",")}")
    println(s"Dispersion: ${summary.dispersion}")
    println(s"Null Deviance: ${summary.nullDeviance}")
    println(s"Residual Degree Of Freedom Null: ${summary.residualDegreeOfFreedomNull}")
    println(s"Deviance: ${summary.deviance}")
    println(s"Residual Degree Of Freedom: ${summary.residualDegreeOfFreedom}")
    println(s"AIC: ${summary.aic}")
    println("Deviance Residuals: ")
    summary.residuals().show()  }

}

Source File: LinearRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.regression.bikesharing

import org.apache.log4j.Logger
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer}
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.{DataFrame, SparkSession}


object LinearRegressionPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def linearRegressionWithVectorFormat(vectorAssembler: VectorAssembler, vectorIndexer: VectorIndexer, dataFrame: DataFrame) = {
    val lr = new LinearRegression()
      .setFeaturesCol("features")
      .setLabelCol("label")
      .setRegParam(0.1)
      .setElasticNetParam(1.0)
      .setMaxIter(10)

    val pipeline = new Pipeline().setStages(Array(vectorAssembler, vectorIndexer, lr))

    val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345)

    val model = pipeline.fit(training)

    val fullPredictions = model.transform(test).cache()
    val predictions = fullPredictions.select("prediction").rdd.map(_.getDouble(0))
    val labels = fullPredictions.select("label").rdd.map(_.getDouble(0))
    val RMSE = new RegressionMetrics(predictions.zip(labels)).rootMeanSquaredError
    println(s"  Root mean squared error (RMSE): $RMSE")
  }

  def linearRegressionWithSVMFormat(spark: SparkSession) = {
    // Load training data
    val training = spark.read.format("libsvm")
      .load("./src/main/scala/org/sparksamples/regression/dataset/BikeSharing/lsvmHours.txt")

    val lr = new LinearRegression()
      .setMaxIter(10)
      .setRegParam(0.3)
      .setElasticNetParam(0.8)

    // Fit the model
    val lrModel = lr.fit(training)

    // Print the coefficients and intercept for linear regression
    println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

    // Summarize the model over the training set and print out some metrics
    val trainingSummary = lrModel.summary
    println(s"numIterations: ${trainingSummary.totalIterations}")
    println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")
    trainingSummary.residuals.show()
    println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")

    println(s"r2: ${trainingSummary.r2}")
  }
}

Source File: SparseNaiveBayes.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.util.MLUtils


object SparseNaiveBayes {

  case class Params(
      input: String = null,
      minPartitions: Int = 0,
      numFeatures: Int = -1,
      lambda: Double = 1.0) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("SparseNaiveBayes") {
      head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.")
      opt[Int]("numPartitions")
        .text("min number of partitions")
        .action((x, c) => c.copy(minPartitions = x))
      opt[Int]("numFeatures")
        .text("number of features")
        .action((x, c) => c.copy(numFeatures = x))
      opt[Double]("lambda")
        .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}")
        .action((x, c) => c.copy(lambda = x))
      arg[String]("<input>")
        .text("input paths to labeled examples in LIBSVM format")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val minPartitions =
      if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions

    val examples =
      MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions)
    // Cache examples because it will be used in both training and evaluation.
    examples.cache()

    val splits = examples.randomSplit(Array(0.8, 0.2))
    val training = splits(0)
    val test = splits(1)

    val numTraining = training.count()
    val numTest = test.count()

    println(s"numTraining = $numTraining, numTest = $numTest.")

    val model = new NaiveBayes().setLambda(params.lambda).run(training)

    val prediction = model.predict(test.map(_.features))
    val predictionAndLabel = prediction.zip(test.map(_.label))
    val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest

    println(s"Test accuracy = $accuracy.")

    sc.stop()
  }
}
// scalastyle:on println

Source File: DenseKMeans.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors


object DenseKMeans {

  object InitializationMode extends Enumeration {
    type InitializationMode = Value
    val Random, Parallel = Value
  }

  import InitializationMode._

  case class Params(
      input: String = null,
      k: Int = -1,
      numIterations: Int = 10,
      initializationMode: InitializationMode = Parallel) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DenseKMeans") {
      head("DenseKMeans: an example k-means app for dense data.")
      opt[Int]('k', "k")
        .required()
        .text(s"number of clusters, required")
        .action((x, c) => c.copy(k = x))
      opt[Int]("numIterations")
        .text(s"number of iterations, default: ${defaultParams.numIterations}")
        .action((x, c) => c.copy(numIterations = x))
      opt[String]("initMode")
        .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " +
        s"default: ${defaultParams.initializationMode}")
        .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x)))
      arg[String]("<input>")
        .text("input paths to examples")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"DenseKMeans with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = sc.textFile(params.input).map { line =>
      Vectors.dense(line.split(' ').map(_.toDouble))
    }.cache()

    val numExamples = examples.count()

    println(s"numExamples = $numExamples.")

    val initMode = params.initializationMode match {
      case Random => KMeans.RANDOM
      case Parallel => KMeans.K_MEANS_PARALLEL
    }

    val model = new KMeans()
      .setInitializationMode(initMode)
      .setK(params.k)
      .setMaxIterations(params.numIterations)
      .run(examples)

    val cost = model.computeCost(examples)

    println(s"Total cost = $cost.")

    sc.stop()
  }
}
// scalastyle:on println

Source File: StreamingExamples.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.examples.streaming

import org.apache.log4j.{Level, Logger}

import org.apache.spark.internal.Logging


  def setStreamingLogLevels() {
    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
    if (!log4jInitialized) {
      // We first log something to initialize Spark's default logging, then we override the
      // logging level.
      logInfo("Setting log level to [WARN] for streaming example." +
        " To override add a custom log4j.properties to the classpath.")
      Logger.getRootLogger.setLevel(Level.WARN)
    }
  }
}

Source File: YarnScheduler.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import org.apache.hadoop.yarn.util.RackResolver
import org.apache.log4j.{Level, Logger}

import org.apache.spark._
import org.apache.spark.scheduler.TaskSchedulerImpl
import org.apache.spark.util.Utils

private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) {

  // RackResolver logs an INFO message whenever it resolves a rack, which is way too often.
  if (Logger.getLogger(classOf[RackResolver]).getLevel == null) {
    Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN)
  }

  // By default, rack is unknown
  override def getRackForHost(hostPort: String): Option[String] = {
    val host = Utils.parseHostPort(hostPort)._1
    Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation)
  }
}

Source File: BeforeAndAfterWithContext.scala From uberdata with Apache License 2.0

5 votes

package org.apache.spark.rpc.netty

import eleflow.uberdata.core.IUberdataContext
import eleflow.uberdata.core.util.ClusterSettings
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkEnv}
import org.scalatest.{BeforeAndAfterEach, Suite}

object TestSparkConf {
  def conf = {
    val sconf = new SparkConf()
    sconf.set("spark.app.name", "teste")
    sconf
  }

  val separator = ","

}


trait BeforeAndAfterWithContext extends BeforeAndAfterEach { this: Suite =>

  val defaultFilePath = "src/test/resources/"
  import TestSparkConf._
  ClusterSettings.master = Some("local[*]")
  conf.set("spark.driver.allowMultipleContexts", "true")
  @transient val context = IUberdataContext.getUC(conf)

  override def beforeEach() = {
    setLogLevels(Level.INFO, Seq("spark", "org.eclipse.jetty", "akka"))
  }

  def setLogLevels(level: org.apache.log4j.Level, loggers: TraversableOnce[String]) = {
    loggers.map { loggerName =>
      val logger = Logger.getLogger(loggerName)
      val prevLevel = logger.getLevel
      logger.setLevel(level)
      loggerName -> prevLevel
    }.toMap
  }

  override def afterEach() = {
    val get = SparkEnv.get
    val rpcEnv =
      if (get != null) {
        Some(get.rpcEnv)
      } else None
    context.clearContext()
    //rpcEnv.foreach(
    //  _.fileServer.asInstanceOf[org.apache.spark.rpc.netty.HttpBasedFileServer].shutdown())


    System.clearProperty("spark.master.port")
  }
}

Source File: MLLibSuite.scala From spark-sql-perf with Apache License 2.0

5 votes

package com.databricks.spark.sql.perf.mllib

import org.scalatest.{BeforeAndAfterAll, FunSuite}

import org.apache.log4j.{Level, Logger}

import org.apache.spark.sql.{Row, SparkSession}

class MLLibSuite extends FunSuite with BeforeAndAfterAll {

  private var sparkSession: SparkSession = _
  var savedLevels: Map[String, Level] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    sparkSession = SparkSession.builder.master("local[2]").appName("MLlib QA").getOrCreate()

    // Travis limits the size of the log file produced by a build. Because we do run a small
    // version of all the ML benchmarks in this suite, we produce a ton of logs. Here we set the
    // log level to ERROR, just for this suite, to avoid displeasing travis.
    savedLevels = Seq("akka", "org", "com.databricks").map { name =>
      val logger = Logger.getLogger(name)
      val curLevel = logger.getLevel
      logger.setLevel(Level.ERROR)
      name -> curLevel
    }.toMap
  }

  override def afterAll(): Unit = {
    savedLevels.foreach { case (name, level) =>
      Logger.getLogger(name).setLevel(level)
    }
    try {
      if (sparkSession != null) {
        sparkSession.stop()
      }
      // To avoid RPC rebinding to the same port, since it doesn't unbind immediately on shutdown
      System.clearProperty("spark.driver.port")
      sparkSession = null
    } finally {
      super.afterAll()
    }
  }

  test("test MlLib benchmarks with mllib-small.yaml.") {
    val results = MLLib.run(yamlConfig = MLLib.smallConfig)
    val failures = results.na.drop(Seq("failure"))
    if (failures.count() > 0) {
      failures.select("name", "failure.*").collect().foreach {
        case Row(name: String, error: String, message: String) =>
          println(
            s"""There as a failure in the benchmark for $name:
               |  $error ${message.replace("\n", "\n  ")}
             """.stripMargin)
      }
      fail("Unable to run all benchmarks successfully, see console output for more info.")
    }
  }

  test("test before benchmark methods for pipeline benchmarks.") {
    val benchmarks = MLLib.getBenchmarks(MLLib.getConf(yamlConfig = MLLib.smallConfig))
    benchmarks.foreach { b =>
      b.beforeBenchmark()
    }
  }
}

Source File: DeleteFromTableEventListener.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.secondaryindex.events

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.internal.Logging
import org.apache.spark.sql.CarbonEnv
import org.apache.spark.sql.hive.CarbonRelation
import org.apache.spark.sql.index.CarbonIndexUtil
import org.apache.spark.sql.secondaryindex.hive.CarbonInternalMetastore
import org.apache.spark.sql.secondaryindex.util.SecondaryIndexUtil

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.events.{DeleteFromTablePostEvent, DeleteFromTablePreEvent, Event, OperationContext, OperationEventListener}


  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case deleteFromTablePreEvent: DeleteFromTablePreEvent =>
        LOGGER.info("Delete from table pre event listener called")
        val carbonTable = deleteFromTablePreEvent.carbonTable
        // Should not allow delete on index table
        if (carbonTable.isIndexTable) {
          sys
            .error(s"Delete is not permitted on Index Table [${
              carbonTable
                .getDatabaseName
            }.${ carbonTable.getTableName }]")
        }
      case deleteFromTablePostEvent: DeleteFromTablePostEvent =>
        LOGGER.info("Delete from table post event listener called")
        val parentCarbonTable = deleteFromTablePostEvent.carbonTable
        val sparkSession = deleteFromTablePostEvent.sparkSession
        CarbonInternalMetastore
          .refreshIndexInfo(parentCarbonTable.getDatabaseName,
            parentCarbonTable.getTableName,
            parentCarbonTable)(
            sparkSession)
        val indexTableList = CarbonIndexUtil.getSecondaryIndexes(parentCarbonTable)
        if (!indexTableList.isEmpty) {
          val indexCarbonTableList = indexTableList.asScala.map { indexTableName =>
            CarbonEnv.getInstance(sparkSession).carbonMetaStore
              .lookupRelation(Option(parentCarbonTable.getDatabaseName), indexTableName)(
                sparkSession)
              .asInstanceOf[CarbonRelation].carbonTable
          }.toList
          SecondaryIndexUtil
            .updateTableStatusForIndexTables(parentCarbonTable, indexCarbonTableList.asJava)
        }
    }
  }
}

Source File: AlterTableDropColumnEventListener.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.secondaryindex.events

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.sql.{CarbonEnv, SparkSession}
import org.apache.spark.sql.execution.command.AlterTableDropColumnModel
import org.apache.spark.sql.execution.command.index.DropIndexCommand
import org.apache.spark.sql.hive.CarbonRelation

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.metadata.index.IndexType
import org.apache.carbondata.events.{AlterTableDropColumnPreEvent, Event, OperationContext, OperationEventListener}

class AlterTableDropColumnEventListener extends OperationEventListener {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case alterTableDropColumnPreEvent: AlterTableDropColumnPreEvent =>
        LOGGER.info("alter table drop column event listener called")
        val carbonTable = alterTableDropColumnPreEvent.carbonTable
        val dbName = carbonTable.getDatabaseName
        val tableName = carbonTable.getTableName
        val tablePath = carbonTable.getTablePath
        val sparkSession = alterTableDropColumnPreEvent.sparkSession
        val alterTableDropColumnModel = alterTableDropColumnPreEvent.alterTableDropColumnModel
        dropApplicableSITables(dbName,
          tableName,
          tablePath,
          alterTableDropColumnModel)(sparkSession)
    }
  }

  private def dropApplicableSITables(dbName: String,
      tableName: String,
      tablePath: String,
      alterTableDropColumnModel: AlterTableDropColumnModel)
    (sparkSession: SparkSession) {
    var indexTableToDrop: Seq[String] = Seq.empty
    val catalog = CarbonEnv.getInstance(sparkSession).carbonMetaStore
    val parentCarbonTable = catalog.lookupRelation(Some(dbName), tableName)(sparkSession)
      .asInstanceOf[CarbonRelation].carbonTable
    val secondaryIndexMap =
      parentCarbonTable.getIndexesMap.get(IndexType.SI.getIndexProviderName)
    if (null == secondaryIndexMap) {
      // if secondary index map is empty, return
      return
    }
    secondaryIndexMap.asScala.foreach(indexTable => {
      val indexColumns = indexTable._2.asScala(CarbonCommonConstants.INDEX_COLUMNS).split(",")
      val colSize = alterTableDropColumnModel.columns.intersect(indexColumns).size
      if (colSize > 0) {
        if (colSize == indexColumns.size) {
          indexTableToDrop ++= Seq(indexTable._1)
        } else {
          sys
            .error(s"Index Table [${
              indexTable._1
            }] exists with combination of provided drop column(s) and other columns, drop " +
                   s"index table & retry")
        }
      }
    })
    indexTableToDrop.foreach { indexTable =>
      DropIndexCommand(ifExistsSet = true,
        Some(dbName),
        tableName,
        indexTable.toLowerCase,
        needLock = false).run(sparkSession)
    }
  }
}

Source File: DeleteSegmentByDateListener.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.secondaryindex.events

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.internal.Logging
import org.apache.spark.sql.CarbonEnv
import org.apache.spark.sql.hive.CarbonRelation
import org.apache.spark.sql.index.CarbonIndexUtil

import org.apache.carbondata.api.CarbonStore
import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.events.{DeleteSegmentByDatePostEvent, Event, OperationContext, OperationEventListener}

class DeleteSegmentByDateListener extends OperationEventListener with Logging {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {

    event match {
      case deleteSegmentPostEvent: DeleteSegmentByDatePostEvent =>
        LOGGER.info("Delete segment By date post event listener called")
        val carbonTable = deleteSegmentPostEvent.carbonTable
        val loadDates = deleteSegmentPostEvent.loadDates
        val sparkSession = deleteSegmentPostEvent.sparkSession
        CarbonIndexUtil.getSecondaryIndexes(carbonTable).asScala.foreach { tableName =>
          val metastore = CarbonEnv.getInstance(sparkSession).carbonMetaStore
          val table = metastore
            .lookupRelation(Some(carbonTable.getDatabaseName), tableName)(sparkSession)
            .asInstanceOf[CarbonRelation].carbonTable
          CarbonStore
            .deleteLoadByDate(loadDates, carbonTable.getDatabaseName, table.getTableName, table)
        }
    }
  }
}

Source File: DropCacheSIEventListener.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.secondaryindex.events

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.execution.command.cache.CarbonDropCacheCommand
import org.apache.spark.sql.index.CarbonIndexUtil

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.metadata.index.IndexType
import org.apache.carbondata.events.{DropTableCacheEvent, Event, OperationContext, OperationEventListener}

object DropCacheSIEventListener extends OperationEventListener {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  override protected def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case dropCacheEvent: DropTableCacheEvent =>
        val carbonTable = dropCacheEvent.carbonTable
        val sparkSession = dropCacheEvent.sparkSession
        val internalCall = dropCacheEvent.internalCall
        if (carbonTable.isIndexTable && !internalCall) {
          throw new UnsupportedOperationException("Operation not allowed on child table.")
        }

        val allIndexTables = carbonTable.getIndexTableNames(
          IndexType.SI.getIndexProviderName)
        val dbName = carbonTable.getDatabaseName
        for (indexTableName <- allIndexTables.asScala) {
          try {
            val dropCacheCommandForChildTable =
              CarbonDropCacheCommand(
                TableIdentifier(indexTableName, Some(dbName)),
                internalCall = true)
            dropCacheCommandForChildTable.processMetadata(sparkSession)
          }
          catch {
            case e: Exception =>
              LOGGER.error(s"Clean cache for SI table $indexTableName failed. ", e)
          }
        }

    }
  }
}

Source File: ShowCacheSIEventListener.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.secondaryindex.events

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.sql.CarbonEnv

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.metadata.index.IndexType
import org.apache.carbondata.core.metadata.schema.indextable.IndexMetadata
import org.apache.carbondata.events.{Event, OperationContext, OperationEventListener, ShowTableCacheEvent}

object ShowCacheSIEventListener extends OperationEventListener {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case showTableCacheEvent: ShowTableCacheEvent =>
        val carbonTable = showTableCacheEvent.carbonTable
        val sparkSession = showTableCacheEvent.sparkSession
        val internalCall = showTableCacheEvent.internalCall
        if (carbonTable.isIndexTable && !internalCall) {
          throw new UnsupportedOperationException("Operation not allowed on index table.")
        }

        val childTables = operationContext.getProperty(carbonTable.getTableUniqueName)
          .asInstanceOf[List[(String, String)]]

        val indexTables = carbonTable.getIndexTableNames(
          IndexType.SI.getIndexProviderName).asScala
        // if there are no index tables for a given fact table do not perform any action
        operationContext.setProperty(carbonTable.getTableUniqueName, indexTables.map {
          indexTable =>
            val indexCarbonTable = CarbonEnv.getCarbonTable(Some(carbonTable.getDatabaseName),
              indexTable)(sparkSession)
            (carbonTable.getDatabaseName + "-" +
             indexTable, "Secondary Index", indexCarbonTable.getTableId)
        }.toList ++ childTables)
    }
  }
}

Source File: UpdateTablePreEventListener.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.secondaryindex.events

import org.apache.log4j.Logger
import org.apache.spark.internal.Logging
import org.apache.spark.sql.index.CarbonIndexUtil

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.events.{Event, OperationContext, OperationEventListener, UpdateTablePreEvent}

class UpdateTablePreEventListener extends OperationEventListener with Logging {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case updateTablePreEvent: UpdateTablePreEvent =>
        LOGGER.info("Update table pre event listener called")
        val carbonTable = updateTablePreEvent.carbonTable
        // Should not allow update on index table
        if (carbonTable.isIndexTable) {
          sys
            .error(s"Update is not permitted on Index Table [${
              carbonTable
                .getDatabaseName
            }.${ carbonTable.getTableName }]")
        } else if (!carbonTable.getIndexesMap.isEmpty) {
          sys
            .error(s"Update is not permitted on table that contains secondary index [${
              carbonTable
                .getDatabaseName
            }.${ carbonTable.getTableName }]. Drop all indexes and retry")

        }
    }
  }
}

Source File: AlterTableRenameEventListener.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.secondaryindex.events

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.internal.Logging
import org.apache.spark.sql.CarbonEnv
import org.apache.spark.sql.hive._
import org.apache.spark.sql.index.CarbonIndexUtil

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.metadata.index.IndexType
import org.apache.carbondata.core.metadata.schema.table.CarbonTable
import org.apache.carbondata.events.{AlterTableRenamePostEvent, Event, OperationContext, OperationEventListener}

class AlterTableRenameEventListener extends OperationEventListener with Logging {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case alterTableRenamePreEvent: AlterTableRenamePostEvent =>
        LOGGER.info("alter table rename Pre event listener called")
        val alterTableRenameModel = alterTableRenamePreEvent.alterTableRenameModel
        val carbonTable = alterTableRenamePreEvent.carbonTable
        val sparkSession = alterTableRenamePreEvent.sparkSession
        val oldDatabaseName = carbonTable.getDatabaseName
        val newTableName = alterTableRenameModel.newTableIdentifier.table
        val metastore = CarbonEnv.getInstance(sparkSession).carbonMetaStore
        val table: CarbonTable = metastore
          .lookupRelation(Some(oldDatabaseName), newTableName)(sparkSession)
          .asInstanceOf[CarbonRelation].carbonTable
        table.getIndexTableNames(IndexType.SI.getIndexProviderName)
          .asScala.map {
          entry =>
            CarbonSessionCatalogUtil.getClient(sparkSession).runSqlHive(
              s"ALTER TABLE $oldDatabaseName.${ entry } " +
              s"SET SERDEPROPERTIES ('parentTableName'='$newTableName')")
        }
    }
  }
}

Source File: CleanFilesPostEventListener.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.secondaryindex.events

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.index.CarbonIndexUtil
import org.apache.spark.sql.optimizer.CarbonFilters

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.indexstore.PartitionSpec
import org.apache.carbondata.core.mutate.CarbonUpdateUtil
import org.apache.carbondata.core.statusmanager.SegmentStatusManager
import org.apache.carbondata.events.{CleanFilesPostEvent, Event, OperationContext, OperationEventListener}

class CleanFilesPostEventListener extends OperationEventListener with Logging {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case cleanFilesPostEvent: CleanFilesPostEvent =>
        LOGGER.info("Clean files post event listener called")
        val carbonTable = cleanFilesPostEvent.carbonTable
        val indexTables = CarbonIndexUtil
          .getIndexCarbonTables(carbonTable, cleanFilesPostEvent.sparkSession)
        indexTables.foreach { indexTable =>
          val partitions: Option[Seq[PartitionSpec]] = CarbonFilters.getPartitions(
            Seq.empty[Expression],
            cleanFilesPostEvent.sparkSession,
            indexTable)
          SegmentStatusManager.deleteLoadsAndUpdateMetadata(
            indexTable, true, partitions.map(_.asJava).orNull)
          CarbonUpdateUtil.cleanUpDeltaFiles(indexTable, true)
        }
    }
  }
}

Source File: DeleteSegmentByIdListener.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.secondaryindex.events

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.internal.Logging
import org.apache.spark.sql.CarbonEnv
import org.apache.spark.sql.hive.CarbonRelation
import org.apache.spark.sql.index.CarbonIndexUtil

import org.apache.carbondata.api.CarbonStore
import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.datastore.impl.FileFactory
import org.apache.carbondata.core.metadata.index.IndexType
import org.apache.carbondata.core.util.path.CarbonTablePath
import org.apache.carbondata.events.{DeleteSegmentByIdPostEvent, Event, OperationContext, OperationEventListener}

class DeleteSegmentByIdListener extends OperationEventListener with Logging {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case deleteSegmentPostEvent: DeleteSegmentByIdPostEvent =>
        LOGGER.info("Delete segment By id post event listener called")
        val carbonTable = deleteSegmentPostEvent.carbonTable
        val loadIds = deleteSegmentPostEvent.loadIds
        val sparkSession = deleteSegmentPostEvent.sparkSession
        val siIndexesMap = carbonTable.getIndexesMap
          .get(IndexType.SI.getIndexProviderName)
        if (null != siIndexesMap) {
          siIndexesMap.keySet().asScala.foreach { tableName =>
            val metastore = CarbonEnv.getInstance(sparkSession).carbonMetaStore
            val table = metastore
              .lookupRelation(Some(carbonTable.getDatabaseName), tableName)(sparkSession)
              .asInstanceOf[CarbonRelation].carbonTable
            val tableStatusFilePath = CarbonTablePath.getTableStatusFilePath(table.getTablePath)
            // this check is added to verify if the table status file for the index table exists
            // or not. Delete on index tables is only to be called if the table status file exists.
            if (FileFactory.isFileExist(tableStatusFilePath)) {
              CarbonStore
                .deleteLoadById(loadIds, carbonTable.getDatabaseName, table.getTableName, table)
            }
          }
        }
    }
  }
}

Source File: CreateCarbonRelationEventListener.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.secondaryindex.events

import org.apache.log4j.Logger
import org.apache.spark.internal.Logging
import org.apache.spark.sql.secondaryindex.hive.CarbonInternalMetastore

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.events.{CreateCarbonRelationPostEvent, Event, OperationContext, OperationEventListener}

class CreateCarbonRelationEventListener extends OperationEventListener with Logging {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case createCarbonRelationPostEvent: CreateCarbonRelationPostEvent =>
        LOGGER.debug("Create carbon relation post event listener called")
        val carbonTable = createCarbonRelationPostEvent.carbonTable
        val databaseName = createCarbonRelationPostEvent.carbonTable.getDatabaseName
        val tableName = createCarbonRelationPostEvent.carbonTable.getTableName
        val sparkSession = createCarbonRelationPostEvent.sparkSession
        CarbonInternalMetastore
          .refreshIndexInfo(databaseName,
            tableName,
            carbonTable,
            createCarbonRelationPostEvent.needLock)(sparkSession)
    }
  }
}

Source File: SILoadEventListener.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.secondaryindex.events

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.internal.Logging
import org.apache.spark.sql.{CarbonEnv, SparkSession}
import org.apache.spark.sql.hive.CarbonRelation
import org.apache.spark.sql.index.CarbonIndexUtil
import org.apache.spark.sql.secondaryindex.command.IndexModel

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.metadata.index.IndexType
import org.apache.carbondata.core.metadata.schema.indextable.IndexMetadata
import org.apache.carbondata.events._
import org.apache.carbondata.processing.loading.events.LoadEvents.LoadTablePreStatusUpdateEvent

class SILoadEventListener extends OperationEventListener with Logging {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case _: LoadTablePreStatusUpdateEvent =>
        if (operationContext.getProperty("isAddLoad") != null &&
            operationContext.getProperty("isAddLoad").toString.toBoolean) {
          return
        }
        LOGGER.info("Load pre status update event-listener called")
        val loadTablePreStatusUpdateEvent = event.asInstanceOf[LoadTablePreStatusUpdateEvent]
        val carbonLoadModel = loadTablePreStatusUpdateEvent.getCarbonLoadModel
        val sparkSession = SparkSession.getActiveSession.get
        // when Si creation and load to main table are parallel, get the carbonTable from the
        // metastore which will have the latest index Info
        val metaStore = CarbonEnv.getInstance(sparkSession).carbonMetaStore
        val carbonTable = metaStore
          .lookupRelation(Some(carbonLoadModel.getDatabaseName),
            carbonLoadModel.getTableName)(sparkSession).asInstanceOf[CarbonRelation].carbonTable
        val indexMetadata = carbonTable.getIndexMetadata
        val secondaryIndexProvider = IndexType.SI.getIndexProviderName
        if (null != indexMetadata && null != indexMetadata.getIndexesMap &&
            null != indexMetadata.getIndexesMap.get(secondaryIndexProvider)) {
          val indexTables = indexMetadata.getIndexesMap
            .get(secondaryIndexProvider).keySet().asScala
          // if there are no index tables for a given fact table do not perform any action
          if (indexTables.nonEmpty) {
            indexTables.foreach {
              indexTableName =>
                val secondaryIndex = IndexModel(Some(carbonTable.getDatabaseName),
                  indexMetadata.getParentTableName,
                  indexMetadata
                    .getIndexColumns(secondaryIndexProvider, indexTableName).split(",").toList,
                  indexTableName)

                val metaStore = CarbonEnv.getInstance(sparkSession).carbonMetaStore
                val indexTable = metaStore
                  .lookupRelation(Some(carbonLoadModel.getDatabaseName),
                    indexTableName)(sparkSession).asInstanceOf[CarbonRelation].carbonTable

                CarbonIndexUtil
                  .LoadToSITable(sparkSession,
                    carbonLoadModel,
                    indexTableName,
                    isLoadToFailedSISegments = false,
                    secondaryIndex,
                    carbonTable, indexTable)
            }
          } else {
            logInfo(s"No index tables found for table: ${carbonTable.getTableName}")
          }
        } else {
          logInfo(s"Index information is null for table: ${carbonTable.getTableName}")
        }
    }
  }
}

Source File: SIRefreshEventListener.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.secondaryindex.events

import org.apache.log4j.Logger
import org.apache.spark.internal.Logging
import org.apache.spark.sql.secondaryindex.hive.CarbonInternalMetastore

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.events.{Event, LookupRelationPostEvent, OperationContext, OperationEventListener}

class SIRefreshEventListener extends OperationEventListener with Logging {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case lookupRelationPostEvent: LookupRelationPostEvent =>
        LOGGER.debug("SI Refresh post event listener called")
        val carbonTable = lookupRelationPostEvent.carbonTable
        val databaseName = lookupRelationPostEvent.carbonTable.getDatabaseName
        val tableName = lookupRelationPostEvent.carbonTable.getTableName
        val sparkSession = lookupRelationPostEvent.sparkSession
        CarbonInternalMetastore.refreshIndexInfo(databaseName, tableName, carbonTable)(sparkSession)
    }
  }
}

Source File: CarbonDropMVCommand.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command.view

import org.apache.log4j.Logger
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.execution.command.AtomicRunnableCommand
import org.apache.spark.sql.execution.command.table.CarbonDropTableCommand

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.datastore.impl.FileFactory
import org.apache.carbondata.core.util.CarbonProperties
import org.apache.carbondata.events.{OperationContext, OperationListenerBus}
import org.apache.carbondata.view.{MVCatalogInSpark, MVManagerInSpark, UpdateMVPostExecutionEvent, UpdateMVPreExecutionEvent}


case class CarbonDropMVCommand(
    databaseNameOption: Option[String],
    name: String,
    ifExistsSet: Boolean,
    forceDrop: Boolean = false)
  extends AtomicRunnableCommand {

  private val logger = CarbonDropMVCommand.LOGGER

  private var dropTableCommand: CarbonDropTableCommand = _

  override def processMetadata(session: SparkSession): Seq[Row] = {
    setAuditInfo(Map("mvName" -> name))
    val viewManager = MVManagerInSpark.get(session)
    try {
      logger.info("Trying to drop materialized view schema")
      val databaseName =
        databaseNameOption.getOrElse(session.sessionState.catalog.getCurrentDatabase)
      val schema = viewManager.getSchema(databaseName, name)
      if (schema != null) {
        // Drop mv status.
        val databaseLocation = viewManager.getDatabaseLocation(databaseName)
        val systemDirectoryPath = CarbonProperties.getInstance()
          .getSystemFolderLocationPerDatabase(FileFactory
            .getCarbonFile(databaseLocation)
            .getCanonicalPath)
        val identifier = TableIdentifier(name, Option(databaseName))
        val operationContext = new OperationContext()
        OperationListenerBus.getInstance().fireEvent(
          UpdateMVPreExecutionEvent(session, systemDirectoryPath, identifier),
          operationContext)
        viewManager.onDrop(databaseName, name)
        OperationListenerBus.getInstance().fireEvent(
          UpdateMVPostExecutionEvent(session, systemDirectoryPath, identifier),
          operationContext)

        // Drop mv table.
        val dropTableCommand = CarbonDropTableCommand(
          ifExistsSet = true,
          Option(databaseName),
          name,
          dropChildTable = true,
          isInternalCall = true)
        dropTableCommand.processMetadata(session)

        // Drop mv schema.
        try {
          viewManager.deleteSchema(databaseName, name)
        } finally {
          val viewCatalog = viewManager.getCatalog()
            .asInstanceOf[MVCatalogInSpark]
          if (viewCatalog != null) {
            viewCatalog.deregisterSchema(schema.getIdentifier)
          }
        }

        this.dropTableCommand = dropTableCommand
      }
    } catch {
      case exception: Exception =>
        if (!ifExistsSet) {
          throw exception
        }
    }
    Seq.empty
  }

  override def processData(sparkSession: SparkSession): Seq[Row] = {
    // delete the table folder
    if (this.dropTableCommand != null) {
      this.dropTableCommand.processData(sparkSession)
    }
    Seq.empty
  }

  override protected def opName: String = "DROP MATERIALIZED VIEW"
}

object CarbonDropMVCommand {

  private val LOGGER: Logger = LogServiceFactory.getLogService(
    classOf[CarbonDropMVCommand].getCanonicalName)

}

Source File: CacheUtil.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command.cache

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.util.SparkSQLUtil

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.datastore.impl.FileFactory
import org.apache.carbondata.core.index.Segment
import org.apache.carbondata.core.metadata.schema.table.{CarbonTable, IndexSchema}
import org.apache.carbondata.core.readcommitter.LatestFilesReadCommittedScope
import org.apache.carbondata.core.statusmanager.SegmentStatusManager
import org.apache.carbondata.core.util.CarbonProperties
import org.apache.carbondata.index.bloom.{BloomCacheKeyValue, BloomCoarseGrainIndexFactory}
import org.apache.carbondata.indexserver.IndexServer
import org.apache.carbondata.processing.merger.CarbonDataMergerUtil


object CacheUtil {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  def getAllIndexFiles(carbonTable: CarbonTable)(sparkSession: SparkSession): List[String] = {
    if (carbonTable.isTransactionalTable) {
      val absoluteTableIdentifier = carbonTable.getAbsoluteTableIdentifier
      val validAndInvalidSegmentsInfo = new SegmentStatusManager(absoluteTableIdentifier)
        .getValidAndInvalidSegments(carbonTable.isMV)
      // Fire a job to clear the invalid segments cached in the executors.
      if (CarbonProperties.getInstance().isDistributedPruningEnabled(carbonTable.getDatabaseName,
        carbonTable.getTableName)) {
        val invalidSegmentIds = validAndInvalidSegmentsInfo.getInvalidSegments.asScala
          .map(_.getSegmentNo).toArray
        try {
          IndexServer.getClient
            .invalidateSegmentCache(carbonTable,
              invalidSegmentIds,
              SparkSQLUtil.getTaskGroupId(sparkSession))
        } catch {
          case e: Exception =>
            LOGGER.warn("Failed to clear cache from executors. ", e)
        }
      }
      validAndInvalidSegmentsInfo.getValidSegments.asScala.flatMap {
        segment =>
          segment.getCommittedIndexFile.keySet().asScala
      }.map { indexFile =>
        indexFile.replace(CarbonCommonConstants.WINDOWS_FILE_SEPARATOR,
          CarbonCommonConstants.FILE_SEPARATOR)
      }.toList
    } else {
      val tablePath = carbonTable.getTablePath
      val readCommittedScope = new LatestFilesReadCommittedScope(tablePath,
        FileFactory.getConfiguration)
      readCommittedScope.getSegmentList.flatMap {
        load =>
          val seg = new Segment(load.getLoadName, null, readCommittedScope)
          seg.getCommittedIndexFile.keySet().asScala
      }.map { indexFile =>
        indexFile.replace(CarbonCommonConstants.WINDOWS_FILE_SEPARATOR,
          CarbonCommonConstants.FILE_SEPARATOR)
      }.toList
    }
  }

  def getBloomCacheKeys(carbonTable: CarbonTable, indexSchema: IndexSchema): List[String] = {
    val segments = CarbonDataMergerUtil.getValidSegmentList(carbonTable).asScala

    // Generate shard Path for the indexSchema
    val shardPaths = segments.flatMap {
      segment =>
        BloomCoarseGrainIndexFactory.getAllShardPaths(carbonTable.getTablePath,
          segment.getSegmentNo, indexSchema.getIndexName).asScala
    }

    // get index columns
    val indexColumns = carbonTable.getIndexedColumns(indexSchema.getIndexColumns).asScala.map {
      entry =>
        entry.getColName
    }

    // generate cache key using shard path and index columns on which bloom was created.
    val indexKeys = shardPaths.flatMap {
      shardPath =>
        indexColumns.map {
          indexCol =>
            new BloomCacheKeyValue.CacheKey(shardPath, indexCol).toString
      }
    }
    indexKeys.toList
  }

}

Source File: DropCacheEventListeners.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.listeners

import scala.collection.JavaConverters._
import scala.collection.mutable

import org.apache.log4j.Logger
import org.apache.spark.sql.{CarbonEnv, SparkSession}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.execution.command.cache.{CacheUtil, CarbonDropCacheCommand}

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.cache.CacheProvider
import org.apache.carbondata.core.metadata.index.IndexType
import org.apache.carbondata.core.metadata.schema.table.IndexSchema
import org.apache.carbondata.core.view.MVSchema
import org.apache.carbondata.events.{DropTableCacheEvent, Event, OperationContext, OperationEventListener}
import org.apache.carbondata.view.MVManagerInSpark

object DropCacheMVEventListener extends OperationEventListener {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override protected def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case dropCacheEvent: DropTableCacheEvent =>
        val carbonTable = dropCacheEvent.carbonTable
        val cache = CacheProvider.getInstance().getCarbonCache
        val indexProviderMap = carbonTable.getIndexesMap
        val bloomIndexProvider = IndexType.BLOOMFILTER.getIndexProviderName
        if (!indexProviderMap.isEmpty && null != indexProviderMap.get(bloomIndexProvider)) {
          val bloomIndexes = indexProviderMap.get(bloomIndexProvider)
          val bloomIndexIterator = bloomIndexes.entrySet().iterator()
          while (bloomIndexIterator.hasNext) {
            val bloomIndexEntry = bloomIndexIterator.next()
            val index = new IndexSchema(bloomIndexEntry.getKey, bloomIndexProvider)
            index.setProperties(bloomIndexEntry.getValue)
            try {
              // Get index keys
              val indexKeys = CacheUtil.getBloomCacheKeys(carbonTable, index)

              // remove index keys from cache
              cache.removeAll(indexKeys.asJava)
            } catch {
              case e: Exception =>
                LOGGER.warn(
                  s"Clean cache for Bloom index ${ index.getIndexName } failed.", e)
            }
          }
        }
    }
  }
}

Source File: CompactionTaskCompletionListener.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.spark.rdd

import java.util

import org.apache.log4j.Logger
import org.apache.spark.TaskContext
import org.apache.spark.sql.carbondata.execution.datasources.tasklisteners.CarbonCompactionTaskCompletionListener
import org.apache.spark.sql.execution.command.management.CommonLoadUtils
import org.apache.spark.util.CollectionAccumulator

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.scan.result.iterator.RawResultIterator
import org.apache.carbondata.core.segmentmeta.SegmentMetaDataInfo
import org.apache.carbondata.processing.loading.TableProcessingOperations
import org.apache.carbondata.processing.loading.model.CarbonLoadModel
import org.apache.carbondata.processing.merger.{AbstractResultProcessor, CarbonCompactionExecutor, CarbonCompactionUtil}

class CompactionTaskCompletionListener(
    carbonLoadModel: CarbonLoadModel,
    exec: CarbonCompactionExecutor,
    processor: AbstractResultProcessor,
    rawResultIteratorMap: util.Map[String, util.List[RawResultIterator]],
    segmentMetaDataAccumulator: CollectionAccumulator[Map[String, SegmentMetaDataInfo]],
    queryStartTime: Long)
  extends CarbonCompactionTaskCompletionListener {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getName)

  override def onTaskCompletion(context: TaskContext): Unit = {
    deleteLocalDataFolders()
    // close all the query executor service and clean up memory acquired during query processing
    if (null != exec) {
      LOGGER.info("Cleaning up query resources acquired during compaction")
      exec.close(rawResultIteratorMap.get(CarbonCompactionUtil.UNSORTED_IDX), queryStartTime)
      exec.close(rawResultIteratorMap.get(CarbonCompactionUtil.SORTED_IDX), queryStartTime)
    }
    // clean up the resources for processor
    if (null != processor) {
      LOGGER.info("Closing compaction processor instance to clean up loading resources")
      processor.close()
    }
    // fill segment metadata to accumulator
    CommonLoadUtils.fillSegmentMetaDataInfoToAccumulator(carbonLoadModel.getTableName,
      carbonLoadModel.getSegmentId,
      segmentMetaDataAccumulator)
  }

  private def deleteLocalDataFolders(): Unit = {
    try {
      LOGGER.info("Deleting local folder store location")
      val isCompactionFlow = true
      TableProcessingOperations
        .deleteLocalDataLoadFolderLocation(carbonLoadModel, isCompactionFlow, false)
    } catch {
      case e: Exception =>
        LOGGER.error(e)
    }
  }

}

Source File: QueryExecutorWithLogging.scala From variantsdwh with Apache License 2.0

5 votes

package pl.edu.pw.ii.zsibio.dwh.benchmark.utils

import java.io.{File, FileOutputStream, PrintWriter}
import java.util.Calendar

import pl.edu.pw.ii.zsibio.dwh.benchmark.dao.{ConnectDriver, EngineConnection, QueryResult}
import net.jcazevedo.moultingyaml._
import net.jcazevedo.moultingyaml.DefaultYamlProtocol
import net.jcazevedo.moultingyaml.DefaultYamlProtocol._
import org.apache.log4j.Logger
import pl.edu.pw.ii.zsibio.dwh.benchmark.dao.ConnectDriver.Value
import pl.edu.pw.ii.zsibio.dwh.benchmark.utils.QueryType.QueryType


case class Query(queryId:String, queryType:String, queryEngine:String, storageFormat:String,queryDesc:String,
                 statement:String)

object QueryType extends Enumeration {

  type QueryType = Value
  val SELECT, CREATE, UPDATE  = Value


}
object QueryExecutorWithLogging {
  val log = Logger.getLogger("pl.edu.pw.ii.zsibio.dwh.benchmark.utils.QueryExecutorWithLogging")
  object QueryYamlProtocol extends DefaultYamlProtocol {
    implicit val queryFormat = yamlFormat6(Query)
  }


  def runStatement(query: Query, conn:EngineConnection, logFile:String, dryRun: Boolean) = {
    log.info(s"Running ${query.queryId} ... using ${query.queryEngine} engine")
    log.debug(s"Executing query: ${query.statement}")
    query.queryType.toLowerCase() match {
      case "select" => logQuery(conn, query, logFile, dryRun)
      case _ => conn.executeUpdate(query.statement.toLowerCase)
    }


  }

  def parseQueryYAML(file:String,storageType:String,connString:String, kuduMaster:String, dbName:String, ifExplain:Boolean = false)  : Query ={
    log.info(s"Parsing ${file}")
    val lines = scala.io.Source.fromFile(file).mkString
    val yml = lines.stripMargin.parseYaml
    import QueryYamlProtocol._
    queryPreprocess(yml.convertTo[Query], storageType, connString, kuduMaster, dbName, ifExplain)

  }

  private def logQuery(conn:EngineConnection, query: Query, logFile:String, dryRun:Boolean) ={
    val rs = conn.executeQuery(query.statement.toLowerCase,true)
    //rs.rs.next()
    val result = s"${Calendar.getInstance().getTime().toString},${query.queryId}," +
      s"${query.queryEngine},${query.storageFormat},${rs.timing.get.getTiming()},${dryRun.toString}\n"
    log.info(s"Result: ${result}")
    val writer = new PrintWriter(new FileOutputStream(new File(logFile),true))
    writer.write(result)
    writer.flush()
    writer.close()
  }

  private def queryPreprocess(query: Query, storageType: String, connString: String, kuduMaster: String, dbName: String, ifExplain: Boolean) = {
    def replaceVars(property:String) ={
      property
        .replaceAll("\\{\\{DATA_FORMAT\\}\\}",storageType.toLowerCase)
        .replaceAll("\\{\\{DB_NAME\\}\\}",dbName.toLowerCase)
        .replaceAll("\\{\\{KUDU_MASTER\\}\\}",kuduMaster )
        .replaceAll("\\{\\{IF_EXPLAIN\\}\\}", if(ifExplain) "EXPLAIN " else "")
        .replaceAll("\\{\\{PERCENTILE_APPROX\\}\\}", if(query.queryEngine.toLowerCase=="presto") "approx_percentile" else "percentile_approx")

    }
    query.copy(
      queryId = replaceVars(query.queryId),
      queryDesc = replaceVars(query.queryDesc),
      storageFormat = replaceVars(query.storageFormat),
      statement = replaceVars(query.statement.replaceAll(",",",\n").replaceAll("\\(","\\(  "))
    )
  }

}

Source File: SomeSQLOnTitanic.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.machinelearning.titanic

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object SomeSQLOnTitanic {

  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

    def main (args:Array[String]): Unit = {
      val testFile = args(0)
      val trainFile = args(1)

      val isLocal = true

      val sparkSession = if (isLocal) {
        SparkSession.builder
          .master("local")
          .appName("my-spark-app")
          .config("spark.some.config.option", "config-value")
          .config("spark.driver.host","127.0.0.1")
          .config("spark.sql.parquet.compression.codec", "gzip")
          .enableHiveSupport()
          .getOrCreate()
      } else {
        SparkSession.builder
          .appName("my-spark-app")
          .config("spark.some.config.option", "config-value")
          .enableHiveSupport()
          .getOrCreate()
      }
      import sparkSession.implicits._

      //Load Data
      val trainDs = sparkSession.read.option("header", "true")
        .option("charset", "UTF8")
        .option("delimiter",",")
        .csv(trainFile)

      trainDs.createOrReplaceTempView("train")

      println("Sex -> Servived")
      sparkSession.sql("select Sex, sum(Survived), count(*), (sum(Survived)/count(*)) from train group by Sex").collect().foreach(println)

      println("Cabin -> Servived")
      sparkSession.sql("select substring(Cabin,1,1), sum(Survived), count(*), (sum(Survived)/count(*)) from train group by 1 order by 1").collect().foreach(println)

      println("Age -> Servived")
      sparkSession.sql("select round(cast(Age as Int) / 10) as age_block, sum(Survived), count(*), (sum(Survived)/count(*)) from train group by 1 order by 1").collect().foreach(println)

      println("PClass -> Servived")
      sparkSession.sql("select pclass, sum(Survived), count(*), (sum(Survived)/count(*)) from train group by pclass order by 1").collect().foreach(println)

      println("Embarked -> Servived")
      sparkSession.sql("select Embarked, sum(Survived), count(*), (sum(Survived)/count(*)) from train group by Embarked order by 1").collect().foreach(println)

      println("Fare -> Servived")
      sparkSession.sql("select round((Fare / 10)), sum(Survived), count(*), (sum(Survived)/count(*)) from train group by 1 order by 1").collect().foreach(println)

      println("Servived -> Servived")
      sparkSession.sql("select sum(Survived), count(*) from train order by 1").collect().foreach(println)

      sparkSession.stop()
  }
}

Source File: ManyToManyNormalJoin.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.manytomany

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Row, SparkSession}

import scala.collection.mutable

object ManyToManyNormalJoin {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val jsonPath = args(0)

    val sparkSession = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .config("spark.driver.host","127.0.0.1")
      .getOrCreate()

    val jsonDf = sparkSession.read.json(jsonPath)

    val nGramWordCount = jsonDf.rdd.flatMap(r => {
      val actions = r.getAs[mutable.WrappedArray[Row]]("actions")

      val resultList = new mutable.MutableList[((Long, Long), Int)]

      actions.foreach(a => {
        val aValue = a.getAs[Long]("action")
        actions.foreach(b => {
          val bValue = b.getAs[Long]("action")
          if (aValue < bValue) {
            resultList.+=(((aValue, bValue), 1))
          }
        })
      })
      resultList.toSeq
    }).reduceByKey(_ + _)

    nGramWordCount.collect().foreach(r => {
      println(" - " + r)
    })
  }
}

Source File: ManyToManyNestedJoin.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.manytomany

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Row, SparkSession}

import scala.collection.mutable

object ManyToManyNestedJoin {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val jsonPath = args(0)

    val sparkSession = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .config("spark.driver.host","127.0.0.1")
      .getOrCreate()

    val jsonDf = sparkSession.read.json(jsonPath)

    val nGramWordCount = jsonDf.rdd.flatMap(r => {
      val actions = r.getAs[mutable.WrappedArray[Row]]("actions")

      val resultList = new mutable.MutableList[(Long, NestedCount)]

      actions.foreach(a => {
        val aValue = a.getAs[Long]("action")
        val aNestedCount = new NestedCount
        actions.foreach(b => {
          val bValue = b.getAs[Long]("action")
          if (aValue < bValue) {
            aNestedCount.+=(bValue, 1)
          }
        })
        resultList.+=((aValue, aNestedCount))
      })
      resultList.toSeq
    }).reduceByKey((a, b) => a + b)

      //.reduceByKey(_ + _)

    nGramWordCount.collect().foreach(r => {
      println(" - " + r)
    })
  }
}


//1,2
//1,3
//1,4

//1 (2, 3, 4)

class NestedCount() extends Serializable{

  val map = new mutable.HashMap[Long, Long]()

  def += (key:Long, count:Long): Unit = {
    val currentValue = map.getOrElse(key, 0l)
    map.put(key, currentValue + count)
  }

  def + (other:NestedCount): NestedCount = {
    val result = new NestedCount

    other.map.foreach(r => {
      result.+=(r._1, r._2)
    })
    this.map.foreach(r => {
      result.+=(r._1, r._2)
    })
    result
  }

  override def toString(): String = {
    val stringBuilder = new StringBuilder
    map.foreach(r => {
      stringBuilder.append("(" + r._1 + "," + r._2 + ")")
    })
    stringBuilder.toString()
  }
}

Source File: SaltedExample.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.salted

import java.util.Random

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object SaltedExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {

    val jsonPath = args(0)

    val sparkSession = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .getOrCreate()

    val jsonDfLeft = sparkSession.read.json(jsonPath)

    val saltedLeft = jsonDfLeft.rdd.flatMap(r => {
      val group = r.getAs[String]("group")
      val value = r.getAs[Long]("value")

      Seq((group + "_" + 0, value),(group + "_" + 1, value))
    })

    val jsonDfRight = sparkSession.read.json(jsonPath)

    val saltedRight = jsonDfRight.rdd.mapPartitions(it => {

      val random = new Random()

      it.map(r => {
        val group = r.getAs[String]("group")
        val value = r.getAs[Long]("value")

        (group + "_" + random.nextInt(2), value)
      })
    })

    jsonDfLeft.join(jsonDfRight).collect().foreach(r => {
      println("Normal.result:" + r)
    })
    println("----")
    saltedLeft.join(saltedRight).collect().foreach(r => {
      println("Salted.result:" + r)
    })
  }
}

Source File: SmallWindowing.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.windowing.small

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession


object SmallWindowing {

  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {

    val jsonPath = args(0)

    val sparkSession = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .getOrCreate()

    val jsonDf = sparkSession.read.json(jsonPath)

    val timeDifRdd = jsonDf.rdd.map(row => {
      val group = row.getAs[String]("group")
      val time = row.getAs[Long]("time")
      val value = row.getAs[Long]("value")
      //(key  , value)
      (group, (time, value))
    }).groupByKey().flatMap{case (group, records) =>

      var lastValue = 0l

      val localList = records.toSeq
      println("localList.size:" + localList.size)
      localList.sortBy(_._1).map{case (time, value) =>
        val dif = value - lastValue
        lastValue = value
        (group, time, value, dif)
      }
    }

    timeDifRdd.take(10).foreach(r => {
      println(r)
    })

    sparkSession.stop()
  }
}

Source File: SuperBigWindowing.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.windowing.superbig

import org.apache.log4j.{Level, Logger}
import org.apache.spark.Partitioner
import org.apache.spark.sql.SparkSession

object SuperBigWindowing {

  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args: Array[String]): Unit = {

    val jsonPath = args(0)
    val pageSize = args(1).toInt

    val spark = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .config("spark.driver.host","127.0.0.1")
      .getOrCreate()

    val jsonDf = spark.read.json(jsonPath)

    import spark.implicits._

    val diffDs = jsonDf.flatMap(row => {
      val group = row.getAs[String]("group")
      val time = row.getAs[Long]("time")
      val value = row.getAs[Long]("value")

      val timePage = time / pageSize

      if (time %  pageSize == 0) { //Am I on the edge of the page
        Seq((timePage, (time, value)), (timePage + 1, (time, value)))
      } else {
        Seq((timePage, (time, value)))
      }
    }).groupByKey(r => r._1).flatMapGroups((k, it) => {
      var lastValue = 0l

      it.toSeq.
        sortBy{case (page, (time, value)) => time}.
        map{case (page, (time, value)) =>
        val dif = value - lastValue
        lastValue = value
        (time, value, dif)
      }
    })

    diffDs.collect().foreach(r => println(" - " + r))

    spark.stop()

  }
}

Source File: SessionWindowing.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.timeseries

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

import scala.collection.mutable

object SessionWindowing {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val sessionJson = args(0)
    val timeGap = args(1).toInt

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host", "127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val sessionDs = sparkSession.read.json(sessionJson).as[JsonLeadLag]

    sessionDs.createOrReplaceTempView("session_table")

    sparkSession.sql("select * from session_table").collect().foreach(println)

    val sessionDefinitinonDf = sessionDs.rdd.map(r => {
      (r.group, r)
    }).groupByKey().flatMap{ case (group, jsonObjIt) =>

      var lastStart:Long = -1
      var lastEnd:Long = -1
      var sessionCount = 1
      var eventsInASession = 0

      val sessionList = new mutable.MutableList[SessionDefinition]

      jsonObjIt.toSeq.sortBy(r => r.ts).foreach(record => {
        val ts = record.ts
        eventsInASession += 1

        if (lastStart == -1) {
          lastStart = ts
        } else if (ts > lastEnd + timeGap) {
          sessionList += SessionDefinition(group, lastStart, lastEnd, lastEnd - lastStart, eventsInASession)
          lastStart = ts
          eventsInASession = 0
        }
        lastEnd = ts
      })
      sessionList
    }

    sessionDefinitinonDf.collect().foreach(println)

  }
}

case class SessionDefinition(group:String, sessionStart:Long, sessionEnd:Long, sessionLength:Long, sessionEvents:Int)

Source File: LeadLagExample.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.timeseries

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object LeadLagExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val leadLagJson = args(0)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host", "127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val leadLag = sparkSession.read.json(leadLagJson).as[JsonLeadLag]

    leadLag.createOrReplaceTempView("leadlag")

    sparkSession.sql("select * from leadlag").collect().foreach(println)

    val leadLagDf = sparkSession.sql("SELECT " +
      "group, ts, " +
      "value as v_now, " +
      "LEAD(value) OVER (PARTITION BY group ORDER BY ts) as v_after, " +
      "LAG(value)  OVER (PARTITION BY group ORDER BY ts) as v_before " +
      "FROM leadlag")

    leadLagDf.collect().foreach(println)

    leadLagDf.createOrReplaceTempView("leadlag_stage2")

    leadLagDf.printSchema()

    sparkSession.sql("select " +
      "group, ts, v_now, v_after, v_before, " +
      "case " +
      " when v_now < v_after and v_now < v_before then 'valley'" +
      " when v_now > v_after and v_now > v_before then 'peak'" +
      " else 'n/a' " +
      "end " +
      "from leadlag_stage2").collect().foreach(println)
  }
}

case class JsonLeadLag(group:String, ts:Long, value:Long)

Source File: TumblingWindows.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.timeseries

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object TumblingWindows {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val leadLagJson = args(0)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host", "127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val leadLag = sparkSession.read.json(leadLagJson).as[JsonLeadLag]

    leadLag.createOrReplaceTempView("leadlag")

    sparkSession.sql("select * from leadlag").collect().foreach(println)

    val leadLagDf = sparkSession.sql("SELECT " +
      "group, " +
      "round(ts / 3), " +
      "avg(value), " +
      "max(value), " +
      "min(value) " +
      "FROM leadlag " +
      "group by 1,2")

    leadLagDf.collect().foreach(println)

  }
}

Source File: InfectionPointWindow.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.timeseries

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object InfectionPointWindow {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val inflectionPointJson = args(0)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host", "127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val inflectionPointDs = sparkSession.read.json(inflectionPointJson).as[JsonInfectionPoint]

    inflectionPointDs.createOrReplaceTempView("inflection_point")

    sparkSession.sql("select * from inflection_point").collect().foreach(println)

    val leadLagDf = sparkSession.sql("SELECT " +
      "group, ts, " +
      "value as v_now, " +
      "AVG(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg, " +
      "Min(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg, " +
      "Max(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg " +
      "FROM inflection_point " +
      "where event_type = 'inflection'")

    leadLagDf.collect().foreach(println)

  }
}

case class JsonInfectionPoint(group:String, ts:Long, value:Long, event_type:String)

Source File: SplidingWindows.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.timeseries

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object SplidingWindows {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val leadLagJson = args(0)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host", "127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val leadLag = sparkSession.read.json(leadLagJson).as[JsonLeadLag]

    leadLag.createOrReplaceTempView("leadlag")

    sparkSession.sql("select * from leadlag").collect().foreach(println)

    val leadLagDf = sparkSession.sql("SELECT " +
      "group, ts, " +
      "value as v_now, " +
      "AVG(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg, " +
      "Min(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg, " +
      "Max(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg " +
      "FROM leadlag")

    leadLagDf.collect().foreach(println)

  }
}

Source File: JsonNestedExample.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.nested

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{ArrayType, DataType, StructField, StructType}

import scala.collection.mutable

object JsonNestedExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args: Array[String]): Unit = {

    val isLocal = args(0).equalsIgnoreCase("l")
    val jsonPath = args(1)
    val outputTableName = args(2)

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    val jsonDf = sparkSession.read.json(jsonPath)

    val localJsonDf = jsonDf.collect()

    println("--Df")
    jsonDf.foreach(row => {
      println("row:" + row)
    })
    println("--local")
    localJsonDf.foreach(row => {
      println("row:" + row)
    })

    jsonDf.createOrReplaceTempView("json_table")

    println("--Tree Schema")
    jsonDf.schema.printTreeString()
    println("--")
    jsonDf.write.saveAsTable(outputTableName)

    sparkSession.sqlContext.sql("select * from " + outputTableName).take(10).foreach(println)

    println("--")
    
    sparkSession.stop()
  }

  def populatedFlattedHashMap(row:Row,
                              schema:StructType,
                              fields:Array[StructField],
                              flattedMap:mutable.HashMap[(String, DataType), mutable.MutableList[Any]],
                              parentFieldName:String): Unit = {
    fields.foreach(field => {

      println("field:" + field.dataType)
      if (field.dataType.isInstanceOf[ArrayType]) {
        val elementType = field.dataType.asInstanceOf[ArrayType].elementType
        if (elementType.isInstanceOf[StructType]) {
          val childSchema = elementType.asInstanceOf[StructType]

          val childRow = Row.fromSeq(row.getAs[mutable.WrappedArray[Any]](field.name).toSeq)

          populatedFlattedHashMap(childRow, childSchema, childSchema.fields, flattedMap, parentFieldName + field.name + ".")
        }
      } else {
        val fieldList = flattedMap.getOrElseUpdate((parentFieldName + field.name, field.dataType), new mutable.MutableList[Any])
        fieldList.+=:(row.getAs[Any](schema.fieldIndex(field.name)))
      }

    })
  }
}

Source File: NestedTableExample.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.nested

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType}
import org.apache.spark.sql.{Row, SparkSession}

object NestedTableExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .config("spark.driver.host","127.0.0.1")
      .enableHiveSupport()
      .getOrCreate()


    spark.sql("create table IF NOT EXISTS nested_empty " +
      "( A int, " +
      "  B string, " +
      "  nested ARRAY<STRUCT< " +
      "     nested_C: int," +
      "     nested_D: string" +
      "  >>" +
      ") ")

    val rowRDD = spark.sparkContext.
      parallelize(Array(
        Row(1, "foo", Seq(Row(1, "barA"),Row(2, "bar"))),
        Row(2, "foo", Seq(Row(1, "barB"),Row(2, "bar"))),
        Row(3, "foo", Seq(Row(1, "barC"),Row(2, "bar")))))

    val emptyDf = spark.sql("select * from nested_empty limit 0")

    val tableSchema = emptyDf.schema

    val populated1Df = spark.sqlContext.createDataFrame(rowRDD, tableSchema)

    println("----")
    populated1Df.collect().foreach(r => println(" emptySchemaExample:" + r))

    val nestedSchema = new StructType()
      .add("nested_C", IntegerType)
      .add("nested_D", StringType)

    val definedSchema = new StructType()
      .add("A", IntegerType)
      .add("B", StringType)
      .add("nested", ArrayType(nestedSchema))

    val populated2Df = spark.sqlContext.createDataFrame(rowRDD, definedSchema)
    println("----")
    populated1Df.collect().foreach(r => println(" BuiltExample:" + r))

    spark.stop()
  }
}

Source File: PopulateHiveTable.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.nested

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType}


object PopulateHiveTable {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .config("spark.driver.host","127.0.0.1")
      .config("spark.sql.parquet.compression.codec", "gzip")
      .enableHiveSupport()
      .getOrCreate()


    spark.sql("create table IF NOT EXISTS nested_empty " +
      "( A int, " +
      "  B string, " +
      "  nested ARRAY<STRUCT< " +
      "     nested_C: int," +
      "     nested_D: string" +
      "  >>" +
      ") ")

    val rowRDD = spark.sparkContext.
      parallelize(Array(
        Row(1, "foo", Seq(Row(1, "barA"),Row(2, "bar"))),
        Row(2, "foo", Seq(Row(1, "barB"),Row(2, "bar"))),
        Row(3, "foo", Seq(Row(1, "barC"),Row(2, "bar")))))

    val emptyDf = spark.sql("select * from nested_empty limit 0")

    val tableSchema = emptyDf.schema

    val populated1Df = spark.sqlContext.createDataFrame(rowRDD, tableSchema)

    populated1Df.repartition(2).write.saveAsTable("nested_populated")

    println("----")
    populated1Df.collect().foreach(r => println(" emptySchemaExample:" + r))

    val nestedSchema = new StructType()
      .add("nested_C", IntegerType)
      .add("nested_D", StringType)

    val definedSchema = new StructType()
      .add("A", IntegerType)
      .add("B", StringType)
      .add("nested", ArrayType(nestedSchema))

    val populated2Df = spark.sqlContext.createDataFrame(rowRDD, definedSchema)

    println("----")
    populated1Df.collect().foreach(r => println(" BuiltExample:" + r))

    spark.stop()
  }
}

Source File: CountingInAStreamExpUpdateStateByKey.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.streaming.dstream

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}

object CountingInAStreamExpUpdateStateByKey {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .master("local[3]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }

    val ssc = new StreamingContext(sparkSession.sparkContext, Seconds(1))
    ssc.checkpoint(checkpointFolder)

    val lines = ssc.socketTextStream(host, port.toInt)
    val words = lines.flatMap(_.split(" "))
    val wordCounts = words.map(word => (word, 1))
      .updateStateByKey((values: Seq[(Int)], state: Option[(Int)]) => {
        var value = state.getOrElse(0)
        values.foreach(i => {
          value += i
        })
        Some(value)
    })

    wordCounts.foreachRDD(rdd => {
      println("{")
      val localCollection = rdd.collect()
      println("  size:" + localCollection.length)
      localCollection.foreach(r => println("  " + r))
      println("}")
    })
    ssc.start()


    ssc.awaitTermination()


  }
}

Source File: CountingInAStreamExpBatchCounting.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.streaming.dstream

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}

object CountingInAStreamExpBatchCounting {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .master("local[3]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }

    val ssc = new StreamingContext(sparkSession.sparkContext, Seconds(2))
    ssc.checkpoint(checkpointFolder)

    val lines = ssc.socketTextStream(host, port.toInt)
    val words = lines.flatMap(line => line.toLowerCase.split(" "))
    val wordCounts = words.map(word => (word, 1))
      .reduceByKey((a,b) => a + b)

    wordCounts.foreachRDD(rdd => {
      println("{")
      val localCollection = rdd.collect()
      println("  size:" + localCollection.length)
      localCollection.foreach(r => println("  " + r))
      println("}")
    })

    ssc.start()

    ssc.awaitTermination()


  }
}

Source File: CountingInAStreamMapWithState.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.streaming.structured

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout}

object CountingInAStreamMapWithState {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .master("local[3]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .master("local[3]")
        .getOrCreate()
    }

    import sparkSession.implicits._

    val socketLines = sparkSession.readStream
      .format("socket")
      .option("host", host)
      .option("port", port)
      .load()

    val messageDs = socketLines.as[String].
      flatMap(line => line.toLowerCase().split(" ")).
      map(word => WordCountEvent(word, 1))

    // Generate running word count
    val wordCounts = messageDs.groupByKey(tuple => tuple.word).
      mapGroupsWithState[WordCountInMemory, WordCountReturn](GroupStateTimeout.ProcessingTimeTimeout) {

      case (word: String, events: Iterator[WordCountEvent], state: GroupState[WordCountInMemory]) =>
        var newCount = if (state.exists) state.get.countOfWord else 0

        events.foreach(tuple => {
          newCount += tuple.countOfWord
        })

        state.update(WordCountInMemory(newCount))

        WordCountReturn(word, newCount)
    }

    // Start running the query that prints the running counts to the console
    val query = wordCounts.writeStream
      .outputMode("update")
      .format("console")
      .start()

    query.awaitTermination()
  }
}

case class WordCountEvent(word:String, countOfWord:Int) extends Serializable {

}

case class WordCountInMemory(countOfWord: Int) extends Serializable {
}

case class WordCountReturn(word:String, countOfWord:Int) extends Serializable {

}

Source File: CountingInAStreamExpGroupBy.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.streaming.structured

import com.malaska.spark.training.streaming.{Message, MessageBuilder}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.{OutputMode, Trigger}
import org.apache.spark.sql.functions._

object CountingInAStreamExpGroupBy {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .master("local[3]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .master("local[3]")
        .getOrCreate()
    }

    import sparkSession.implicits._

    val socketLines = sparkSession.readStream
      .format("socket")
      .option("host", host)
      .option("port", port)
      .load()

    val messageDs = socketLines.as[String].
      flatMap(line => line.toLowerCase().split(" "))

    // Generate running word count
    val wordCounts = messageDs.groupBy("value").count()

    // Start running the query that prints the running counts to the console
    val query = wordCounts.writeStream
      .outputMode("complete")
      .format("console")
      .start()

    query.awaitTermination()
  }
}

Source File: CountingInAStreamDatasetExpGroupBy.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.streaming.structured

import com.malaska.spark.training.streaming.{Message, MessageBuilder}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.functions._

object CountingInAStreamDatasetExpGroupBy {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .master("local[3]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .master("local[3]")
        .getOrCreate()
    }

    import sparkSession.implicits._

    val socketLines = sparkSession.readStream
      .format("socket")
      .option("host", host)
      .option("port", port)
      .load()

    val messageDs = socketLines.as[String].map(line => {
      MessageBuilder.build(line)
    }).as[Message]

    val tickerCount = messageDs.groupBy("ticker", "destUser").agg(sum($"price"), avg($"price"))

    val ticketOutput = tickerCount.writeStream
      .format("Console")
      .trigger(Trigger.ProcessingTime("5 seconds"))
      .option("checkpointLocation", checkpointFolder)
      .outputMode("complete")
      .format("console")
      .start()

    ticketOutput.awaitTermination()
  }
}

Source File: CountingInAStreamExpWindowing.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.streaming.structured

import com.malaska.spark.training.streaming.{Message, MessageBuilder}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.streaming.FileStreamSource.Timestamp
import org.apache.spark.sql.functions._
import org.apache.spark.sql.execution.streaming._
import org.apache.spark.sql.streaming.{OutputMode, Trigger}


object CountingInAStreamExpWindowing {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .master("local[5]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .master("local[5]")
        .getOrCreate()
    }

    import sparkSession.implicits._

    val socketLines = sparkSession.readStream
      .format("socket")
      .option("host", host)
      .option("port", port)
      .option("includeTimestamp", true)
      .load()

    val messageDsDStream = socketLines.as[(String, Timestamp)].map(line => {
      MessageBuilder.build(line._1, line._2)
    }).filter(r => r != null).as[Message]


    val tickerCount = messageDsDStream.withColumn("eventTime", $"tradeTs".cast("timestamp"))
      .withWatermark("eventTime", "30 seconds")
      .groupBy(window($"eventTime", "30 seconds", "5 seconds"), $"ticker")
      .agg(max($"tradeTs") as "max_time", sum($"price") as "total_price", avg($"price") as "avg_price", count($"price") as "number_of_trades")//.orderBy("window")


    val ticketOutput = tickerCount.writeStream
      .format("Console")
      .option("checkpointLocation", checkpointFolder)
      .outputMode("update")
      //.outputMode("complete")
      .format("console")
      .option("truncate", false)
      .option("numRows", 40)
      .start()

    ticketOutput.awaitTermination()
  }

}

Source File: ZombieExample.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.graph

import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx.{Edge, EdgeDirection, Graph, _}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession


object ZombieExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val vertexJsonFile = args(0)
    val edgeJsonFile = args(1)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val vectorDs = sparkSession.read.json(vertexJsonFile).as[JsonVertex]
    val edgeDs = sparkSession.read.json(edgeJsonFile).as[JsonEdge]

    val vectorRdd:RDD[(VertexId, ZombieStats)] = vectorDs.rdd.map(r => {
      (r.vertex_id.toLong, new ZombieStats(r.is_zombie.equals("yes"), r.time_alive))
    })

    val edgeRdd = edgeDs.rdd.map(r => {
      new Edge[String](r.src, r.dst, r.edge_type)
    })

    val defaultUser = new ZombieStats(false, 0)

    val graph = Graph(vectorRdd, edgeRdd, defaultUser)

    val zombieResults = graph.pregel[Long](0, 30, EdgeDirection.Either)(
      (vertexId, zombieState, message) => {
        if (message > 0 && !zombieState.isZombie) {
          new ZombieStats(true, message)
        } else {
          zombieState
        }
      }, triplet => {
        if (triplet.srcAttr.isZombie && !triplet.dstAttr.isZombie) {
          Iterator((triplet.dstId, triplet.srcAttr.lengthOfLife + 1l))
        } else if (triplet.dstAttr.isZombie && !triplet.srcAttr.isZombie) {
          Iterator((triplet.srcId, triplet.dstAttr.lengthOfLife + 1l))
        } else {
          Iterator.empty
        }
      }, (a, b) => Math.min(a, b))

    println("ZombieBite")
    zombieResults.vertices.collect().sortBy(r => r._1).foreach(r => {
      println("vertexId:" + r._1 + ",ZobmieStat:" + r._2)
    })

    sparkSession.stop()
  }
}

case class ZombieStats (isZombie:Boolean, lengthOfLife:Long)

Source File: TrianglesExample.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.graph

import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession


object TrianglesExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val vertexJsonFile = args(0)
    val edgeJsonFile = args(1)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val vectorDs = sparkSession.read.json(vertexJsonFile).as[JsonVertex]
    val edgeDs = sparkSession.read.json(edgeJsonFile).as[JsonEdge]

    val vectorRdd:RDD[(VertexId, ZombieStats)] = vectorDs.rdd.map(r => {
      (r.vertex_id.toLong, new ZombieStats(r.is_zombie.equals("yes"), r.time_alive))
    })

    val edgeRdd = edgeDs.rdd.map(r => {
      new Edge[String](r.src, r.dst, r.edge_type)
    })

    val defaultUser = new ZombieStats(false, 0)

    val graph = Graph(vectorRdd, edgeRdd, defaultUser)

    println("TriangleCount")
    graph.triangleCount().vertices.collect().sortBy(r => r._1).foreach(r => {
      println("vertexId:" + r._1 + ",triangleCount:" + r._2)
    })

    graph.pageRank(1.1, 1.1)

    sparkSession.stop()
  }
}

Source File: RegressionMetricsSpark.scala From DynaML with Apache License 2.0

5 votes

package io.github.mandar2812.dynaml.evaluation

import breeze.linalg.DenseVector
import io.github.mandar2812.dynaml.graphics.charts.Highcharts._
import org.apache.log4j.{Priority, Logger}
import org.apache.spark.Accumulator
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD

import scalax.chart.module.ChartFactories.{XYBarChart, XYLineChart, XYAreaChart}


    histogram(residuals, numBins = 20)
    title("Histogram of Regression Residuals")
  }

}

object RegressionMetricsSpark {

  def computeKPIs(scoresAndLabels: RDD[(Double, Double)], size: Long)
  : (Double, Double, Double, Double) = {
    val mean: Accumulator[Double] = scoresAndLabels.context.accumulator(0.0, "mean")

    val err:DenseVector[Double] = scoresAndLabels.map((sc) => {
      val diff = sc._1 - sc._2
      mean += sc._2
      val difflog = math.pow(math.log(1 + math.abs(sc._1)) - math.log(math.abs(sc._2) + 1),
        2)
      DenseVector(math.abs(diff), math.pow(diff, 2.0), difflog)
    }).reduce((a,b) => a+b)

    val SS_res = err(1)

    val mu: Broadcast[Double] = scoresAndLabels.context.broadcast(mean.value/size.toDouble)

    val SS_tot = scoresAndLabels.map((sc) => math.pow(sc._2 - mu.value, 2.0)).sum()

    val rmse = math.sqrt(SS_res/size.toDouble)
    val mae = err(0)/size.toDouble
    val rsq = if(1/SS_tot != Double.NaN) 1 - (SS_res/SS_tot) else 0.0
    val rmsle = err(2)/size.toDouble
    (mae, rmse, rsq, rmsle)
  } 
  
}

Source File: RejectionSamplingScheme.scala From DynaML with Apache License 2.0

5 votes

package io.github.mandar2812.dynaml.probability

import breeze.stats.distributions.{Density, Rand}
import io.github.mandar2812.dynaml.pipes._
import io.github.mandar2812.dynaml.probability.distributions.GenericDistribution

import scala.util.Random
import org.apache.log4j.Logger




abstract class RejectionSamplingScheme[
ConditioningSet, Domain,
Dist <: Density[ConditioningSet] with Rand[ConditioningSet],
DistL <: Density[Domain] with Rand[Domain],
JointDist <: Density[(ConditioningSet, Domain)] with Rand[(ConditioningSet, Domain)],
Likelihood <: RandomVarWithDistr[Domain, DistL]](
  p: RandomVarWithDistr[ConditioningSet, Dist],
  c: DataPipe[ConditioningSet, Likelihood])
  extends RandomVarWithDistr[(ConditioningSet, Domain), JointDist]
    with BayesJointProbabilityScheme[
    ConditioningSet, Domain,
    RandomVarWithDistr[ConditioningSet, Dist],
    Likelihood] { self =>

  override val prior: RandomVarWithDistr[ConditioningSet, Dist] = p

  override val likelihood: DataPipe[ConditioningSet, Likelihood] = c

  var Max_Candidates: Int = 1000

  var Max_Estimations: Int = 10000

  override val sample = prior.sample >
  BifurcationPipe[ConditioningSet, ConditioningSet, Domain](
      (c: ConditioningSet) => (c, likelihood(c).draw)
    )

  override val posterior: DataPipe[Domain, RandomVariable[ConditioningSet]] =
    DataPipe((data: Domain) => {

    val sampl = this.prior.sample
    val q = this.prior.underlyingDist

    val M = (1 to Max_Estimations).map(i => {
      likelihood(sampl()).underlyingDist(data)
    }).sum/Max_Estimations.toDouble

    new RandomVarWithDistr[ConditioningSet, GenericDistribution[ConditioningSet]] { innerself =>

      val logger = Logger.getLogger(this.getClass)

      override val sample: DataPipe[Unit, ConditioningSet] = DataPipe(() => {
        val iterations = 0
        var accepted = false
        var accepted_sample: ConditioningSet = sampl()

        while(!accepted && iterations < Max_Candidates) {
          // generate a candidate
          val candidate = sampl()
          val a = underlyingDist(candidate)/(M*q(candidate))
          if(Random.nextDouble() <= a) {
            logger.info("... Sample Accepted ...")
            accepted = true
            accepted_sample = candidate
          }
        }

        accepted_sample
      })

      override val underlyingDist: GenericDistribution[ConditioningSet] = new GenericDistribution[ConditioningSet] {
        override def apply(x: ConditioningSet): Double =
          prior.underlyingDist(x)*likelihood(x).underlyingDist(data)

        override def draw() = innerself.sample()
      }

    }
  })

}

Source File: MOGPRegressionModel.scala From DynaML with Apache License 2.0

5 votes

package io.github.mandar2812.dynaml.models.gp

import breeze.linalg.{DenseMatrix, DenseVector}
import io.github.mandar2812.dynaml.kernels.LocalScalarKernel
import io.github.mandar2812.dynaml.pipes.{DataPipe, DataPipe2}
import io.github.mandar2812.dynaml.probability.distributions.MatrixNormal
import org.apache.log4j.Logger


  override def dataAsSeq(data: Stream[(I, DenseVector[Double])]): Seq[((I, Int), Double)] =
    data.map((patternAndLabel) =>
      patternAndLabel._2.mapPairs((i, label) =>
        ((patternAndLabel._1, i), label)
      ).toArray.toSeq).reduceLeft((s1, s2) => s1 ++ s2)
}


class KroneckerMOGPModel[I](
  covFunc: LocalScalarKernel[I], noiseCovFunc: LocalScalarKernel[I], coRegCov: LocalScalarKernel[Int],
  data: Stream[(I, DenseVector[Double])], num: Int, numOutputs: Int,
  meanFunc: DataPipe[(I, Int), Double] = DataPipe((_: (I, Int)) => 0.0)) extends
  MOGPRegressionModel[I](covFunc:*coRegCov, noiseCovFunc:* coRegCov, data, num, numOutputs, meanFunc) {

  val (covFPipe, noiseCovPipe, coRegCovPipe) = (covFunc.asPipe, noiseCovFunc.asPipe, coRegCov.asPipe)

  override def energy(h: Map[String, Double], options: Map[String, String]): Double = {
    setState(h)

    val (features, targets) = data.unzip

    val covMatrix: DenseMatrix[Double] =
      covFunc
        .buildKernelMatrix(features, features.length)
        .getKernelMatrix()

    val noiseMatrix: DenseMatrix[Double] =
      noiseCovFunc
        .buildKernelMatrix(features, features.length)
        .getKernelMatrix()

    val colCovMatrix = coRegCov
      .buildKernelMatrix(0 until noutputs, noutputs)
      .getKernelMatrix()

    val meanMat: DenseMatrix[Double] = DenseMatrix.vertcat(
      features.map(instance =>
        DenseVector.tabulate[Double](noutputs)(o => mean((instance, o))).asDenseMatrix):_*
    )

    val mvn = MatrixNormal(meanMat, covMatrix+noiseMatrix, colCovMatrix)

    -mvn.logPdf(DenseMatrix.vertcat(targets.map(_.asDenseMatrix):_*))
  }
}

Source File: GPBasisFuncRegressionModel.scala From DynaML with Apache License 2.0

5 votes

package io.github.mandar2812.dynaml.models.gp

import breeze.linalg.{DenseMatrix, DenseVector, cholesky, trace, inv}
import breeze.numerics.{log, sqrt}
import io.github.mandar2812.dynaml.algebra._
import io.github.mandar2812.dynaml.analysis._
import io.github.mandar2812.dynaml.algebra.PartitionedMatrixOps._
import io.github.mandar2812.dynaml.algebra.PartitionedMatrixSolvers._
import io.github.mandar2812.dynaml.kernels._
import io.github.mandar2812.dynaml.models.{ContinuousProcessModel, SecondOrderProcessModel}
import io.github.mandar2812.dynaml.optimization.GloballyOptWithGrad
import io.github.mandar2812.dynaml.pipes.{DataPipe, DataPipe2}
import io.github.mandar2812.dynaml.probability.{MultGaussianPRV, MultGaussianRV}
import org.apache.log4j.Logger

import scala.reflect.ClassTag


abstract class GPBasisFuncRegressionModel[T, I: ClassTag](
  cov: LocalScalarKernel[I], n: LocalScalarKernel[I],
  data: T, num: Int, basisFunc: DataPipe[I, DenseVector[Double]],
  basis_param_prior: MultGaussianRV) extends AbstractGPRegressionModel[T, I](
  cov, n, data, num) {

  val MultGaussianRV(b, covB) = basis_param_prior

  implicit val vf = VectorField(b.length)

  private lazy val lowB = cholesky(covB)

  private lazy val covBsolveb = lowB.t \ (lowB \ b)

  private lazy val h: PartitionedMatrix = PartitionedMatrix.horzcat(_blockSize)(trainingData.map(basisFunc(_)):_*)

  override val mean: DataPipe[I, Double] = basisFunc > DataPipe((h: DenseVector[Double]) => h.t * b)

  private val basisFeatureMap: DataPipe[I, DenseVector[Double]] = basisFunc > DataPipe((x: DenseVector[Double]) => lowB*x)

  val feature_map_cov = CovarianceFunction(basisFunc > DataPipe((x: DenseVector[Double]) => lowB*x))

  override protected def getTrainKernelMatrix[U <: Seq[I]] = {
    SVMKernel.buildPartitionedKernelMatrix(trainingData,
      trainingData.length, _blockSize, _blockSize,
      (x: I, y: I) => {covariance.evaluate(x, y) + feature_map_cov.evaluate(x, y) + noiseModel.evaluate(x, y)}
    )
  }

  override protected def getCrossKernelMatrix[U <: Seq[I]](test: U) =
    SVMKernel.crossPartitonedKernelMatrix(
      trainingData, test, _blockSize, _blockSize,
      (x: I, y: I) => {covariance.evaluate(x, y) + feature_map_cov.evaluate(x, y)}
    )

  override protected def getTestKernelMatrix[U <: Seq[I]](test: U) =
    SVMKernel.buildPartitionedKernelMatrix(
      test, test.length.toLong,
      _blockSize, _blockSize,
      (x: I, y: I) => {covariance.evaluate(x, y) + feature_map_cov.evaluate(x, y)}
    )


}

Source File: QuasiNewtonOptimizer.scala From DynaML with Apache License 2.0

5 votes

package io.github.mandar2812.dynaml.optimization

import breeze.linalg.{DenseMatrix, DenseVector, inv}
import io.github.mandar2812.dynaml.pipes.DataPipe
import org.apache.log4j.Logger
import spire.implicits._


  override def optimize(
    nPoints: Long,
    ParamOutEdges: Stream[(DenseVector[Double], Double)],
    initialP: DenseVector[Double]): DenseVector[Double] =
    QuasiNewtonOptimizer.run(
      nPoints, this.regParam, this.numIterations,
      updater, gradient, this.stepSize, initialP,
      ParamOutEdges, DataPipe(identity[Stream[(DenseVector[Double], Double)]] _)
    )
}

object QuasiNewtonOptimizer {

  private val logger = Logger.getLogger(this.getClass)

  def run[T](
    nPoints: Long, regParam: Double, numIterations: Int,
    updater: HessianUpdater, gradient: Gradient, stepSize: Double,
    initial: DenseVector[Double], POutEdges: T,
    transform: DataPipe[T, Stream[(DenseVector[Double], Double)]],
    logging: Boolean = true, logging_rate: Int = 100): DenseVector[Double] = {

    var oldW: DenseVector[Double] = initial

    var newW = oldW
    val hessian = transform(POutEdges)
      .map(_._1)
      .map(x => DenseVector(x.toArray ++ Array(1.0)))
      .map(x => x*x.t)
      .reduce((x: DenseMatrix[Double],
               y: DenseMatrix[Double]) =>
        x + y)

    var regInvHessian = inv(hessian + DenseMatrix.eye[Double](initial.length)*regParam)
    var oldCumGradient = DenseVector.zeros[Double](initial.length)

    println("Performing Quasi-Newton Optimization")
    cfor(1)(iter => iter < numIterations, iter => iter + 1)( iter => {
      val cumGradient: DenseVector[Double] = DenseVector.zeros(initial.length)
      var cumLoss: Double = 0.0
      transform(POutEdges).foreach(ed => {
        val x = DenseVector(ed._1.toArray ++ Array(1.0))
        val y = ed._2
        cumLoss += gradient.compute(x, y, oldW, cumGradient)
      })

      if(logging && iter % logging_rate == 0) RegularizedOptimizer.prettyPrint(iter, cumLoss/nPoints.toDouble)

      //Find the search direction p = inv(H)*grad(J)
      //perform update x_new = x + step*p
      val searchDirection = regInvHessian*cumGradient*(-1.0)
      newW = updater.compute(oldW, searchDirection, stepSize, iter, regParam)._1

      regInvHessian = updater.hessianUpdate(regInvHessian, newW-oldW, cumGradient-oldCumGradient)
      oldW = newW
      oldCumGradient = cumGradient
    })
    newW
  }
}

Source File: GradientDescentSpark.scala From DynaML with Apache License 2.0

5 votes

package io.github.mandar2812.dynaml.optimization

import breeze.linalg.DenseVector
import org.apache.log4j.{Logger, Priority}
import org.apache.spark.AccumulatorParam
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD


  override def optimize(nPoints: Long, ParamOutEdges: RDD[LabeledPoint], initialP: DenseVector[Double])
  : DenseVector[Double] =
    GradientDescentSpark.runBatchSGD(
    nPoints,
    this.regParam,
    this.numIterations,
    this.updater,
    this.gradient,
    this.stepSize,
    initialP,
    ParamOutEdges,
    this.miniBatchFraction
  )
}

object GradientDescentSpark {

  private val logger = Logger.getLogger(this.getClass)

  def runBatchSGD(
                   nPoints: Long,
                   regParam: Double,
                   numIterations: Int,
                   updater: Updater,
                   gradient: Gradient,
                   stepSize: Double,
                   initial: DenseVector[Double],
                   POutEdges: RDD[LabeledPoint],
                   miniBatchFraction: Double): DenseVector[Double] = {
    var count = 1
    var oldW: DenseVector[Double] = initial
    var newW = oldW
    val sc = POutEdges.context
    val gradb = sc.broadcast(gradient)

    logger.log(Priority.INFO, "Training model using SGD")
    while(count <= numIterations) {
      val cumGradient =
        sc.accumulator(DenseVector.zeros[Double](initial.length))(new VectorAccumulator())
      val wb = sc.broadcast(oldW)
      POutEdges sample(withReplacement = false, fraction = miniBatchFraction) foreach
        ((ed) => {
          val features = DenseVector(ed.features.toArray)
          val label = ed.label
          val (g, _) = gradb.value.compute(features, label, wb.value)
          cumGradient += g
        })
      newW = updater.compute(oldW, cumGradient.value / nPoints.toDouble,
        stepSize, count, regParam)._1
      oldW = newW
      count += 1
    }
    newW
  }
}

class VectorAccumulator extends AccumulatorParam[DenseVector[Double]] {
  override def addInPlace(r1: DenseVector[Double],
                          r2: DenseVector[Double]): DenseVector[Double] = r1 + r2

  override def zero(initialValue: DenseVector[Double]): DenseVector[Double] =
    DenseVector.zeros(initialValue.length)
}

Source File: GradBasedGlobalOptimizer.scala From DynaML with Apache License 2.0

5 votes

package io.github.mandar2812.dynaml.optimization

import breeze.linalg.{DenseVector, norm}
import org.apache.log4j.Logger


class GradBasedGlobalOptimizer[M <: GloballyOptWithGrad](model: M)
  extends GlobalOptimizer[M] {
  override val system: M = model

  override protected val logger: Logger = Logger.getLogger(this.getClass)

  override def optimize(initialConfig: Map[String, Double],
                        options: Map[String, String] = Map("tolerance" -> "0.0001",
                          "step" -> "0.005",
                          "maxIterations" -> "50"))
  : (M, Map[String, Double]) = {

    logger.info("Starting Maximum Likelihood based optimization: ML-II")
    println(
      "-----------------------------------------------------"+
      "-----------------------------------------------------")
    //Carry out gradient descent with step size alpha and
    //for a specified number of maximum iterations

    val tolerance = options("tolerance").toDouble
    val alpha = options("step").toDouble
    val maxit = options("maxIterations").toInt

    var count = 1
    var gradNorm = 1.0
    var working_solution = initialConfig

    logger.info("Starting state: \n"+GlobalOptimizer.prettyPrint(working_solution))

    do {
      val gradient = system.gradEnergy(working_solution)
      println(
        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"+
          "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
      print("\n")
      logger.info("Gradient at "+count+" iterations is: \n"+GlobalOptimizer.prettyPrint(gradient))
      print("\n")
      gradNorm = norm(DenseVector(gradient.values.toArray), 2)

      working_solution = working_solution.zip(gradient).map((confAndGrad) => {
        val hyp = confAndGrad._1._1

        val gr:Double = if(confAndGrad._2._2 == Double.PositiveInfinity){
          1.0
        } else if(confAndGrad._2._2 == Double.NegativeInfinity){
          -1.0
        } else if(confAndGrad._2._2 == Double.NaN){
          1.0
        } else {
          confAndGrad._2._2
        }

        val newValue = math.abs(confAndGrad._1._2 - alpha*gr)
        (hyp, newValue)
      })

      logger.info("Updated state : \n"+GlobalOptimizer.prettyPrint(working_solution))
      print("\n")
      println(
        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"+
          "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

      count += 1
    } while (count < maxit && gradNorm >= tolerance)
    logger.info("Stopped ML-II at "+count+" iterations")
    logger.info("Final state : \n"+GlobalOptimizer.prettyPrint(working_solution))

    //Persist the current configuration to the model memory
    if(options.contains("persist") && (options("persist") == "true" || options("persist") == "1"))
      system.persist(working_solution)

    (system, working_solution)
  }
}

Source File: SVMKernelMatrix.scala From DynaML with Apache License 2.0

5 votes

package io.github.mandar2812.dynaml.kernels

import breeze.linalg.{DenseMatrix, DenseVector, eig, max, min}
import org.apache.log4j.{Logger, Priority}


  override def eigenDecomposition(dimensions: Int = this.dimension.toInt):
  (DenseVector[Double], DenseMatrix[Double]) = {
    logger.log(Priority.INFO, "Eigenvalue decomposition of the kernel matrix using JBlas.")
    val decomp = eig(this.kernel)
    logger.log(Priority.INFO, "Eigenvalue stats: "
      +min(decomp.eigenvalues)
      +" =< lambda =< "
      +max(decomp.eigenvalues)
    )
    (decomp.eigenvalues, decomp.eigenvectors)

  }

}

Source File: Test_example_CNN.scala From SparkMLlibDeepLearn with Apache License 2.0

5 votes

package tests

import org.apache.log4j.{ Level, Logger }
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.storage.StorageLevel
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.linalg.{ Vector, Vectors }
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.regression.LabeledPoint
import breeze.linalg.{
  Matrix => BM,
  CSCMatrix => BSM,
  DenseMatrix => BDM,
  Vector => BV,
  DenseVector => BDV,
  SparseVector => BSV,
  axpy => brzAxpy,
  svd => brzSvd,
  max => Bmax,
  min => Bmin,
  sum => Bsum
}
import scala.collection.mutable.ArrayBuffer
import CNN.CNN

object Test_example_CNN {

  def main(args: Array[String]) {
    //1 ����Spark����
    val conf = new SparkConf().setAppName("CNNtest")
    val sc = new SparkContext(conf)

    //2 ��������
    Logger.getRootLogger.setLevel(Level.WARN)
    val data_path = "/deeplearn/train_d3.txt"
    val examples = sc.textFile(data_path).cache()
    val train_d1 = examples.map { line =>
      val f1 = line.split("\t")
      val f = f1.map(f => f.toDouble)
      val y = f.slice(0, 10)
      val x = f.slice(10, f.length)
      (new BDM(1, y.length, y), (new BDM(1, x.length, x)).reshape(28, 28) / 255.0)
    }
    val train_d = train_d1.map(f => (f._1, f._2))
    
    //3 ����ѵ������������ģ��
    // opts:��������������������������֤����
    val opts = Array(50.0, 1.0, 0.0)
    train_d.cache
    val numExamples = train_d.count()
    println(s"numExamples = $numExamples.")
    val CNNmodel = new CNN().
      setMapsize(new BDM(1, 2, Array(28.0, 28.0))).
      setTypes(Array("i", "c", "s", "c", "s")).
      setLayer(5).
      setOnum(10).
      setOutputmaps(Array(0.0, 6.0, 0.0, 12.0, 0.0)).
      setKernelsize(Array(0.0, 5.0, 0.0, 5.0, 0.0)).
      setScale(Array(0.0, 0.0, 2.0, 0.0, 2.0)).
      setAlpha(1.0).
      CNNtrain(train_d, opts)

    //4 ģ�Ͳ���
    val CNNforecast = CNNmodel.predict(train_d)
    val CNNerror = CNNmodel.Loss(CNNforecast)
    println(s"NNerror = $CNNerror.")
    val printf1 = CNNforecast.map(f => (f.label.data,  f.predict_label.data)).take(200)
    println("Ԥ��ֵ")
    for (i <- 0 until printf1.length) {
      val outi = printf1(i)._2.mkString("\t")
      println(outi)
    }

  }
}

Source File: LRAccuracyTest.scala From SparseML with Apache License 2.0

5 votes

package MLlib

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, LogisticRegressionModel, SparseLogisticRegressionWithLBFGS}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkContext, SparkConf}


object LRAccuracyTest {

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName(s"LogisticRegressionTest with $args").setMaster("local")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)
    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").map(
      l => LabeledPoint(l.label, l.features.toSparse))

    // Split data into training (60%) and test (40%).
    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    // Run training algorithm to build the model
    val model = new SparseLogisticRegressionWithLBFGS()
      .setNumClasses(5)
      .run(training)

    // Compute raw scores on the test set.
    val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
      val prediction = model.predict(features)
      (prediction, label)
    }

    // Get evaluation metrics.
    val metrics = new MulticlassMetrics(predictionAndLabels)

    val precision = metrics.precision
    println("Precision = " + precision)


  }

}

Source File: MnistExample.scala From SparseML with Apache License 2.0

5 votes

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.clustering.{KMeans, ScalableKMeans, SparseKMeans}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.SparkSession

object MnistExample {


  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)
    val spark = SparkSession.builder.appName("svm").master("local[8]").getOrCreate()

    val trainRDD = spark.sparkContext.textFile("data/mnist/mnist_train.csv", 8)
      .map(line => line.split(",")).map(arr => arr.map(_.toDouble))
      .map(arr =>  Vectors.dense(arr.slice(1, 785)))

    val model = new KMeans()
      .setK(10)
      .setInitializationMode("random")
      .setMaxIterations(10)
      .run(trainRDD)

    println("final clusters:")
    println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n"))
  }

}

Source File: KMeanTest.scala From SparseML with Apache License 2.0

5 votes

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.clustering.{ScalableKMeans, KMeans}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.{SparseVector, Vectors, Vector}

import scala.util.Random


//spark/bin/spark-submit --master spark://10.100.34.48:7077 --class  ScalableKMeanTest --executor-memory 20g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 1000000 100 0.1 1 my 9

//guale spark/bin/spark-submit --master spark://10.100.34.48:7077 --class  ScalableKMeanTest --executor-memory 5g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 5000000 100 0.1 1 my 15

object ScalableKMeanTest {

  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName(s"kmeans: ${args.mkString(",")}")
    val sc = new SparkContext(conf)

    val k = args(0).toInt
    val dimension = args(1).toInt
    val recordNum = args(2).toInt
    val sparsity = args(3).toDouble
    val iterations = args(4).toInt
    val means = args(5)
    val parNumber = args(6).toInt

    val data: RDD[Vector] = sc.parallelize(1 to recordNum, parNumber).map(i => {
      val ran = new Random()
      val indexArr = ran.shuffle((0 until dimension).toList).take((dimension * sparsity).toInt).sorted.toArray
      val valueArr = (1 to (dimension * sparsity).toInt).map(in => ran.nextDouble()).sorted.toArray
      val vec: Vector = new SparseVector(dimension, indexArr, valueArr)
      vec
    }).cache()
    println(args.mkString(", "))
    println(data.count() + " records generated")

    val st = System.nanoTime()

    val model = if(means == "my") {
      println("running scalable kmeans")
      val model = new ScalableKMeans()
        .setK(k)
        .setInitializationMode("random")
        .setMaxIterations(iterations)
        .run(data)
      model
    } else {
      println("running mllib kmeans")
      val model = new KMeans()
        .setK(k)
        .setInitializationMode("random")
        .setMaxIterations(iterations)
        .run(data)
      model
    }

    println((System.nanoTime() - st) / 1e9 + " seconds cost")
    println("final clusters: " + model.clusterCenters.length)
    println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n"))

    sc.stop()
  }

}

Source File: MannWhitneyUTestSuite.scala From StatisticsOnSpark with Apache License 2.0

5 votes

package test

import org.apache.commons.math3.stat.inference.MannWhitneyUTest
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkContext, SparkConf}


object MannWhitneyUTestSuite {

  Logger.getLogger("org").setLevel(Level.WARN)
  Logger.getLogger("akka").setLevel(Level.WARN)
  val conf = new SparkConf().setAppName("TallSkinnySVD").setMaster("local")
  val sc = new SparkContext(conf)

  def main(args: Array[String]) {
    testMannWhitneyU
    testMannWhitneyUTest
  }

  private def testMannWhitneyU(): Unit ={
    val sample1 = Array(1d, 3d, 5, 7)
    val sample2 = Array(2, 4, 6, 8d)

    val rdd1 = sc.parallelize(sample1)
    val rdd2 = sc.parallelize(sample2)

    val result = new MannWhitneyUTest()
      .mannWhitneyU(sample1, sample2)
    val result2 = org.apache.spark.mllib.stat.test.MannWhitneyUTest.mannWhitneyU(rdd1, rdd2)
    assert(result == result2)
  }

  private def testMannWhitneyUTest(): Unit ={
    val sample1 = Array(1d, 3d, 5, 7)
    val sample2 = Array(2, 4, 6, 8d)

    val rdd1 = sc.parallelize(sample1)
    val rdd2 = sc.parallelize(sample2)

    val result = new MannWhitneyUTest()
      .mannWhitneyUTest(sample1, sample2)
    val result2 = org.apache.spark.mllib.stat.test.MannWhitneyUTest.mannWhitneyUTest(rdd1, rdd2)
    println(result)
    println(result2)
    assert(result == result2)
  }



}

Source File: TTestSuite.scala From StatisticsOnSpark with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat

import org.apache.commons.math3.stat.inference.TestUtils
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}



object TTestSuite {

  Logger.getLogger("org").setLevel(Level.WARN)
  Logger.getLogger("akka").setLevel(Level.WARN)
  val conf = new SparkConf().setAppName("TallSkinnySVD").setMaster("local")
  val sc = new SparkContext(conf)

  def main(args: Array[String]) {
    OneSampleTTest
    twoIndependentSampleTTest
    pairedTwoSampleTTest
  }

  def OneSampleTTest(): Unit ={
    val observed = Array(100d, 200d, 300d, 400d)
    val mu = 2.5d

    assert(TestUtils.tTest(mu, observed, 0.05) == new OneSampleTTest().tTest(mu, sc.parallelize(observed), 0.05))
    assert(TestUtils.tTest(mu, observed) == new OneSampleTTest().tTest(mu, sc.parallelize(observed)))
  }

  def twoIndependentSampleTTest(): Unit ={
    val sample1 = Array(100d, 200d, 300d, 400d)
    val sample2 = Array(101d, 205d, 300d, 400d)

    val rdd1 = sc.parallelize(sample1)
    val rdd2 = sc.parallelize(sample2)

    assert(TestUtils.tTest(sample1, sample2, 0.05) == new TwoSampleIndependentTTest().tTest(rdd1, rdd2, 0.05))
    assert(TestUtils.tTest(sample1, sample2) == new TwoSampleIndependentTTest().tTest(rdd1, rdd2))
  }

  def pairedTwoSampleTTest(): Unit ={
    val sample1 = Array(100d, 200d, 300d, 400d)
    val sample2 = Array(101d, 202d, 300d, 400d)

    val rdd1 = sc.parallelize(sample1)
    val rdd2 = sc.parallelize(sample2)

    assert(TestUtils.pairedTTest(sample1, sample2, 0.05) == new PairTwoSampleTTest().tTest(rdd1, rdd2, 0.05))
    assert(TestUtils.pairedTTest(sample1, sample2) == new PairTwoSampleTTest().tTest(rdd1, rdd2))
  }

}

Source File: ANOVASuite.scala From StatisticsOnSpark with Apache License 2.0

5 votes

package test

import java.util

import main.ANOVA.OneWayANOVA
import org.apache.commons.math3.stat.inference.TestUtils
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.stat.OneSampleTTest
import org.apache.spark.{SparkContext, SparkConf}


object ANOVASuite {

  Logger.getLogger("org").setLevel(Level.WARN)
  Logger.getLogger("akka").setLevel(Level.WARN)
  val conf = new SparkConf().setAppName("TallSkinnySVD").setMaster("local")
  val sc = new SparkContext(conf)

  def main(args: Array[String]) {
    OneWayANOVA
  }

  def OneWayANOVA(): Unit ={
    val sample1 = Array(100d, 200d, 300d, 400d)
    val sample2 = Array(101d, 200d, 300d, 400d)
    val sample3 = Array(102d, 200d, 300d, 400d)
    val data = new util.ArrayList[Array[Double]]()
    data.add(sample1)
    data.add(sample2)
    data.add(sample3)

    val rdd1 = sc.parallelize(sample1)
    val rdd2 = sc.parallelize(sample2)
    val rdd3 = sc.parallelize(sample3)
    val rddData = Seq(rdd1, rdd2, rdd3)

    assert(TestUtils.oneWayAnovaFValue(data) == new OneWayANOVA().anovaFValue(rddData))
    assert(TestUtils.oneWayAnovaPValue(data) == new OneWayANOVA().anovaPValue(rddData))
  }

}

Source File: ProxyPlugin.scala From AppCrawler with Apache License 2.0

5 votes

package com.testerhome.appcrawler.plugin

import java.io.File

import com.brsanthu.googleanalytics.GoogleAnalytics
import com.testerhome.appcrawler.URIElement
import com.testerhome.appcrawler.Plugin
import net.lightbody.bmp.BrowserMobProxyServer
import net.lightbody.bmp.proxy.CaptureType
import org.apache.log4j.{BasicConfigurator, Level, Logger}

import scala.util.Try


class ProxyPlugin extends Plugin {
  private var proxy: BrowserMobProxyServer = _
  val port = 7777

  //todo: 支持代理
  override def start(): Unit = {
    BasicConfigurator.configure()
    Logger.getRootLogger.setLevel(Level.INFO)
    Logger.getLogger("ProxyServer").setLevel(Level.WARN)

    proxy = new BrowserMobProxyServer()
    proxy.setHarCaptureTypes(CaptureType.getNonBinaryContentCaptureTypes)
    proxy.setTrustAllServers(true)
    proxy.start(port)

    //proxy.setHarCaptureTypes(CaptureType.getAllContentCaptureTypes)
    //proxy.setHarCaptureTypes(CaptureType.getHeaderCaptureTypes)
    log.info(s"proxy server listen on ${port}")
    proxy.newHar("start")
  }

  override def beforeElementAction(element: URIElement): Unit = {
    log.info("clear har")
    proxy.endHar()
    //创建新的har
    val harFileName = getCrawler().getBasePathName() + ".har"
    proxy.newHar(harFileName)
  }

  override def afterElementAction(element: URIElement): Unit = {
    log.info("save har")
    val harFileName = getCrawler().getBasePathName() + ".har"
    val file = new File(harFileName)
    try {
      log.info(proxy.getHar)
      log.info(proxy.getHar.getLog)
      log.info(proxy.getHar.getLog.getEntries.size())
      log.info(s"har entry size = ${proxy.getHar.getLog.getEntries.size()}")
      if (proxy.getHar.getLog.getEntries.size() > 0) {
        proxy.getHar.writeTo(file)
      }
    } catch {
      case e: Exception =>{
        log.error("read har error")
        log.error(e.getCause)
        log.error(e.getMessage)
        e.getStackTrace.foreach(log.error)
      }
    }

  }

  override def stop(): Unit ={
    log.info("prpxy stop")
    proxy.stop()
  }
}

Source File: TestGA.scala From AppCrawler with Apache License 2.0

5 votes

package com.testerhome.appcrawler.ut

import com.brsanthu.googleanalytics.{GoogleAnalytics, PageViewHit}
import org.apache.log4j.{BasicConfigurator, Level, Logger}
import org.scalatest.FunSuite


class TestGA extends FunSuite{
  test("google analyse"){
    println("ga start")

    BasicConfigurator.configure()
    Logger.getRootLogger().setLevel(Level.WARN)
    val ga = new GoogleAnalytics("UA-74406102-1")
    1 to 10 foreach(x=>{
      ga.postAsync(new PageViewHit(s"http://appcrawler.io/demo${x}", "test"))
    })
    Thread.sleep(10000)

    1 to 10 foreach(x=>{
      ga.postAsync(new PageViewHit(s"http://appcrawler.io/dem1${x}", "test"))
    })

    Thread.sleep(10000)
    1 to 10 foreach(x=>{
      ga.postAsync(new PageViewHit(s"http://appcrawler.io/dem2${x}", "test"))
    })
    //ga.post(new PageViewHit("http://appcrawler.io/test2", "test"))
    println("ga end")

  }

}

Source File: TestMacaca.scala From AppCrawler with Apache License 2.0

5 votes

package com.testerhome.appcrawler.it

import org.scalatest.{BeforeAndAfterAll, FunSuite}
import org.apache.log4j.Logger
import com.alibaba.fastjson.JSONObject
import macaca.client.MacacaClient



class TestMacaca extends FunSuite with BeforeAndAfterAll{

  val driver=new MacacaClient()
  override def beforeAll(): Unit = {

    val porps = new JSONObject()
    porps.put("autoAcceptAlerts", true)
    porps.put("browserName", "")
    porps.put("platformName", "android")
    porps.put("package", "com.gotokeep.keep")
    porps.put("activity", ".activity.SplashActivity")
    porps.put("reuse", 3)

    val desiredCapabilities = new JSONObject()
    desiredCapabilities.put("desiredCapabilities", porps)
    driver.initDriver(desiredCapabilities)

  }
  test("macaca android"){
    println(driver.source())
  }
  test("macaca chrome"){
    val porps = new JSONObject()
    porps.put("autoAcceptAlerts", true)
    porps.put("browserName", "Chrome")
    porps.put("platformName", "desktop") // android or ios

    porps.put("javascriptEnabled", true)
    porps.put("platform", "ANY")

    val desiredCapabilities = new JSONObject()
    desiredCapabilities.put("desiredCapabilities", porps)
    driver.initDriver(desiredCapabilities)
    driver.get("http://www.baidu.com/")
  }

}

Source File: StreamHQL.scala From spark-cep with Apache License 2.0

5 votes

import java.util.Properties

import kafka.consumer.ConsumerConfig
import org.I0Itec.zkclient.ZkClient
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.streaming.StreamSQLContext
import org.apache.spark.sql.streaming.sources.MessageDelimiter
import org.apache.spark.streaming.dstream.ConstantInputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import redis.RedisManager

import scala.util.parsing.json.JSON

class TabDelimiter extends MessageDelimiter {
  override val delimiter = "\t"
}

object StreamDDL {
  def main(args: Array[String]): Unit = {
    Logger.getRootLogger.setLevel(Level.WARN)
    val query = args(0)
    val sc = new SparkContext(new SparkConf())
    val ssc = new StreamingContext(sc, Seconds(1))
    val streamSqlContext = new StreamSQLContext(ssc, new HiveContext(sc))
    streamSqlContext.command(query)
    new ConstantInputDStream[Int](ssc, sc.parallelize(Seq(1))).print
    ssc.start()
    ssc.awaitTerminationOrTimeout(100)
    ssc.stop()
  }
}

object StreamHQL {

  object Redis {
    var initialized = false
    var manager: RedisManager = _
    def init(confMap: Map[String, String]) {
      if (initialized == false) {
        manager = new RedisManager(
          confMap("redis.shards"),
          confMap("redis.sentinels"),
          confMap("redis.database").toInt)
        manager.init
        initialized = true
      }
    }
  }

  def removeConsumerGroup(zkQuorum: String, groupId: String) {
    val properties = new Properties()
    properties.put("zookeeper.connect", zkQuorum)
    properties.put("group.id", groupId)
    val conf = new ConsumerConfig(properties)
    val zkClient = new ZkClient(conf.zkConnect)
    zkClient.deleteRecursive(s"/consumers/${conf.groupId}")
    zkClient.close()
  }

  def main(args: Array[String]): Unit = {
    Logger.getRootLogger.setLevel(Level.WARN)
    val confMap = JSON.parseFull(args(0)).get.asInstanceOf[Map[String, String]]
    val qid = args(1)
    val query = args(2)
    val sc = new SparkContext(new SparkConf())
    val ssc = new StreamingContext(sc, Seconds(1))
    val hc = new HiveContext(sc)
    val streamSqlContext = new StreamSQLContext(ssc, hc)
    val redisExpireSec = confMap("redis.expire.sec").toInt
    ssc.checkpoint(s"checkpoint/$qid")
    hc.setConf("spark.streaming.query.id", qid)
    hc.setConf("spark.sql.shuffle.partitions", confMap("spark.sql.shuffle.partitions"))

    removeConsumerGroup(confMap("kafka.zookeeper.quorum"), qid)
    val result = streamSqlContext.sql(query)
    val schema = result.schema

    result.foreachRDD((rdd, time) => {
      rdd.foreachPartition(partition => {
        Redis.init(confMap)
        val jedis = Redis.manager.getResource
        val pipe = jedis.pipelined
        partition.foreach(record => {
          val seq = record.toSeq(schema)
          val ts = time.milliseconds / 1000
          val hkey = seq.take(seq.size - 1).mkString(".")
          pipe.hset(qid + "." + ts, hkey, seq(seq.size - 1).toString)
          pipe.expire(qid + "." + ts, redisExpireSec)
        })
        pipe.sync
        Redis.manager.returnResource(jedis)
      })
    })

    ssc.start()
    ssc.awaitTermination()
    ssc.stop()
  }
}

Source File: ZeroMQWordCount.scala From bahir with Apache License 2.0

5 votes

package org.apache.spark.examples.streaming.zeromq

import scala.language.implicitConversions
import scala.util.Random

import org.apache.log4j.{Level, Logger}
import org.zeromq.ZContext
import org.zeromq.ZMQ
import org.zeromq.ZMQException
import org.zeromq.ZMsg

import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.zeromq.ZeroMQUtils


object ZeroMQWordCount {
  def main(args: Array[String]) {
    if (args.length < 2) {
      // scalastyle:off println
      System.err.println("Usage: ZeroMQWordCount <zeroMqUrl> <topic>")
      // scalastyle:on println
      System.exit(1)
    }

    // Set logging level if log4j not configured (override by adding log4j.properties to classpath).
    Logger.getRootLogger.setLevel(Level.WARN)

    val Seq(url, topic) = args.toSeq
    val sparkConf = new SparkConf().setAppName("ZeroMQWordCount")

    // Check Spark configuration for master URL, set it to local if not present.
    if (!sparkConf.contains("spark.master")) {
      sparkConf.setMaster("local[2]")
    }

    // Create the context and set the batch size.
    val ssc = new StreamingContext(sparkConf, Seconds(2))

    val lines = ZeroMQUtils.createTextStream(
      ssc, url, true, Seq(topic.getBytes)
    )
    val words = lines.flatMap(_.split(" "))
    val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)

    wordCounts.print()

    ssc.start()
    ssc.awaitTermination()
  }
}

Source File: TwitterLocations.scala From bahir with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.streaming.twitter

import org.apache.log4j.{Level, Logger}
import twitter4j.FilterQuery

import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.twitter._


object TwitterLocations {
  def main(args: Array[String]) {
    if (args.length < 4 || args.length % 4 != 0) {
      System.err.println("Usage: TwitterLocations <consumer key> <consumer secret> " +
        "<access token> <access token secret> " +
        "[<latitude-south-west> <longitude-south-west>" +
        " <latitude-north-east> <longitude-north-east> ...]")
      System.exit(1)
    }

    // Set logging level if log4j not configured (override by adding log4j.properties to classpath)
    if (!Logger.getRootLogger.getAllAppenders.hasMoreElements) {
      Logger.getRootLogger.setLevel(Level.WARN)
    }

    // Set the system properties so that Twitter4j library used by twitter stream
    // can use them to generate OAuth credentials
    val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4)
    System.setProperty("twitter4j.oauth.consumerKey", consumerKey)
    System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret)
    System.setProperty("twitter4j.oauth.accessToken", accessToken)
    System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret)

    // Get bounding boxes of locations for which to retrieve Tweets from command line
    val locationArgs = args.takeRight(args.length - 4)
    val boundingBoxes = if (locationArgs.length == 0) {
      System.out.println("No location bounding boxes specified, using defaults for New York City")
      val nycSouthWest = Array(-74.0, 40.0)
      val nycNorthEast = Array(-73.0, 41.0)
      Array(nycSouthWest, nycNorthEast)
    } else {
      locationArgs.map(_.toDouble).sliding(2, 2).toArray
    }

    val sparkConf = new SparkConf().setAppName("TwitterLocations")

    // check Spark configuration for master URL, set it to local if not configured
    if (!sparkConf.contains("spark.master")) {
      sparkConf.setMaster("local[2]")
    }

    val ssc = new StreamingContext(sparkConf, Seconds(2))
    val locationsQuery = new FilterQuery().locations(boundingBoxes : _*)

    // Print Tweets from the specified coordinates
    // This includes Tweets geo-tagged in the bounding box defined by the coordinates
    // As well as Tweets tagged in places inside of the bounding box
    TwitterUtils.createFilteredStream(ssc, None, Some(locationsQuery))
      .map(tweet => {
        val latitude = Option(tweet.getGeoLocation).map(l => s"${l.getLatitude},${l.getLongitude}")
        val place = Option(tweet.getPlace).map(_.getName)
        val location = latitude.getOrElse(place.getOrElse("(no location)"))
        val text = tweet.getText.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        s"$location\t$text"
      })
      .print()

    ssc.start()
    ssc.awaitTermination()
  }
}
// scalastyle:on println

Source File: TwitterAlgebirdHLL.scala From bahir with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.streaming.twitter

import com.twitter.algebird.HyperLogLog._
import com.twitter.algebird.HyperLogLogMonoid
import org.apache.log4j.{Level, Logger}

import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.twitter._

// scalastyle:off

    val BIT_SIZE = 12
    val filters = args
    val sparkConf = new SparkConf().setAppName("TwitterAlgebirdHLL")

    // check Spark configuration for master URL, set it to local if not configured
    if (!sparkConf.contains("spark.master")) {
      sparkConf.setMaster("local[2]")
    }

    val ssc = new StreamingContext(sparkConf, Seconds(5))
    val stream = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_ONLY_SER)

    val users = stream.map(status => status.getUser.getId)

    val hll = new HyperLogLogMonoid(BIT_SIZE)
    var globalHll = hll.zero
    var userSet: Set[Long] = Set()

    val approxUsers = users.mapPartitions(ids => {
      ids.map(id => hll.create(id))
    }).reduce(_ + _)

    val exactUsers = users.map(id => Set(id)).reduce(_ ++ _)

    approxUsers.foreachRDD(rdd => {
      if (rdd.count() != 0) {
        val partial = rdd.first()
        globalHll += partial
        println("Approx distinct users this batch: %d".format(partial.estimatedSize.toInt))
        println("Approx distinct users overall: %d".format(globalHll.estimatedSize.toInt))
      }
    })

    exactUsers.foreachRDD(rdd => {
      if (rdd.count() != 0) {
        val partial = rdd.first()
        userSet ++= partial
        println("Exact distinct users this batch: %d".format(partial.size))
        println("Exact distinct users overall: %d".format(userSet.size))
        println("Error rate: %2.5f%%".format(((globalHll.estimatedSize / userSet.size.toDouble) - 1
          ) * 100))
      }
    })

    ssc.start()
    ssc.awaitTermination()
  }
}
// scalastyle:on println

Source File: TwitterPopularTags.scala From bahir with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.streaming.twitter

import org.apache.log4j.{Level, Logger}

import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.twitter._
import org.apache.spark.SparkConf


object TwitterPopularTags {
  def main(args: Array[String]) {
    if (args.length < 4) {
      System.err.println("Usage: TwitterPopularTags <consumer key> <consumer secret> " +
        "<access token> <access token secret> [<filters>]")
      System.exit(1)
    }

    // Set logging level if log4j not configured (override by adding log4j.properties to classpath)
    if (!Logger.getRootLogger.getAllAppenders.hasMoreElements) {
      Logger.getRootLogger.setLevel(Level.WARN)
    }

    val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4)
    val filters = args.takeRight(args.length - 4)

    // Set the system properties so that Twitter4j library used by twitter stream
    // can use them to generate OAuth credentials
    System.setProperty("twitter4j.oauth.consumerKey", consumerKey)
    System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret)
    System.setProperty("twitter4j.oauth.accessToken", accessToken)
    System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret)

    val sparkConf = new SparkConf().setAppName("TwitterPopularTags")

    // check Spark configuration for master URL, set it to local if not configured
    if (!sparkConf.contains("spark.master")) {
      sparkConf.setMaster("local[2]")
    }

    val ssc = new StreamingContext(sparkConf, Seconds(2))
    val stream = TwitterUtils.createStream(ssc, None, filters)

    val hashTags = stream.flatMap(status => status.getText.split(" ").filter(_.startsWith("#")))

    val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60))
                     .map{case (topic, count) => (count, topic)}
                     .transform(_.sortByKey(false))

    val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10))
                     .map{case (topic, count) => (count, topic)}
                     .transform(_.sortByKey(false))


    // Print popular hashtags
    topCounts60.foreachRDD(rdd => {
      val topList = rdd.take(10)
      println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
      topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))}
    })

    topCounts10.foreachRDD(rdd => {
      val topList = rdd.take(10)
      println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
      topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))}
    })

    ssc.start()
    ssc.awaitTermination()
  }
}
// scalastyle:on println

Source File: TrainNewsClassWithDTDemo.scala From CkoocNLP with Apache License 2.0

5 votes

package applications.mining

import config.paramconf.ClassParams
import functions.Preprocessor
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.feature._
import org.apache.spark.sql.SparkSession


object TrainNewsClassWithDTDemo {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.WARN)

    val spark = SparkSession
      .builder
      .master("local[2]")
      .appName("train news with DT Demo")
      .getOrCreate()

    val args = Array("ckooc-ml/data/classnews/train")
    val filePath = args(0)

    import spark.implicits._
    val data = spark.sparkContext.textFile(filePath).flatMap { line =>
      val tokens: Array[String] = line.split("\u00ef")
      if (tokens.length > 3) Some((tokens(0), tokens(1), tokens(2), tokens(3))) else None
    }.toDF("label", "title", "time", "content")
    data.persist()

    val preprocessor = new Preprocessor
    val pipeline = preprocessor.preprocess(data)

    // DT模型训练
    val params = new ClassParams
    val dtClassifier = new DecisionTreeClassifier()
      .setMinInfoGain(params.minInfoGain)
      .setMaxDepth(params.maxDepth) //目前Spark只支持最大30层深度
      .setLabelCol("indexedLabel")
      .setFeaturesCol("features")

    val indexModel = pipeline.getStages(1).asInstanceOf[StringIndexerModel]
    //索引标签化
    val labelConverter = new IndexToString()
      .setLabels(indexModel.labels)
      .setInputCol(dtClassifier.getPredictionCol)
      .setOutputCol("predictedLabel")

    val stages = pipeline.getStages ++ Array(dtClassifier, labelConverter)
    pipeline.setStages(stages)

    val model = pipeline.fit(data)
    model.write.overwrite().save(params.DTModelPath)

    data.unpersist()
    spark.stop()
  }
}

Source File: PredictNewsClassDemo.scala From CkoocNLP with Apache License 2.0

5 votes

package applications.mining

import algorithms.evaluation.MultiClassEvaluation
import config.paramconf.ClassParams
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.PipelineModel
import org.apache.spark.sql.{Row, SparkSession}


object PredictNewsClassDemo extends Serializable {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.WARN)

    val spark = SparkSession
      .builder
      .master("local[2]")
      .appName("predict news multi class demo")
      .getOrCreate()

    val args = Array("ckooc-ml/data/classnews/predict", "lr")
    val filePath = args(0)
    val modelType = args(1)

    var modelPath = ""
    val params = new ClassParams

    modelType match {
      case "lr" => modelPath = params.LRModelPath
      case "dt" => modelPath = params.DTModelPath
      case _ =>
        println("模型类型错误！")
        System.exit(1)
    }

    import spark.implicits._
    val data = spark.sparkContext.textFile(filePath).flatMap { line =>
      val tokens: Array[String] = line.split("\u00ef")
      if (tokens.length > 3) Some((tokens(0), tokens(1), tokens(2), tokens(3))) else None
    }.toDF("label", "title", "time", "content")
    data.persist()

    //加载模型，进行数据转换
    val model = PipelineModel.load(modelPath)
    val predictions = model.transform(data)

    //=== 模型评估
    val resultRDD = predictions.select("prediction", "indexedLabel").rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) }
    val (precision, recall, f1) = MultiClassEvaluation.multiClassEvaluate(resultRDD)
    println("\n\n========= 评估结果 ==========")
    println(s"\n加权准确率：$precision")
    println(s"加权召回率：$recall")
    println(s"F1值：$f1")

    //    predictions.select("label", "predictedLabel", "content").show(100, truncate = false)
    data.unpersist()

    spark.stop()
  }
}

Source File: TrainNewsClassWithLRDemo.scala From CkoocNLP with Apache License 2.0

5 votes

package applications.mining

import config.paramconf.ClassParams
import functions.Preprocessor
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature._
import org.apache.spark.sql.SparkSession


object TrainNewsClassWithLRDemo extends Serializable {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.WARN)

    val spark = SparkSession
      .builder
      .master("local[2]")
      .appName("train news with LR Demo")
      .getOrCreate()

    val args = Array("ckooc-ml/data/classnews/train")
    val filePath = args(0)

    import spark.implicits._
    val data = spark.sparkContext.textFile(filePath).flatMap { line =>
      val tokens: Array[String] = line.split("\u00ef")
      if (tokens.length > 3) Some((tokens(0), tokens(1), tokens(2), tokens(3))) else None
    }.toDF("label", "title", "time", "content")
    data.persist()

    val preprocessor = new Preprocessor
    val pipeline = preprocessor.preprocess(data)

    //LR模型训练
    val params = new ClassParams
    val logisticRegression = new LogisticRegression()
      .setTol(params.converTol)
      .setMaxIter(params.maxIteration)
      .setRegParam(params.regParam)
      .setElasticNetParam(params.elasticNetParam)
      .setLabelCol("indexedLabel")
      .setFeaturesCol("features")

    val indexModel = pipeline.getStages(1).asInstanceOf[StringIndexerModel]
    //索引标签化
    val labelConverter = new IndexToString()
      .setLabels(indexModel.labels)
      .setInputCol(logisticRegression.getPredictionCol)
      .setOutputCol("predictedLabel")

    val stages = pipeline.getStages ++ Array(logisticRegression, labelConverter)
    pipeline.setStages(stages)

    val model = pipeline.fit(data)
    model.write.overwrite().save(params.LRModelPath)

    data.unpersist()
    spark.stop()
  }
}

Source File: StarsAnalysisDemo.scala From CkoocNLP with Apache License 2.0

5 votes

package applications.analysis

import java.io.{BufferedWriter, FileOutputStream, OutputStreamWriter}

import functions.segment.Segmenter
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}


object StarsAnalysisDemo {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val spark = SparkSession
      .builder
      .master("local[2]")
      .appName("Stars Analysis Demo")
      .getOrCreate()

    val filePath = "E:/data/chinaNews/entertainment.txt"


    // 加载数据，并保留年份和内容字段，并对内容字段进行过滤
    import spark.implicits._
    val data = spark.sparkContext.textFile(filePath).flatMap { line =>
      val tokens: Array[String] = line.split("\u00ef")
      if (tokens.length > 3) {
        var year: String = tokens(2).split("-")(0)
        if (tokens(2).contains("年")) year = tokens(2).split("年")(0)

        var content = tokens(3)
        if (content.length > 22 && content.substring(0, 20).contains("日电")) {
          content = content.substring(content.indexOf("日电") + 2, content.length).trim
        }

        if (content.startsWith("(")) content = content.substring(content.indexOf(")") + 1, content.length)
        if (content.length > 20 && content.substring(content.length - 20, content.length).contains("记者")) {
          content = content.substring(0, content.lastIndexOf("记者")).trim
        }

        Some(year, content)
      } else None
    }.toDF("year", "content")

    // 分词，去除长度为1的词，每个词保留词性
    val segmenter = new Segmenter()
      .isAddNature(true)
      .isDelEn(true)
      .isDelNum(true)
      .setMinTermLen(2)
      .setMinTermNum(5)
      .setSegType("StandardSegment")
      .setInputCol("content")
      .setOutputCol("segmented")
    val segDF: DataFrame = segmenter.transform(data)
    segDF.cache()

    val segRDD: RDD[(Int, Seq[String])] = segDF.select("year", "segmented").rdd.map { case Row(year: String, terms: Seq[String]) =>
      (Integer.parseInt(year), terms)
    }

    val result: Array[String] = segRDD.map(line => line._1.toString + "\u00ef" + line._2.mkString(",")).collect()
    val writer: BufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("E:/entertainment_seg.txt")))
    result.foreach(line => writer.write(line + "\n"))
    writer.close()

    // 统计2016出现在新闻中最多的明星
    val stars2016 = segRDD.filter(_._1 == 2016)
      .flatMap { case (year: Int, termStr: Seq[String]) =>
        val person = termStr
          .map(term => (term.split("/")(0), term.split("/")(1)))
          .filter(_._2.equalsIgnoreCase("nr"))
          .map(term => (term._1, 1L))

        person
      }
      .reduceByKey(_ + _)
      .sortBy(_._2, ascending = false)

    segDF.unpersist()

    stars2016.take(100).foreach(println)

    spark.stop()
  }
}

Source File: NLPPreprocessTest.scala From CkoocNLP with Apache License 2.0

5 votes

package nlp

import com.hankcs.hanlp.utility.Predefine
import functions.clean.Cleaner
import functions.segment.Segmenter
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.junit.Test

import scala.reflect.io.File


  @Test
  def testSegmenter(): Unit = {
    val spark = SparkSession
      .builder
      .master("local[2]")
      .appName("Segment Demo")
      .getOrCreate()

    val text = Seq(
      (0, "这段文本是用来做分词测试的！This text is for test!"),
      (1, "江州市长江大桥参加长江大桥通车仪式"),
      (2, "他邀请了不少于10个明星，有：范冰冰、赵薇、周杰伦等，还有20几位商业大佬")
    )
    val sentenceData = spark.createDataFrame(text).toDF("id", "sentence")

    // 设置HanLP配置文件路径, 默认位于classpath路径中
    val path = this.getClass.getClassLoader.getResource("").getPath
    Predefine.HANLP_PROPERTIES_PATH = path + File.separator + "hanlp.properties"

    val segmenter = new Segmenter()
      .isDelEn(true)
      .isDelNum(true)
      .isAddNature(true)
      .setSegType("StandardSegment")
      .setMinTermLen(2)
      .setMinTermNum(3)
      .setInputCol("sentence")
      .setOutputCol("segmented")

    segmenter.transform(sentenceData).show(false)

    spark.stop()
  }
}

Source File: Schema.scala From osmesa with Apache License 2.0

5 votes

package osmesa.analytics.updater

import java.sql.Timestamp
import java.time.Instant

import geotrellis.vectortile.Layer
import org.apache.log4j.Logger
import osmesa.analytics.updater.Implicits._

trait Schema {
  val layer: Layer
  val features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)]

  val newFeatures: Seq[VTFeature]
  lazy val replacementFeatures: Seq[VTFeature] = Seq.empty[VTFeature]
  lazy val retainedFeatures: Seq[VTFeature] = Seq.empty[VTFeature]

  protected lazy val logger: Logger = Logger.getLogger(getClass)

  protected lazy val touchedFeatures: Map[String, Seq[VTFeature]] =
    Map.empty[String, Seq[VTFeature]]

  protected lazy val versionInfo: Map[String, (Int, Int, Timestamp)] =
    touchedFeatures
      .mapValues(_.last)
      .mapValues(
        f =>
          (
            f.data("__version").toInt,
            f.data("__minorVersion").toInt,
            Timestamp.from(Instant.ofEpochMilli(f.data("__updated")))
        ))

  protected lazy val minorVersions: Map[String, Int] =
    features
      .mapValues {
        case (_, curr) => curr.data
      }
      .map {
        case (id, f) =>
          versionInfo.get(id) match {
            case Some((prevVersion, _, _)) if prevVersion < f.version => (id, 0)
            case Some((prevVersion, prevMinorVersion, _)) if prevVersion == f.version =>
              (id, prevMinorVersion + 1)
            case _ => (id, 0)
          }
      }
}

trait SchemaBuilder {
  val layerName: String

  def apply(layer: Layer,
            features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)]): Schema
}

Source File: printMatrix.scala From mCNN with Apache License 2.0

5 votes

package hhbyyh.mCNN

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.{SparkContext, SparkConf}
import breeze.linalg.{DenseMatrix => BDM, kron}


object printMatrix {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)
    val conf = new SparkConf().setMaster("local[8]").setAppName("ttt")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("dataset/mnist/mnist_train.csv", 8)
    val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble))
      .map(arr => (arr(0), Example.Vector2Tensor(Vectors.dense(arr.slice(1, 785).map(v => if(v > 200) 1.0 else 0)))(0)))

    val lines2 = sc.textFile("dataset/train.format", 8)
    val data2 = lines2.map(line => line.split(",")).map(arr => arr.map(_.toDouble))
      .map(arr => (arr(784), Example.Vector2Tensor(Vectors.dense(arr.slice(0, 784)))(0)))

    data2.take(10).foreach(record =>{
      println("label: " + record._1)
      val intm = new BDM[Int](28, 28, record._2.toArray.map(d => d.toInt))
      val str = intm.toString(1000, 1000).replace('0', '.').replace('0', '*')
      println(str)
    })

  }
}

Source File: Example.scala From mCNN with Apache License 2.0

5 votes

package hhbyyh.mCNN

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.{SparkConf, SparkContext}
import breeze.linalg.{DenseMatrix => BDM, _}

object Example {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)
    val conf = new SparkConf().setMaster("local[8]").setAppName("ttt")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("dataset/train.format", 8)
    val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble))
      .map(arr => (arr(784), Vector2Tensor(Vectors.dense(arr.slice(0, 784)))))

    val topology = new CNNTopology
    topology.addLayer(CNNLayer.buildConvolutionLayer(1, 6, new Scale(5, 5)))
    topology.addLayer(CNNLayer.buildMeanPoolingLayer(new Scale(2, 2)))
    topology.addLayer(CNNLayer.buildConvolutionLayer(6, 12, new Scale(5, 5)))
    topology.addLayer(CNNLayer.buildMeanPoolingLayer(new Scale(2, 2)))
    topology.addLayer(CNNLayer.buildConvolutionLayer(12, 12, new Scale(4, 4)))
    val cnn: CNN = new CNN(topology).setMaxIterations(5).setMiniBatchSize(16)
    val start = System.nanoTime()
    cnn.trainOneByOne(data)
    println("Training time: " + (System.nanoTime() - start) / 1e9)

    val right = data.map(record =>{
      val result = cnn.predict(record._2)
      if(result == record._1) 1 else 0
    }).sum()
    println(s"Predicting precision: $right " + right.toDouble/(data.count()))

//    val testData = sc.textFile("dataset/mnist/mnist_test.csv", 8)
//      .map(line => line.split(",")).map(arr => arr.map(_.toDouble))
//      .map(arr => (arr(0), Example.Vector2Tensor(Vectors.dense(arr.slice(1, 785).map(v => if(v > 200) 1.0 else 0)))))

    val rightM = data.map(record =>{
      val result = cnn.predict(record._2)
      if(result == record._1) 1 else 0
    }).sum()
    println(s"Mnist Full Predicting precision: $rightM " + rightM.toDouble/(data.count()))
  }

  
  def Vector2Tensor(record: Vector): Array[BDM[Double]] = {
    val mapSize = new Scale(28, 28)
    val m = new BDM[Double](mapSize.x, mapSize.y)
    var i: Int = 0
    while (i < mapSize.x) {
      var j: Int = 0
      while (j < mapSize.y) {
        m(i, j) = record(mapSize.x * i + j)
        j += 1
      }
      i += 1
    }
    Array(m)
  }


}

org.apache.log4j.Logger Scala Examples