org.apache.log4j.Level Scala Examples

The following examples show how to use org.apache.log4j.Level. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: DenseKMeans.scala    From drizzle-spark   with Apache License 2.0 6 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors


object DenseKMeans {

  object InitializationMode extends Enumeration {
    type InitializationMode = Value
    val Random, Parallel = Value
  }

  import InitializationMode._

  case class Params(
      input: String = null,
      k: Int = -1,
      numIterations: Int = 10,
      initializationMode: InitializationMode = Parallel) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DenseKMeans") {
      head("DenseKMeans: an example k-means app for dense data.")
      opt[Int]('k', "k")
        .required()
        .text(s"number of clusters, required")
        .action((x, c) => c.copy(k = x))
      opt[Int]("numIterations")
        .text(s"number of iterations, default: ${defaultParams.numIterations}")
        .action((x, c) => c.copy(numIterations = x))
      opt[String]("initMode")
        .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " +
        s"default: ${defaultParams.initializationMode}")
        .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x)))
      arg[String]("<input>")
        .text("input paths to examples")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"DenseKMeans with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = sc.textFile(params.input).map { line =>
      Vectors.dense(line.split(' ').map(_.toDouble))
    }.cache()

    val numExamples = examples.count()

    println(s"numExamples = $numExamples.")

    val initMode = params.initializationMode match {
      case Random => KMeans.RANDOM
      case Parallel => KMeans.K_MEANS_PARALLEL
    }

    val model = new KMeans()
      .setInitializationMode(initMode)
      .setK(params.k)
      .setMaxIterations(params.numIterations)
      .run(examples)

    val cost = model.computeCost(examples)

    println(s"Total cost = $cost.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 2
Source File: LinearRegression.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.optimization.{L1Updater, SimpleUpdater, SquaredL2Updater}
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
import org.apache.spark.mllib.util.MLUtils

spark-examples-*.jar \
          |  data/mllib/sample_linear_regression_data.txt
        """.stripMargin)
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"LinearRegression with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()

    val splits = examples.randomSplit(Array(0.8, 0.2))
    val training = splits(0).cache()
    val test = splits(1).cache()

    val numTraining = training.count()
    val numTest = test.count()
    println(s"Training: $numTraining, test: $numTest.")

    examples.unpersist(blocking = false)

    val updater = params.regType match {
      case NONE => new SimpleUpdater()
      case L1 => new L1Updater()
      case L2 => new SquaredL2Updater()
    }

    val algorithm = new LinearRegressionWithSGD()
    algorithm.optimizer
      .setNumIterations(params.numIterations)
      .setStepSize(params.stepSize)
      .setUpdater(updater)
      .setRegParam(params.regParam)

    val model = algorithm.run(training)

    val prediction = model.predict(test.map(_.features))
    val predictionAndLabel = prediction.zip(test.map(_.label))

    val loss = predictionAndLabel.map { case (p, l) =>
      val err = p - l
      err * err
    }.reduce(_ + _)
    val rmse = math.sqrt(loss / numTest)

    println(s"Test RMSE = $rmse.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 3
Source File: BinaryClassification.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, SVMWithSGD}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.optimization.{L1Updater, SquaredL2Updater}
import org.apache.spark.mllib.util.MLUtils

spark-examples-*.jar \
          |  --algorithm LR --regType L2 --regParam 1.0 \
          |  data/mllib/sample_binary_classification_data.txt
        """.stripMargin)
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"BinaryClassification with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()

    val splits = examples.randomSplit(Array(0.8, 0.2))
    val training = splits(0).cache()
    val test = splits(1).cache()

    val numTraining = training.count()
    val numTest = test.count()
    println(s"Training: $numTraining, test: $numTest.")

    examples.unpersist(blocking = false)

    val updater = params.regType match {
      case L1 => new L1Updater()
      case L2 => new SquaredL2Updater()
    }

    val model = params.algorithm match {
      case LR =>
        val algorithm = new LogisticRegressionWithLBFGS()
        algorithm.optimizer
          .setNumIterations(params.numIterations)
          .setUpdater(updater)
          .setRegParam(params.regParam)
        algorithm.run(training).clearThreshold()
      case SVM =>
        val algorithm = new SVMWithSGD()
        algorithm.optimizer
          .setNumIterations(params.numIterations)
          .setStepSize(params.stepSize)
          .setUpdater(updater)
          .setRegParam(params.regParam)
        algorithm.run(training).clearThreshold()
    }

    val prediction = model.predict(test.map(_.features))
    val predictionAndLabel = prediction.zip(test.map(_.label))

    val metrics = new BinaryClassificationMetrics(predictionAndLabel)

    println(s"Test areaUnderPR = ${metrics.areaUnderPR()}.")
    println(s"Test areaUnderROC = ${metrics.areaUnderROC()}.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 4
Source File: SparseNaiveBayes.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.util.MLUtils


object SparseNaiveBayes {

  case class Params(
      input: String = null,
      minPartitions: Int = 0,
      numFeatures: Int = -1,
      lambda: Double = 1.0) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("SparseNaiveBayes") {
      head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.")
      opt[Int]("numPartitions")
        .text("min number of partitions")
        .action((x, c) => c.copy(minPartitions = x))
      opt[Int]("numFeatures")
        .text("number of features")
        .action((x, c) => c.copy(numFeatures = x))
      opt[Double]("lambda")
        .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}")
        .action((x, c) => c.copy(lambda = x))
      arg[String]("<input>")
        .text("input paths to labeled examples in LIBSVM format")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val minPartitions =
      if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions

    val examples =
      MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions)
    // Cache examples because it will be used in both training and evaluation.
    examples.cache()

    val splits = examples.randomSplit(Array(0.8, 0.2))
    val training = splits(0)
    val test = splits(1)

    val numTraining = training.count()
    val numTest = test.count()

    println(s"numTraining = $numTraining, numTest = $numTest.")

    val model = new NaiveBayes().setLambda(params.lambda).run(training)

    val prediction = model.predict(test.map(_.features))
    val predictionAndLabel = prediction.zip(test.map(_.label))
    val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest

    println(s"Test accuracy = $accuracy.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 5
Source File: StreamingExamples.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.streaming

import org.apache.log4j.{Level, Logger}

import org.apache.spark.internal.Logging


  def setStreamingLogLevels() {
    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
    if (!log4jInitialized) {
      // We first log something to initialize Spark's default logging, then we override the
      // logging level.
      logInfo("Setting log level to [WARN] for streaming example." +
        " To override add a custom log4j.properties to the classpath.")
      Logger.getRootLogger.setLevel(Level.WARN)
    }
  }
} 
Example 6
Source File: YarnScheduler.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler.cluster

import org.apache.hadoop.yarn.util.RackResolver
import org.apache.log4j.{Level, Logger}

import org.apache.spark._
import org.apache.spark.scheduler.TaskSchedulerImpl
import org.apache.spark.util.Utils

private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) {

  // RackResolver logs an INFO message whenever it resolves a rack, which is way too often.
  if (Logger.getLogger(classOf[RackResolver]).getLevel == null) {
    Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN)
  }

  // By default, rack is unknown
  override def getRackForHost(hostPort: String): Option[String] = {
    val host = Utils.parseHostPort(hostPort)._1
    Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation)
  }
} 
Example 7
Source File: ClientArguments.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy

import java.net.{URI, URISyntaxException}

import scala.annotation.tailrec
import scala.collection.mutable.ListBuffer

import org.apache.log4j.Level

import org.apache.spark.util.{IntParam, MemoryParam, Utils}


  private def printUsageAndExit(exitCode: Int) {
    // TODO: It wouldn't be too hard to allow users to submit their app and dependency jars
    //       separately similar to in the YARN client.
    val usage =
     s"""
      |Usage: DriverClient [options] launch <active-master> <jar-url> <main-class> [driver options]
      |Usage: DriverClient kill <active-master> <driver-id>
      |
      |Options:
      |   -c CORES, --cores CORES        Number of cores to request (default: $DEFAULT_CORES)
      |   -m MEMORY, --memory MEMORY     Megabytes of memory to request (default: $DEFAULT_MEMORY)
      |   -s, --supervise                Whether to restart the driver on failure
      |                                  (default: $DEFAULT_SUPERVISE)
      |   -v, --verbose                  Print more debugging output
     """.stripMargin
    // scalastyle:off println
    System.err.println(usage)
    // scalastyle:on println
    System.exit(exitCode)
  }
}

private[deploy] object ClientArguments {
  val DEFAULT_CORES = 1
  val DEFAULT_MEMORY = Utils.DEFAULT_DRIVER_MEM_MB // MB
  val DEFAULT_SUPERVISE = false

  def isValidJarUrl(s: String): Boolean = {
    try {
      val uri = new URI(s)
      uri.getScheme != null && uri.getPath != null && uri.getPath.endsWith(".jar")
    } catch {
      case _: URISyntaxException => false
    }
  }
} 
Example 8
Source File: DLClassifierLeNet.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.example.MLPipeline

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToBatch}
import com.intel.analytics.bigdl.dataset.{DataSet, DistributedDataSet, MiniBatch, _}
import com.intel.analytics.bigdl.dlframes.DLClassifier
import com.intel.analytics.bigdl.models.lenet.LeNet5
import com.intel.analytics.bigdl.models.lenet.Utils._
import com.intel.analytics.bigdl.nn.ClassNLLCriterion
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat
import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext


object DLClassifierLeNet {

  LoggerFilter.redirectSparkInfoLogs()

  def main(args: Array[String]): Unit = {
    val inputs = Array[String]("Feature data", "Label data")
    trainParser.parse(args, new TrainParams()).foreach(param => {
      val conf = Engine.createSparkConf()
        .setAppName("MLPipeline Example")
        .set("spark.task.maxFailures", "1")
      val sc = new SparkContext(conf)
      val sqLContext = SQLContext.getOrCreate(sc)
      Engine.init

      val trainData = param.folder + "/train-images-idx3-ubyte"
      val trainLabel = param.folder + "/train-labels-idx1-ubyte"
      val validationData = param.folder + "/t10k-images-idx3-ubyte"
      val validationLabel = param.folder + "/t10k-labels-idx1-ubyte"

      val trainSet = DataSet.array(load(trainData, trainLabel), sc) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch(1)

      val trainingRDD : RDD[Data[Float]] = trainSet.
        asInstanceOf[DistributedDataSet[MiniBatch[Float]]].data(false).map(batch => {
          val feature = batch.getInput().asInstanceOf[Tensor[Float]]
          val label = batch.getTarget().asInstanceOf[Tensor[Float]]
          Data[Float](feature.storage().array(), label.storage().array())
        })
      val trainingDF = sqLContext.createDataFrame(trainingRDD).toDF(inputs: _*)

      val model = LeNet5(classNum = 10)
      val criterion = ClassNLLCriterion[Float]()
      val featureSize = Array(28, 28)
      val estimator = new DLClassifier[Float](model, criterion, featureSize)
        .setFeaturesCol(inputs(0))
        .setLabelCol(inputs(1))
        .setBatchSize(param.batchSize)
        .setMaxEpoch(param.maxEpoch)
      val transformer = estimator.fit(trainingDF)

      val validationSet = DataSet.array(load(validationData, validationLabel), sc) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToBatch(1)

      val validationRDD: RDD[Data[Float]] = validationSet.
        asInstanceOf[DistributedDataSet[MiniBatch[Float]]].data(false).map{batch =>
          val feature = batch.getInput().asInstanceOf[Tensor[Float]]
          val label = batch.getTarget().asInstanceOf[Tensor[Float]]
          Data[Float](feature.storage().array(), label.storage().array())
        }
      val validationDF = sqLContext.createDataFrame(validationRDD).toDF(inputs: _*)
      val transformed = transformer.transform(validationDF)
      transformed.show()
      sc.stop()
    })
  }
}

private case class Data[T](featureData : Array[T], labelData : Array[T]) 
Example 9
Source File: ImagePredictor.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.example.imageclassification

import java.nio.file.Paths

import com.intel.analytics.bigdl.dataset.image._
import com.intel.analytics.bigdl.dlframes.DLClassifierModel
import com.intel.analytics.bigdl.example.imageclassification.MlUtils._
import com.intel.analytics.bigdl.numeric.NumericFloat
import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext


object ImagePredictor {
  LoggerFilter.redirectSparkInfoLogs()
  Logger.getLogger("com.intel.analytics.bigdl.example").setLevel(Level.INFO)

  def main(args: Array[String]): Unit = {
    predictParser.parse(args, new PredictParams()).map(param => {
      val conf = Engine.createSparkConf()
      conf.setAppName("Predict with trained model")
      val sc = new SparkContext(conf)
      Engine.init
      val sqlContext = new SQLContext(sc)

      val partitionNum = Engine.nodeNumber() * Engine.coreNumber()
      val model = loadModel(param)
      val valTrans = new DLClassifierModel(model, Array(3, imageSize, imageSize))
        .setBatchSize(param.batchSize)
        .setFeaturesCol("features")
        .setPredictionCol("predict")

      val valRDD = if (param.isHdfs) {
        // load image set from hdfs
        imagesLoadSeq(param.folder, sc, param.classNum).coalesce(partitionNum, true)
      } else {
        // load image set from local
        val paths = LocalImageFiles.readPaths(Paths.get(param.folder), hasLabel = false)
        sc.parallelize(imagesLoad(paths, 256), partitionNum)
      }

      val transf = RowToByteRecords() ->
          BytesToBGRImg() ->
          BGRImgCropper(imageSize, imageSize) ->
          BGRImgNormalizer(testMean, testStd) ->
          BGRImgToImageVector()

      val valDF = transformDF(sqlContext.createDataFrame(valRDD), transf)

      valTrans.transform(valDF)
          .select("imageName", "predict")
          .collect()
          .take(param.showNum)
          .foreach(println)
      sc.stop()
    })
  }
} 
Example 10
Source File: ImageNetInference.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.example.mkldnn.int8

import com.intel.analytics.bigdl.models.resnet.ImageNetDataSet
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._
import com.intel.analytics.bigdl.utils._
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext


object ImageNetInference {
  LoggerFilter.redirectSparkInfoLogs()
  Logger.getLogger("com.intel.analytics.bigdl.optim").setLevel(Level.INFO)

  val logger: Logger = Logger.getLogger(getClass)

  import Utils._

  def main(args: Array[String]): Unit = {
    testParser.parse(args, TestParams()).foreach(param => {
      val conf = Engine.createSparkConf()
        .setAppName("Test model on ImageNet2012 with Int8")
        .set("spark.rpc.message.maxSize", "200")
      val sc = new SparkContext(conf)
      Engine.init

      val evaluationSet = ImageNetDataSet.valDataSet(param.folder,
        sc, 224, param.batchSize).toDistributed().data(train = false)

      val model = Module.loadModule[Float](param.model).quantize()
      model.evaluate()

      val result = model.evaluate(evaluationSet, Array(new Top1Accuracy[Float],
        new Top5Accuracy[Float]))

      result.foreach(r => println(s"${r._2} is ${r._1}"))

      sc.stop()
    })
  }
} 
Example 11
Source File: GenerateInt8Scales.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.example.mkldnn.int8

import com.intel.analytics.bigdl.dataset.{DataSet, MiniBatch}
import com.intel.analytics.bigdl.models.resnet.ImageNetDataSet
import com.intel.analytics.bigdl.nn.{Graph, Module}
import com.intel.analytics.bigdl.utils.Engine
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD


object GenerateInt8Scales {
  val logger: Logger = Logger.getLogger(getClass)
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)

  import Utils._

  def genereateInt8Scales(model: Graph[Float], modelName: String,
    evaluationSet: RDD[MiniBatch[Float]]): Unit = {
    model.evaluate()

    model.setInputDimMask(0, true)
    model.setOutputDimMask(0, true)
    model.setWeightDimMask(1, true)

    logger.info(s"Generate the scales for $modelName ...")
    val samples = evaluationSet
      .repartition(1) // repartition (shuffle) will have better accuracy
      .take(1) // only split one batch to sample
      .map(_.getInput().toTensor[Float])

    samples.foreach { sample =>
      model.forward(sample)
      model.calcScales(sample)
    }

    // we should clean the state, such as output
    model.clearState()

    logger.info(s"Generate the scales for $modelName done.")
  }

  def saveQuantizedModel(model: Graph[Float], modelName: String): Unit = {
    val suffix = ".bigdl"
    val prefix = modelName.stripSuffix(suffix)
    val name = prefix.concat(".quantized").concat(suffix)
    logger.info(s"Save the quantized model $name ...")
    // it will force overWrite the existed model file
    model.saveModule(name, overWrite = true)
    logger.info(s"Save the quantized model $name done.")
  }

  def main(args: Array[String]): Unit = {
    genInt8ScalesParser.parse(args, GenInt8ScalesParams()).foreach { param =>
      val conf = Engine.createSparkConf().setAppName("Quantize the model")
        .set("spark.akka.frameSize", 64.toString)
        .set("spark.task.maxFailures", "1")
      val sc = new SparkContext(conf)
      Engine.init

      val partitionNum = Engine.nodeNumber()
      val imageFrame = DataSet.SeqFileFolder.filesToImageFrame(param.folder, sc, 1000,
        partitionNum = Option(partitionNum))

      // the transformer is the same as as that in validation during training
      val evaluationSet = ImageNetDataSet.valDataSet(param.folder,
        sc, 224, param.batchSize).toDistributed().data(train = false)
      // Currently, we only support the graph model, so we add a `toGraph`
      // if the model is already graph, you can need not to it.
      val model = Module.loadModule[Float](param.model).toGraph()
      genereateInt8Scales(model, param.model, evaluationSet)
      saveQuantizedModel(model, param.model)
    }
  }
} 
Example 12
Source File: Test.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.example.lenetLocal

import com.intel.analytics.bigdl.dataset.{DataSet, SampleToBatch}
import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToSample}
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.optim.{Top1Accuracy, ValidationMethod}
import com.intel.analytics.bigdl.utils.Engine
import org.apache.log4j.{Level, Logger}

object Test {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)


  import Utils._

  def main(args: Array[String]): Unit = {
    testParser.parse(args, new TestParams()).foreach { param =>
      System.setProperty("bigdl.localMode", "true")
      System.setProperty("bigdl.coreNumber", param.coreNumber.toString)
      Engine.init

      val validationData = param.folder + "/t10k-images-idx3-ubyte"
      val validationLabel = param.folder + "/t10k-labels-idx1-ubyte"

      val evaluationSet = DataSet.array(load(validationData, validationLabel)) ->
        BytesToGreyImg(28, 28) ->
        GreyImgNormalizer(trainMean, trainStd) ->
        GreyImgToSample() -> SampleToBatch(
        batchSize = param.batchSize, None, None, None,
        partitionNum = Some(1))

      val model = Module.load[Float](param.model)
      val result = model.evaluate(evaluationSet.toLocal(),
        Array(new Top1Accuracy[Float].asInstanceOf[ValidationMethod[Float]]))
      result.foreach(r => println(s"${r._2} is ${r._1}"))
    }
  }
} 
Example 13
Source File: Predict.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.example.lenetLocal
import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToSample}
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.utils.Engine
import com.intel.analytics.bigdl.dataset.Sample
import com.intel.analytics.bigdl.optim.LocalPredictor
import org.apache.log4j.{Level, Logger}

import scala.collection.mutable.ArrayBuffer

object Predict {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)


  import Utils._

  def main(args: Array[String]): Unit = {
    predictParser.parse(args, new PredictParams()).foreach { param =>

      System.setProperty("bigdl.localMode", "true")
      System.setProperty("bigdl.coreNumber", (param.coreNumber.toString))
      Engine.init

      val validationData = param.folder + "/t10k-images-idx3-ubyte"
      val validationLabel = param.folder + "/t10k-labels-idx1-ubyte"

      val rawData = load(validationData, validationLabel)
      val iter = rawData.iterator
      val sampleIter = GreyImgToSample()(
          GreyImgNormalizer(trainMean, trainStd)(
          BytesToGreyImg(28, 28)(iter)))
      var samplesBuffer = ArrayBuffer[Sample[Float]]()
      while (sampleIter.hasNext) {
        val elem = sampleIter.next().clone()
        samplesBuffer += elem
      }
      val samples = samplesBuffer.toArray

      val model = Module.load[Float](param.model)
      val localPredictor = LocalPredictor(model)
      val result = localPredictor.predict(samples)
      val result_class = localPredictor.predictClass(samples)
      result_class.foreach(r => println(s"${r}"))
    }
  }
} 
Example 14
Source File: Train.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.example.lenetLocal

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.DataSet
import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToBatch}
import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Module}
import com.intel.analytics.bigdl.numeric.NumericFloat
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter}
import com.intel.analytics.bigdl.models.lenet.LeNet5
import org.apache.log4j.{Level, Logger}


object Train {
  LoggerFilter.redirectSparkInfoLogs()


  import Utils._

  def main(args: Array[String]): Unit = {
    trainParser.parse(args, new TrainParams()).map(param => {

      System.setProperty("bigdl.localMode", "true")
      System.setProperty("bigdl.coreNumber", param.coreNumber.toString)
      Engine.init

      val trainData = param.folder + "/train-images-idx3-ubyte"
      val trainLabel = param.folder + "/train-labels-idx1-ubyte"
      val validationData = param.folder + "/t10k-images-idx3-ubyte"
      val validationLabel = param.folder + "/t10k-labels-idx1-ubyte"

      val model = if (param.modelSnapshot.isDefined) {
        Module.load[Float](param.modelSnapshot.get)
      } else {
        LeNet5(classNum = 10)
      }

      val optimMethod = if (param.stateSnapshot.isDefined) {
        OptimMethod.load[Float](param.stateSnapshot.get)
      } else {
        new SGD[Float](learningRate = param.learningRate,
          learningRateDecay = param.learningRateDecay)
      }

      val trainSet = DataSet.array(load(trainData, trainLabel)) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch(
        param.batchSize)

      val optimizer = Optimizer(
        model = model,
        dataset = trainSet,
        criterion = ClassNLLCriterion[Float]())
      if (param.checkpoint.isDefined) {
        optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch)
      }
      if(param.overWriteCheckpoint) {
        optimizer.overWriteCheckpoint()
      }

      val validationSet = DataSet.array(load(validationData, validationLabel)) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToBatch(
        param.batchSize)

      optimizer
        .setValidation(
          trigger = Trigger.everyEpoch,
          dataset = validationSet,
          vMethods = Array(new Top1Accuracy, new Top5Accuracy[Float], new Loss[Float]))
        .setOptimMethod(optimMethod)
        .setEndWhen(Trigger.maxEpoch(param.maxEpoch))
        .optimize()
    })
  }
} 
Example 15
Source File: Test.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.models.resnet

import com.intel.analytics.bigdl.Module
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.utils.Engine
import com.intel.analytics.bigdl.models.resnet.Utils._
import com.intel.analytics.bigdl.optim.{Top1Accuracy, ValidationMethod, ValidationResult}
import com.intel.analytics.bigdl.dataset.image.{BGRImgNormalizer, BGRImgToSample, BytesToBGRImg}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object Test {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)

  def main(args: Array[String]): Unit = {
    testParser.parse(args, TestParams()).foreach { param =>
      val conf = Engine.createSparkConf().setAppName("Test ResNet on Cifar10")
        .set("spark.akka.frameSize", 64.toString)
        .set("spark.task.maxFailures", "1")
      val sc = new SparkContext(conf)

      Engine.init
      val partitionNum = Engine.nodeNumber() * Engine.coreNumber()

      val rddData = sc.parallelize(loadTest(param.folder), partitionNum)
      val transformer = BytesToBGRImg() -> BGRImgNormalizer(Cifar10DataSet.trainMean,
          Cifar10DataSet.trainStd) -> BGRImgToSample()
      val evaluationSet = transformer(rddData)

      val model = Module.load[Float](param.model)
      println(model)
      val result = model.evaluate(evaluationSet, Array(new Top1Accuracy[Float]),
        Some(param.batchSize))
      result.foreach(r => println(s"${r._2} is ${r._1}"))

      sc.stop()
    }
  }
} 
Example 16
Source File: TestImageNet.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.models.resnet

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.DataSet
import com.intel.analytics.bigdl.dataset.image.CropCenter
import com.intel.analytics.bigdl.models.resnet.ResNet.DatasetType
import com.intel.analytics.bigdl.nn.{Module, StaticGraph}
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._
import com.intel.analytics.bigdl.transform.vision.image.{ImageFeature, MTImageFeatureToBatch, MatToTensor, PixelBytesToMat}
import com.intel.analytics.bigdl.transform.vision.image.augmentation.{ChannelScaledNormalizer, RandomCropper, RandomResize}
import com.intel.analytics.bigdl.utils._
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext


object TestImageNet {
  LoggerFilter.redirectSparkInfoLogs()
  Logger.getLogger("com.intel.analytics.bigdl.optim").setLevel(Level.INFO)
  val logger = Logger.getLogger(getClass)

  import Utils._

  def main(args: Array[String]): Unit = {
    testParser.parse(args, new TestParams()).map(param => {
      val conf = Engine.createSparkConf().setAppName("Test model on ImageNet2012")
        .set("spark.rpc.message.maxSize", "200")
      val sc = new SparkContext(conf)
      Engine.init

      val model = Module.loadModule[Float](param.model)
      val evaluationSet = ImageNetDataSet.valDataSet(param.folder,
        sc, 224, param.batchSize).toDistributed().data(train = false)

      val result = model.evaluate(evaluationSet,
        Array(new Top1Accuracy[Float], new Top5Accuracy[Float]))
      result.foreach(r => println(s"${r._2} is ${r._1}"))

      sc.stop()
    })
  }
} 
Example 17
Source File: TrainCIFAR10.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.models.resnet

import com.intel.analytics.bigdl.nn.{CrossEntropyCriterion, Module}
import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.models.resnet.ResNet.{DatasetType, ShortcutType}
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter, OptimizerV1, OptimizerV2, T, Table}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._

object TrainCIFAR10 {
  LoggerFilter.redirectSparkInfoLogs()


  import Utils._

  def cifar10Decay(epoch: Int): Double =
    if (epoch >= 122) 2.0 else if (epoch >= 81) 1.0 else 0.0

  def main(args: Array[String]): Unit = {
    trainParser.parse(args, new TrainParams()).map(param => {
      val conf = Engine.createSparkConf().setAppName("Train ResNet on Cifar10")
        // Will throw exception without this config when has only one executor
        .set("spark.rpc.message.maxSize", "200")
      val sc = new SparkContext(conf)
      Engine.init

      val batchSize = param.batchSize
      val (imageSize, lrSchedule, maxEpoch, dataSet) =
        (32, DatasetType.CIFAR10, param.nepochs, Cifar10DataSet)

      val trainDataSet = dataSet.trainDataSet(param.folder, sc, imageSize, batchSize)

      val validateSet = dataSet.valDataSet(param.folder, sc, imageSize, batchSize)

      val shortcut: ShortcutType = param.shortcutType match {
        case "A" => ShortcutType.A
        case "B" => ShortcutType.B
        case _ => ShortcutType.C
      }

      val model = if (param.modelSnapshot.isDefined) {
        Module.load[Float](param.modelSnapshot.get)
      } else {
        val curModel = if (param.graphModel) {
          ResNet.graph(param.classes,
            T("shortcutType" -> shortcut, "depth" -> param.depth, "optnet" -> param.optnet))
        } else {
          ResNet(param.classes,
            T("shortcutType" -> shortcut, "depth" -> param.depth, "optnet" -> param.optnet))
        }
        if (param.optnet) {
          ResNet.shareGradInput(curModel)
        }
        ResNet.modelInit(curModel)
        curModel
      }

      if (param.optimizerVersion.isDefined) {
        param.optimizerVersion.get.toLowerCase match {
          case "optimizerv1" => Engine.setOptimizerVersion(OptimizerV1)
          case "optimizerv2" => Engine.setOptimizerVersion(OptimizerV2)
        }
      }

      val optimMethod = if (param.stateSnapshot.isDefined) {
        OptimMethod.load[Float](param.stateSnapshot.get)
      } else {
        new SGD[Float](learningRate = param.learningRate, learningRateDecay = 0.0,
          weightDecay = param.weightDecay, momentum = param.momentum, dampening = param.dampening,
          nesterov = param.nesterov, learningRateSchedule = SGD.EpochDecay(cifar10Decay))
      }

      val optimizer = Optimizer(
        model = model,
        dataset = trainDataSet,
        criterion = new CrossEntropyCriterion[Float]()
      )
      if (param.checkpoint.isDefined) {
        optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch)
      }

      optimizer
        .setOptimMethod(optimMethod)
        .setValidation(Trigger.everyEpoch,
          validateSet, Array(new Top1Accuracy[Float]))
        .setEndWhen(Trigger.maxEpoch(maxEpoch))
        .optimize()
      sc.stop()

    })
  }
} 
Example 18
Source File: Test.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.models.inception

import com.intel.analytics.bigdl.dataset.{ByteRecord, DataSet}
import com.intel.analytics.bigdl.dataset.image._
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.optim.{Top1Accuracy, Top5Accuracy, Validator}
import com.intel.analytics.bigdl.utils.Engine
import org.apache.hadoop.io.Text
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object Test {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)


  import Options._

  val imageSize = 224

  def main(args: Array[String]) {
    testParser.parse(args, new TestParams()).foreach { param =>
      val batchSize = param.batchSize.getOrElse(128)
      val conf = Engine.createSparkConf().setAppName("Test Inception on ImageNet")
      val sc = new SparkContext(conf)
      Engine.init

      // We set partition number to be node*core, actually you can also assign other partitionNum
      val partitionNum = Engine.nodeNumber() * Engine.coreNumber()
      val rawData = sc.sequenceFile(param.folder, classOf[Text], classOf[Text], partitionNum)
        .map(image => {
          ByteRecord(image._2.copyBytes(), DataSet.SeqFileFolder.readLabel(image._1).toFloat)
        }).coalesce(partitionNum, true)

      val rddData = DataSet.SeqFileFolder.filesToRdd(param.folder, sc, 1000)
      val transformer = BytesToBGRImg() -> BGRImgCropper(imageSize, imageSize, CropCenter) ->
        HFlip(0.5) -> BGRImgNormalizer(0.485, 0.456, 0.406, 0.229, 0.224, 0.225) -> BGRImgToSample()
      val evaluationSet = transformer(rddData)

      val model = Module.load[Float](param.model)
      val result = model.evaluate(evaluationSet,
        Array(new Top1Accuracy[Float], new Top5Accuracy[Float]), param.batchSize)

      result.foreach(r => println(s"${r._2} is ${r._1}"))
      sc.stop()
    }
  }
} 
Example 19
Source File: Test.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.models.lenet

import java.nio.file.Paths

import com.intel.analytics.bigdl.dataset.DataSet
import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToSample}
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.optim.Top1Accuracy
import com.intel.analytics.bigdl.utils.Engine
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object Test {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)


  import Utils._

  def main(args: Array[String]): Unit = {
    testParser.parse(args, new TestParams()).foreach { param =>
      val conf = Engine.createSparkConf().setAppName("Test Lenet on MNIST")
        .set("spark.akka.frameSize", 64.toString)
        .set("spark.task.maxFailures", "1")
      val sc = new SparkContext(conf)
      Engine.init

      val validationData = param.folder + "/t10k-images-idx3-ubyte"
      val validationLabel = param.folder + "/t10k-labels-idx1-ubyte"

      val partitionNum = Engine.nodeNumber() * Engine.coreNumber()
      val rddData = sc.parallelize(load(validationData, validationLabel), partitionNum)
      val transformer =
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToSample()
      val evaluationSet = transformer(rddData)

      val model = Module.load[Float](param.model)
      val result = model.evaluate(evaluationSet,
        Array(new Top1Accuracy[Float]), Some(param.batchSize))

      result.foreach(r => println(s"${r._2} is ${r._1}"))
      sc.stop()
    }
  }
} 
Example 20
Source File: Train.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.models.lenet

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.DataSet
import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToBatch}
import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, CrossEntropyCriterion, Module}
import com.intel.analytics.bigdl.numeric.NumericFloat
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.utils._
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object Train {
  LoggerFilter.redirectSparkInfoLogs()


  import Utils._

  def main(args: Array[String]): Unit = {
    trainParser.parse(args, new TrainParams()).map(param => {
      val conf = Engine.createSparkConf()
        .setAppName("Train Lenet on MNIST")
        .set("spark.task.maxFailures", "1")
      val sc = new SparkContext(conf)
      Engine.init

      val trainData = param.folder + "/train-images-idx3-ubyte"
      val trainLabel = param.folder + "/train-labels-idx1-ubyte"
      val validationData = param.folder + "/t10k-images-idx3-ubyte"
      val validationLabel = param.folder + "/t10k-labels-idx1-ubyte"

      val model = if (param.modelSnapshot.isDefined) {
        Module.load[Float](param.modelSnapshot.get)
      } else {
        if (param.graphModel) {
          LeNet5.graph(classNum = 10)
        } else {
          Engine.getEngineType() match {
            case MklBlas => LeNet5(10)
            case MklDnn => LeNet5.dnnGraph(param.batchSize / Engine.nodeNumber(), 10)
          }
        }
      }
      val criterion = Engine.getEngineType() match {
        case MklBlas => ClassNLLCriterion()
        case MklDnn => CrossEntropyCriterion()
      }

      if (param.optimizerVersion.isDefined) {
        param.optimizerVersion.get.toLowerCase match {
          case "optimizerv1" => Engine.setOptimizerVersion(OptimizerV1)
          case "optimizerv2" => Engine.setOptimizerVersion(OptimizerV2)
        }
      }

      val optimMethod = if (param.stateSnapshot.isDefined) {
        OptimMethod.load[Float](param.stateSnapshot.get)
      } else {
        new SGD[Float](learningRate = param.learningRate,
          learningRateDecay = param.learningRateDecay)
      }

      val trainSet = DataSet.array(load(trainData, trainLabel), sc) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch(
        param.batchSize)

      val optimizer = Optimizer(
        model = model,
        dataset = trainSet,
        criterion = criterion)
      if (param.checkpoint.isDefined) {
        optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch)
      }
      if(param.overWriteCheckpoint) {
        optimizer.overWriteCheckpoint()
      }

      val validationSet = DataSet.array(load(validationData, validationLabel), sc) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToBatch(
        param.batchSize)

      optimizer
        .setValidation(
          trigger = Trigger.everyEpoch,
          dataset = validationSet,
          vMethods = Array(new Top1Accuracy, new Top5Accuracy[Float], new Loss[Float]))
        .setOptimMethod(optimMethod)
        .setEndWhen(Trigger.maxEpoch(param.maxEpoch))
        .optimize()

      sc.stop()
    })
  }
} 
Example 21
Source File: Train.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.models.autoencoder

import java.nio.file.Paths

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.image._
import com.intel.analytics.bigdl.dataset.{DataSet, MiniBatch, Transformer}
import com.intel.analytics.bigdl.nn.{MSECriterion, Module}
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._
import com.intel.analytics.bigdl.utils.{Engine, OptimizerV1, OptimizerV2, T, Table}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

import scala.reflect.ClassTag

object toAutoencoderBatch {
  def apply(): toAutoencoderBatch[Float] = new toAutoencoderBatch[Float]()
}

class toAutoencoderBatch[T: ClassTag](implicit ev: TensorNumeric[T]
      )extends Transformer[MiniBatch[T], MiniBatch[T]] {
  override def apply(prev: Iterator[MiniBatch[T]]): Iterator[MiniBatch[T]] = {
    prev.map(batch => {
      MiniBatch(batch.getInput().toTensor[T], batch.getInput().toTensor[T])
    })
  }
}

object Train {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)


  import Utils._

  def main(args: Array[String]): Unit = {
    trainParser.parse(args, new TrainParams()).map(param => {
      val conf = Engine.createSparkConf().setAppName("Train Autoencoder on MNIST")

      val sc = new SparkContext(conf)
      Engine.init

      val trainData = Paths.get(param.folder, "/train-images-idx3-ubyte")
      val trainLabel = Paths.get(param.folder, "/train-labels-idx1-ubyte")

      val trainDataSet = DataSet.array(load(trainData, trainLabel), sc) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) ->
        GreyImgToBatch(param.batchSize) -> toAutoencoderBatch()

      val model = if (param.modelSnapshot.isDefined) {
        Module.load[Float](param.modelSnapshot.get)
      } else {
        if (param.graphModel) Autoencoder.graph(classNum = 32) else Autoencoder(classNum = 32)
      }

      if (param.optimizerVersion.isDefined) {
        param.optimizerVersion.get.toLowerCase match {
          case "optimizerv1" => Engine.setOptimizerVersion(OptimizerV1)
          case "optimizerv2" => Engine.setOptimizerVersion(OptimizerV2)
        }
      }

      val optimMethod = if (param.stateSnapshot.isDefined) {
        OptimMethod.load[Float](param.stateSnapshot.get)
      } else {
        new Adagrad[Float](learningRate = 0.01, learningRateDecay = 0.0, weightDecay = 0.0005)
      }

      val optimizer = Optimizer(
        model = model,
        dataset = trainDataSet,
        criterion = new MSECriterion[Float]()
      )

      if (param.checkpoint.isDefined) {
        optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch)
      }
      optimizer
        .setOptimMethod(optimMethod)
        .setEndWhen(Trigger.maxEpoch(param.maxEpoch))
        .optimize()
      sc.stop()
    })
  }
} 
Example 22
Source File: Test.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.models.vgg

import com.intel.analytics.bigdl.dataset.DataSet
import com.intel.analytics.bigdl.dataset.image._
import com.intel.analytics.bigdl.models.lenet.Utils._
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.optim.{Top1Accuracy, Validator}
import com.intel.analytics.bigdl.utils.Engine
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object Test {
  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)
  Logger.getLogger("breeze").setLevel(Level.ERROR)


  import Utils._

  def main(args: Array[String]) {
    testParser.parse(args, new TestParams()).foreach { param =>
      val conf = Engine.createSparkConf().setAppName("Test Vgg on Cifar10")
        .set("spark.akka.frameSize", 64.toString)
      val sc = new SparkContext(conf)
      Engine.init

      val partitionNum = Engine.nodeNumber() * Engine.coreNumber()
      val rddData = sc.parallelize(Utils.loadTest(param.folder), partitionNum)
      val transformer = BytesToBGRImg() -> BGRImgNormalizer(testMean, testStd) -> BGRImgToSample()
      val evaluationSet = transformer(rddData)

      val model = Module.load[Float](param.model)
      val result = model.evaluate(evaluationSet,
        Array(new Top1Accuracy[Float]), Some(param.batchSize))
      result.foreach(r => println(s"${r._2} is ${r._1}"))
      sc.stop()
    }
  }
} 
Example 23
Source File: Train.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.models.vgg

import java.text.SimpleDateFormat
import java.util.Date

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.DataSet
import com.intel.analytics.bigdl.dataset.image._
import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Module}
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._
import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter, OptimizerV1, OptimizerV2, T, Table}
import com.intel.analytics.bigdl.visualization.{TrainSummary, ValidationSummary}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object Train {
  LoggerFilter.redirectSparkInfoLogs()


  import Utils._

  def main(args: Array[String]): Unit = {
    trainParser.parse(args, new TrainParams()).map(param => {
      val conf = Engine.createSparkConf().setAppName("Train Vgg on Cifar10")
        // Will throw exception without this config when has only one executor
          .set("spark.rpc.message.maxSize", "200")
      val sc = new SparkContext(conf)
      Engine.init

      val trainDataSet = DataSet.array(Utils.loadTrain(param.folder), sc) ->
        BytesToBGRImg() -> BGRImgNormalizer(trainMean, trainStd) ->
        BGRImgToBatch(param.batchSize)

      val model = if (param.modelSnapshot.isDefined) {
        Module.load[Float](param.modelSnapshot.get)
      } else {
        if (param.graphModel) VggForCifar10.graph(classNum = 10) else VggForCifar10(classNum = 10)
      }

      if (param.optimizerVersion.isDefined) {
        param.optimizerVersion.get.toLowerCase match {
          case "optimizerv1" => Engine.setOptimizerVersion(OptimizerV1)
          case "optimizerv2" => Engine.setOptimizerVersion(OptimizerV2)
        }
      }

      val optimMethod = if (param.stateSnapshot.isDefined) {
        OptimMethod.load[Float](param.stateSnapshot.get)
      } else {
        new SGD[Float](learningRate = param.learningRate, learningRateDecay = 0.0,
          weightDecay = param.weightDecay, momentum = 0.9, dampening = 0.0, nesterov = false,
          learningRateSchedule = SGD.EpochStep(25, 0.5))
      }

      val optimizer = Optimizer(
        model = model,
        dataset = trainDataSet,
        criterion = new ClassNLLCriterion[Float]()
      )

      val validateSet = DataSet.array(Utils.loadTest(param.folder), sc) ->
        BytesToBGRImg() -> BGRImgNormalizer(testMean, testStd) ->
        BGRImgToBatch(param.batchSize)

      if (param.checkpoint.isDefined) {
        optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch)
      }

      if (param.overWriteCheckpoint) {
        optimizer.overWriteCheckpoint()
      }

      if (param.summaryPath.isDefined) {
        val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
        val timeStamp = sdf.format(new Date())
        val trainSummry = new TrainSummary(param.summaryPath.get,
          s"vgg-on-cifar10-train-$timeStamp")
        optimizer.setTrainSummary(trainSummry)
        val validationSummary = new ValidationSummary(param.summaryPath.get,
          s"vgg-on-cifar10-val-$timeStamp")
        optimizer.setValidationSummary(validationSummary)
      }

      optimizer
        .setValidation(Trigger.everyEpoch, validateSet, Array(new Top1Accuracy[Float]))
        .setOptimMethod(optimMethod)
        .setEndWhen(Trigger.maxEpoch(param.maxEpoch))
        .optimize()

      sc.stop()
    })
  }
} 
Example 24
Source File: ParallelOptimizerSpec.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.optim

import com.intel.analytics.bigdl.dataset.{DataSet, MiniBatch}
import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Linear, MSECriterion}
import com.intel.analytics.bigdl.optim.DistriOptimizerSpecModel.mse
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.{Engine, T}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers}

@com.intel.analytics.bigdl.tags.Serial
class ParallelOptimizerSpec extends FlatSpec with Matchers with BeforeAndAfter {

  Logger.getLogger("org").setLevel(Level.WARN)
  Logger.getLogger("akka").setLevel(Level.WARN)

  private var sc: SparkContext = _

  before {
    val conf = Engine.createSparkConf()
      .setMaster("local[1]").setAppName("ParallelOptimizerSpec")
    sc = new SparkContext(conf)
    Engine.init
    Engine.setCoreNumber(1)
  }

  after {
    if (sc != null) {
      sc.stop()
    }
  }

  "Train with parallel" should "work properly" in {
    val input = Tensor[Float](1, 10).fill(1.0f)
    val target = Tensor[Float](1).fill(1.0f)
    val miniBatch = MiniBatch(input, target)
    val model = Linear[Float](10, 2)
    model.getParameters()._1.fill(1.0f)
    val optimMethod = new SGD[Float]()

    val dataSet = DataSet.array(Array(miniBatch), sc)

    val optimizer = new DistriOptimizer[Float](model, dataSet, new ClassNLLCriterion[Float]())
      .setState(T("learningRate" -> 1.0))
      .setEndWhen(Trigger.maxIteration(10))

    optimizer.optimize()

  }

  "Train with parallel" should "have same results as DistriOptimizer" in {

    val input = Tensor[Float](1, 10).fill(1.0f)
    val target = Tensor[Float](1).fill(1.0f)
    val miniBatch = MiniBatch(input, target)
    val model1 = Linear[Float](10, 2)
    model1.getParameters()._1.fill(1.0f)

    val model2 = Linear[Float](10, 2)
    model2.getParameters()._1.fill(1.0f)

    val dataSet = DataSet.array(Array(miniBatch), sc)

    val parallelOptimizer = new DistriOptimizer[Float](model1,
      dataSet, new ClassNLLCriterion[Float]())
      .setState(T("learningRate" -> 1.0))
      .setEndWhen(Trigger.maxIteration(10))

    parallelOptimizer.optimize

    val distriOptimizer = new DistriOptimizer[Float](model2,
      dataSet, new ClassNLLCriterion[Float]())
      .setState(T("learningRate" -> 1.0))
      .setEndWhen(Trigger.maxIteration(10))

    distriOptimizer.optimize

    model1.getParameters()._1 should be (model2.getParameters()._1)

  }

} 
Example 25
Source File: SparkFunSuite.scala    From spark-alchemy   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark

// scalastyle:off
import java.io.File

import scala.annotation.tailrec
import org.apache.log4j.{Appender, Level, Logger}
import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, BeforeAndAfterEach, FunSuite, Outcome, Suite}
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config.Tests.IS_TESTING
import org.apache.spark.util.{AccumulatorContext, Utils}


  protected def withLogAppender(
    appender: Appender,
    loggerName: Option[String] = None,
    level: Option[Level] = None)(
    f: => Unit): Unit = {
    val logger = loggerName.map(Logger.getLogger).getOrElse(Logger.getRootLogger)
    val restoreLevel = logger.getLevel
    logger.addAppender(appender)
    if (level.isDefined) {
      logger.setLevel(level.get)
    }
    try f finally {
      logger.removeAppender(appender)
      if (level.isDefined) {
        logger.setLevel(restoreLevel)
      }
    }
  }
} 
Example 26
Source File: DenseKMeans.scala    From AI   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package com.bigchange.mllib

import org.apache.log4j.{Level, Logger}

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors


object DenseKMeans {

  object InitializationMode extends Enumeration {
    type InitializationMode = Value
    val Random, Parallel = Value
  }

  import InitializationMode._

  case class Params(
      var input: String = null,
      k: Int = 2,
      numIterations: Int = 10,
      initializationMode: InitializationMode = Parallel) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()
    defaultParams.input = args(0)
    run(defaultParams)

  }

  def run(params: Params) {
    val conf = new SparkConf().setAppName(s"DenseKMeans with $params").setMaster("local")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = sc.textFile(params.input).map { line =>
      Vectors.dense(line.split(' ').map(_.toDouble))
    }.cache()

    val numExamples = examples.count()

    println(s"numExamples = $numExamples.")

    val initMode = params.initializationMode match {
      case Random => KMeans.RANDOM
      case Parallel => KMeans.K_MEANS_PARALLEL
    }

    val model = new KMeans()
      .setInitializationMode(initMode)
      .setK(params.k)
      .setMaxIterations(params.numIterations)
      .run(examples)

    //  Return the K-means cost (sum of squared distances of points to their nearest center) for this
    val cost = model.computeCost(examples)
    // 获取质点(k个)
    val centerPoint = model.clusterCenters
    val one = centerPoint(0)
    val two  =  centerPoint(1)
    println(s"centerPoint=$one,$two.")
    println(s"Total cost = $cost.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 27
Source File: SparkSqlUtils.scala    From HadoopLearning   with MIT License 5 votes vote down vote up
package com.c503.utils

import java.io.{BufferedInputStream, BufferedReader, FileInputStream, InputStreamReader}
import java.nio.file.Path

import com.google.common.io.Resources
import org.apache.log4j.{Level, Logger}
import org.apache.mesos.Protos.Resource
import org.apache.spark.sql.SparkSession

import scala.io.Source


  def readSqlByPath(sqlPath: String) = {
    val buf = new StringBuilder
    val path = this.getPathByName(sqlPath)
    val file = Source.fromFile(path)
    for (line <- file.getLines) {
      buf ++= line + "\n"
    }
    file.close
    buf.toString()
  }


} 
Example 28
Source File: Streaming.scala    From scala-spark-cab-rides-predictions   with MIT License 5 votes vote down vote up
import com.amazonaws.services.dynamodbv2.document.internal.InternalUtils
import com.amazonaws.services.dynamodbv2.streamsadapter.model.RecordAdapter
import com.amazonaws.services.kinesis.model.Record
import com.google.gson.Gson
import org.apache.spark.sql._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.kinesis.dynamostream.KinesisInitialPositions.Latest
import org.apache.spark.streaming.kinesis.dynamostream.KinesisInputDStream
import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext}

object Trials extends App {

  import org.apache.log4j.{Level, Logger}

  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)

  //session setup
  System.setProperty("hadoop.home.dir", "C:\\winutils")
  val sparkSession = SparkSession.builder()
    .master("local[*]")
    .appName("test")
    .getOrCreate()
  val sc = sparkSession.sparkContext
  val ssc = new StreamingContext(sc, Seconds(10))
  val sqlContext = sparkSession.sqlContext

  //creates an array of strings from raw byte array
  def kinesisRecordHandler: Record => Array[String] = (record: Record) => new String(record.getData.array()).split(",")

  //converts records to map of key value pair and then json
  def recordHandler = (record: Record) => {
    val gson = new Gson
    val sRecord = record.asInstanceOf[RecordAdapter].getInternalObject
    val map = InternalUtils.toSimpleMapValue(sRecord.getDynamodb.getNewImage)
    gson.toJson(map)
  }

  case class CabPrice(cab_type: String, product_id: String, name: String, price: String, distance: String, surge_multiplier: String, time_stamp: String, source: String, destination: String, id: String)

  val stream_cab = KinesisInputDStream.builder
    .streamingContext(ssc)
    .streamName("cab_rides")
    .regionName("us-east-1")
    .initialPosition(new Latest())
    .checkpointAppName("cab_rides-app")
    .checkpointInterval(Milliseconds(1000))
    .storageLevel(StorageLevel.MEMORY_AND_DISK_2)
    .buildWithMessageHandler(recordHandler)


  val stream_weather = KinesisInputDStream.builder
    .streamingContext(ssc)
    .streamName("weather")
    .regionName("us-east-1")
    .initialPosition(new Latest())
    .checkpointAppName("cab_rides-app")
    .checkpointInterval(Milliseconds(1000))
    .storageLevel(StorageLevel.MEMORY_AND_DISK_2)
    .buildWithMessageHandler(recordHandler)


  //creating dataframe, can be stored as temp view
  val cabSchema = Encoders.product[CabPrice].schema
  stream_cab.foreachRDD(rdd => {
    import sqlContext.implicits._
    //val xx: Dataset[String] = rdd.toDS()

    val df: DataFrame = sqlContext.read.schema(cabSchema).json(rdd.toDS())
    df.show()

  })
  ssc.start()
  ssc.awaitTermination()

} 
Example 29
Source File: SocialGraphJob.scala    From spark-graphx   with GNU General Public License v3.0 5 votes vote down vote up
package com.github.graphx.pregel.jobs.social

import com.github.graphx.pregel.social.SocialGraph
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext

object SocialGraphJob {

  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.ERROR)
    val sc = new SparkContext("local[*]", "GraphX")

    val graph = new SocialGraph(sc)

    println("Top 10 most-connected users:")
    graph.getMostConnectedUsers(10) foreach println

    println("Computing degrees of separation for user Arch")
    graph.degreeOfSeparationSingleUser(5306) foreach println

    println("Computing degrees of separation for user Arch and Fred")
    graph.degreeOfSeparationTwoUser(5306, 14) foreach println

    println("Connected component")
    graph.connectedComponentGroupedByUsers
      .sortBy ( {case (_, lowestVertexId) => lowestVertexId},
        ascending = false).take(10) foreach println

    sc.stop()
  }
} 
Example 30
Source File: ShortestPathProblemJob.scala    From spark-graphx   with GNU General Public License v3.0 5 votes vote down vote up
package com.github.graphx.pregel.jobs.ssp

import com.github.graphx.pregel.ssp.ShortestPathProblem
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.graphx.VertexId

object ShortestPathProblemJob extends App {
  Logger.getLogger("org").setLevel(Level.ERROR)
  val sc = new SparkContext("local[*]", "ShortestPathProblemDemo")
  val ssp = new ShortestPathProblem(sc)

  val sourceIdForTest: VertexId = 3
  val sourceIdForRandom: VertexId = 75

  val testGraph = ssp.testGraph
  val resultOnTestGraph = ssp.shortestPath(testGraph, sourceIdForTest)
  println(s"Test Graph:\n${ssp.graphToString(testGraph)}\n\n" +
    s"Distances on the test graph $resultOnTestGraph\n")

  val randomGraph = ssp.randomGraph
  val resultOnRandomGraph = ssp.shortestPath(randomGraph, sourceIdForRandom)
  println(s"Distances on the random graph $resultOnRandomGraph\n")
} 
Example 31
Source File: SocialPageRankJob.scala    From spark-graphx   with GNU General Public License v3.0 5 votes vote down vote up
package com.github.graphx.pagerank

import com.github.graphx.pregel.social.SocialGraph
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.graphx.VertexRDD

object SocialPageRankJob {

  
  def static(socialGraph: SocialGraph, tolerance: Double): VertexRDD[Double] =
    socialGraph.graph.staticPageRank(numIter = 20).vertices

  def handleResult(socialGraph: SocialGraph, ranks: VertexRDD[Double]) = {
    socialGraph.verts.join(ranks).map {
      case (_, (username, rank)) => (username, rank)
    }.sortBy({ case (_, rank) => rank }, ascending = false).take(10)
  }

  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.ERROR)
    val sc = new SparkContext("local[*]", "PageRank")

    val socialGraph: SocialGraph = new SocialGraph(sc)
    val TOLERANCE: Double = 0.0001

    import scala.compat.Platform.{EOL => D}
    val topUsersDynamically = handleResult(socialGraph, ranks(socialGraph, TOLERANCE)).mkString(D)
    val topUsersIterative = handleResult(socialGraph, static(socialGraph, TOLERANCE)).mkString(D)

    println(s"Top 10 users in network counted with TOLERANCE until convergence $TOLERANCE - $D $topUsersDynamically")
    println(s"Top 10 users in the network counted iteratively - $D $topUsersIterative")

    sc.stop()
  }
} 
Example 32
Source File: AkkaUtils.scala    From DataXServer   with Apache License 2.0 5 votes vote down vote up
package org.tianlangstudio.data.hamal.yarn.util

import akka.actor.{ActorSystem, ExtendedActorSystem}
import com.typesafe.config.ConfigFactory
import org.apache.log4j.{Level, Logger}
import org.tianlangstudio.data.hamal.core.{Constants, HamalConf}
import org.tianlangstudio.data.hamal.core.HamalConf


  def maxFrameSizeBytes(conf: HamalConf): Int = {
    val frameSizeInMB = conf.getInt("datax.akka.frameSize", 128)
    if (frameSizeInMB > AKKA_MAX_FRAME_SIZE_IN_MB) {
      throw new IllegalArgumentException(
        s"spark.akka.frameSize should not be greater than $AKKA_MAX_FRAME_SIZE_IN_MB MB")
    }
    frameSizeInMB * 1024 * 1024
  }


  def protocol(actorSystem: ActorSystem): String = {
    val akkaConf = actorSystem.settings.config
    val sslProp = "akka.remote.netty.tcp.enable-ssl"
    protocol(akkaConf.hasPath(sslProp) && akkaConf.getBoolean(sslProp))
  }

  def protocol(ssl: Boolean = false): String = {
    if (ssl) {
      "akka.ssl.tcp"
    } else {
      "akka.tcp"
    }
  }

  def address(
      protocol: String,
      systemName: String,
      host: String,
      port: Int,
      actorName: String): String = {

        address(protocol,
          systemName,
          s"$host:$port",
          actorName
        )
  }
  def address(
               protocol: String,
               systemName: String,
               hostPort: String,
               actorName: String): String = {
    s"$protocol://$systemName@$hostPort/user/$actorName"
  }
} 
Example 33
Source File: ModelSerialization.scala    From CTRmodel   with Apache License 2.0 5 votes vote down vote up
package com.ggstar.example

import com.ggstar.ctrmodel._
import com.ggstar.features.FeatureEngineering
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}

object ModelSerialization {
  def main(args: Array[String]): Unit = {

    Logger.getLogger("org").setLevel(Level.ERROR)

    val conf = new SparkConf()
      .setMaster("local")
      .setAppName("ctrModel")
      .set("spark.submit.deployMode", "client")

    val spark = SparkSession.builder.config(conf).getOrCreate()

    val resourcesPath = this.getClass.getResource("/samples.snappy.orc")
    val rawSamples = spark.read.format("orc").option("compression", "snappy").load(resourcesPath.getPath)


    //transform array to vector for following vectorAssembler
    val samples = FeatureEngineering.transferArray2Vector(rawSamples)

    samples.printSchema()
    samples.show(5, false)


    //model training
    println("Neural Network Ctr Prediction Model:")
    val innModel = new InnerProductNNCtrModel()
    innModel.train(samples)
    val transformedData = innModel.transform(samples)

    transformedData.show(1,false)

    //model serialization by mleap
    val mleapModelSerializer = new com.ggstar.serving.mleap.serialization.ModelSerializer()
    mleapModelSerializer.serializeModel(innModel._pipelineModel, "jar:file:/Users/zhwang/Workspace/CTRmodel/model/inn.model.mleap.zip", transformedData)

    //model serialization by JPMML
    val jpmmlModelSerializer = new com.ggstar.serving.jpmml.serialization.ModelSerializer()
    jpmmlModelSerializer.serializeModel(innModel._pipelineModel, "model/inn.model.jpmml.xml", transformedData)
  }
} 
Example 34
Source File: ModelSelection.scala    From CTRmodel   with Apache License 2.0 5 votes vote down vote up
package com.ggstar.example

import com.ggstar.ctrmodel._
import com.ggstar.evaluation.Evaluator
import com.ggstar.features.FeatureEngineering
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.log4j.{Level, Logger}

object ModelSelection {
  def main(args: Array[String]): Unit = {

    Logger.getLogger("org").setLevel(Level.ERROR)

    val conf = new SparkConf()
      .setMaster("local")
      .setAppName("ctrModel")
      .set("spark.submit.deployMode", "client")

    val spark = SparkSession.builder.config(conf).getOrCreate()

    val resourcesPath = this.getClass.getResource("/samples.snappy.orc")
    val rawSamples = spark.read.format("orc").option("compression", "snappy").load(resourcesPath.getPath)
    rawSamples.printSchema()
    rawSamples.show(10)

    //transform array to vector for following vectorAssembler
    val samples = FeatureEngineering.transferArray2Vector(rawSamples)

    //split samples into training samples and validation samples
    val Array(trainingSamples, validationSamples) = samples.randomSplit(Array(0.7, 0.3))
    val evaluator = new Evaluator

    
  }
} 
Example 35
Source File: GenerateVerticesExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch08

// scalastyle:off println
import org.apache.log4j.{Level, Logger}

import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.rdd.RDD


object GenerateVerticesExample {

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }
    // ログレベルをWARNに設定
    Logger.getLogger("org").setLevel(Level.WARN)

    // SparkContextの生成
    val conf = new SparkConf().setAppName("GenerateVerticesExample")
    val sc = new SparkContext(conf)

    // 引数から設定値を取得
    val (numProducts, numUsers): (Int, Int) = (args(0).toInt, args(1).toInt)
    implicit val recOpts: RecommendLogOptions = RecommendLogOptions(numProducts, numUsers)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext)
         (implicit recOpts: RecommendLogOptions)
  : Unit = {

    // 商品リスト、ユーザリストのRDDを生成
    val products: RDD[VertexProperty] = sc.parallelize(PurchaseLogGenerator.genProductList)
    val users: RDD[VertexProperty] = sc.parallelize(PurchaseLogGenerator.genUserList)

    // 商品リスト20件を表示
    println("===================================")
    println("get top 20 products:")
    products.take(20).foreach(x => println(s"id: ${x.id},\ttype: ${x.kind},\tname: ${x.name}"))

    // ユーザリスト20件を表示
    println("===================================")
    println("get top 20 users:")
    users.take(20).foreach(x => println(s"id: ${x.id},\ttype: ${x.kind},\tname: ${x.name}"))

  }
}
// scalastyle:on println 
Example 36
Source File: ReduceExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object ReduceExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("ReduceExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1, 6, 3), 3)
    nums.reduce((x, y) => x + y)

    println(s"""nums: ${nums.collect().mkString(", ")}""")
    println(s"""sum:  ${nums.reduce((x, y) => x + y)}""")
  }
}

// scalastyle:on println 
Example 37
Source File: StatsExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object StatsExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("StatsExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array.range(1, 11))
    val stats = nums.stats()

    println(s"""nums:   ${nums.collect().mkString(", ")}""")
    println(s"""count:  ${stats.count}""")
    println(s"""mean:   ${stats.mean}""")
    println(s"""stdev:  ${stats.stdev}""")
  }
}

// scalastyle:on println 
Example 38
Source File: FoldExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object FoldExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("FoldExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1, 6, 3), 3)
    nums.reduce((x, y) => x + y)

    println(s"""nums: ${nums.collect().mkString(", ")}""")
    println(s"""sum:  ${nums.fold(0)((x, y) => x + y)}""")
  }
}

// scalastyle:on println 
Example 39
Source File: OrderExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object OrderExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("OrderExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1))

    println(s"""nums:          ${nums.collect().mkString(", ")}""")
    println(s"""top3:          ${nums.top(3).mkString(", ")}""")
    println(s"""takeOredered3: ${nums.takeOrdered(3).mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 40
Source File: AggregateExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object AggregateExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("AggregateExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  private[basic_action]
  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array.range(1, 11), 3)

    val acc = nums.aggregate(zeroValue = (0.0, 0))(
      seqOp = (partAcc, n) => (partAcc._1 + n, partAcc._2 + 1),
      combOp = (acc1, acc2) => (acc1._1 + acc2._1, acc1._2 + acc2._2)
    )
    val avg = acc._1 / acc._2

    println(s"""nums: ${nums.collect().mkString(", ")}""")
    println(s"""sum:  ${nums.fold(0)((x, y) => x + y)}""")
  }
}

// scalastyle:on println 
Example 41
Source File: CollectAsMapExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.pairrdd_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object CollectAsMapExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("CollectAsMapExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(
      Array(
        ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1)
      ), 3
    )
    val fruitsAsMap = fruits.collectAsMap()

    println(s"""fruits:      ${fruits.collect().mkString(", ")}""")
    println(s"""fruitsAsMap: $fruitsAsMap""")
  }
}

// scalastyle:on println 
Example 42
Source File: PersistExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.persistence

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

object PersistExample {
  def main(args: Array[String]) {
    if (args.length != 1) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }

    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("PersistExample")
    val sc = new SparkContext(conf)

    run(sc, args(0))
    sc.stop()
  }

  def run(sc: SparkContext, inputFile: String) {
    val lines = sc.textFile(inputFile)
    lines.count()
    lines.collect()

    val persistedLines = sc.textFile(inputFile).persist()
    persistedLines.collect()
    persistedLines.count()

    persistedLines.unpersist()
    persistedLines.collect()
  }
} 
Example 43
Source File: CustomPartitionerExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.partition

import org.apache.log4j.{Level, Logger}
import org.apache.spark.Partitioner
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object CustomPartitionerExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("CustomPartitionerExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))

    val defaultPartitioned = fruits.map((_, 1)).reduceByKey(_ + _)
    val customPartitioned = fruits.map((_, 1)).reduceByKey(
      new FirstLetterPartitioner(sc.defaultParallelism), _ + _)

    println(s"""fruits:\n  ${fruits.collect().mkString(", ")}""")
    println()

    println("partitioned by default partitioner")
    defaultPartitioned.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
    println()

    println("partitioned by first letter partitioner")
    customPartitioned.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
  }
}

private[partition]
class FirstLetterPartitioner(numParts: Int) extends Partitioner {
  override def numPartitions: Int = numParts

  override def getPartition(key: Any): Int = {
    key.toString.charAt(0).hashCode % numPartitions match {
      case p if p < 0 => p + numPartitions
      case p => p
    }
  }

  override def equals(other: Any): Boolean = {
    other match {
      case p: FirstLetterPartitioner => p.numPartitions == numPartitions
      case _ => false
    }
  }
}

// scalastyle:on println 
Example 44
Source File: PartitionExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.partition

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object PartitionExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("Partition")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1), 1)
    println(s"""nums:\n  ${nums.collect().mkString(", ")}""")
    println()

    println("original:")
    nums.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
    println()

    val numsPar3 = nums.repartition(3)
    println("repartition to 3:")
    numsPar3.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
    println()

    val numsPar2 = numsPar3.coalesce(2)
    println("coalesce to 2:")
    numsPar2.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
  }
}

// scalastyle:on println 
Example 45
Source File: WordCountExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.shared_variable

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object WordCountExample {
  def main(args: Array[String]) {
    if (args.length != 1) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }

    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("WordCountExample")
    val sc = new SparkContext(conf)

    run(sc, args(0))
    sc.stop()
  }

  def run(sc: SparkContext, inputFile: String) {
    val stopWordCount = sc.accumulator(0L)
    val stopWords = sc.broadcast(Set("a", "an", "for", "in", "on"))

    val lines = sc.textFile(inputFile)
    val words = lines.flatMap(_.split(" ")).filter(!_.isEmpty)
    val wordCounts = words.map(w => (w, 1)).reduceByKey(_ + _).filter { w =>
      val result = !stopWords.value.contains(w._1)
      if (!result) stopWordCount += 1L
      result
    }
    val sortedWordCounts = wordCounts.sortBy(_._2, ascending = false)

    println(s"""wordCounts:     ${sortedWordCounts.take(10).mkString(", ")}""")
    println(s"""stopWordCounts: ${stopWordCount.value}""")
  }
}

// scalastyle:on println 
Example 46
Source File: AggregateByKeyExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object AggregateByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("AggregateByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(
      Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1)))
    val fruitCountAvgs = fruits.aggregateByKey(zeroValue = Acc(0.0, 0))(
      seqOp = (partAcc, n) => partAcc += n,
      combOp = (acc1, acc2) => acc1 ++= acc2
    ).mapValues(acc => acc.sum / acc.count)

    println(s"""fruits:         ${fruits.collect().mkString(", ")}""")
    println(s"""fruitCountAvgs: ${fruitCountAvgs.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 47
Source File: MapValuesExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object MapValuesExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("MapValuesExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array(("Apple", 1), ("Orange", 4), ("Apple", 2), ("Peach", 1)))
    val plusOnes = fruits.mapValues(v => v + 1)

    println(s"""fruits:   ${fruits.collect().mkString(", ")}""")
    println(s"""plusOnes: ${plusOnes.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 48
Source File: SortByKeyExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object SortByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("SortByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(
      Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1)))
    val sortedByKeyAsc = fruits.sortByKey(ascending = false)

    println(s"""fruits:         ${fruits.collect().mkString(", ")}""")
    println(s"""sortedByKeyAsc: ${sortedByKeyAsc.collect().mkString(", ")}""")

    val nums = sc.parallelize(
      Array(("One", 1), ("Hundred", 100), ("Three", 3), ("Thousand", 1000)))
    implicit val sortByStrLen = new Ordering[String] {
      def compare(x: String, y: String): Int = x.length - y.length
    }
    val sortedByKeyLength = nums.sortByKey()

    println()
    println(s"""nums:              ${nums.collect().mkString(", ")}""")
    println(s"""sortedByKeyLength: ${sortedByKeyLength.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 49
Source File: CoGroupExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object CoGroupExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("CoGroupExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val persons = sc.parallelize(Array(
      ("Adam", "San francisco"),
      ("Bob", "San francisco"),
      ("Taro", "Tokyo"),
      ("Charles", "New York")
    ))
    val cities = sc.parallelize(Array(
      ("Tokyo", "Japan"),
      ("San francisco", "America"),
      ("Beijing", "China")
    ))
    val grouped = persons.map(_.swap).cogroup(cities)

    println(s"""persons: ${persons.collect().mkString(", ")}""")
    println(s"""cities:  ${cities.collect().mkString(", ")}""")
    println()
    println(s"""grouped:\n${grouped.collect().mkString("\n")}""")
  }
}

// scalastyle:on println 
Example 50
Source File: JoinExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object JoinExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("JoinExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val persons = sc.parallelize(Array(
      ("Adam", "San francisco"),
      ("Bob", "San francisco"),
      ("Taro", "Tokyo"),
      ("Charles", "New York")
    ))
    val cities = sc.parallelize(Array(
      ("Tokyo", "Japan"),
      ("San francisco", "America"),
      ("Beijing", "China")
    ))

    val leftJoined = persons.map(_.swap).join(cities)
    val leftOuterJoined = persons.map(_.swap).leftOuterJoin(cities)
    val rightOuterJoined = persons.map(_.swap).rightOuterJoin(cities)
    val fullOuterJoined = persons.map(_.swap).fullOuterJoin(cities)

    println(s"""persons: ${persons.collect().mkString(", ")}""")
    println(s"""cities:  ${cities.collect().mkString(", ")}""")
    println()
    println(s"""leftJoined:\n${leftJoined.collect().mkString("\n")}""")
    println()
    println(s"""leftOuterJoined:\n${leftOuterJoined.collect().mkString("\n")}""")
    println()
    println(s"""rightOuterJoined:\n${rightOuterJoined.collect().mkString("\n")}""")
    println()
    println(s"""fullOuterJoined:\n${fullOuterJoined.collect().mkString("\n")}""")
  }
}

// scalastyle:on println 
Example 51
Source File: GroupByKeyExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object GroupByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("GroupByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(
      Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1)))
    val grouped = fruits.groupByKey()

    println(s"""fruits:  ${fruits.collect().mkString(", ")}""")
    println(s"""grouped: ${grouped.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 52
Source File: ReduceByKeyExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object ReduceByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("ReduceByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array(
      ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1)))
    val fruitCounts = fruits.reduceByKey((x, y) => x + y)

    println(s"""fruits:      ${fruits.collect().mkString(", ")}""")
    println(s"""fruitCounts: ${fruitCounts.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 53
Source File: CombineByKeyExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object CombineByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("CombineByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(
      Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1)))
    val fruitCountAvgs = fruits.combineByKey(
      createCombiner = (v: Int) => Acc(v.toDouble, 1),
      mergeValue = (partAcc: Acc, n: Int) => partAcc += n,
      mergeCombiners = (acc1: Acc, acc2: Acc) => acc1 ++= acc2
    ).mapValues(acc => acc.sum / acc.count)

    println(s"""fruits:         ${fruits.collect().mkString(", ")}""")
    println(s"""fruitCountAvgs: ${fruitCountAvgs.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 54
Source File: FoldByKeyExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object FoldByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("FoldByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array(
      ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1)))
    val fruitCounts = fruits.foldByKey(0)((x, y) => x + y)

    println(s"""fruits:      ${fruits.collect().mkString(", ")}""")
    println(s"""fruitCounts: ${fruitCounts.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 55
Source File: MapPartitionsExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_transformation

import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object MapPartitionsExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("MapPartitionsExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val jsonLines = sc.parallelize(Array(
      """{"name": "Apple",  "num": 1}""",
      """{"name": "Orange", "num": 4}""",
      """{"name": "Apple",  "num": 2}""",
      """{"name": "Peach",  "num": 1}"""
    ))

    val parsed = jsonLines.mapPartitions { lines =>
      val mapper = new ObjectMapper()
      mapper.registerModule(DefaultScalaModule)
      lines.map { line =>
        val f = mapper.readValue(line, classOf[Map[String, String]])
        (f("name"), f("num"))
      }
    }

    println(s"""json:\n${jsonLines.collect().mkString("\n")}""")
    println()
    println(s"""parsed:\n${parsed.collect().mkString("\n")}""")
  }
}

// scalastyle:on println 
Example 56
Source File: FlatMapExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object FlatMapExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("FlatMapExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val lines = sc.parallelize(Array("Apple is red", "PineApple is yellow"))
    val words = lines.flatMap(line => line.split(" "))

    println(s"""lines: ${lines.collect().mkString(", ")}""")
    println(s"""words: ${words.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 57
Source File: SetOperationsExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object SetOperationsExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("SetOperationsExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits1 = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val fruits2 = sc.parallelize(Array("Grape", "Apple", "Banana", "Orange"))

    val union = fruits1.union(fruits2)
    val subtract = fruits1.subtract(fruits2)
    val intersection = fruits1.intersection(fruits2)
    val cartesian = fruits1.cartesian(fruits2)

    println(s"""fruits1: ${fruits1.collect().mkString(", ")}""")
    println(s"""fruits2: ${fruits2.collect().mkString(", ")}""")
    println(s"""union: ${union.collect().mkString(", ")}""")
    println(s"""subtract: ${subtract.collect().mkString(", ")}""")
    println(s"""intersection: ${intersection.collect().mkString(", ")}""")
    println(s"""cartesian: ${cartesian.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 58
Source File: MapExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object MapExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("MapExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val lengths = fruits.map(fruit => fruit.length)

    println(s"""fruits:  ${fruits.collect().mkString(", ")}""")
    println(s"""lengths: ${lengths.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 59
Source File: ZipExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object ZipExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("ZipExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits1 = sc.parallelize(
      Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val fruits2 = sc.parallelize(
      Array("りんご", "オレンジ", "桃", "オレンジ", "パイナップル", "オレンジ"))
    val zipped = fruits1.zip(fruits2)

    println(s"""fruits1: ${fruits1.collect().mkString(", ")}""")
    println(s"""fruits2: ${fruits2.collect().mkString(", ")}""")
    println(s"""zipped:  ${zipped.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 60
Source File: DistinctExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object DistinctExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("DistinctExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val uniques = fruits.distinct()

    println(s"""fruits:  ${fruits.collect().mkString(", ")}""")
    println(s"""uniques: ${uniques.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 61
Source File: SampleExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object SampleExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("SampleExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val samples = fruits.sample(withReplacement = false, 0.5, 1)

    println(s"""fruits:  ${fruits.collect().mkString(", ")}""")
    println(s"""samples: ${samples.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 62
Source File: FilterExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object FilterExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("FilterExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val startWithPs = fruits.filter(fruit => fruit.startsWith("P"))

    println(s"""fruits:      ${fruits.collect().mkString(", ")}""")
    println(s"""startWithPs: ${startWithPs.collect().mkString(", ")}""")
  }
}

// scalastyle:on println 
Example 63
Source File: SparkFunSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark

// scalastyle:off
import org.apache.log4j.{Level, Logger}
import org.scalatest.{FunSuite, Outcome}

import org.apache.spark.Logging


  final protected override def withFixture(test: NoArgTest): Outcome = {
    val testName = test.text
    val suiteName = this.getClass.getName
    val shortSuiteName = suiteName.replaceAll("org.apache.spark", "o.a.s")
    try {
      Logger.getLogger("org").setLevel(Level.OFF)
      Logger.getLogger("akka").setLevel(Level.OFF)

      logInfo(s"\n\n===== TEST OUTPUT FOR $shortSuiteName: '$testName' =====\n")
      test()
    } finally {
      logInfo(s"\n\n===== FINISHED $shortSuiteName: '$testName' =====\n")
    }
  }

} 
Example 64
Source File: Logging.scala    From spark-distcp   with Apache License 2.0 5 votes vote down vote up
package com.coxautodata.objects

import org.apache.log4j.{Level, LogManager, Logger}

trait Logging {

  // Method to get the logger name for this object
  protected def logName: String = {
    // Ignore trailing $'s in the class names for Scala objects
    this.getClass.getName.stripSuffix("$")
  }

  private val log: Logger = LogManager.getLogger(logName)

  // Set logger level
  protected def setLogLevel(level: Level): Unit = log.setLevel(level)

  // Log methods that take only a String
  protected def logInfo(msg: => String) {
    if (log.isInfoEnabled) log.info(msg)
  }

  protected def logDebug(msg: => String) {
    if (log.isDebugEnabled) log.debug(msg)
  }

  protected def logTrace(msg: => String) {
    if (log.isTraceEnabled) log.trace(msg)
  }

  protected def logWarning(msg: => String) {
    log.warn(msg)
  }

  protected def logError(msg: => String) {
    log.error(msg)
  }

  // Log methods that take Throwables (Exceptions/Errors) too
  protected def logInfo(msg: => String, throwable: Throwable) {
    if (log.isInfoEnabled) log.info(msg, throwable)
  }

  protected def logDebug(msg: => String, throwable: Throwable) {
    if (log.isDebugEnabled) log.debug(msg, throwable)
  }

  protected def logTrace(msg: => String, throwable: Throwable) {
    if (log.isTraceEnabled) log.trace(msg, throwable)
  }

  protected def logWarning(msg: => String, throwable: Throwable) {
    log.warn(msg, throwable)
  }

  protected def logError(msg: => String, throwable: Throwable) {
    log.error(msg, throwable)
  }

  protected def isTraceEnabled: Boolean = {
    log.isTraceEnabled
  }


} 
Example 65
Source File: MCLModelSuite.scala    From MCL_spark   with MIT License 5 votes vote down vote up
package org.apache.spark.mllib.clustering

import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.util.Utils


class MCLModelSuite extends MCLFunSuite{
  // Disable Spark messages when running program
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  test("model save/load", UnitTest){

    val users: RDD[(VertexId, String)] =
      sc.parallelize(Array((0L,"Node1"), (1L,"Node2"),
        (2L,"Node3"), (3L,"Node4"),(4L,"Node5"),
        (5L,"Node6"), (6L,"Node7"), (7L, "Node8"),
        (8L, "Node9"), (9L, "Node10"), (10L, "Node11")))

    val relationships: RDD[Edge[Double]] =
      sc.parallelize(
        Seq(Edge(0, 1, 1.0), Edge(1, 0, 1.0),
          Edge(0, 2, 1.0), Edge(2, 0, 1.0),
          Edge(0, 3, 1.0), Edge(3, 0, 1.0),
          Edge(1, 2, 1.0), Edge(2, 1, 1.0),
          Edge(1, 3, 1.0), Edge(3, 1, 1.0),
          Edge(2, 3, 1.0), Edge(3, 2, 1.0),
          Edge(4, 5, 1.0), Edge(5, 4, 1.0),
          Edge(4, 6, 1.0), Edge(6, 4, 1.0),
          Edge(4, 7, 1.0), Edge(7, 4, 1.0),
          Edge(5, 6, 1.0), Edge(6, 5, 1.0),
          Edge(5, 7, 1.0), Edge(7, 5, 1.0),
          Edge(6, 7, 1.0), Edge(7, 6, 1.0),
          Edge(3, 8, 1.0), Edge(8, 3, 1.0),
          Edge(9, 8, 1.0), Edge(8, 9, 1.0),
          Edge(9, 10, 1.0), Edge(10, 9, 1.0),
          Edge(4, 10, 1.0), Edge(10, 4, 1.0)
        ))

    val graph = Graph(users, relationships)

    val model: MCLModel = MCL.train(graph)

    // Check number of clusters
    model.nbClusters shouldEqual 3

    // Check save and load methods
    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString

    Array(true, false).foreach { case selector =>
      // Save model, load it back, and compare.
      try {
        model.save(sc, path)
        val sameModel = MCLModel.load(sc, path)
        assertDatasetEquals(model.assignments.orderBy("id"), sameModel.assignments.orderBy("id"))
      } finally {
        Utils.deleteRecursively(tempDir)
      }
    }

  }

  test("nodes assignments", UnitTest) {
    val nodeId = 1.0.toLong
    val cluster = 2.0.toLong
    val newAssignment:Assignment = Assignment.apply(Row(nodeId, cluster))

    newAssignment.id shouldEqual nodeId
    newAssignment.cluster shouldEqual cluster
  }

} 
Example 66
Source File: KmeansTest.scala    From Scala-for-Machine-Learning-Second-Edition   with MIT License 5 votes vote down vote up
package org.scalaml.spark.mllib

import org.apache.log4j.{Level, Logger}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.{SparkConf, SparkContext}
import org.scalaml.{Logging, Resource}
import org.scalaml.Predef._
import org.scalaml.stats.TSeries._
import org.scalaml.trading.YahooFinancials
import org.scalaml.workflow.data.DataSource
import org.scalatest.FunSuite
import org.scalatest.concurrent.ScalaFutures

import scala.concurrent.Future


final class KmeansTest extends FunSuite with ScalaFutures with Logging with Resource {
  import scala.concurrent.ExecutionContext.Implicits.global

  protected[this] val name = "Spark MLlib K-Means"
  private val K = 8
  private val NRUNS = 4
  private val MAXITERS = 60
  private val PATH = "spark/CSCO.csv"
  private val CACHE = false

  test(s"$name evaluation") {
    show(s"Evaluation")

    Logger.getRootLogger.setLevel(Level.ERROR)
    // The Spark configuration has to be customize to your environment
    val sparkConf = new SparkConf().setMaster("local")
      .setAppName("Kmeans")
      .set("spark.executor.memory", "4096m")

    implicit val sc = SparkContext.getOrCreate(sparkConf) // no need to load additional jar file

    val kmeanClustering: Option[Kmeans] = extract.map(input => {
      val volatilityVol = zipToSeries(input._1, input._2).take(500)

      val config = new KmeansConfig(K, MAXITERS, NRUNS)
      val rddConfig = RDDConfig(CACHE, StorageLevel.MEMORY_ONLY)
      Kmeans(config, rddConfig, volatilityVol)
    })

      // Wraps into a future to enforce time out in case of a straggler
    val ft = Future[Boolean] { predict(kmeanClustering) }
    whenReady(ft) { result => assert(result) }
    sc.stop
  }

   private def predict(kmeanClustering: Option[Kmeans]): Boolean = {
     kmeanClustering.map(kmeansCluster => {
       val obs = Array[Double](0.1, 0.9)
       val clusterId1 = kmeansCluster |> obs
       show(s"(${obs(0)},${obs(1)}) => Cluster #$clusterId1")

       val obs2 = Array[Double](0.56, 0.11)
       val clusterId2 = kmeansCluster |> obs2
       val result = s"(${obs2(0)},${obs2(1)}) => Cluster #$clusterId2"
       show(s"$name result: $result")
     })
     true
   }

  private def extract: Option[(DblVec, DblVec)] = {
    import scala.util._
    val extractors = List[Array[String] => Double](
      YahooFinancials.volatility,
      YahooFinancials.volume
    )

    DataSource(getPath(PATH).get, true).map(_.|>) match {
      case Success(pfnSrc) => pfnSrc(extractors).map(res => ((res(0).toVector, res(1).toVector))).toOption
      case Failure(e) =>
        failureHandler(e)
        None
    }
  }
}


// ---------------------------------  EOF ------------------------------------------------- 
Example 67
Source File: Application.scala    From retail_analytics   with Apache License 2.0 5 votes vote down vote up
package controllers

import scalaz._
import Scalaz._
import scalaz.EitherT._
import scalaz.Validation
//import scalaz.Validation.FlatMap._
import scalaz.NonEmptyList._
import play.api.mvc._
import java.io.File
import scala.io.Source
import org.apache.log4j.Logger
import org.apache.log4j.Level
import models._
import models.stack._

import play.api.libs.json._

object Application extends Controller {

  def index() = Action { implicit request =>
    Ok(views.html.index("Megam Analytics."))
  }

  def upload = Action(parse.multipartFormData) { implicit request =>
       request.body.file("picture").map { picture =>
       import java.io.File
      val filename = picture.filename
      val contentType = picture.contentType
      picture.ref.moveTo(new File("/tmp/"+filename))
     
      models.HDFSFileService.saveFile("/tmp/"+filename) match {
        case Success(succ) => {
          val fu = List(("success" -> succ))
          Redirect("/").flashing(fu: _*)
        }
        case Failure(err) => {
          val fu = List(("error" -> "File doesn't get uploaded"))
          Redirect("/").flashing(fu: _*)
        }
      }
    }.getOrElse {
      val fu = List(("error" -> "File doesn't get uploaded.."))
      Redirect("/").flashing(fu: _*)
    }
  }

  def analysis() = Action { implicit request =>
  val tuple_res = models.Retail.buyingbehaviour(MConfig.recommand_ID.toInt, MConfig.retailfile)
  
     
  println("BACK==========================>>>")
  println(tuple_res._1)
 
  
  //val finalJson = {
  //  for {
  //    product <- productList
  //  } yield Json.parse(product).as[JsObject]
 // }
    Ok(views.html.finalProducts(tuple_res._1, tuple_res._2))
  }

} 
Example 68
Source File: FileOutputIT.scala    From sparta   with Apache License 2.0 5 votes vote down vote up
package com.stratio.sparta

import java.sql.Timestamp
import java.util.UUID

import com.github.nscala_time.time.Imports._
import com.stratio.sparta.sdk.pipeline.output.{Output, OutputFormatEnum, SaveModeEnum}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest._

import scala.reflect.io.File


class FileOutputIT extends FlatSpec with ShouldMatchers with BeforeAndAfterAll {
  self: FlatSpec =>

  @transient var sc: SparkContext = _

  override def beforeAll {
    Logger.getRootLogger.setLevel(Level.ERROR)
    sc = FileOutputIT.getNewLocalSparkContext(1, "test")
  }

  override def afterAll {
    sc.stop()
    System.clearProperty("spark.driver.port")
  }

  trait CommonValues {

    val sqlContext = SQLContext.getOrCreate(sc)

    import sqlContext.implicits._

    val time = new Timestamp(DateTime.now.getMillis)

    val data =
      sc.parallelize(Seq(Person("Kevin", 18, time), Person("Kira", 21, time), Person("Ariadne", 26, time))).toDF

    val tmpPath: String = s"/tmp/sparta-test/${UUID.randomUUID().toString}"
  }

  trait WithEventData extends CommonValues {
    val properties = Map("path" -> tmpPath, "createDifferentFiles" -> "false")
    val output = new FileOutput("file-test", properties)
  }

  "FileOutputIT" should "save a dataframe" in new WithEventData {
    output.save(data, SaveModeEnum.Append, Map(Output.TimeDimensionKey -> "minute", Output.TableNameKey -> "person"))

    val source = new java.io.File(tmpPath).listFiles()
    val read = sqlContext.read.json(tmpPath).toDF
    read.count shouldBe(3)
    File("/tmp/sparta-test").deleteRecursively
  }
}

object FileOutputIT {

  def getNewLocalSparkContext(numExecutors: Int = 1, title: String): SparkContext = {
    val conf = new SparkConf().setMaster(s"local[$numExecutors]").setAppName(title)
    SparkContext.getOrCreate(conf)
  }
}

case class Person(name: String, age: Int, minute: Timestamp) extends Serializable 
Example 69
Source File: GamerSparkSQLExample.scala    From SparkOnKudu   with Apache License 2.0 5 votes vote down vote up
package org.kududb.spark.demo.gamer.aggregates

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object GamerSparkSQLExample {
  def main(args:Array[String]): Unit = {
    if (args.length == 0) {
      println("{kudumaster} {runLocal}")
      return
    }

    Logger.getRootLogger.setLevel(Level.ERROR)

    val kuduMaster = args(0)
    val runLocal = args(1).equals("l")

    println("Loading Spark Context")
    var sc:SparkContext = null

    if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      sc = new SparkContext(sparkConfig)
    }
    println("Loading Spark Context: Finished")

    println("Setting up Tables")
    val sqlContext = new SQLContext(sc)
    sqlContext.load("org.kududb.spark",
      Map("kudu.table" -> "gamer", "kudu.master" -> kuduMaster)).registerTempTable("gamer")

    println("Query 1: SELECT count(*) FROM gamer")
    val startTimeQ1 = System.currentTimeMillis()
    sqlContext.sql("SELECT count(*) FROM gamer").take(10).foreach(r => {
      println(" - (" + r + ")")
    })
    println("Finish Query 1: " + (System.currentTimeMillis() - startTimeQ1))

    println("Query 2: SELECT * FROM gamer limit 100")
    val startTimeQ2 = System.currentTimeMillis()
    sqlContext.sql("SELECT * FROM gamer limit 100").take(100).foreach(r => {
      println(" - (" + r + ")")
    })
    println("Finish Query 2: " + (System.currentTimeMillis() - startTimeQ2))

    println("Query 3: SELECT * FROM gamer order_by last_time_played desc limit 100")
    val startTimeQ3 = System.currentTimeMillis()
    sqlContext.sql("SELECT * FROM gamer order by last_time_played desc limit 100").take(100).foreach(r => {
      println(" - (" + r + ")")
    })
    println("Finish Query 3: " + (System.currentTimeMillis() - startTimeQ3))

    println("Query 4: SELECT max(games_played), max(oks), max(damage_given) FROM gamer")
    val startTimeQ4 = System.currentTimeMillis()
    sqlContext.sql("SELECT max(games_played), max(oks), max(damage_given) FROM gamer").take(100).foreach(r => {
      println(" - (" + r + ")")
    })
    println("Finish Query 4: " + (System.currentTimeMillis() - startTimeQ4))

    println("Query 5 + MLLIB: SELECT gamer_id, oks, games_won, games_played FROM gamer" )
    val startTimeQ5 = System.currentTimeMillis()
    val resultDf = sqlContext.sql("SELECT gamer_id, oks, games_won, games_played FROM gamer")

    val parsedData = resultDf.map(r => {
      val array = Array(r.getInt(1).toDouble, r.getInt(2).toDouble, r.getInt(3).toDouble)
      Vectors.dense(array)
    })

    val dataCount = parsedData.count()

    if (dataCount > 0) {
      val clusters = KMeans.train(parsedData, 3, 5)
      clusters.clusterCenters.foreach(v => println(" Vector Center:" + v))

    }
    //TODO add Mllib here
    println("Finish Query 5 + MLLIB: " + (System.currentTimeMillis() - startTimeQ5))

  }
} 
Example 70
Source File: BasicSparkSQLExamples.scala    From SparkOnKudu   with Apache License 2.0 5 votes vote down vote up
package org.kududb.spark.demo.basic

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors

object BasicSparkSQLExamples {
  def main(args:Array[String]): Unit = {
    if (args.length == 0) {
      println("<kuduMaster> <tablename> <runLocal>")
    }

    Logger.getRootLogger.setLevel(Level.ERROR)

    val kuduMaster = args(0)
    val tableName = args(1)
    val runLocal = args(2).equals("l")

    println("starting")
    var sc:SparkContext = null
    if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      sc = new SparkContext(sparkConfig)
    }

    try {
      println("Setting up Tables")
      val sqlContext = new SQLContext(sc)
      sqlContext.load("org.kududb.spark",
        Map("kudu.table" -> tableName, "kudu.master" -> kuduMaster)).registerTempTable(tableName)

      println("Query 1: SELECT count(*) FROM " + tableName)
      val startTimeQ1 = System.currentTimeMillis()
      sqlContext.sql("SELECT count(*) FROM " + tableName).take(10).foreach(r => {
        println(" - (" + r + ")")
      })
      println("Finish Query 1: " + (System.currentTimeMillis() - startTimeQ1))

      println("Query 2: SELECT key_id, col_1 FROM " + tableName + " limit 100")
      val startTimeQ2 = System.currentTimeMillis()
      sqlContext.sql("SELECT key_id, col_1 FROM " + tableName + " limit 100 ").take(100).foreach(r => {
        println(" - (" + r + ")")
      })
      println("Finish Query 2: " + (System.currentTimeMillis() - startTimeQ2))

      val q3 = "select key_id from " + tableName + " a join (SELECT max(col_1) col_max FROM " + tableName + ") b on (a.col_1 = b.col_max)"
      println("Query 3: " + q3)
      val startTimeQ3 = System.currentTimeMillis()
      sqlContext.sql(q3).take(100).foreach(r => {
        println(" - (" + r + ")")
      })
      println("Finish Query 3: " + (System.currentTimeMillis() - startTimeQ3))


      println("Query 5 + MLLIB: SELECT key_id, col_1, col_2 FROM " + tableName )
      val startTimeQ5 = System.currentTimeMillis()
      val resultDf = sqlContext.sql("SELECT key_id, col_1, col_2 FROM " + tableName + " limit 1000")

      val parsedData = resultDf.map(r => {
        val array = Array(r.getInt(1).toDouble, r.getInt(2).toDouble)
        Vectors.dense(array)
      })
      val clusters = KMeans.train(parsedData, 3, 4)
      clusters.clusterCenters.foreach(v => println(" Vector Center:" + v))

      //TODO add Mllib here
      println("Finish Query 5 + MLLIB: " + (System.currentTimeMillis() - startTimeQ5))

    } finally {
      sc.stop()
    }
  }
} 
Example 71
Source File: StreamingKMeansSuite.scala    From spark-structured-streaming-ml   with Apache License 2.0 5 votes vote down vote up
package com.highperformancespark.examples.structuredstreaming

import com.holdenkarau.spark.testing.DataFrameSuiteBase
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.ml.linalg._
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.scalatest.FunSuite
import org.apache.log4j.{Level, Logger}

case class TestRow(features: Vector)

class StreamingKMeansSuite extends FunSuite with DataFrameSuiteBase {

  override def beforeAll(): Unit = {
    super.beforeAll()
    Logger.getLogger("org").setLevel(Level.OFF)
  }

  test("streaming model with one center should converge to true center") {
    import spark.implicits._
    val k = 1
    val dim = 5
    val clusterSpread = 0.1
    val seed = 63
    // TODO: this test is very flaky. The centers do not converge for some
    // (most?) random seeds
    val (batches, trueCenters) =
      StreamingKMeansSuite.generateBatches(100, 80, k, dim, clusterSpread, seed)
    val inputStream = MemoryStream[TestRow]
    val ds = inputStream.toDS()
    val skm = new StreamingKMeans().setK(k).setRandomCenters(dim, 0.01)
    val query = skm.evilTrain(ds.toDF())
    val streamingModels = batches.map { batch =>
      inputStream.addData(batch)
      query.processAllAvailable()
      skm.getModel
    }
    // TODO: use spark's testing suite
    streamingModels.last.centers.zip(trueCenters).foreach {
      case (center, trueCenter) =>
        val centers = center.toArray.mkString(",")
        val trueCenters = trueCenter.toArray.mkString(",")
        println(s"${centers} | ${trueCenters}")
        assert(center.toArray.zip(trueCenter.toArray).forall(
          x => math.abs(x._1 - x._2) < 0.1))
    }
    query.stop()
  }

  def compareBatchAndStreaming(
      batchModel: KMeansModel,
      streamingModel: StreamingKMeansModel,
      validationData: DataFrame): Unit = {
    assert(batchModel.clusterCenters === streamingModel.centers)
    // TODO: implement prediction comparison
  }

}

object StreamingKMeansSuite {

  def generateBatches(
      numPoints: Int,
      numBatches: Int,
      k: Int,
      d: Int,
      r: Double,
      seed: Int,
      initCenters: Array[Vector] = null):
      (IndexedSeq[IndexedSeq[TestRow]], Array[Vector]) = {
    val rand = scala.util.Random
    rand.setSeed(seed)
    val centers = initCenters match {
      case null => Array.fill(k)(Vectors.dense(Array.fill(d)(rand.nextGaussian())))
      case _ => initCenters
    }
    val data = (0 until numBatches).map { i =>
      (0 until numPoints).map { idx =>
        val center = centers(idx % k)
        val vec = Vectors.dense(
          Array.tabulate(d)(x => center(x) + rand.nextGaussian() * r))
        TestRow(vec)
      }
    }
    (data, centers)
  }
} 
Example 72
Source File: SuspiciousConnects.scala    From oni-ml   with Apache License 2.0 5 votes vote down vote up
package org.opennetworkinsight

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import org.slf4j.LoggerFactory
import org.opennetworkinsight.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
import org.opennetworkinsight.dns.DNSSuspiciousConnects
import org.opennetworkinsight.netflow.FlowSuspiciousConnects
import org.opennetworkinsight.proxy.ProxySuspiciousConnectsAnalysis


  def main(args: Array[String]) {

    val parser = SuspiciousConnectsArgumentParser.parser

    parser.parse(args, SuspiciousConnectsConfig()) match {
      case Some(config) =>
        val logger = LoggerFactory.getLogger(this.getClass)
        Logger.getLogger("org").setLevel(Level.OFF)
        Logger.getLogger("akka").setLevel(Level.OFF)

        val analysis = config.analysis
        val sparkConfig = new SparkConf().setAppName("ONI ML:  " + analysis + " lda")
        val sparkContext = new SparkContext(sparkConfig)
        val sqlContext = new SQLContext(sparkContext)
        implicit val outputDelimiter = OutputDelimiter

        analysis match {
          case "flow" => FlowSuspiciousConnects.run(config, sparkContext, sqlContext, logger)
          case "dns" => DNSSuspiciousConnects.run(config, sparkContext, sqlContext, logger)
          case "proxy" => ProxySuspiciousConnectsAnalysis.run(config, sparkContext, sqlContext, logger)
          case _ => println("ERROR:  unsupported (or misspelled) analysis: " + analysis)
        }

        sparkContext.stop()

      case None => println("Error parsing arguments")
    }

    System.exit(0)
  }


} 
Example 73
Source File: SparseNaiveBayes.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.util.MLUtils


object SparseNaiveBayes {

  case class Params(
      input: String = null,
      minPartitions: Int = 0,
      numFeatures: Int = -1,
      lambda: Double = 1.0) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("SparseNaiveBayes") {
      head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.")
      opt[Int]("numPartitions")
        .text("min number of partitions")
        .action((x, c) => c.copy(minPartitions = x))
      opt[Int]("numFeatures")
        .text("number of features")
        .action((x, c) => c.copy(numFeatures = x))
      opt[Double]("lambda")
        .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}")
        .action((x, c) => c.copy(lambda = x))
      arg[String]("<input>")
        .text("input paths to labeled examples in LIBSVM format")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val minPartitions =
      if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions

    val examples =
      MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions)
    // Cache examples because it will be used in both training and evaluation.
    examples.cache()

    val splits = examples.randomSplit(Array(0.8, 0.2))
    val training = splits(0)
    val test = splits(1)

    val numTraining = training.count()
    val numTest = test.count()

    println(s"numTraining = $numTraining, numTest = $numTest.")

    val model = new NaiveBayes().setLambda(params.lambda).run(training)

    val prediction = model.predict(test.map(_.features))
    val predictionAndLabel = prediction.zip(test.map(_.label))
    val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest

    println(s"Test accuracy = $accuracy.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 74
Source File: DenseKMeans.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors


object DenseKMeans {

  object InitializationMode extends Enumeration {
    type InitializationMode = Value
    val Random, Parallel = Value
  }

  import InitializationMode._

  case class Params(
      input: String = null,
      k: Int = -1,
      numIterations: Int = 10,
      initializationMode: InitializationMode = Parallel) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DenseKMeans") {
      head("DenseKMeans: an example k-means app for dense data.")
      opt[Int]('k', "k")
        .required()
        .text(s"number of clusters, required")
        .action((x, c) => c.copy(k = x))
      opt[Int]("numIterations")
        .text(s"number of iterations, default: ${defaultParams.numIterations}")
        .action((x, c) => c.copy(numIterations = x))
      opt[String]("initMode")
        .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " +
        s"default: ${defaultParams.initializationMode}")
        .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x)))
      arg[String]("<input>")
        .text("input paths to examples")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"DenseKMeans with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = sc.textFile(params.input).map { line =>
      Vectors.dense(line.split(' ').map(_.toDouble))
    }.cache()

    val numExamples = examples.count()

    println(s"numExamples = $numExamples.")

    val initMode = params.initializationMode match {
      case Random => KMeans.RANDOM
      case Parallel => KMeans.K_MEANS_PARALLEL
    }

    val model = new KMeans()
      .setInitializationMode(initMode)
      .setK(params.k)
      .setMaxIterations(params.numIterations)
      .run(examples)

    val cost = model.computeCost(examples)

    println(s"Total cost = $cost.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 75
Source File: StreamingExamples.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.streaming

import org.apache.log4j.{Level, Logger}

import org.apache.spark.internal.Logging


  def setStreamingLogLevels() {
    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
    if (!log4jInitialized) {
      // We first log something to initialize Spark's default logging, then we override the
      // logging level.
      logInfo("Setting log level to [WARN] for streaming example." +
        " To override add a custom log4j.properties to the classpath.")
      Logger.getRootLogger.setLevel(Level.WARN)
    }
  }
} 
Example 76
Source File: YarnScheduler.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler.cluster

import org.apache.hadoop.yarn.util.RackResolver
import org.apache.log4j.{Level, Logger}

import org.apache.spark._
import org.apache.spark.scheduler.TaskSchedulerImpl
import org.apache.spark.util.Utils

private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) {

  // RackResolver logs an INFO message whenever it resolves a rack, which is way too often.
  if (Logger.getLogger(classOf[RackResolver]).getLevel == null) {
    Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN)
  }

  // By default, rack is unknown
  override def getRackForHost(hostPort: String): Option[String] = {
    val host = Utils.parseHostPort(hostPort)._1
    Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation)
  }
} 
Example 77
Source File: ClientArguments.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy

import java.net.{URI, URISyntaxException}

import scala.annotation.tailrec
import scala.collection.mutable.ListBuffer

import org.apache.log4j.Level

import org.apache.spark.util.{IntParam, MemoryParam, Utils}


  private def printUsageAndExit(exitCode: Int) {
    // TODO: It wouldn't be too hard to allow users to submit their app and dependency jars
    //       separately similar to in the YARN client.
    val usage =
     s"""
      |Usage: DriverClient [options] launch <active-master> <jar-url> <main-class> [driver options]
      |Usage: DriverClient kill <active-master> <driver-id>
      |
      |Options:
      |   -c CORES, --cores CORES        Number of cores to request (default: $DEFAULT_CORES)
      |   -m MEMORY, --memory MEMORY     Megabytes of memory to request (default: $DEFAULT_MEMORY)
      |   -s, --supervise                Whether to restart the driver on failure
      |                                  (default: $DEFAULT_SUPERVISE)
      |   -v, --verbose                  Print more debugging output
     """.stripMargin
    // scalastyle:off println
    System.err.println(usage)
    // scalastyle:on println
    System.exit(exitCode)
  }
}

private[deploy] object ClientArguments {
  val DEFAULT_CORES = 1
  val DEFAULT_MEMORY = Utils.DEFAULT_DRIVER_MEM_MB // MB
  val DEFAULT_SUPERVISE = false

  def isValidJarUrl(s: String): Boolean = {
    try {
      val uri = new URI(s)
      uri.getScheme != null && uri.getPath != null && uri.getPath.endsWith(".jar")
    } catch {
      case _: URISyntaxException => false
    }
  }
} 
Example 78
Source File: BeforeAndAfterWithContext.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rpc.netty

import eleflow.uberdata.core.IUberdataContext
import eleflow.uberdata.core.util.ClusterSettings
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkEnv}
import org.scalatest.{BeforeAndAfterEach, Suite}

object TestSparkConf {
  def conf = {
    val sconf = new SparkConf()
    sconf.set("spark.app.name", "teste")
    sconf
  }

  val separator = ","

}


trait BeforeAndAfterWithContext extends BeforeAndAfterEach { this: Suite =>

  val defaultFilePath = "src/test/resources/"
  import TestSparkConf._
  ClusterSettings.master = Some("local[*]")
  conf.set("spark.driver.allowMultipleContexts", "true")
  @transient val context = IUberdataContext.getUC(conf)

  override def beforeEach() = {
    setLogLevels(Level.INFO, Seq("spark", "org.eclipse.jetty", "akka"))
  }

  def setLogLevels(level: org.apache.log4j.Level, loggers: TraversableOnce[String]) = {
    loggers.map { loggerName =>
      val logger = Logger.getLogger(loggerName)
      val prevLevel = logger.getLevel
      logger.setLevel(level)
      loggerName -> prevLevel
    }.toMap
  }

  override def afterEach() = {
    val get = SparkEnv.get
    val rpcEnv =
      if (get != null) {
        Some(get.rpcEnv)
      } else None
    context.clearContext()
    //rpcEnv.foreach(
    //  _.fileServer.asInstanceOf[org.apache.spark.rpc.netty.HttpBasedFileServer].shutdown())


    System.clearProperty("spark.master.port")
  }
} 
Example 79
Source File: ClientArguments.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy

import java.net.{URI, URISyntaxException}

import scala.collection.mutable.ListBuffer

import org.apache.log4j.Level

import org.apache.spark.util.{IntParam, MemoryParam}


  def printUsageAndExit(exitCode: Int) {
    // TODO: It wouldn't be too hard to allow users to submit their app and dependency jars
    //       separately similar to in the YARN client.
    val usage =
     s"""
      |Usage: DriverClient [options] launch <active-master> <jar-url> <main-class> [driver options]
      |Usage: DriverClient kill <active-master> <driver-id>
      |
      |Options:
      |   -c CORES, --cores CORES        Number of cores to request (default: $DEFAULT_CORES)
      |   -m MEMORY, --memory MEMORY     Megabytes of memory to request (default: $DEFAULT_MEMORY)
      |   -s, --supervise                Whether to restart the driver on failure
      |                                  (default: $DEFAULT_SUPERVISE)
      |   -v, --verbose                  Print more debugging output
     """.stripMargin
    System.err.println(usage)
    System.exit(exitCode)
  }
}

object ClientArguments {
  private[spark] val DEFAULT_CORES = 1
  private[spark] val DEFAULT_MEMORY = 512 // MB
  private[spark] val DEFAULT_SUPERVISE = false

  def isValidJarUrl(s: String): Boolean = {
    try {
      val uri = new URI(s)
      uri.getScheme != null && uri.getPath != null && uri.getPath.endsWith(".jar")
    } catch {
      case _: URISyntaxException => false
    }
  }
} 
Example 80
Source File: MLLibSuite.scala    From spark-sql-perf   with Apache License 2.0 5 votes vote down vote up
package com.databricks.spark.sql.perf.mllib

import org.scalatest.{BeforeAndAfterAll, FunSuite}

import org.apache.log4j.{Level, Logger}

import org.apache.spark.sql.{Row, SparkSession}

class MLLibSuite extends FunSuite with BeforeAndAfterAll {

  private var sparkSession: SparkSession = _
  var savedLevels: Map[String, Level] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    sparkSession = SparkSession.builder.master("local[2]").appName("MLlib QA").getOrCreate()

    // Travis limits the size of the log file produced by a build. Because we do run a small
    // version of all the ML benchmarks in this suite, we produce a ton of logs. Here we set the
    // log level to ERROR, just for this suite, to avoid displeasing travis.
    savedLevels = Seq("akka", "org", "com.databricks").map { name =>
      val logger = Logger.getLogger(name)
      val curLevel = logger.getLevel
      logger.setLevel(Level.ERROR)
      name -> curLevel
    }.toMap
  }

  override def afterAll(): Unit = {
    savedLevels.foreach { case (name, level) =>
      Logger.getLogger(name).setLevel(level)
    }
    try {
      if (sparkSession != null) {
        sparkSession.stop()
      }
      // To avoid RPC rebinding to the same port, since it doesn't unbind immediately on shutdown
      System.clearProperty("spark.driver.port")
      sparkSession = null
    } finally {
      super.afterAll()
    }
  }

  test("test MlLib benchmarks with mllib-small.yaml.") {
    val results = MLLib.run(yamlConfig = MLLib.smallConfig)
    val failures = results.na.drop(Seq("failure"))
    if (failures.count() > 0) {
      failures.select("name", "failure.*").collect().foreach {
        case Row(name: String, error: String, message: String) =>
          println(
            s"""There as a failure in the benchmark for $name:
               |  $error ${message.replace("\n", "\n  ")}
             """.stripMargin)
      }
      fail("Unable to run all benchmarks successfully, see console output for more info.")
    }
  }

  test("test before benchmark methods for pipeline benchmarks.") {
    val benchmarks = MLLib.getBenchmarks(MLLib.getConf(yamlConfig = MLLib.smallConfig))
    benchmarks.foreach { b =>
      b.beforeBenchmark()
    }
  }
} 
Example 81
Source File: SomeSQLOnTitanic.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.machinelearning.titanic

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object SomeSQLOnTitanic {

  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

    def main (args:Array[String]): Unit = {
      val testFile = args(0)
      val trainFile = args(1)

      val isLocal = true

      val sparkSession = if (isLocal) {
        SparkSession.builder
          .master("local")
          .appName("my-spark-app")
          .config("spark.some.config.option", "config-value")
          .config("spark.driver.host","127.0.0.1")
          .config("spark.sql.parquet.compression.codec", "gzip")
          .enableHiveSupport()
          .getOrCreate()
      } else {
        SparkSession.builder
          .appName("my-spark-app")
          .config("spark.some.config.option", "config-value")
          .enableHiveSupport()
          .getOrCreate()
      }
      import sparkSession.implicits._

      //Load Data
      val trainDs = sparkSession.read.option("header", "true")
        .option("charset", "UTF8")
        .option("delimiter",",")
        .csv(trainFile)

      trainDs.createOrReplaceTempView("train")

      println("Sex -> Servived")
      sparkSession.sql("select Sex, sum(Survived), count(*), (sum(Survived)/count(*)) from train group by Sex").collect().foreach(println)

      println("Cabin -> Servived")
      sparkSession.sql("select substring(Cabin,1,1), sum(Survived), count(*), (sum(Survived)/count(*)) from train group by 1 order by 1").collect().foreach(println)

      println("Age -> Servived")
      sparkSession.sql("select round(cast(Age as Int) / 10) as age_block, sum(Survived), count(*), (sum(Survived)/count(*)) from train group by 1 order by 1").collect().foreach(println)

      println("PClass -> Servived")
      sparkSession.sql("select pclass, sum(Survived), count(*), (sum(Survived)/count(*)) from train group by pclass order by 1").collect().foreach(println)

      println("Embarked -> Servived")
      sparkSession.sql("select Embarked, sum(Survived), count(*), (sum(Survived)/count(*)) from train group by Embarked order by 1").collect().foreach(println)

      println("Fare -> Servived")
      sparkSession.sql("select round((Fare / 10)), sum(Survived), count(*), (sum(Survived)/count(*)) from train group by 1 order by 1").collect().foreach(println)

      println("Servived -> Servived")
      sparkSession.sql("select sum(Survived), count(*) from train order by 1").collect().foreach(println)

      sparkSession.stop()
  }
} 
Example 82
Source File: ManyToManyNormalJoin.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.manytomany

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Row, SparkSession}

import scala.collection.mutable

object ManyToManyNormalJoin {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val jsonPath = args(0)

    val sparkSession = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .config("spark.driver.host","127.0.0.1")
      .getOrCreate()

    val jsonDf = sparkSession.read.json(jsonPath)

    val nGramWordCount = jsonDf.rdd.flatMap(r => {
      val actions = r.getAs[mutable.WrappedArray[Row]]("actions")

      val resultList = new mutable.MutableList[((Long, Long), Int)]

      actions.foreach(a => {
        val aValue = a.getAs[Long]("action")
        actions.foreach(b => {
          val bValue = b.getAs[Long]("action")
          if (aValue < bValue) {
            resultList.+=(((aValue, bValue), 1))
          }
        })
      })
      resultList.toSeq
    }).reduceByKey(_ + _)

    nGramWordCount.collect().foreach(r => {
      println(" - " + r)
    })
  }
} 
Example 83
Source File: ManyToManyNestedJoin.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.manytomany

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Row, SparkSession}

import scala.collection.mutable

object ManyToManyNestedJoin {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val jsonPath = args(0)

    val sparkSession = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .config("spark.driver.host","127.0.0.1")
      .getOrCreate()

    val jsonDf = sparkSession.read.json(jsonPath)

    val nGramWordCount = jsonDf.rdd.flatMap(r => {
      val actions = r.getAs[mutable.WrappedArray[Row]]("actions")

      val resultList = new mutable.MutableList[(Long, NestedCount)]

      actions.foreach(a => {
        val aValue = a.getAs[Long]("action")
        val aNestedCount = new NestedCount
        actions.foreach(b => {
          val bValue = b.getAs[Long]("action")
          if (aValue < bValue) {
            aNestedCount.+=(bValue, 1)
          }
        })
        resultList.+=((aValue, aNestedCount))
      })
      resultList.toSeq
    }).reduceByKey((a, b) => a + b)

      //.reduceByKey(_ + _)

    nGramWordCount.collect().foreach(r => {
      println(" - " + r)
    })
  }
}


//1,2
//1,3
//1,4

//1 (2, 3, 4)

class NestedCount() extends Serializable{

  val map = new mutable.HashMap[Long, Long]()

  def += (key:Long, count:Long): Unit = {
    val currentValue = map.getOrElse(key, 0l)
    map.put(key, currentValue + count)
  }

  def + (other:NestedCount): NestedCount = {
    val result = new NestedCount

    other.map.foreach(r => {
      result.+=(r._1, r._2)
    })
    this.map.foreach(r => {
      result.+=(r._1, r._2)
    })
    result
  }

  override def toString(): String = {
    val stringBuilder = new StringBuilder
    map.foreach(r => {
      stringBuilder.append("(" + r._1 + "," + r._2 + ")")
    })
    stringBuilder.toString()
  }
} 
Example 84
Source File: SaltedExample.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.salted

import java.util.Random

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object SaltedExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {

    val jsonPath = args(0)

    val sparkSession = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .getOrCreate()

    val jsonDfLeft = sparkSession.read.json(jsonPath)

    val saltedLeft = jsonDfLeft.rdd.flatMap(r => {
      val group = r.getAs[String]("group")
      val value = r.getAs[Long]("value")

      Seq((group + "_" + 0, value),(group + "_" + 1, value))
    })

    val jsonDfRight = sparkSession.read.json(jsonPath)

    val saltedRight = jsonDfRight.rdd.mapPartitions(it => {

      val random = new Random()

      it.map(r => {
        val group = r.getAs[String]("group")
        val value = r.getAs[Long]("value")

        (group + "_" + random.nextInt(2), value)
      })
    })

    jsonDfLeft.join(jsonDfRight).collect().foreach(r => {
      println("Normal.result:" + r)
    })
    println("----")
    saltedLeft.join(saltedRight).collect().foreach(r => {
      println("Salted.result:" + r)
    })
  }
} 
Example 85
Source File: SmallWindowing.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.windowing.small

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession


object SmallWindowing {

  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {

    val jsonPath = args(0)

    val sparkSession = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .getOrCreate()

    val jsonDf = sparkSession.read.json(jsonPath)

    val timeDifRdd = jsonDf.rdd.map(row => {
      val group = row.getAs[String]("group")
      val time = row.getAs[Long]("time")
      val value = row.getAs[Long]("value")
      //(key  , value)
      (group, (time, value))
    }).groupByKey().flatMap{case (group, records) =>

      var lastValue = 0l

      val localList = records.toSeq
      println("localList.size:" + localList.size)
      localList.sortBy(_._1).map{case (time, value) =>
        val dif = value - lastValue
        lastValue = value
        (group, time, value, dif)
      }
    }

    timeDifRdd.take(10).foreach(r => {
      println(r)
    })

    sparkSession.stop()
  }
} 
Example 86
Source File: SuperBigWindowing.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.windowing.superbig

import org.apache.log4j.{Level, Logger}
import org.apache.spark.Partitioner
import org.apache.spark.sql.SparkSession

object SuperBigWindowing {

  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args: Array[String]): Unit = {

    val jsonPath = args(0)
    val pageSize = args(1).toInt

    val spark = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .config("spark.driver.host","127.0.0.1")
      .getOrCreate()

    val jsonDf = spark.read.json(jsonPath)

    import spark.implicits._

    val diffDs = jsonDf.flatMap(row => {
      val group = row.getAs[String]("group")
      val time = row.getAs[Long]("time")
      val value = row.getAs[Long]("value")

      val timePage = time / pageSize

      if (time %  pageSize == 0) { //Am I on the edge of the page
        Seq((timePage, (time, value)), (timePage + 1, (time, value)))
      } else {
        Seq((timePage, (time, value)))
      }
    }).groupByKey(r => r._1).flatMapGroups((k, it) => {
      var lastValue = 0l

      it.toSeq.
        sortBy{case (page, (time, value)) => time}.
        map{case (page, (time, value)) =>
        val dif = value - lastValue
        lastValue = value
        (time, value, dif)
      }
    })

    diffDs.collect().foreach(r => println(" - " + r))

    spark.stop()

  }
} 
Example 87
Source File: SessionWindowing.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.timeseries

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

import scala.collection.mutable

object SessionWindowing {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val sessionJson = args(0)
    val timeGap = args(1).toInt

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host", "127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val sessionDs = sparkSession.read.json(sessionJson).as[JsonLeadLag]

    sessionDs.createOrReplaceTempView("session_table")

    sparkSession.sql("select * from session_table").collect().foreach(println)

    val sessionDefinitinonDf = sessionDs.rdd.map(r => {
      (r.group, r)
    }).groupByKey().flatMap{ case (group, jsonObjIt) =>

      var lastStart:Long = -1
      var lastEnd:Long = -1
      var sessionCount = 1
      var eventsInASession = 0

      val sessionList = new mutable.MutableList[SessionDefinition]

      jsonObjIt.toSeq.sortBy(r => r.ts).foreach(record => {
        val ts = record.ts
        eventsInASession += 1

        if (lastStart == -1) {
          lastStart = ts
        } else if (ts > lastEnd + timeGap) {
          sessionList += SessionDefinition(group, lastStart, lastEnd, lastEnd - lastStart, eventsInASession)
          lastStart = ts
          eventsInASession = 0
        }
        lastEnd = ts
      })
      sessionList
    }

    sessionDefinitinonDf.collect().foreach(println)

  }
}

case class SessionDefinition(group:String, sessionStart:Long, sessionEnd:Long, sessionLength:Long, sessionEvents:Int) 
Example 88
Source File: LeadLagExample.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.timeseries

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object LeadLagExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val leadLagJson = args(0)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host", "127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val leadLag = sparkSession.read.json(leadLagJson).as[JsonLeadLag]

    leadLag.createOrReplaceTempView("leadlag")

    sparkSession.sql("select * from leadlag").collect().foreach(println)

    val leadLagDf = sparkSession.sql("SELECT " +
      "group, ts, " +
      "value as v_now, " +
      "LEAD(value) OVER (PARTITION BY group ORDER BY ts) as v_after, " +
      "LAG(value)  OVER (PARTITION BY group ORDER BY ts) as v_before " +
      "FROM leadlag")

    leadLagDf.collect().foreach(println)

    leadLagDf.createOrReplaceTempView("leadlag_stage2")

    leadLagDf.printSchema()

    sparkSession.sql("select " +
      "group, ts, v_now, v_after, v_before, " +
      "case " +
      " when v_now < v_after and v_now < v_before then 'valley'" +
      " when v_now > v_after and v_now > v_before then 'peak'" +
      " else 'n/a' " +
      "end " +
      "from leadlag_stage2").collect().foreach(println)
  }
}

case class JsonLeadLag(group:String, ts:Long, value:Long) 
Example 89
Source File: TumblingWindows.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.timeseries

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object TumblingWindows {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val leadLagJson = args(0)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host", "127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val leadLag = sparkSession.read.json(leadLagJson).as[JsonLeadLag]

    leadLag.createOrReplaceTempView("leadlag")

    sparkSession.sql("select * from leadlag").collect().foreach(println)

    val leadLagDf = sparkSession.sql("SELECT " +
      "group, " +
      "round(ts / 3), " +
      "avg(value), " +
      "max(value), " +
      "min(value) " +
      "FROM leadlag " +
      "group by 1,2")

    leadLagDf.collect().foreach(println)

  }
} 
Example 90
Source File: InfectionPointWindow.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.timeseries

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object InfectionPointWindow {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val inflectionPointJson = args(0)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host", "127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val inflectionPointDs = sparkSession.read.json(inflectionPointJson).as[JsonInfectionPoint]

    inflectionPointDs.createOrReplaceTempView("inflection_point")

    sparkSession.sql("select * from inflection_point").collect().foreach(println)

    val leadLagDf = sparkSession.sql("SELECT " +
      "group, ts, " +
      "value as v_now, " +
      "AVG(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg, " +
      "Min(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg, " +
      "Max(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg " +
      "FROM inflection_point " +
      "where event_type = 'inflection'")

    leadLagDf.collect().foreach(println)

  }
}

case class JsonInfectionPoint(group:String, ts:Long, value:Long, event_type:String) 
Example 91
Source File: SplidingWindows.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.timeseries

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object SplidingWindows {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val leadLagJson = args(0)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host", "127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val leadLag = sparkSession.read.json(leadLagJson).as[JsonLeadLag]

    leadLag.createOrReplaceTempView("leadlag")

    sparkSession.sql("select * from leadlag").collect().foreach(println)

    val leadLagDf = sparkSession.sql("SELECT " +
      "group, ts, " +
      "value as v_now, " +
      "AVG(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg, " +
      "Min(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg, " +
      "Max(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg " +
      "FROM leadlag")

    leadLagDf.collect().foreach(println)

  }
} 
Example 92
Source File: JsonNestedExample.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.nested

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{ArrayType, DataType, StructField, StructType}

import scala.collection.mutable

object JsonNestedExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args: Array[String]): Unit = {

    val isLocal = args(0).equalsIgnoreCase("l")
    val jsonPath = args(1)
    val outputTableName = args(2)

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    val jsonDf = sparkSession.read.json(jsonPath)

    val localJsonDf = jsonDf.collect()

    println("--Df")
    jsonDf.foreach(row => {
      println("row:" + row)
    })
    println("--local")
    localJsonDf.foreach(row => {
      println("row:" + row)
    })

    jsonDf.createOrReplaceTempView("json_table")

    println("--Tree Schema")
    jsonDf.schema.printTreeString()
    println("--")
    jsonDf.write.saveAsTable(outputTableName)

    sparkSession.sqlContext.sql("select * from " + outputTableName).take(10).foreach(println)

    println("--")
    
    sparkSession.stop()
  }

  def populatedFlattedHashMap(row:Row,
                              schema:StructType,
                              fields:Array[StructField],
                              flattedMap:mutable.HashMap[(String, DataType), mutable.MutableList[Any]],
                              parentFieldName:String): Unit = {
    fields.foreach(field => {

      println("field:" + field.dataType)
      if (field.dataType.isInstanceOf[ArrayType]) {
        val elementType = field.dataType.asInstanceOf[ArrayType].elementType
        if (elementType.isInstanceOf[StructType]) {
          val childSchema = elementType.asInstanceOf[StructType]

          val childRow = Row.fromSeq(row.getAs[mutable.WrappedArray[Any]](field.name).toSeq)

          populatedFlattedHashMap(childRow, childSchema, childSchema.fields, flattedMap, parentFieldName + field.name + ".")
        }
      } else {
        val fieldList = flattedMap.getOrElseUpdate((parentFieldName + field.name, field.dataType), new mutable.MutableList[Any])
        fieldList.+=:(row.getAs[Any](schema.fieldIndex(field.name)))
      }

    })
  }
} 
Example 93
Source File: NestedTableExample.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.nested

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType}
import org.apache.spark.sql.{Row, SparkSession}

object NestedTableExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .config("spark.driver.host","127.0.0.1")
      .enableHiveSupport()
      .getOrCreate()


    spark.sql("create table IF NOT EXISTS nested_empty " +
      "( A int, " +
      "  B string, " +
      "  nested ARRAY<STRUCT< " +
      "     nested_C: int," +
      "     nested_D: string" +
      "  >>" +
      ") ")

    val rowRDD = spark.sparkContext.
      parallelize(Array(
        Row(1, "foo", Seq(Row(1, "barA"),Row(2, "bar"))),
        Row(2, "foo", Seq(Row(1, "barB"),Row(2, "bar"))),
        Row(3, "foo", Seq(Row(1, "barC"),Row(2, "bar")))))

    val emptyDf = spark.sql("select * from nested_empty limit 0")

    val tableSchema = emptyDf.schema

    val populated1Df = spark.sqlContext.createDataFrame(rowRDD, tableSchema)

    println("----")
    populated1Df.collect().foreach(r => println(" emptySchemaExample:" + r))

    val nestedSchema = new StructType()
      .add("nested_C", IntegerType)
      .add("nested_D", StringType)

    val definedSchema = new StructType()
      .add("A", IntegerType)
      .add("B", StringType)
      .add("nested", ArrayType(nestedSchema))

    val populated2Df = spark.sqlContext.createDataFrame(rowRDD, definedSchema)
    println("----")
    populated1Df.collect().foreach(r => println(" BuiltExample:" + r))

    spark.stop()
  }
} 
Example 94
Source File: PopulateHiveTable.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.nested

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType}


object PopulateHiveTable {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .config("spark.driver.host","127.0.0.1")
      .config("spark.sql.parquet.compression.codec", "gzip")
      .enableHiveSupport()
      .getOrCreate()


    spark.sql("create table IF NOT EXISTS nested_empty " +
      "( A int, " +
      "  B string, " +
      "  nested ARRAY<STRUCT< " +
      "     nested_C: int," +
      "     nested_D: string" +
      "  >>" +
      ") ")

    val rowRDD = spark.sparkContext.
      parallelize(Array(
        Row(1, "foo", Seq(Row(1, "barA"),Row(2, "bar"))),
        Row(2, "foo", Seq(Row(1, "barB"),Row(2, "bar"))),
        Row(3, "foo", Seq(Row(1, "barC"),Row(2, "bar")))))

    val emptyDf = spark.sql("select * from nested_empty limit 0")

    val tableSchema = emptyDf.schema

    val populated1Df = spark.sqlContext.createDataFrame(rowRDD, tableSchema)

    populated1Df.repartition(2).write.saveAsTable("nested_populated")

    println("----")
    populated1Df.collect().foreach(r => println(" emptySchemaExample:" + r))

    val nestedSchema = new StructType()
      .add("nested_C", IntegerType)
      .add("nested_D", StringType)

    val definedSchema = new StructType()
      .add("A", IntegerType)
      .add("B", StringType)
      .add("nested", ArrayType(nestedSchema))

    val populated2Df = spark.sqlContext.createDataFrame(rowRDD, definedSchema)

    println("----")
    populated1Df.collect().foreach(r => println(" BuiltExample:" + r))

    spark.stop()
  }
} 
Example 95
Source File: CountingInAStreamExpUpdateStateByKey.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.streaming.dstream

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}

object CountingInAStreamExpUpdateStateByKey {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .master("local[3]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }

    val ssc = new StreamingContext(sparkSession.sparkContext, Seconds(1))
    ssc.checkpoint(checkpointFolder)

    val lines = ssc.socketTextStream(host, port.toInt)
    val words = lines.flatMap(_.split(" "))
    val wordCounts = words.map(word => (word, 1))
      .updateStateByKey((values: Seq[(Int)], state: Option[(Int)]) => {
        var value = state.getOrElse(0)
        values.foreach(i => {
          value += i
        })
        Some(value)
    })

    wordCounts.foreachRDD(rdd => {
      println("{")
      val localCollection = rdd.collect()
      println("  size:" + localCollection.length)
      localCollection.foreach(r => println("  " + r))
      println("}")
    })
    ssc.start()


    ssc.awaitTermination()


  }
} 
Example 96
Source File: CountingInAStreamExpBatchCounting.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.streaming.dstream

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}

object CountingInAStreamExpBatchCounting {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .master("local[3]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }

    val ssc = new StreamingContext(sparkSession.sparkContext, Seconds(2))
    ssc.checkpoint(checkpointFolder)

    val lines = ssc.socketTextStream(host, port.toInt)
    val words = lines.flatMap(line => line.toLowerCase.split(" "))
    val wordCounts = words.map(word => (word, 1))
      .reduceByKey((a,b) => a + b)

    wordCounts.foreachRDD(rdd => {
      println("{")
      val localCollection = rdd.collect()
      println("  size:" + localCollection.length)
      localCollection.foreach(r => println("  " + r))
      println("}")
    })

    ssc.start()

    ssc.awaitTermination()


  }
} 
Example 97
Source File: CountingInAStreamMapWithState.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.streaming.structured

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout}

object CountingInAStreamMapWithState {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .master("local[3]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .master("local[3]")
        .getOrCreate()
    }

    import sparkSession.implicits._

    val socketLines = sparkSession.readStream
      .format("socket")
      .option("host", host)
      .option("port", port)
      .load()

    val messageDs = socketLines.as[String].
      flatMap(line => line.toLowerCase().split(" ")).
      map(word => WordCountEvent(word, 1))

    // Generate running word count
    val wordCounts = messageDs.groupByKey(tuple => tuple.word).
      mapGroupsWithState[WordCountInMemory, WordCountReturn](GroupStateTimeout.ProcessingTimeTimeout) {

      case (word: String, events: Iterator[WordCountEvent], state: GroupState[WordCountInMemory]) =>
        var newCount = if (state.exists) state.get.countOfWord else 0

        events.foreach(tuple => {
          newCount += tuple.countOfWord
        })

        state.update(WordCountInMemory(newCount))

        WordCountReturn(word, newCount)
    }

    // Start running the query that prints the running counts to the console
    val query = wordCounts.writeStream
      .outputMode("update")
      .format("console")
      .start()

    query.awaitTermination()
  }
}

case class WordCountEvent(word:String, countOfWord:Int) extends Serializable {

}

case class WordCountInMemory(countOfWord: Int) extends Serializable {
}

case class WordCountReturn(word:String, countOfWord:Int) extends Serializable {

} 
Example 98
Source File: CountingInAStreamExpGroupBy.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.streaming.structured

import com.malaska.spark.training.streaming.{Message, MessageBuilder}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.{OutputMode, Trigger}
import org.apache.spark.sql.functions._

object CountingInAStreamExpGroupBy {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .master("local[3]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .master("local[3]")
        .getOrCreate()
    }

    import sparkSession.implicits._

    val socketLines = sparkSession.readStream
      .format("socket")
      .option("host", host)
      .option("port", port)
      .load()

    val messageDs = socketLines.as[String].
      flatMap(line => line.toLowerCase().split(" "))

    // Generate running word count
    val wordCounts = messageDs.groupBy("value").count()

    // Start running the query that prints the running counts to the console
    val query = wordCounts.writeStream
      .outputMode("complete")
      .format("console")
      .start()

    query.awaitTermination()
  }
} 
Example 99
Source File: CountingInAStreamDatasetExpGroupBy.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.streaming.structured

import com.malaska.spark.training.streaming.{Message, MessageBuilder}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.functions._

object CountingInAStreamDatasetExpGroupBy {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .master("local[3]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .master("local[3]")
        .getOrCreate()
    }

    import sparkSession.implicits._

    val socketLines = sparkSession.readStream
      .format("socket")
      .option("host", host)
      .option("port", port)
      .load()

    val messageDs = socketLines.as[String].map(line => {
      MessageBuilder.build(line)
    }).as[Message]

    val tickerCount = messageDs.groupBy("ticker", "destUser").agg(sum($"price"), avg($"price"))

    val ticketOutput = tickerCount.writeStream
      .format("Console")
      .trigger(Trigger.ProcessingTime("5 seconds"))
      .option("checkpointLocation", checkpointFolder)
      .outputMode("complete")
      .format("console")
      .start()

    ticketOutput.awaitTermination()
  }
} 
Example 100
Source File: CountingInAStreamExpWindowing.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.streaming.structured

import com.malaska.spark.training.streaming.{Message, MessageBuilder}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.streaming.FileStreamSource.Timestamp
import org.apache.spark.sql.functions._
import org.apache.spark.sql.execution.streaming._
import org.apache.spark.sql.streaming.{OutputMode, Trigger}


object CountingInAStreamExpWindowing {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .master("local[5]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .master("local[5]")
        .getOrCreate()
    }

    import sparkSession.implicits._

    val socketLines = sparkSession.readStream
      .format("socket")
      .option("host", host)
      .option("port", port)
      .option("includeTimestamp", true)
      .load()

    val messageDsDStream = socketLines.as[(String, Timestamp)].map(line => {
      MessageBuilder.build(line._1, line._2)
    }).filter(r => r != null).as[Message]


    val tickerCount = messageDsDStream.withColumn("eventTime", $"tradeTs".cast("timestamp"))
      .withWatermark("eventTime", "30 seconds")
      .groupBy(window($"eventTime", "30 seconds", "5 seconds"), $"ticker")
      .agg(max($"tradeTs") as "max_time", sum($"price") as "total_price", avg($"price") as "avg_price", count($"price") as "number_of_trades")//.orderBy("window")


    val ticketOutput = tickerCount.writeStream
      .format("Console")
      .option("checkpointLocation", checkpointFolder)
      .outputMode("update")
      //.outputMode("complete")
      .format("console")
      .option("truncate", false)
      .option("numRows", 40)
      .start()

    ticketOutput.awaitTermination()
  }

} 
Example 101
Source File: ZombieExample.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.graph

import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx.{Edge, EdgeDirection, Graph, _}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession


object ZombieExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val vertexJsonFile = args(0)
    val edgeJsonFile = args(1)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val vectorDs = sparkSession.read.json(vertexJsonFile).as[JsonVertex]
    val edgeDs = sparkSession.read.json(edgeJsonFile).as[JsonEdge]

    val vectorRdd:RDD[(VertexId, ZombieStats)] = vectorDs.rdd.map(r => {
      (r.vertex_id.toLong, new ZombieStats(r.is_zombie.equals("yes"), r.time_alive))
    })

    val edgeRdd = edgeDs.rdd.map(r => {
      new Edge[String](r.src, r.dst, r.edge_type)
    })

    val defaultUser = new ZombieStats(false, 0)

    val graph = Graph(vectorRdd, edgeRdd, defaultUser)

    val zombieResults = graph.pregel[Long](0, 30, EdgeDirection.Either)(
      (vertexId, zombieState, message) => {
        if (message > 0 && !zombieState.isZombie) {
          new ZombieStats(true, message)
        } else {
          zombieState
        }
      }, triplet => {
        if (triplet.srcAttr.isZombie && !triplet.dstAttr.isZombie) {
          Iterator((triplet.dstId, triplet.srcAttr.lengthOfLife + 1l))
        } else if (triplet.dstAttr.isZombie && !triplet.srcAttr.isZombie) {
          Iterator((triplet.srcId, triplet.dstAttr.lengthOfLife + 1l))
        } else {
          Iterator.empty
        }
      }, (a, b) => Math.min(a, b))

    println("ZombieBite")
    zombieResults.vertices.collect().sortBy(r => r._1).foreach(r => {
      println("vertexId:" + r._1 + ",ZobmieStat:" + r._2)
    })

    sparkSession.stop()
  }
}

case class ZombieStats (isZombie:Boolean, lengthOfLife:Long) 
Example 102
Source File: TrianglesExample.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.graph

import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession


object TrianglesExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val vertexJsonFile = args(0)
    val edgeJsonFile = args(1)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val vectorDs = sparkSession.read.json(vertexJsonFile).as[JsonVertex]
    val edgeDs = sparkSession.read.json(edgeJsonFile).as[JsonEdge]

    val vectorRdd:RDD[(VertexId, ZombieStats)] = vectorDs.rdd.map(r => {
      (r.vertex_id.toLong, new ZombieStats(r.is_zombie.equals("yes"), r.time_alive))
    })

    val edgeRdd = edgeDs.rdd.map(r => {
      new Edge[String](r.src, r.dst, r.edge_type)
    })

    val defaultUser = new ZombieStats(false, 0)

    val graph = Graph(vectorRdd, edgeRdd, defaultUser)

    println("TriangleCount")
    graph.triangleCount().vertices.collect().sortBy(r => r._1).foreach(r => {
      println("vertexId:" + r._1 + ",triangleCount:" + r._2)
    })

    graph.pageRank(1.1, 1.1)

    sparkSession.stop()
  }
} 
Example 103
Source File: Test_example_CNN.scala    From SparkMLlibDeepLearn   with Apache License 2.0 5 votes vote down vote up
package tests

import org.apache.log4j.{ Level, Logger }
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.storage.StorageLevel
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.linalg.{ Vector, Vectors }
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.regression.LabeledPoint
import breeze.linalg.{
  Matrix => BM,
  CSCMatrix => BSM,
  DenseMatrix => BDM,
  Vector => BV,
  DenseVector => BDV,
  SparseVector => BSV,
  axpy => brzAxpy,
  svd => brzSvd,
  max => Bmax,
  min => Bmin,
  sum => Bsum
}
import scala.collection.mutable.ArrayBuffer
import CNN.CNN

object Test_example_CNN {

  def main(args: Array[String]) {
    //1 ����Spark����
    val conf = new SparkConf().setAppName("CNNtest")
    val sc = new SparkContext(conf)

    //2 ��������
    Logger.getRootLogger.setLevel(Level.WARN)
    val data_path = "/deeplearn/train_d3.txt"
    val examples = sc.textFile(data_path).cache()
    val train_d1 = examples.map { line =>
      val f1 = line.split("\t")
      val f = f1.map(f => f.toDouble)
      val y = f.slice(0, 10)
      val x = f.slice(10, f.length)
      (new BDM(1, y.length, y), (new BDM(1, x.length, x)).reshape(28, 28) / 255.0)
    }
    val train_d = train_d1.map(f => (f._1, f._2))
    
    //3 ����ѵ������������ģ��
    // opts:��������������������������֤����
    val opts = Array(50.0, 1.0, 0.0)
    train_d.cache
    val numExamples = train_d.count()
    println(s"numExamples = $numExamples.")
    val CNNmodel = new CNN().
      setMapsize(new BDM(1, 2, Array(28.0, 28.0))).
      setTypes(Array("i", "c", "s", "c", "s")).
      setLayer(5).
      setOnum(10).
      setOutputmaps(Array(0.0, 6.0, 0.0, 12.0, 0.0)).
      setKernelsize(Array(0.0, 5.0, 0.0, 5.0, 0.0)).
      setScale(Array(0.0, 0.0, 2.0, 0.0, 2.0)).
      setAlpha(1.0).
      CNNtrain(train_d, opts)

    //4 ģ�Ͳ���
    val CNNforecast = CNNmodel.predict(train_d)
    val CNNerror = CNNmodel.Loss(CNNforecast)
    println(s"NNerror = $CNNerror.")
    val printf1 = CNNforecast.map(f => (f.label.data,  f.predict_label.data)).take(200)
    println("Ԥ��ֵ")
    for (i <- 0 until printf1.length) {
      val outi = printf1(i)._2.mkString("\t")
      println(outi)
    }

  }
} 
Example 104
Source File: LRAccuracyTest.scala    From SparseML   with Apache License 2.0 5 votes vote down vote up
package MLlib

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, LogisticRegressionModel, SparseLogisticRegressionWithLBFGS}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkContext, SparkConf}


object LRAccuracyTest {

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName(s"LogisticRegressionTest with $args").setMaster("local")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)
    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").map(
      l => LabeledPoint(l.label, l.features.toSparse))

    // Split data into training (60%) and test (40%).
    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    // Run training algorithm to build the model
    val model = new SparseLogisticRegressionWithLBFGS()
      .setNumClasses(5)
      .run(training)

    // Compute raw scores on the test set.
    val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
      val prediction = model.predict(features)
      (prediction, label)
    }

    // Get evaluation metrics.
    val metrics = new MulticlassMetrics(predictionAndLabels)

    val precision = metrics.precision
    println("Precision = " + precision)


  }

} 
Example 105
Source File: MnistExample.scala    From SparseML   with Apache License 2.0 5 votes vote down vote up
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.clustering.{KMeans, ScalableKMeans, SparseKMeans}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.SparkSession

object MnistExample {


  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)
    val spark = SparkSession.builder.appName("svm").master("local[8]").getOrCreate()

    val trainRDD = spark.sparkContext.textFile("data/mnist/mnist_train.csv", 8)
      .map(line => line.split(",")).map(arr => arr.map(_.toDouble))
      .map(arr =>  Vectors.dense(arr.slice(1, 785)))

    val model = new KMeans()
      .setK(10)
      .setInitializationMode("random")
      .setMaxIterations(10)
      .run(trainRDD)

    println("final clusters:")
    println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n"))
  }

} 
Example 106
Source File: KMeanTest.scala    From SparseML   with Apache License 2.0 5 votes vote down vote up
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.clustering.{ScalableKMeans, KMeans}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.{SparseVector, Vectors, Vector}

import scala.util.Random


//spark/bin/spark-submit --master spark://10.100.34.48:7077 --class  ScalableKMeanTest --executor-memory 20g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 1000000 100 0.1 1 my 9

//guale spark/bin/spark-submit --master spark://10.100.34.48:7077 --class  ScalableKMeanTest --executor-memory 5g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 5000000 100 0.1 1 my 15

object ScalableKMeanTest {

  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName(s"kmeans: ${args.mkString(",")}")
    val sc = new SparkContext(conf)

    val k = args(0).toInt
    val dimension = args(1).toInt
    val recordNum = args(2).toInt
    val sparsity = args(3).toDouble
    val iterations = args(4).toInt
    val means = args(5)
    val parNumber = args(6).toInt

    val data: RDD[Vector] = sc.parallelize(1 to recordNum, parNumber).map(i => {
      val ran = new Random()
      val indexArr = ran.shuffle((0 until dimension).toList).take((dimension * sparsity).toInt).sorted.toArray
      val valueArr = (1 to (dimension * sparsity).toInt).map(in => ran.nextDouble()).sorted.toArray
      val vec: Vector = new SparseVector(dimension, indexArr, valueArr)
      vec
    }).cache()
    println(args.mkString(", "))
    println(data.count() + " records generated")

    val st = System.nanoTime()

    val model = if(means == "my") {
      println("running scalable kmeans")
      val model = new ScalableKMeans()
        .setK(k)
        .setInitializationMode("random")
        .setMaxIterations(iterations)
        .run(data)
      model
    } else {
      println("running mllib kmeans")
      val model = new KMeans()
        .setK(k)
        .setInitializationMode("random")
        .setMaxIterations(iterations)
        .run(data)
      model
    }

    println((System.nanoTime() - st) / 1e9 + " seconds cost")
    println("final clusters: " + model.clusterCenters.length)
    println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n"))

    sc.stop()
  }

} 
Example 107
Source File: MannWhitneyUTestSuite.scala    From StatisticsOnSpark   with Apache License 2.0 5 votes vote down vote up
package test

import org.apache.commons.math3.stat.inference.MannWhitneyUTest
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkContext, SparkConf}


object MannWhitneyUTestSuite {

  Logger.getLogger("org").setLevel(Level.WARN)
  Logger.getLogger("akka").setLevel(Level.WARN)
  val conf = new SparkConf().setAppName("TallSkinnySVD").setMaster("local")
  val sc = new SparkContext(conf)

  def main(args: Array[String]) {
    testMannWhitneyU
    testMannWhitneyUTest
  }

  private def testMannWhitneyU(): Unit ={
    val sample1 = Array(1d, 3d, 5, 7)
    val sample2 = Array(2, 4, 6, 8d)

    val rdd1 = sc.parallelize(sample1)
    val rdd2 = sc.parallelize(sample2)

    val result = new MannWhitneyUTest()
      .mannWhitneyU(sample1, sample2)
    val result2 = org.apache.spark.mllib.stat.test.MannWhitneyUTest.mannWhitneyU(rdd1, rdd2)
    assert(result == result2)
  }

  private def testMannWhitneyUTest(): Unit ={
    val sample1 = Array(1d, 3d, 5, 7)
    val sample2 = Array(2, 4, 6, 8d)

    val rdd1 = sc.parallelize(sample1)
    val rdd2 = sc.parallelize(sample2)

    val result = new MannWhitneyUTest()
      .mannWhitneyUTest(sample1, sample2)
    val result2 = org.apache.spark.mllib.stat.test.MannWhitneyUTest.mannWhitneyUTest(rdd1, rdd2)
    println(result)
    println(result2)
    assert(result == result2)
  }



} 
Example 108
Source File: TTestSuite.scala    From StatisticsOnSpark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat

import org.apache.commons.math3.stat.inference.TestUtils
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}



object TTestSuite {

  Logger.getLogger("org").setLevel(Level.WARN)
  Logger.getLogger("akka").setLevel(Level.WARN)
  val conf = new SparkConf().setAppName("TallSkinnySVD").setMaster("local")
  val sc = new SparkContext(conf)

  def main(args: Array[String]) {
    OneSampleTTest
    twoIndependentSampleTTest
    pairedTwoSampleTTest
  }

  def OneSampleTTest(): Unit ={
    val observed = Array(100d, 200d, 300d, 400d)
    val mu = 2.5d

    assert(TestUtils.tTest(mu, observed, 0.05) == new OneSampleTTest().tTest(mu, sc.parallelize(observed), 0.05))
    assert(TestUtils.tTest(mu, observed) == new OneSampleTTest().tTest(mu, sc.parallelize(observed)))
  }

  def twoIndependentSampleTTest(): Unit ={
    val sample1 = Array(100d, 200d, 300d, 400d)
    val sample2 = Array(101d, 205d, 300d, 400d)

    val rdd1 = sc.parallelize(sample1)
    val rdd2 = sc.parallelize(sample2)

    assert(TestUtils.tTest(sample1, sample2, 0.05) == new TwoSampleIndependentTTest().tTest(rdd1, rdd2, 0.05))
    assert(TestUtils.tTest(sample1, sample2) == new TwoSampleIndependentTTest().tTest(rdd1, rdd2))
  }

  def pairedTwoSampleTTest(): Unit ={
    val sample1 = Array(100d, 200d, 300d, 400d)
    val sample2 = Array(101d, 202d, 300d, 400d)

    val rdd1 = sc.parallelize(sample1)
    val rdd2 = sc.parallelize(sample2)

    assert(TestUtils.pairedTTest(sample1, sample2, 0.05) == new PairTwoSampleTTest().tTest(rdd1, rdd2, 0.05))
    assert(TestUtils.pairedTTest(sample1, sample2) == new PairTwoSampleTTest().tTest(rdd1, rdd2))
  }

} 
Example 109
Source File: ANOVASuite.scala    From StatisticsOnSpark   with Apache License 2.0 5 votes vote down vote up
package test

import java.util

import main.ANOVA.OneWayANOVA
import org.apache.commons.math3.stat.inference.TestUtils
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.stat.OneSampleTTest
import org.apache.spark.{SparkContext, SparkConf}


object ANOVASuite {

  Logger.getLogger("org").setLevel(Level.WARN)
  Logger.getLogger("akka").setLevel(Level.WARN)
  val conf = new SparkConf().setAppName("TallSkinnySVD").setMaster("local")
  val sc = new SparkContext(conf)

  def main(args: Array[String]) {
    OneWayANOVA
  }

  def OneWayANOVA(): Unit ={
    val sample1 = Array(100d, 200d, 300d, 400d)
    val sample2 = Array(101d, 200d, 300d, 400d)
    val sample3 = Array(102d, 200d, 300d, 400d)
    val data = new util.ArrayList[Array[Double]]()
    data.add(sample1)
    data.add(sample2)
    data.add(sample3)

    val rdd1 = sc.parallelize(sample1)
    val rdd2 = sc.parallelize(sample2)
    val rdd3 = sc.parallelize(sample3)
    val rddData = Seq(rdd1, rdd2, rdd3)

    assert(TestUtils.oneWayAnovaFValue(data) == new OneWayANOVA().anovaFValue(rddData))
    assert(TestUtils.oneWayAnovaPValue(data) == new OneWayANOVA().anovaPValue(rddData))
  }

} 
Example 110
Source File: ProxyPlugin.scala    From AppCrawler   with Apache License 2.0 5 votes vote down vote up
package com.testerhome.appcrawler.plugin

import java.io.File

import com.brsanthu.googleanalytics.GoogleAnalytics
import com.testerhome.appcrawler.URIElement
import com.testerhome.appcrawler.Plugin
import net.lightbody.bmp.BrowserMobProxyServer
import net.lightbody.bmp.proxy.CaptureType
import org.apache.log4j.{BasicConfigurator, Level, Logger}

import scala.util.Try


class ProxyPlugin extends Plugin {
  private var proxy: BrowserMobProxyServer = _
  val port = 7777

  //todo: 支持代理
  override def start(): Unit = {
    BasicConfigurator.configure()
    Logger.getRootLogger.setLevel(Level.INFO)
    Logger.getLogger("ProxyServer").setLevel(Level.WARN)

    proxy = new BrowserMobProxyServer()
    proxy.setHarCaptureTypes(CaptureType.getNonBinaryContentCaptureTypes)
    proxy.setTrustAllServers(true)
    proxy.start(port)

    //proxy.setHarCaptureTypes(CaptureType.getAllContentCaptureTypes)
    //proxy.setHarCaptureTypes(CaptureType.getHeaderCaptureTypes)
    log.info(s"proxy server listen on ${port}")
    proxy.newHar("start")
  }

  override def beforeElementAction(element: URIElement): Unit = {
    log.info("clear har")
    proxy.endHar()
    //创建新的har
    val harFileName = getCrawler().getBasePathName() + ".har"
    proxy.newHar(harFileName)
  }

  override def afterElementAction(element: URIElement): Unit = {
    log.info("save har")
    val harFileName = getCrawler().getBasePathName() + ".har"
    val file = new File(harFileName)
    try {
      log.info(proxy.getHar)
      log.info(proxy.getHar.getLog)
      log.info(proxy.getHar.getLog.getEntries.size())
      log.info(s"har entry size = ${proxy.getHar.getLog.getEntries.size()}")
      if (proxy.getHar.getLog.getEntries.size() > 0) {
        proxy.getHar.writeTo(file)
      }
    } catch {
      case e: Exception =>{
        log.error("read har error")
        log.error(e.getCause)
        log.error(e.getMessage)
        e.getStackTrace.foreach(log.error)
      }
    }

  }

  override def stop(): Unit ={
    log.info("prpxy stop")
    proxy.stop()
  }
} 
Example 111
Source File: TestGA.scala    From AppCrawler   with Apache License 2.0 5 votes vote down vote up
package com.testerhome.appcrawler.ut

import com.brsanthu.googleanalytics.{GoogleAnalytics, PageViewHit}
import org.apache.log4j.{BasicConfigurator, Level, Logger}
import org.scalatest.FunSuite


class TestGA extends FunSuite{
  test("google analyse"){
    println("ga start")

    BasicConfigurator.configure()
    Logger.getRootLogger().setLevel(Level.WARN)
    val ga = new GoogleAnalytics("UA-74406102-1")
    1 to 10 foreach(x=>{
      ga.postAsync(new PageViewHit(s"http://appcrawler.io/demo${x}", "test"))
    })
    Thread.sleep(10000)

    1 to 10 foreach(x=>{
      ga.postAsync(new PageViewHit(s"http://appcrawler.io/dem1${x}", "test"))
    })

    Thread.sleep(10000)
    1 to 10 foreach(x=>{
      ga.postAsync(new PageViewHit(s"http://appcrawler.io/dem2${x}", "test"))
    })
    //ga.post(new PageViewHit("http://appcrawler.io/test2", "test"))
    println("ga end")

  }

} 
Example 112
Source File: StreamHQL.scala    From spark-cep   with Apache License 2.0 5 votes vote down vote up
import java.util.Properties

import kafka.consumer.ConsumerConfig
import org.I0Itec.zkclient.ZkClient
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.streaming.StreamSQLContext
import org.apache.spark.sql.streaming.sources.MessageDelimiter
import org.apache.spark.streaming.dstream.ConstantInputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import redis.RedisManager

import scala.util.parsing.json.JSON

class TabDelimiter extends MessageDelimiter {
  override val delimiter = "\t"
}

object StreamDDL {
  def main(args: Array[String]): Unit = {
    Logger.getRootLogger.setLevel(Level.WARN)
    val query = args(0)
    val sc = new SparkContext(new SparkConf())
    val ssc = new StreamingContext(sc, Seconds(1))
    val streamSqlContext = new StreamSQLContext(ssc, new HiveContext(sc))
    streamSqlContext.command(query)
    new ConstantInputDStream[Int](ssc, sc.parallelize(Seq(1))).print
    ssc.start()
    ssc.awaitTerminationOrTimeout(100)
    ssc.stop()
  }
}

object StreamHQL {

  object Redis {
    var initialized = false
    var manager: RedisManager = _
    def init(confMap: Map[String, String]) {
      if (initialized == false) {
        manager = new RedisManager(
          confMap("redis.shards"),
          confMap("redis.sentinels"),
          confMap("redis.database").toInt)
        manager.init
        initialized = true
      }
    }
  }

  def removeConsumerGroup(zkQuorum: String, groupId: String) {
    val properties = new Properties()
    properties.put("zookeeper.connect", zkQuorum)
    properties.put("group.id", groupId)
    val conf = new ConsumerConfig(properties)
    val zkClient = new ZkClient(conf.zkConnect)
    zkClient.deleteRecursive(s"/consumers/${conf.groupId}")
    zkClient.close()
  }

  def main(args: Array[String]): Unit = {
    Logger.getRootLogger.setLevel(Level.WARN)
    val confMap = JSON.parseFull(args(0)).get.asInstanceOf[Map[String, String]]
    val qid = args(1)
    val query = args(2)
    val sc = new SparkContext(new SparkConf())
    val ssc = new StreamingContext(sc, Seconds(1))
    val hc = new HiveContext(sc)
    val streamSqlContext = new StreamSQLContext(ssc, hc)
    val redisExpireSec = confMap("redis.expire.sec").toInt
    ssc.checkpoint(s"checkpoint/$qid")
    hc.setConf("spark.streaming.query.id", qid)
    hc.setConf("spark.sql.shuffle.partitions", confMap("spark.sql.shuffle.partitions"))

    removeConsumerGroup(confMap("kafka.zookeeper.quorum"), qid)
    val result = streamSqlContext.sql(query)
    val schema = result.schema

    result.foreachRDD((rdd, time) => {
      rdd.foreachPartition(partition => {
        Redis.init(confMap)
        val jedis = Redis.manager.getResource
        val pipe = jedis.pipelined
        partition.foreach(record => {
          val seq = record.toSeq(schema)
          val ts = time.milliseconds / 1000
          val hkey = seq.take(seq.size - 1).mkString(".")
          pipe.hset(qid + "." + ts, hkey, seq(seq.size - 1).toString)
          pipe.expire(qid + "." + ts, redisExpireSec)
        })
        pipe.sync
        Redis.manager.returnResource(jedis)
      })
    })

    ssc.start()
    ssc.awaitTermination()
    ssc.stop()
  }
} 
Example 113
Source File: ZeroMQWordCount.scala    From bahir   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.streaming.zeromq

import scala.language.implicitConversions
import scala.util.Random

import org.apache.log4j.{Level, Logger}
import org.zeromq.ZContext
import org.zeromq.ZMQ
import org.zeromq.ZMQException
import org.zeromq.ZMsg

import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.zeromq.ZeroMQUtils


object ZeroMQWordCount {
  def main(args: Array[String]) {
    if (args.length < 2) {
      // scalastyle:off println
      System.err.println("Usage: ZeroMQWordCount <zeroMqUrl> <topic>")
      // scalastyle:on println
      System.exit(1)
    }

    // Set logging level if log4j not configured (override by adding log4j.properties to classpath).
    Logger.getRootLogger.setLevel(Level.WARN)

    val Seq(url, topic) = args.toSeq
    val sparkConf = new SparkConf().setAppName("ZeroMQWordCount")

    // Check Spark configuration for master URL, set it to local if not present.
    if (!sparkConf.contains("spark.master")) {
      sparkConf.setMaster("local[2]")
    }

    // Create the context and set the batch size.
    val ssc = new StreamingContext(sparkConf, Seconds(2))

    val lines = ZeroMQUtils.createTextStream(
      ssc, url, true, Seq(topic.getBytes)
    )
    val words = lines.flatMap(_.split(" "))
    val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)

    wordCounts.print()

    ssc.start()
    ssc.awaitTermination()
  }
} 
Example 114
Source File: TwitterLocations.scala    From bahir   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.streaming.twitter

import org.apache.log4j.{Level, Logger}
import twitter4j.FilterQuery

import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.twitter._


object TwitterLocations {
  def main(args: Array[String]) {
    if (args.length < 4 || args.length % 4 != 0) {
      System.err.println("Usage: TwitterLocations <consumer key> <consumer secret> " +
        "<access token> <access token secret> " +
        "[<latitude-south-west> <longitude-south-west>" +
        " <latitude-north-east> <longitude-north-east> ...]")
      System.exit(1)
    }

    // Set logging level if log4j not configured (override by adding log4j.properties to classpath)
    if (!Logger.getRootLogger.getAllAppenders.hasMoreElements) {
      Logger.getRootLogger.setLevel(Level.WARN)
    }

    // Set the system properties so that Twitter4j library used by twitter stream
    // can use them to generate OAuth credentials
    val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4)
    System.setProperty("twitter4j.oauth.consumerKey", consumerKey)
    System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret)
    System.setProperty("twitter4j.oauth.accessToken", accessToken)
    System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret)

    // Get bounding boxes of locations for which to retrieve Tweets from command line
    val locationArgs = args.takeRight(args.length - 4)
    val boundingBoxes = if (locationArgs.length == 0) {
      System.out.println("No location bounding boxes specified, using defaults for New York City")
      val nycSouthWest = Array(-74.0, 40.0)
      val nycNorthEast = Array(-73.0, 41.0)
      Array(nycSouthWest, nycNorthEast)
    } else {
      locationArgs.map(_.toDouble).sliding(2, 2).toArray
    }

    val sparkConf = new SparkConf().setAppName("TwitterLocations")

    // check Spark configuration for master URL, set it to local if not configured
    if (!sparkConf.contains("spark.master")) {
      sparkConf.setMaster("local[2]")
    }

    val ssc = new StreamingContext(sparkConf, Seconds(2))
    val locationsQuery = new FilterQuery().locations(boundingBoxes : _*)

    // Print Tweets from the specified coordinates
    // This includes Tweets geo-tagged in the bounding box defined by the coordinates
    // As well as Tweets tagged in places inside of the bounding box
    TwitterUtils.createFilteredStream(ssc, None, Some(locationsQuery))
      .map(tweet => {
        val latitude = Option(tweet.getGeoLocation).map(l => s"${l.getLatitude},${l.getLongitude}")
        val place = Option(tweet.getPlace).map(_.getName)
        val location = latitude.getOrElse(place.getOrElse("(no location)"))
        val text = tweet.getText.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        s"$location\t$text"
      })
      .print()

    ssc.start()
    ssc.awaitTermination()
  }
}
// scalastyle:on println 
Example 115
Source File: TwitterAlgebirdHLL.scala    From bahir   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.streaming.twitter

import com.twitter.algebird.HyperLogLog._
import com.twitter.algebird.HyperLogLogMonoid
import org.apache.log4j.{Level, Logger}

import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.twitter._

// scalastyle:off

    val BIT_SIZE = 12
    val filters = args
    val sparkConf = new SparkConf().setAppName("TwitterAlgebirdHLL")

    // check Spark configuration for master URL, set it to local if not configured
    if (!sparkConf.contains("spark.master")) {
      sparkConf.setMaster("local[2]")
    }

    val ssc = new StreamingContext(sparkConf, Seconds(5))
    val stream = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_ONLY_SER)

    val users = stream.map(status => status.getUser.getId)

    val hll = new HyperLogLogMonoid(BIT_SIZE)
    var globalHll = hll.zero
    var userSet: Set[Long] = Set()

    val approxUsers = users.mapPartitions(ids => {
      ids.map(id => hll.create(id))
    }).reduce(_ + _)

    val exactUsers = users.map(id => Set(id)).reduce(_ ++ _)

    approxUsers.foreachRDD(rdd => {
      if (rdd.count() != 0) {
        val partial = rdd.first()
        globalHll += partial
        println("Approx distinct users this batch: %d".format(partial.estimatedSize.toInt))
        println("Approx distinct users overall: %d".format(globalHll.estimatedSize.toInt))
      }
    })

    exactUsers.foreachRDD(rdd => {
      if (rdd.count() != 0) {
        val partial = rdd.first()
        userSet ++= partial
        println("Exact distinct users this batch: %d".format(partial.size))
        println("Exact distinct users overall: %d".format(userSet.size))
        println("Error rate: %2.5f%%".format(((globalHll.estimatedSize / userSet.size.toDouble) - 1
          ) * 100))
      }
    })

    ssc.start()
    ssc.awaitTermination()
  }
}
// scalastyle:on println 
Example 116
Source File: TwitterPopularTags.scala    From bahir   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.streaming.twitter

import org.apache.log4j.{Level, Logger}

import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.twitter._
import org.apache.spark.SparkConf


object TwitterPopularTags {
  def main(args: Array[String]) {
    if (args.length < 4) {
      System.err.println("Usage: TwitterPopularTags <consumer key> <consumer secret> " +
        "<access token> <access token secret> [<filters>]")
      System.exit(1)
    }

    // Set logging level if log4j not configured (override by adding log4j.properties to classpath)
    if (!Logger.getRootLogger.getAllAppenders.hasMoreElements) {
      Logger.getRootLogger.setLevel(Level.WARN)
    }

    val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4)
    val filters = args.takeRight(args.length - 4)

    // Set the system properties so that Twitter4j library used by twitter stream
    // can use them to generate OAuth credentials
    System.setProperty("twitter4j.oauth.consumerKey", consumerKey)
    System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret)
    System.setProperty("twitter4j.oauth.accessToken", accessToken)
    System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret)

    val sparkConf = new SparkConf().setAppName("TwitterPopularTags")

    // check Spark configuration for master URL, set it to local if not configured
    if (!sparkConf.contains("spark.master")) {
      sparkConf.setMaster("local[2]")
    }

    val ssc = new StreamingContext(sparkConf, Seconds(2))
    val stream = TwitterUtils.createStream(ssc, None, filters)

    val hashTags = stream.flatMap(status => status.getText.split(" ").filter(_.startsWith("#")))

    val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60))
                     .map{case (topic, count) => (count, topic)}
                     .transform(_.sortByKey(false))

    val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10))
                     .map{case (topic, count) => (count, topic)}
                     .transform(_.sortByKey(false))


    // Print popular hashtags
    topCounts60.foreachRDD(rdd => {
      val topList = rdd.take(10)
      println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
      topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))}
    })

    topCounts10.foreachRDD(rdd => {
      val topList = rdd.take(10)
      println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
      topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))}
    })

    ssc.start()
    ssc.awaitTermination()
  }
}
// scalastyle:on println 
Example 117
Source File: TrainNewsClassWithDTDemo.scala    From CkoocNLP   with Apache License 2.0 5 votes vote down vote up
package applications.mining

import config.paramconf.ClassParams
import functions.Preprocessor
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.feature._
import org.apache.spark.sql.SparkSession


object TrainNewsClassWithDTDemo {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.WARN)

    val spark = SparkSession
      .builder
      .master("local[2]")
      .appName("train news with DT Demo")
      .getOrCreate()

    val args = Array("ckooc-ml/data/classnews/train")
    val filePath = args(0)

    import spark.implicits._
    val data = spark.sparkContext.textFile(filePath).flatMap { line =>
      val tokens: Array[String] = line.split("\u00ef")
      if (tokens.length > 3) Some((tokens(0), tokens(1), tokens(2), tokens(3))) else None
    }.toDF("label", "title", "time", "content")
    data.persist()

    val preprocessor = new Preprocessor
    val pipeline = preprocessor.preprocess(data)

    // DT模型训练
    val params = new ClassParams
    val dtClassifier = new DecisionTreeClassifier()
      .setMinInfoGain(params.minInfoGain)
      .setMaxDepth(params.maxDepth) //目前Spark只支持最大30层深度
      .setLabelCol("indexedLabel")
      .setFeaturesCol("features")

    val indexModel = pipeline.getStages(1).asInstanceOf[StringIndexerModel]
    //索引标签化
    val labelConverter = new IndexToString()
      .setLabels(indexModel.labels)
      .setInputCol(dtClassifier.getPredictionCol)
      .setOutputCol("predictedLabel")

    val stages = pipeline.getStages ++ Array(dtClassifier, labelConverter)
    pipeline.setStages(stages)

    val model = pipeline.fit(data)
    model.write.overwrite().save(params.DTModelPath)

    data.unpersist()
    spark.stop()
  }
} 
Example 118
Source File: PredictNewsClassDemo.scala    From CkoocNLP   with Apache License 2.0 5 votes vote down vote up
package applications.mining

import algorithms.evaluation.MultiClassEvaluation
import config.paramconf.ClassParams
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.PipelineModel
import org.apache.spark.sql.{Row, SparkSession}


object PredictNewsClassDemo extends Serializable {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.WARN)

    val spark = SparkSession
      .builder
      .master("local[2]")
      .appName("predict news multi class demo")
      .getOrCreate()

    val args = Array("ckooc-ml/data/classnews/predict", "lr")
    val filePath = args(0)
    val modelType = args(1)

    var modelPath = ""
    val params = new ClassParams

    modelType match {
      case "lr" => modelPath = params.LRModelPath
      case "dt" => modelPath = params.DTModelPath
      case _ =>
        println("模型类型错误!")
        System.exit(1)
    }

    import spark.implicits._
    val data = spark.sparkContext.textFile(filePath).flatMap { line =>
      val tokens: Array[String] = line.split("\u00ef")
      if (tokens.length > 3) Some((tokens(0), tokens(1), tokens(2), tokens(3))) else None
    }.toDF("label", "title", "time", "content")
    data.persist()

    //加载模型,进行数据转换
    val model = PipelineModel.load(modelPath)
    val predictions = model.transform(data)

    //=== 模型评估
    val resultRDD = predictions.select("prediction", "indexedLabel").rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) }
    val (precision, recall, f1) = MultiClassEvaluation.multiClassEvaluate(resultRDD)
    println("\n\n========= 评估结果 ==========")
    println(s"\n加权准确率:$precision")
    println(s"加权召回率:$recall")
    println(s"F1值:$f1")

    //    predictions.select("label", "predictedLabel", "content").show(100, truncate = false)
    data.unpersist()

    spark.stop()
  }
} 
Example 119
Source File: TrainNewsClassWithLRDemo.scala    From CkoocNLP   with Apache License 2.0 5 votes vote down vote up
package applications.mining

import config.paramconf.ClassParams
import functions.Preprocessor
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature._
import org.apache.spark.sql.SparkSession


object TrainNewsClassWithLRDemo extends Serializable {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.WARN)

    val spark = SparkSession
      .builder
      .master("local[2]")
      .appName("train news with LR Demo")
      .getOrCreate()

    val args = Array("ckooc-ml/data/classnews/train")
    val filePath = args(0)

    import spark.implicits._
    val data = spark.sparkContext.textFile(filePath).flatMap { line =>
      val tokens: Array[String] = line.split("\u00ef")
      if (tokens.length > 3) Some((tokens(0), tokens(1), tokens(2), tokens(3))) else None
    }.toDF("label", "title", "time", "content")
    data.persist()

    val preprocessor = new Preprocessor
    val pipeline = preprocessor.preprocess(data)

    //LR模型训练
    val params = new ClassParams
    val logisticRegression = new LogisticRegression()
      .setTol(params.converTol)
      .setMaxIter(params.maxIteration)
      .setRegParam(params.regParam)
      .setElasticNetParam(params.elasticNetParam)
      .setLabelCol("indexedLabel")
      .setFeaturesCol("features")

    val indexModel = pipeline.getStages(1).asInstanceOf[StringIndexerModel]
    //索引标签化
    val labelConverter = new IndexToString()
      .setLabels(indexModel.labels)
      .setInputCol(logisticRegression.getPredictionCol)
      .setOutputCol("predictedLabel")

    val stages = pipeline.getStages ++ Array(logisticRegression, labelConverter)
    pipeline.setStages(stages)

    val model = pipeline.fit(data)
    model.write.overwrite().save(params.LRModelPath)

    data.unpersist()
    spark.stop()
  }
} 
Example 120
Source File: StarsAnalysisDemo.scala    From CkoocNLP   with Apache License 2.0 5 votes vote down vote up
package applications.analysis

import java.io.{BufferedWriter, FileOutputStream, OutputStreamWriter}

import functions.segment.Segmenter
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}


object StarsAnalysisDemo {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val spark = SparkSession
      .builder
      .master("local[2]")
      .appName("Stars Analysis Demo")
      .getOrCreate()

    val filePath = "E:/data/chinaNews/entertainment.txt"


    // 加载数据,并保留年份和内容字段,并对内容字段进行过滤
    import spark.implicits._
    val data = spark.sparkContext.textFile(filePath).flatMap { line =>
      val tokens: Array[String] = line.split("\u00ef")
      if (tokens.length > 3) {
        var year: String = tokens(2).split("-")(0)
        if (tokens(2).contains("年")) year = tokens(2).split("年")(0)

        var content = tokens(3)
        if (content.length > 22 && content.substring(0, 20).contains("日电")) {
          content = content.substring(content.indexOf("日电") + 2, content.length).trim
        }

        if (content.startsWith("(")) content = content.substring(content.indexOf(")") + 1, content.length)
        if (content.length > 20 && content.substring(content.length - 20, content.length).contains("记者")) {
          content = content.substring(0, content.lastIndexOf("记者")).trim
        }

        Some(year, content)
      } else None
    }.toDF("year", "content")

    // 分词,去除长度为1的词,每个词保留词性
    val segmenter = new Segmenter()
      .isAddNature(true)
      .isDelEn(true)
      .isDelNum(true)
      .setMinTermLen(2)
      .setMinTermNum(5)
      .setSegType("StandardSegment")
      .setInputCol("content")
      .setOutputCol("segmented")
    val segDF: DataFrame = segmenter.transform(data)
    segDF.cache()

    val segRDD: RDD[(Int, Seq[String])] = segDF.select("year", "segmented").rdd.map { case Row(year: String, terms: Seq[String]) =>
      (Integer.parseInt(year), terms)
    }

    val result: Array[String] = segRDD.map(line => line._1.toString + "\u00ef" + line._2.mkString(",")).collect()
    val writer: BufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("E:/entertainment_seg.txt")))
    result.foreach(line => writer.write(line + "\n"))
    writer.close()

    // 统计2016出现在新闻中最多的明星
    val stars2016 = segRDD.filter(_._1 == 2016)
      .flatMap { case (year: Int, termStr: Seq[String]) =>
        val person = termStr
          .map(term => (term.split("/")(0), term.split("/")(1)))
          .filter(_._2.equalsIgnoreCase("nr"))
          .map(term => (term._1, 1L))

        person
      }
      .reduceByKey(_ + _)
      .sortBy(_._2, ascending = false)

    segDF.unpersist()

    stars2016.take(100).foreach(println)

    spark.stop()
  }
} 
Example 121
Source File: NLPPreprocessTest.scala    From CkoocNLP   with Apache License 2.0 5 votes vote down vote up
package nlp

import com.hankcs.hanlp.utility.Predefine
import functions.clean.Cleaner
import functions.segment.Segmenter
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.junit.Test

import scala.reflect.io.File


  @Test
  def testSegmenter(): Unit = {
    val spark = SparkSession
      .builder
      .master("local[2]")
      .appName("Segment Demo")
      .getOrCreate()

    val text = Seq(
      (0, "这段文本是用来做分词测试的!This text is for test!"),
      (1, "江州市长江大桥参加长江大桥通车仪式"),
      (2, "他邀请了不少于10个明星,有:范冰冰、赵薇、周杰伦等,还有20几位商业大佬")
    )
    val sentenceData = spark.createDataFrame(text).toDF("id", "sentence")

    // 设置HanLP配置文件路径, 默认位于classpath路径中
    val path = this.getClass.getClassLoader.getResource("").getPath
    Predefine.HANLP_PROPERTIES_PATH = path + File.separator + "hanlp.properties"

    val segmenter = new Segmenter()
      .isDelEn(true)
      .isDelNum(true)
      .isAddNature(true)
      .setSegType("StandardSegment")
      .setMinTermLen(2)
      .setMinTermNum(3)
      .setInputCol("sentence")
      .setOutputCol("segmented")

    segmenter.transform(sentenceData).show(false)

    spark.stop()
  }
} 
Example 122
Source File: printMatrix.scala    From mCNN   with Apache License 2.0 5 votes vote down vote up
package hhbyyh.mCNN

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.{SparkContext, SparkConf}
import breeze.linalg.{DenseMatrix => BDM, kron}


object printMatrix {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)
    val conf = new SparkConf().setMaster("local[8]").setAppName("ttt")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("dataset/mnist/mnist_train.csv", 8)
    val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble))
      .map(arr => (arr(0), Example.Vector2Tensor(Vectors.dense(arr.slice(1, 785).map(v => if(v > 200) 1.0 else 0)))(0)))

    val lines2 = sc.textFile("dataset/train.format", 8)
    val data2 = lines2.map(line => line.split(",")).map(arr => arr.map(_.toDouble))
      .map(arr => (arr(784), Example.Vector2Tensor(Vectors.dense(arr.slice(0, 784)))(0)))

    data2.take(10).foreach(record =>{
      println("label: " + record._1)
      val intm = new BDM[Int](28, 28, record._2.toArray.map(d => d.toInt))
      val str = intm.toString(1000, 1000).replace('0', '.').replace('0', '*')
      println(str)
    })

  }
} 
Example 123
Source File: Example.scala    From mCNN   with Apache License 2.0 5 votes vote down vote up
package hhbyyh.mCNN

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.{SparkConf, SparkContext}
import breeze.linalg.{DenseMatrix => BDM, _}

object Example {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)
    val conf = new SparkConf().setMaster("local[8]").setAppName("ttt")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("dataset/train.format", 8)
    val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble))
      .map(arr => (arr(784), Vector2Tensor(Vectors.dense(arr.slice(0, 784)))))

    val topology = new CNNTopology
    topology.addLayer(CNNLayer.buildConvolutionLayer(1, 6, new Scale(5, 5)))
    topology.addLayer(CNNLayer.buildMeanPoolingLayer(new Scale(2, 2)))
    topology.addLayer(CNNLayer.buildConvolutionLayer(6, 12, new Scale(5, 5)))
    topology.addLayer(CNNLayer.buildMeanPoolingLayer(new Scale(2, 2)))
    topology.addLayer(CNNLayer.buildConvolutionLayer(12, 12, new Scale(4, 4)))
    val cnn: CNN = new CNN(topology).setMaxIterations(5).setMiniBatchSize(16)
    val start = System.nanoTime()
    cnn.trainOneByOne(data)
    println("Training time: " + (System.nanoTime() - start) / 1e9)

    val right = data.map(record =>{
      val result = cnn.predict(record._2)
      if(result == record._1) 1 else 0
    }).sum()
    println(s"Predicting precision: $right " + right.toDouble/(data.count()))

//    val testData = sc.textFile("dataset/mnist/mnist_test.csv", 8)
//      .map(line => line.split(",")).map(arr => arr.map(_.toDouble))
//      .map(arr => (arr(0), Example.Vector2Tensor(Vectors.dense(arr.slice(1, 785).map(v => if(v > 200) 1.0 else 0)))))

    val rightM = data.map(record =>{
      val result = cnn.predict(record._2)
      if(result == record._1) 1 else 0
    }).sum()
    println(s"Mnist Full Predicting precision: $rightM " + rightM.toDouble/(data.count()))
  }

  
  def Vector2Tensor(record: Vector): Array[BDM[Double]] = {
    val mapSize = new Scale(28, 28)
    val m = new BDM[Double](mapSize.x, mapSize.y)
    var i: Int = 0
    while (i < mapSize.x) {
      var j: Int = 0
      while (j < mapSize.y) {
        m(i, j) = record(mapSize.x * i + j)
        j += 1
      }
      i += 1
    }
    Array(m)
  }


} 
Example 124
Source File: Driver.scala    From mCNN   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.ann

import org.apache.log4j.{Logger, Level}
import breeze.linalg.{DenseMatrix => BDM}
import org.apache.spark.mllib.linalg.{Vectors, Vector}
import org.apache.spark.{SparkContext, SparkConf}

object CNNDriver {

  def main(args: Array[String]) {

    val myLayers = new Array[Layer](8)
    myLayers(0) = new ConvolutionalLayer(1, 6, kernelSize = new MapSize(5, 5), inputMapSize = new MapSize(28, 28))
    myLayers(1) = new FunctionalLayer(new SigmoidFunction())
    myLayers(2) = new MeanPoolingLayer(new MapSize(2, 2), new MapSize(24, 24))
    myLayers(3) = new ConvolutionalLayer(6, 12, new MapSize(5, 5), new MapSize(12, 12))
    myLayers(4) = new FunctionalLayer(new SigmoidFunction())
    myLayers(5) = new MeanPoolingLayer(new MapSize(2, 2), new MapSize(8, 8))
    myLayers(6) = new ConvolutionalLayer(12, 12, new MapSize(4, 4), new MapSize(4, 4))
    myLayers(7) = new FunctionalLayer(new SigmoidFunction())
    val topology = FeedForwardTopology(myLayers)

    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)
    val conf = new SparkConf().setMaster("local[8]").setAppName("ttt")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("dataset/train.format", 8)
    val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble))
      .map(arr => {
      val target = new Array[Double](12)
      target(arr(784).toInt) = 1
      val in = Vector2BDM(Vectors.dense(arr.slice(0, 784)))
      (Vectors.fromBreeze(in.toDenseVector), Vectors.dense(target))
    }).cache()

    val feedForwardTrainer = new FeedForwardTrainer(topology, 784, 12)

    feedForwardTrainer.setStackSize(4) // CNN does not benefit from the stacked data
//    .LBFGSOptimizer.setNumIterations(20)
      .SGDOptimizer
      .setMiniBatchFraction(0.002)
      .setConvergenceTol(0)
      .setNumIterations(1000)
      .setUpdater(new CNNUpdater(0.85))

    for(iter <- 1 to 1000){
      val start = System.nanoTime()
      val mlpModel = feedForwardTrainer.train(data)
      feedForwardTrainer.setWeights(mlpModel.weights())

      println(s"Training time $iter: " + (System.nanoTime() - start) / 1e9)

      // predict
      val right = data.filter(v => mlpModel.predict(v._1).argmax == v._2.argmax).count()
      val precision = right.toDouble / data.count()
      println(s"right: $right, count: ${data.count()}, precision: $precision")
    }
  }

  def Vector2BDM(record: Vector): BDM[Double] = {
    val mapSize = new MapSize(28, 28)
    val m = new BDM[Double](mapSize.x, mapSize.y)
    var i: Int = 0
    while (i < mapSize.x) {
      var j: Int = 0
      while (j < mapSize.y) {
        m(i, j) = record(mapSize.x * i + j)
        j += 1
      }
      i += 1
    }
    m
  }

} 
Example 125
Source File: MnistCSVDriver.scala    From mCNN   with Apache License 2.0 5 votes vote down vote up
package hhbyyh.mCNN

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.{SparkConf, SparkContext}

object MnistCSVDriver {

  def main(args: Array[String]) {
    val topology = new CNNTopology
    topology.addLayer(CNNLayer.buildConvolutionLayer(new Scale(28, 28)))
    topology.addLayer(CNNLayer.buildConvLayer(6, new Scale(5, 5)))
    topology.addLayer(CNNLayer.buildSampLayer(new Scale(2, 2)))
    topology.addLayer(CNNLayer.buildConvLayer(12, new Scale(5, 5)))
    topology.addLayer(CNNLayer.buildSampLayer(new Scale(2, 2)))
    topology.addLayer(CNNLayer.buildConvLayer(12, new Scale(4, 4)))
    val cnn: CNN = new CNN(topology).setMaxIterations(500000).setMiniBatchSize(16)

    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)
    val conf = new SparkConf().setMaster("local[8]").setAppName("ttt")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("dataset/mnist/mnist_train.csv", 8)
    val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble))
      .map(arr => new LabeledPoint(arr(0), Vectors.dense(arr.slice(1, 785).map(v => if(v > 0) 1.0 else 0))))

    val start = System.nanoTime()
    cnn.trainOneByOne(data)
    println("Training time: " + (System.nanoTime() - start) / 1e9)
  }

} 
Example 126
Source File: LoggerOutputStream.scala    From hail   with MIT License 5 votes vote down vote up
package is.hail.utils

import java.io.{ByteArrayOutputStream, OutputStream}
import java.nio.charset.StandardCharsets

import org.apache.log4j.{Level, Logger}

class LoggerOutputStream(logger: Logger, level: Level) extends OutputStream {
  private val buffer = new ByteArrayOutputStream()

  override def write(b: Int) {
    buffer.write(b)
    if (b == '\n') {
      val line = buffer.toString(StandardCharsets.UTF_8.name())
      level match {
        case Level.TRACE => logger.trace(line)
        case Level.DEBUG => logger.debug(line)
        case Level.INFO  => logger.info(line)
        case Level.WARN  => logger.warn(line)
        case Level.ERROR => logger.error(line)
      }
      buffer.reset()
    }
  }
} 
Example 127
Source File: APSPSpec.scala    From spark-all-pairs-shortest-path   with Apache License 2.0 5 votes vote down vote up
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry}
import org.scalatest.{Outcome, FlatSpec}
import AllPairsShortestPath._
import breeze.linalg.{DenseMatrix => BDM}

class APSPSpec extends FlatSpec {

  val conf = new SparkConf().setAppName("AllPairsShortestPath").setMaster("local[4]").set("spark.driver.allowMultipleContexts", "true")
  val sc = new SparkContext(conf)

  override def withFixture(test: NoArgTest) : Outcome = {
    Logger.getLogger("org").setLevel(Level.ERROR)
    Logger.getLogger("akka").setLevel(Level.ERROR)
    try {
      test() // invoke the test function
    }
  }

  def fourByFourBlockMatrx = {
    val entries = sc.parallelize(Array(
      (0, 1, 20), (0, 2, 4), (0, 3, 2),
      (1, 0, 2), (1, 2, 1), (1, 3, 3), (2, 0, 1),
      (2, 1, 6), (2, 3, 5), (3, 0, 4), (3, 1, 2), (3, 2, 2))).map { case (i, j, v) => MatrixEntry(i, j, v) }
    val coordMat = new CoordinateMatrix(entries)
    val matA = coordMat.toBlockMatrix(2, 2).cache()
    matA
  }

  def ApspPartitioner = {
    GridPartitioner(fourByFourBlockMatrx.numRowBlocks, fourByFourBlockMatrx.numColBlocks, fourByFourBlockMatrx.blocks.partitions.length)
  }

  def toBreeze(A: Matrix): BDM[Double] = {
    new BDM[Double](A.numRows, A.numCols, A.toArray)
  }

  "The sample 4x4 Block Matrix" should "be valid" in {
    fourByFourBlockMatrx.validate()
  }

  it should "match our APSP matrix" in {
    println(fourByFourBlockMatrx.toLocalMatrix())
    val result = new DistributedBlockFW
    val observed = toBreeze(result.compute(fourByFourBlockMatrx).toLocal())
    val expected = BDM(
      (0.0, 4.0, 4.0, 2.0),
      (2.0, 0.0, 1.0, 3.0),
      (1.0, 5.0, 0.0, 3.0),
      (3.0, 2.0, 2.0, 0.0)
    )
    assert(observed === expected)
  }
} 
Example 128
Source File: Part5_BusinessRecommendations.scala    From morpheus   with Apache License 2.0 5 votes vote down vote up
package org.opencypher.morpheus.integration.yelp

import org.apache.log4j.{Level, Logger}
import org.opencypher.morpheus.api.{GraphSources, MorpheusSession}
import org.opencypher.morpheus.integration.yelp.YelpConstants._
import org.opencypher.okapi.api.value.CypherValue.CypherInteger
import org.opencypher.okapi.neo4j.io.MetaLabelSupport._
import org.opencypher.okapi.neo4j.io.Neo4jHelpers.{cypher => neo4jCypher, _}

object Part5_BusinessRecommendations extends App {
  Logger.getRootLogger.setLevel(Level.ERROR)

  log("Part 5 - Business Recommendation")

  lazy val inputPath = args.headOption.getOrElse(defaultYelpGraphFolder)

  implicit val morpheus: MorpheusSession = MorpheusSession.local()
  import morpheus._

  registerSource(fsNamespace, GraphSources.fs(inputPath).parquet)
  registerSource(neo4jNamespace, GraphSources.cypher.neo4j(neo4jConfig))

  val year = 2017

  log("Write to Neo4j, detect communities and find similar users within communities", 1)
  cypher(
    s"""
       |CATALOG CREATE GRAPH $neo4jNamespace.${coReviewAndBusinessGraphName(year)} {
       |  FROM $fsNamespace.${coReviewAndBusinessGraphName(year)}
       |  RETURN GRAPH
       |}
     """.stripMargin)

  // Use Neo4j Graph Algorithms to compute Louvain clusters and Jaccard similarity within clusters
  neo4jConfig.withSession { implicit session =>

    log("Find communities via Louvain", 1)
    val louvainStats = neo4jCypher(
      s"""
         |CALL algo.louvain('${coReviewAndBusinessGraphName(year).metaLabel}', 'CO_REVIEWS', {
         |  write:           true,
         |  weightProperty: 'reviewCount',
         |  writeProperty:  '${communityProp(year)}'
         |})
         |YIELD communityCount, nodes, loadMillis, computeMillis, writeMillis
         |RETURN communityCount, nodes, loadMillis + computeMillis + writeMillis AS total""".stripMargin).head
    log(s"Computing Louvain modularity on ${louvainStats("nodes")} nodes took ${louvainStats("total")} ms", 1)

    val communityNumber = louvainStats("communityCount").cast[CypherInteger].value.toInt

    log(s"Find similar users within $communityNumber communities", 1)
    // We use Jaccard similarity because it doesn't require equal length vectors
    (0 until communityNumber).foreach { communityNumber =>
      neo4jCypher(
        s"""
           |MATCH (u:User)-[r:REVIEWS]->(b:Business)
           |WHERE u.${communityProp(year)} = $communityNumber
           |WITH { item: id(u), categories: collect(id(b))} AS userData
           |WITH collect(userData) AS data
           |CALL algo.similarity.jaccard(data, {
           |  similarityCutoff:      0.5,
           |  write:                 true,
           |  writeRelationshipType: '${isSimilarRelType(year)}'})
           |YIELD similarityPairs
           |RETURN similarityPairs
       """.stripMargin
      )
    }
  }

  log("Load graphs back to Spark and compute recommendations", 1)

  // Reset schema cache to enable loading new properties
  catalog.source(neo4jNamespace).reset()

  val recommendations = cypher(
    s"""
       |FROM GRAPH $neo4jNamespace.${coReviewAndBusinessGraphName(year)}
       |MATCH (u:User)-[:${isSimilarRelType(year)}]-(o:User),
       |      (o:User)-[r:REVIEWS]->(b:Business)
       |WHERE NOT((u)<-[:REVIEWS]-(b:Business)) AND r.stars > 3
       |WITH id(u) AS user_id, u.name AS name, collect(DISTINCT b.name) AS recommendations
       |RETURN name AS user, recommendations
       |ORDER BY user_id DESC
       |LIMIT 10
       """.stripMargin
  )

  recommendations.show
} 
Example 129
Source File: Part4_BusinessTrends.scala    From morpheus   with Apache License 2.0 5 votes vote down vote up
package org.opencypher.morpheus.integration.yelp

import org.apache.log4j.{Level, Logger}
import org.opencypher.morpheus.api.{GraphSources, MorpheusSession}
import org.opencypher.morpheus.integration.yelp.YelpConstants._
import org.opencypher.okapi.api.value.CypherValue.CypherFloat
import org.opencypher.okapi.neo4j.io.MetaLabelSupport._
import org.opencypher.okapi.neo4j.io.Neo4jHelpers.{cypher => neo4jCypher, _}

object Part4_BusinessTrends extends App {
  Logger.getRootLogger.setLevel(Level.ERROR)

  log("Part 4 - Business trends")

  lazy val inputPath = args.headOption.getOrElse(defaultYelpGraphFolder)

  implicit val morpheus: MorpheusSession = MorpheusSession.local()
  import morpheus._

  registerSource(fsNamespace, GraphSources.fs(inputPath).parquet)
  registerSource(neo4jNamespace, GraphSources.cypher.neo4j(neo4jConfig))

  log("Write to Neo4j and compute pageRank", 1)
  (2017 to 2018) foreach { year =>
    log(s"For year $year", 2)
    cypher(
      s"""
         |CATALOG CREATE GRAPH $neo4jNamespace.${coReviewedGraphName(year)} {
         |  FROM $fsNamespace.${coReviewedGraphName(year)}
         |  RETURN GRAPH
         |}
     """.stripMargin)

    // Compute PageRank using Neo4j Graph Algorithms
    neo4jConfig.withSession { implicit session =>
      val pageRankStats = neo4jCypher(
        s"""
           |CALL algo.pageRank('${coReviewedGraphName(year).metaLabel}', null, {
           |  iterations:     20,
           |  dampingFactor:  0.85,
           |  direction:      "BOTH",
           |  write:          true,
           |  writeProperty:  "pageRank$year",
           |  weightProperty: "reviewCount"
           |})
           |YIELD nodes, loadMillis, computeMillis, writeMillis
           |RETURN nodes, loadMillis + computeMillis + writeMillis AS total""".stripMargin).head
      log(s"Computing page rank on ${pageRankStats("nodes")} nodes took ${pageRankStats("total")} ms", 2)
    }
  }

  // Reset schema cache to enable loading new properties
  catalog.source(neo4jNamespace).reset()

  // Load graphs from Neo4j into Spark and compute trend rank for each business based on their page ranks.
  log("Load graphs back to Spark and compute trend rank", 1)
  cypher(
    s"""
       |CATALOG CREATE GRAPH $businessTrendsGraphName {
       |  FROM GRAPH $neo4jNamespace.${coReviewedGraphName(2017)}
       |  MATCH (b1:Business)
       |  FROM GRAPH $neo4jNamespace.${coReviewedGraphName(2018)}
       |  MATCH (b2:Business)
       |  WHERE b1.businessId = b2.businessId
       |  WITH b1 AS b, (b2.${pageRankProp(2018)} / ${normalizationFactor(2018)}) - (b1.${pageRankProp(2017)} / ${normalizationFactor(2017)}) AS trendRank
       |  CONSTRUCT
       |    CREATE (newB COPY OF b)
       |    SET newB.trendRank = trendRank
       |  RETURN GRAPH
       |}
     """.stripMargin)

  // Top 10 Increasing popularity
  cypher(
    s"""
       |FROM GRAPH $businessTrendsGraphName
       |MATCH (b:Business)
       |RETURN b.name AS name, b.address AS address, b.trendRank AS trendRank
       |ORDER BY trendRank DESC
       |LIMIT 10
     """.stripMargin).show

  def normalizationFactor(year: Int): Double = neo4jConfig.cypherWithNewSession(
    s"""
       |MATCH (b:Business)
       |RETURN sum(b.${pageRankProp(year)}) AS nf
     """.stripMargin).head("nf").cast[CypherFloat].value
} 
Example 130
Source File: Part1_YelpImport.scala    From morpheus   with Apache License 2.0 5 votes vote down vote up
package org.opencypher.morpheus.integration.yelp

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.opencypher.morpheus.api.io.GraphElement._
import org.opencypher.morpheus.api.io.MorpheusElementTable
import org.opencypher.morpheus.api.io.Relationship._
import org.opencypher.morpheus.api.{GraphSources, MorpheusSession}
import org.opencypher.morpheus.integration.yelp.YelpConstants._
import org.opencypher.morpheus.integration.yelp.YelpHelpers._
import org.opencypher.okapi.api.graph.{GraphName, PropertyGraph}
import org.opencypher.okapi.api.io.conversion.{NodeMappingBuilder, RelationshipMappingBuilder}

object Part1_YelpImport extends App {
  Logger.getRootLogger.setLevel(Level.ERROR)

  log("Part 1 - Import")

  lazy val inputPath = args.headOption.getOrElse(defaultYelpJsonFolder)
  lazy val outputPath = args.lift(1).getOrElse(defaultYelpGraphFolder)

  implicit val morpheus: MorpheusSession = MorpheusSession.local()
  implicit val spark: SparkSession = morpheus.sparkSession

  storeGraph(inputPath, outputPath)

  def storeGraph(inputPath: String, outputPath: String): Unit = {
    // Load Yelp data into DataFrames
    log("Load yelp tables", 1)
    val yelpTables = loadYelpTables(inputPath)
    // Create a Property Graph from DataFrames
    log("Create property graph", 1)
    val propertyGraph = createPropertyGraph(yelpTables)
    log("Store in parquet", 1)
    storeAsParquet(yelpGraphName, propertyGraph)
  }

  def storeAsParquet(graphName: GraphName, graph: PropertyGraph): Unit = {
    // Init Property Graph Data Source (PGDS)
    val parquetPGDS = GraphSources.fs(outputPath).parquet
    // Store graph in PGDS
    if (parquetPGDS.hasGraph(graphName)) {
      log(s"Warning: A graph with GraphName $graphName already exists.")
    } else {
      parquetPGDS.store(yelpGraphName, graph)
    }
  }

  def createPropertyGraph(yelpTables: YelpTables): PropertyGraph = {
    // Define node tables
    // (:User)
    val userNodeTable = MorpheusElementTable.create(NodeMappingBuilder.on(sourceIdKey)
      .withImpliedLabel(userLabel)
      .withPropertyKey("name")
      .withPropertyKey("yelping_since")
      .withPropertyKey("elite")
      .build,
      yelpTables.userDf.prependIdColumn(sourceIdKey, userLabel))

    // (:Business)
    val businessNodeTable = MorpheusElementTable.create(NodeMappingBuilder.on(sourceIdKey)
      .withImpliedLabel(businessLabel)
      .withPropertyKey("businessId", "business_id")
      .withPropertyKey("name")
      .withPropertyKey("address")
      .withPropertyKey("city")
      .withPropertyKey("state")
      .build,
      yelpTables.businessDf.prependIdColumn(sourceIdKey, businessLabel))

    // Define relationship tables
    // (:User)-[:REVIEWS]->(:Business)
    val reviewRelTable = MorpheusElementTable.create(RelationshipMappingBuilder.on(sourceIdKey)
      .withSourceStartNodeKey(sourceStartNodeKey)
      .withSourceEndNodeKey(sourceEndNodeKey)
      .withRelType(reviewRelType)
      .withPropertyKey("stars")
      .withPropertyKey("date")
      .build,
      yelpTables.reviewDf
        .prependIdColumn(sourceIdKey, reviewRelType)
        .prependIdColumn(sourceStartNodeKey, userLabel)
        .prependIdColumn(sourceEndNodeKey, businessLabel))

    // Create property graph
    morpheus.graphs.create(businessNodeTable, userNodeTable, reviewRelTable)
  }
} 
Example 131
Source File: SparseNaiveBayes.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.util.MLUtils


object SparseNaiveBayes {

  case class Params(
      input: String = null,
      minPartitions: Int = 0,
      numFeatures: Int = -1,
      lambda: Double = 1.0) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("SparseNaiveBayes") {
      head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.")
      opt[Int]("numPartitions")
        .text("min number of partitions")
        .action((x, c) => c.copy(minPartitions = x))
      opt[Int]("numFeatures")
        .text("number of features")
        .action((x, c) => c.copy(numFeatures = x))
      opt[Double]("lambda")
        .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}")
        .action((x, c) => c.copy(lambda = x))
      arg[String]("<input>")
        .text("input paths to labeled examples in LIBSVM format")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val minPartitions =
      if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions

    val examples =
      MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions)
    // Cache examples because it will be used in both training and evaluation.
    examples.cache()

    val splits = examples.randomSplit(Array(0.8, 0.2))
    val training = splits(0)
    val test = splits(1)

    val numTraining = training.count()
    val numTest = test.count()

    println(s"numTraining = $numTraining, numTest = $numTest.")

    val model = new NaiveBayes().setLambda(params.lambda).run(training)

    val prediction = model.predict(test.map(_.features))
    val predictionAndLabel = prediction.zip(test.map(_.label))
    val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest

    println(s"Test accuracy = $accuracy.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 132
Source File: DenseKMeans.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors


object DenseKMeans {

  object InitializationMode extends Enumeration {
    type InitializationMode = Value
    val Random, Parallel = Value
  }

  import InitializationMode._

  case class Params(
      input: String = null,
      k: Int = -1,
      numIterations: Int = 10,
      initializationMode: InitializationMode = Parallel) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DenseKMeans") {
      head("DenseKMeans: an example k-means app for dense data.")
      opt[Int]('k', "k")
        .required()
        .text(s"number of clusters, required")
        .action((x, c) => c.copy(k = x))
      opt[Int]("numIterations")
        .text(s"number of iterations, default: ${defaultParams.numIterations}")
        .action((x, c) => c.copy(numIterations = x))
      opt[String]("initMode")
        .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " +
        s"default: ${defaultParams.initializationMode}")
        .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x)))
      arg[String]("<input>")
        .text("input paths to examples")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"DenseKMeans with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = sc.textFile(params.input).map { line =>
      Vectors.dense(line.split(' ').map(_.toDouble))
    }.cache()

    val numExamples = examples.count()

    println(s"numExamples = $numExamples.")

    val initMode = params.initializationMode match {
      case Random => KMeans.RANDOM
      case Parallel => KMeans.K_MEANS_PARALLEL
    }

    val model = new KMeans()
      .setInitializationMode(initMode)
      .setK(params.k)
      .setMaxIterations(params.numIterations)
      .run(examples)

    val cost = model.computeCost(examples)

    println(s"Total cost = $cost.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 133
Source File: StreamingExamples.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.streaming

import org.apache.log4j.{Level, Logger}

import org.apache.spark.internal.Logging


  def setStreamingLogLevels() {
    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
    if (!log4jInitialized) {
      // We first log something to initialize Spark's default logging, then we override the
      // logging level.
      logInfo("Setting log level to [WARN] for streaming example." +
        " To override add a custom log4j.properties to the classpath.")
      Logger.getRootLogger.setLevel(Level.WARN)
    }
  }
} 
Example 134
Source File: YarnScheduler.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler.cluster

import org.apache.hadoop.yarn.util.RackResolver
import org.apache.log4j.{Level, Logger}

import org.apache.spark._
import org.apache.spark.scheduler.TaskSchedulerImpl
import org.apache.spark.util.Utils

private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) {

  // RackResolver logs an INFO message whenever it resolves a rack, which is way too often.
  if (Logger.getLogger(classOf[RackResolver]).getLevel == null) {
    Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN)
  }

  // By default, rack is unknown
  override def getRackForHost(hostPort: String): Option[String] = {
    val host = Utils.parseHostPort(hostPort)._1
    Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation)
  }
} 
Example 135
Source File: ClientArguments.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy

import java.net.{URI, URISyntaxException}

import scala.annotation.tailrec
import scala.collection.mutable.ListBuffer

import org.apache.log4j.Level

import org.apache.spark.util.{IntParam, MemoryParam, Utils}


  private def printUsageAndExit(exitCode: Int) {
    // TODO: It wouldn't be too hard to allow users to submit their app and dependency jars
    //       separately similar to in the YARN client.
    val usage =
     s"""
      |Usage: DriverClient [options] launch <active-master> <jar-url> <main-class> [driver options]
      |Usage: DriverClient kill <active-master> <driver-id>
      |
      |Options:
      |   -c CORES, --cores CORES        Number of cores to request (default: $DEFAULT_CORES)
      |   -m MEMORY, --memory MEMORY     Megabytes of memory to request (default: $DEFAULT_MEMORY)
      |   -s, --supervise                Whether to restart the driver on failure
      |                                  (default: $DEFAULT_SUPERVISE)
      |   -v, --verbose                  Print more debugging output
     """.stripMargin
    // scalastyle:off println
    System.err.println(usage)
    // scalastyle:on println
    System.exit(exitCode)
  }
}

private[deploy] object ClientArguments {
  val DEFAULT_CORES = 1
  val DEFAULT_MEMORY = Utils.DEFAULT_DRIVER_MEM_MB // MB
  val DEFAULT_SUPERVISE = false

  def isValidJarUrl(s: String): Boolean = {
    try {
      val uri = new URI(s)
      uri.getScheme != null && uri.getPath != null && uri.getPath.endsWith(".jar")
    } catch {
      case _: URISyntaxException => false
    }
  }
} 
Example 136
Source File: Application.scala    From data-faker   with MIT License 5 votes vote down vote up
package com.dunnhumby.datafaker

import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

object Application extends App {

  Logger.getLogger("org.apache.spark.SparkContext").setLevel(Level.WARN)

  val parsedArgs = ArgsParser.validateArgs(ArgsParser.parseArgs(args.toList))
  val conf = new SparkConf()
    .set("spark.ui.showConsoleProgress", "true")
    .setAppName("data-faker")
  val spark: SparkSession = SparkSession
    .builder()
    .config(conf)
    .enableHiveSupport()
    .getOrCreate()

  spark.sparkContext.setLogLevel("OFF")

  spark.sql(s"create database if not exists ${parsedArgs("database")}")

  val schema = YamlParser.parseSchemaFromFile(parsedArgs("file"))
  val dataGenerator = new DataGenerator(spark, parsedArgs("database"))

  dataGenerator.generateAndWriteDataFromSchema(schema)
} 
Example 137
Source File: LogUtils.scala    From Spark-MLlib-Twitter-Sentiment-Analysis   with Apache License 2.0 5 votes vote down vote up
package org.p7h.spark.sentiment.utils

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{Logging, SparkContext}


object LogUtils extends Logging {

  def setLogLevels(sparkContext: SparkContext) {

    sparkContext.setLogLevel(Level.WARN.toString)
    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
    if (!log4jInitialized) {
      logInfo(
        """Setting log level to [WARN] for streaming executions.
          |To override add a custom log4j.properties to the classpath.""".stripMargin)
      Logger.getRootLogger.setLevel(Level.WARN)
    }
  }
} 
Example 138
Source File: LinearRegression.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.optimization.{SimpleUpdater, SquaredL2Updater, L1Updater}

spark-examples-*.jar \
          |  data/mllib/sample_linear_regression_data.txt
        """.stripMargin)
    }

    parser.parse(args, defaultParams).map { params =>
      run(params)
    } getOrElse {
      sys.exit(1)
    }
  }

  def run(params: Params) {
    val conf = new SparkConf().setAppName(s"LinearRegression with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()

    val splits = examples.randomSplit(Array(0.8, 0.2))
    val training = splits(0).cache()
    val test = splits(1).cache()

    val numTraining = training.count()
    val numTest = test.count()
    println(s"Training: $numTraining, test: $numTest.")

    examples.unpersist(blocking = false)

    val updater = params.regType match {
      case NONE => new SimpleUpdater()
      case L1 => new L1Updater()
      case L2 => new SquaredL2Updater()
    }

    val algorithm = new LinearRegressionWithSGD()
    algorithm.optimizer
      .setNumIterations(params.numIterations)
      .setStepSize(params.stepSize)
      .setUpdater(updater)
      .setRegParam(params.regParam)

    val model = algorithm.run(training)

    val prediction = model.predict(test.map(_.features))
    val predictionAndLabel = prediction.zip(test.map(_.label))

    val loss = predictionAndLabel.map { case (p, l) =>
      val err = p - l
      err * err
    }.reduce(_ + _)
    val rmse = math.sqrt(loss / numTest)

    println(s"Test RMSE = $rmse.")

    sc.stop()
  }
} 
Example 139
Source File: SparseNaiveBayes.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.util.MLUtils


object SparseNaiveBayes {

  case class Params(
      input: String = null,
      minPartitions: Int = 0,
      numFeatures: Int = -1,
      lambda: Double = 1.0) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("SparseNaiveBayes") {
      head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.")
      opt[Int]("numPartitions")
        .text("min number of partitions")
        .action((x, c) => c.copy(minPartitions = x))
      opt[Int]("numFeatures")
        .text("number of features")
        .action((x, c) => c.copy(numFeatures = x))
      opt[Double]("lambda")
        .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}")
        .action((x, c) => c.copy(lambda = x))
      arg[String]("<input>")
        .text("input paths to labeled examples in LIBSVM format")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams).map { params =>
      run(params)
    }.getOrElse {
      sys.exit(1)
    }
  }

  def run(params: Params) {
    val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val minPartitions =
      if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions

    val examples =
      MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions)
    // Cache examples because it will be used in both training and evaluation.
    examples.cache()

    val splits = examples.randomSplit(Array(0.8, 0.2))
    val training = splits(0)
    val test = splits(1)

    val numTraining = training.count()
    val numTest = test.count()

    println(s"numTraining = $numTraining, numTest = $numTest.")

    val model = new NaiveBayes().setLambda(params.lambda).run(training)

    val prediction = model.predict(test.map(_.features))
    val predictionAndLabel = prediction.zip(test.map(_.label))
    val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest

    println(s"Test accuracy = $accuracy.")

    sc.stop()
  }
} 
Example 140
Source File: DenseKMeans.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors


object DenseKMeans {

  object InitializationMode extends Enumeration {
    type InitializationMode = Value
    val Random, Parallel = Value
  }

  import InitializationMode._

  case class Params(
      input: String = null,
      k: Int = -1,
      numIterations: Int = 10,
      initializationMode: InitializationMode = Parallel) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DenseKMeans") {
      head("DenseKMeans: an example k-means app for dense data.")
      opt[Int]('k', "k")
        .required()
        .text(s"number of clusters, required")
        .action((x, c) => c.copy(k = x))
      opt[Int]("numIterations")
        .text(s"number of iterations, default: ${defaultParams.numIterations}")
        .action((x, c) => c.copy(numIterations = x))
      opt[String]("initMode")
        .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " +
        s"default: ${defaultParams.initializationMode}")
        .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x)))
      arg[String]("<input>")
        .text("input paths to examples")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams).map { params =>
      run(params)
    }.getOrElse {
      sys.exit(1)
    }
  }

  def run(params: Params) {
    val conf = new SparkConf().setAppName(s"DenseKMeans with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = sc.textFile(params.input).map { line =>
      Vectors.dense(line.split(' ').map(_.toDouble))
    }.cache()

    val numExamples = examples.count()

    println(s"numExamples = $numExamples.")

    val initMode = params.initializationMode match {
      case Random => KMeans.RANDOM
      case Parallel => KMeans.K_MEANS_PARALLEL
    }

    val model = new KMeans()
      .setInitializationMode(initMode)
      .setK(params.k)
      .setMaxIterations(params.numIterations)
      .run(examples)

    val cost = model.computeCost(examples)

    println(s"Total cost = $cost.")

    sc.stop()
  }
} 
Example 141
Source File: StreamingExamples.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.streaming

import org.apache.spark.Logging

import org.apache.log4j.{Level, Logger}


  def setStreamingLogLevels() {
    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
    if (!log4jInitialized) {
      // We first log something to initialize Spark's default logging, then we override the
      // logging level.
      logInfo("Setting log level to [WARN] for streaming example." +
        " To override add a custom log4j.properties to the classpath.")
      Logger.getRootLogger.setLevel(Level.WARN)
    }
  }
} 
Example 142
Source File: YarnScheduler.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler.cluster

import org.apache.hadoop.yarn.util.RackResolver

import org.apache.log4j.{Level, Logger}

import org.apache.spark._
import org.apache.spark.scheduler.TaskSchedulerImpl
import org.apache.spark.util.Utils

private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) {

  // RackResolver logs an INFO message whenever it resolves a rack, which is way too often.
  if (Logger.getLogger(classOf[RackResolver]).getLevel == null) {
    Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN)
  }

  // By default, rack is unknown
  override def getRackForHost(hostPort: String): Option[String] = {
    val host = Utils.parseHostPort(hostPort)._1
    Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation)
  }
} 
Example 143
Source File: ClientArguments.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy

import java.net.{URI, URISyntaxException}

import scala.collection.mutable.ListBuffer

import org.apache.log4j.Level
import org.apache.spark.util.{IntParam, MemoryParam, Utils}


  private def printUsageAndExit(exitCode: Int) {
    // TODO: It wouldn't be too hard to allow users to submit their app and dependency jars
    //       separately similar to in the YARN client.
    val usage =
     s"""
      |Usage: DriverClient [options] launch <active-master> <jar-url> <main-class> [driver options]
      |Usage: DriverClient kill <active-master> <driver-id>
      |
      |Options:
      |   -c CORES, --cores CORES        Number of cores to request (default: $DEFAULT_CORES)
      |   -m MEMORY, --memory MEMORY     Megabytes of memory to request (default: $DEFAULT_MEMORY)
      |   -s, --supervise                Whether to restart the driver on failure
      |                                  (default: $DEFAULT_SUPERVISE)
      |   -v, --verbose                  Print more debugging output
     """.stripMargin
    System.err.println(usage)
    System.exit(exitCode)
  }
}

private[deploy] object ClientArguments {
  val DEFAULT_CORES = 1
  val DEFAULT_MEMORY = 512 // MB
  val DEFAULT_SUPERVISE = false

  def isValidJarUrl(s: String): Boolean = {
    try {
      val uri = new URI(s)
      uri.getScheme != null && uri.getPath != null && uri.getPath.endsWith(".jar")
    } catch {
      case _: URISyntaxException => false
    }
  }
} 
Example 144
Source File: driver.scala    From SparkSMOTE   with MIT License 5 votes vote down vote up
import java.io._
import utils._
import SMOTE._
import org.apache.log4j.Logger
import org.apache.log4j.Level
import breeze.linalg._
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import scala.collection.mutable.{ArrayBuffer,Map}


object driver {
	
	def main(args: Array[String]) {
			
		val conf = new SparkConf()

		val options = args.map { arg =>
			arg.dropWhile(_ == '-').split('=') match {
				case Array(opt, v) => (opt -> v)
				case Array(opt) => (opt -> "")
				case _ => throw new IllegalArgumentException("Invalid argument: "+arg)
			}
		}.toMap

        val rootLogger = Logger.getRootLogger()
        rootLogger.setLevel(Level.ERROR)

		val sc = new SparkContext(conf)	

		// read in general inputs
		val inputDirectory = options.getOrElse("inputDirectory","")
		val outputDirectory = options.getOrElse("outputDirectory","")
		val numFeatures = options.getOrElse("numFeatures","0").toInt
		val oversamplingPctg = options.getOrElse("oversamplingPctg","1.0").toDouble
        val kNN = options.getOrElse("K","5").toInt
		val delimiter = options.getOrElse("delimiter",",")
		val numPartitions = options.getOrElse("numPartitions","20").toInt

		SMOTE.runSMOTE(sc, inputDirectory, outputDirectory, numFeatures, oversamplingPctg, kNN, delimiter, numPartitions)	

		println("The algorithm has finished running")
		sc.stop()
	}
} 
Example 145
Source File: DQMainClass.scala    From DataQuality   with GNU Lesser General Public License v3.0 5 votes vote down vote up
package it.agilelab.bigdata.DataQuality.utils

import java.util.Locale

import it.agilelab.bigdata.DataQuality.utils.io.HistoryDBManager
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveContext


trait DQMainClass { this: DQSparkContext with Logging =>

  private def initLogger(): Unit = {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("org.apache.spark.scheduler.TaskSetManager").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.OFF)
    Logger.getLogger("io.netty").setLevel(Level.OFF)
    Logger.getLogger("org.spark-project.jetty").setLevel(Level.OFF)
    Logger.getLogger("org.apache.hadoop.hdfs.KeyProviderCache").setLevel(Level.OFF)
  }

  private def makeFileSystem(settings: DQSettings, sc: SparkContext): FileSystem = {
    if (sc.isLocal) FileSystem.getLocal(sc.hadoopConfiguration)
    else {
      if (settings.s3Bucket.isDefined) {
        sc.hadoopConfiguration.set("fs.defaultFS", settings.s3Bucket.get)
        sc.hadoopConfiguration.set("fs.AbstractFileSystem.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
      }
      FileSystem.get( sc.hadoopConfiguration)
    }
  }

  protected def body()(implicit fs: FileSystem,
                       sparkContext: SparkContext,
                       sqlContext: SQLContext,
                       sqlWriter: HistoryDBManager,
                       settings: DQSettings): Boolean

  def preMessage(task: String): Unit = {
    log.warn("************************************************************************")
    log.warn(s"               Starting execution of task $task")
    log.warn("************************************************************************")
  }

  def postMessage(task: String): Unit = {
    log.warn("************************************************************************")
    log.warn(s"               Finishing execution of task $task")
    log.warn("************************************************************************")
  }

  def main(args: Array[String]): Unit = {
    // set to avoid casting problems in metric result name generation
    Locale.setDefault(Locale.ENGLISH)
    initLogger()

    DQCommandLineOptions.parser().parse(args, DQCommandLineOptions("","")) match {
      case Some(commandLineOptions) =>
        // Load our own config values from the default location, application.conf
        val settings = new DQSettings(commandLineOptions)
        val sparkContext = makeSparkContext(settings)
        val fs = makeFileSystem(settings, sparkContext)

        settings.logThis()(log)

        val sqlContext: SQLContext = if (settings.hiveDir.isDefined) {
          val hc =  new HiveContext(sparkContext)
          hc.setConf("hive.metastore.warehouse.dir", settings.hiveDir.get)
          hc
        } else makeSqlContext(sparkContext)

        val historyDatabase = new HistoryDBManager(settings)

        // Starting application body
        preMessage(s"{${settings.appName}}")
        val startTime = System.currentTimeMillis()
        body()(fs, sparkContext, sqlContext, historyDatabase, settings)
        postMessage(s"{${settings.appName}}")

        log.info(s"Execution finished in [${(System.currentTimeMillis() - startTime) / 60000}] min(s)")
        log.info("Closing application...")

        historyDatabase.closeConnection()
        sparkContext.stop()

        log.info("Spark context were terminated. Exiting...")

      case None =>
        log.error("Wrong parameters provided")
        throw new Exception("Wrong parameters provided")

    }

  }

} 
Example 146
Source File: TestSparkContext.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
// scalastyle:off header.matches

trait TestSparkStreamingContext extends TestSparkContext {
  self: Suite =>

  implicit lazy val streaming: StreamingContext = StreamingContext.getActiveOrCreate(() =>
    new StreamingContext(sc, Seconds(1))
  )

  override def afterAll: Unit = {
    streaming.stop(stopSparkContext = false)
    super[TestSparkContext].afterAll
  }

} 
Example 147
Source File: SVMWithSGDDemo.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.util.MLUtils
import org.apache.log4j.Level
import org.apache.log4j.Logger
import org.apache.spark.SparkConf
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS

    //逻辑回归,基于lbfgs优化损失函数,支持多分类,(BFGS是逆秩2拟牛顿法)
    val modelBFGS = new LogisticRegressionWithLBFGS()
      .setNumClasses(10)
      .run(training)
    //在测试数据上计算原始分数
    // Compute raw scores on the test set.
    val predictionAndLabels = test.map {
    //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label)
      case LabeledPoint(label, features) =>
        val prediction = model.predict(features)
        (prediction, label)
    }
    //获取评估指标
    // Get evaluation metrics.
    val metricsBFGS = new MulticlassMetrics(predictionAndLabels)
    val precision = metricsBFGS.precision
    println("Precision = " + precision)

  }

} 
Example 148
Source File: LogisticRegressionWithLBFGSDeom.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.util.MLUtils
import org.apache.log4j.Level
import org.apache.log4j.Logger
import org.apache.spark.SparkConf
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS

    //逻辑回归,基于lbfgs优化损失函数,支持多分类(BFGS是逆秩2拟牛顿法)
    val modelBFGS = new LogisticRegressionWithLBFGS()
      .setNumClasses(10)
      .run(training)
    //在测试数据上计算原始分数
    // Compute raw scores on the test set.
    val predictionAndLabels = test.map {
    //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label)
      case LabeledPoint(label, features) =>
        val prediction = modelBFGS.predict(features)
        (prediction, label)
    }
    //获取评估指标
    // Get evaluation metrics.
    val metricsBFGS = new MulticlassMetrics(predictionAndLabels)
    val precision = metricsBFGS.precision
    println("Precision = " + precision)

  }
} 
Example 149
Source File: SparseNaiveBayes.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.util.MLUtils


    }

    parser.parse(args, defaultParams).map { params =>
      run(params)
    }.getOrElse {
      sys.exit(1)
    }
  }

  def run(params: Params) {
    val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params").setMaster("local")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val minPartitions =
      if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions

    val examples =
      MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions)
    // Cache examples because it will be used in both training and evaluation.
    //缓存的例子,因为它将被用于在训练和评估。
    examples.cache()

    val splits = examples.randomSplit(Array(0.8, 0.2))
    val training = splits(0)
    val test = splits(1)

    val numTraining = training.count()
    val numTest = test.count()
    //numTraining = 81, numTest = 19.
    println(s"numTraining = $numTraining, numTest = $numTest.")

    val model = new NaiveBayes().setLambda(params.lambda).run(training)

    val prediction = model.predict(test.map(_.features))
    val predictionAndLabel = prediction.zip(test.map(_.label))
    val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest
   //Test accuracy = 1.0. 准确率
    println(s"Test accuracy = $accuracy.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 150
Source File: StreamingExamples.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.streaming

import org.apache.spark.Logging

import org.apache.log4j.{Level, Logger}


  def setStreamingLogLevels() {
    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
    if (!log4jInitialized) {
      // We first log something to initialize Spark's default logging, then we override the
      // logging level.
      logInfo("Setting log level to [WARN] for streaming example." +
        " To override add a custom log4j.properties to the classpath.")
      Logger.getRootLogger.setLevel(Level.WARN)
    }
  }
} 
Example 151
Source File: YarnScheduler.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler.cluster

import org.apache.hadoop.yarn.util.RackResolver

import org.apache.log4j.{Level, Logger}

import org.apache.spark._
import org.apache.spark.scheduler.TaskSchedulerImpl
import org.apache.spark.util.Utils

private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) {

  // RackResolver logs an INFO message whenever it resolves a rack, which is way too often.
  //RackResolver在解析机架时会记录INFO消息,这种情况太常见了
  if (Logger.getLogger(classOf[RackResolver]).getLevel == null) {
    Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN)
  }

  // By default, rack is unknown
  //默认情况下,机架未知
  override def getRackForHost(hostPort: String): Option[String] = {
    val host = Utils.parseHostPort(hostPort)._1
    Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation)
  }
} 
Example 152
Source File: ClientArguments.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy

import java.net.{URI, URISyntaxException}

import scala.collection.mutable.ListBuffer

import org.apache.log4j.Level
import org.apache.spark.util.{IntParam, MemoryParam, Utils}


  private def printUsageAndExit(exitCode: Int) {
    // TODO: It wouldn't be too hard to allow users to submit their app and dependency jars
    //       separately similar to in the YARN client.
    val usage =
    s"""
       |Usage: DriverClient [options] launch <active-master> <jar-url> <main-class> [driver options]
       |Usage: DriverClient kill <active-master> <driver-id>
       |
      |Options:
       |   -c CORES, --cores CORES        Number of cores to request (default: $DEFAULT_CORES)
       |   -m MEMORY, --memory MEMORY     Megabytes of memory to request (default: $DEFAULT_MEMORY)
       |   -s, --supervise                Whether to restart the driver on failure
       |                                  (default: $DEFAULT_SUPERVISE)
       |   -v, --verbose                  Print more debugging output
     """.stripMargin
    // scalastyle:off println
    System.err.println(usage)
    // scalastyle:on println
    System.exit(exitCode)
  }
}

private[deploy] object ClientArguments {
  val DEFAULT_CORES = 1
  val DEFAULT_MEMORY = Utils.DEFAULT_DRIVER_MEM_MB // MB
  val DEFAULT_SUPERVISE = false

  def isValidJarUrl(s: String): Boolean = {
    try {
      val uri = new URI(s)
      uri.getScheme != null && uri.getPath != null && uri.getPath.endsWith(".jar")
    } catch {
      case _: URISyntaxException => false
    }
  }
} 
Example 153
Source File: Loggable.scala    From meetup-stream   with Apache License 2.0 5 votes vote down vote up
package core

import org.apache.spark.Logging

import org.apache.log4j.{Level, Logger}


  def setStreamingLogLevels() {
    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
    if (!log4jInitialized) {
      // We first log something to initialize Spark's default logging, then we override the
      // logging level.
      logInfo("Setting log level to [ERROR] for streaming example." +
        " To override add a custom log4j.properties to the classpath.")
      Logger.getRootLogger.setLevel(Level.WARN)
      Logger.getLogger("org").setLevel(Level.ERROR)
      Logger.getLogger("akka").setLevel(Level.ERROR)
      Logger.getLogger("streaming").setLevel(Level.WARN)
      Logger.getLogger("spark").setLevel(Level.WARN)
    }
  }
} 
Example 154
Source File: RemoteAppender.scala    From mist   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.mist.worker.logging

import io.hydrosphere.mist.core.logging.LogEvent
import org.apache.log4j.spi.LoggingEvent
import org.apache.log4j.{AppenderSkeleton, Level, SimpleLayout}

class RemoteAppender(sourceId: String, logsWriter: LogsWriter) extends AppenderSkeleton {

  override def append(event: LoggingEvent): Unit = {

    val timeStamp = event.timeStamp
    val message = event.getRenderedMessage
    val evt = event.getLevel match {
      case Level.INFO => LogEvent.mkInfo(sourceId, message, timeStamp)
      case Level.DEBUG => LogEvent.mkDebug(sourceId, message, timeStamp)
      case Level.ERROR =>
        LogEvent.mkError(
          sourceId, message,
          Option(event.getThrowableInformation).map(_.getThrowable),
          timeStamp
        )
      case Level.WARN => LogEvent.mkWarn(sourceId, message, timeStamp)
      case _ => LogEvent.mkInfo(sourceId, this.getLayout.format(event), timeStamp)
    }
    logsWriter.write(evt)
  }

  override def close(): Unit = ()

  override def requiresLayout(): Boolean = true
}


object RemoteAppender {

  def create(sourceId: String, logsWriter: LogsWriter): RemoteAppender = {
    val jobLogsAppender = new RemoteAppender(sourceId, logsWriter)
    jobLogsAppender.setLayout(new SimpleLayout)
    jobLogsAppender.setName(sourceId)
    jobLogsAppender
  }

} 
Example 155
Source File: SparseNaiveBayes.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.util.MLUtils


object SparseNaiveBayes {

  case class Params(
      input: String = null,
      minPartitions: Int = 0,
      numFeatures: Int = -1,
      lambda: Double = 1.0) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("SparseNaiveBayes") {
      head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.")
      opt[Int]("numPartitions")
        .text("min number of partitions")
        .action((x, c) => c.copy(minPartitions = x))
      opt[Int]("numFeatures")
        .text("number of features")
        .action((x, c) => c.copy(numFeatures = x))
      opt[Double]("lambda")
        .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}")
        .action((x, c) => c.copy(lambda = x))
      arg[String]("<input>")
        .text("input paths to labeled examples in LIBSVM format")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val minPartitions =
      if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions

    val examples =
      MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions)
    // Cache examples because it will be used in both training and evaluation.
    examples.cache()

    val splits = examples.randomSplit(Array(0.8, 0.2))
    val training = splits(0)
    val test = splits(1)

    val numTraining = training.count()
    val numTest = test.count()

    println(s"numTraining = $numTraining, numTest = $numTest.")

    val model = new NaiveBayes().setLambda(params.lambda).run(training)

    val prediction = model.predict(test.map(_.features))
    val predictionAndLabel = prediction.zip(test.map(_.label))
    val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest

    println(s"Test accuracy = $accuracy.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 156
Source File: DenseKMeans.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors


object DenseKMeans {

  object InitializationMode extends Enumeration {
    type InitializationMode = Value
    val Random, Parallel = Value
  }

  import InitializationMode._

  case class Params(
      input: String = null,
      k: Int = -1,
      numIterations: Int = 10,
      initializationMode: InitializationMode = Parallel) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DenseKMeans") {
      head("DenseKMeans: an example k-means app for dense data.")
      opt[Int]('k', "k")
        .required()
        .text(s"number of clusters, required")
        .action((x, c) => c.copy(k = x))
      opt[Int]("numIterations")
        .text(s"number of iterations, default: ${defaultParams.numIterations}")
        .action((x, c) => c.copy(numIterations = x))
      opt[String]("initMode")
        .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " +
        s"default: ${defaultParams.initializationMode}")
        .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x)))
      arg[String]("<input>")
        .text("input paths to examples")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"DenseKMeans with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = sc.textFile(params.input).map { line =>
      Vectors.dense(line.split(' ').map(_.toDouble))
    }.cache()

    val numExamples = examples.count()

    println(s"numExamples = $numExamples.")

    val initMode = params.initializationMode match {
      case Random => KMeans.RANDOM
      case Parallel => KMeans.K_MEANS_PARALLEL
    }

    val model = new KMeans()
      .setInitializationMode(initMode)
      .setK(params.k)
      .setMaxIterations(params.numIterations)
      .run(examples)

    val cost = model.computeCost(examples)

    println(s"Total cost = $cost.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 157
Source File: StreamingExamples.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.streaming

import org.apache.log4j.{Level, Logger}

import org.apache.spark.internal.Logging


  def setStreamingLogLevels() {
    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
    if (!log4jInitialized) {
      // We first log something to initialize Spark's default logging, then we override the
      // logging level.
      logInfo("Setting log level to [WARN] for streaming example." +
        " To override add a custom log4j.properties to the classpath.")
      Logger.getRootLogger.setLevel(Level.WARN)
    }
  }
} 
Example 158
Source File: YarnScheduler.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler.cluster

import org.apache.hadoop.yarn.util.RackResolver
import org.apache.log4j.{Level, Logger}

import org.apache.spark._
import org.apache.spark.scheduler.TaskSchedulerImpl
import org.apache.spark.util.Utils

private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) {

  // RackResolver logs an INFO message whenever it resolves a rack, which is way too often.
  if (Logger.getLogger(classOf[RackResolver]).getLevel == null) {
    Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN)
  }

  // By default, rack is unknown
  override def getRackForHost(hostPort: String): Option[String] = {
    val host = Utils.parseHostPort(hostPort)._1
    Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation)
  }
} 
Example 159
Source File: ClientArguments.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy

import java.net.{URI, URISyntaxException}

import scala.annotation.tailrec
import scala.collection.mutable.ListBuffer

import org.apache.log4j.Level

import org.apache.spark.util.{IntParam, MemoryParam, Utils}


  private def printUsageAndExit(exitCode: Int) {
    // TODO: It wouldn't be too hard to allow users to submit their app and dependency jars
    //       separately similar to in the YARN client.
    val usage =
     s"""
      |Usage: DriverClient [options] launch <active-master> <jar-url> <main-class> [driver options]
      |Usage: DriverClient kill <active-master> <driver-id>
      |
      |Options:
      |   -c CORES, --cores CORES        Number of cores to request (default: $DEFAULT_CORES)
      |   -m MEMORY, --memory MEMORY     Megabytes of memory to request (default: $DEFAULT_MEMORY)
      |   -s, --supervise                Whether to restart the driver on failure
      |                                  (default: $DEFAULT_SUPERVISE)
      |   -v, --verbose                  Print more debugging output
     """.stripMargin
    // scalastyle:off println
    System.err.println(usage)
    // scalastyle:on println
    System.exit(exitCode)
  }
}

private[deploy] object ClientArguments {
  val DEFAULT_CORES = 1
  val DEFAULT_MEMORY = Utils.DEFAULT_DRIVER_MEM_MB // MB
  val DEFAULT_SUPERVISE = false

  def isValidJarUrl(s: String): Boolean = {
    try {
      val uri = new URI(s)
      uri.getScheme != null && uri.getPath != null && uri.getPath.endsWith(".jar")
    } catch {
      case _: URISyntaxException => false
    }
  }
} 
Example 160
Source File: CypherParser.scala    From ingraph   with Eclipse Public License 1.0 5 votes vote down vote up
package ingraph.compiler.cypher2gplan

import java.io.ByteArrayInputStream

import ingraph.compiler.exceptions.CompilerException
import org.apache.log4j.{Level, Logger}
import org.eclipse.emf.common.util.URI
import org.eclipse.emf.ecore.resource.Resource
import org.eclipse.xtext.diagnostics.Severity
import org.eclipse.xtext.resource.{XtextResource, XtextResourceSet}
import org.eclipse.xtext.util.CancelIndicator
import org.eclipse.xtext.validation.CheckMode
import org.slizaa.neo4j.opencypher.OpenCypherStandaloneSetup
import org.slizaa.neo4j.opencypher.openCypher.Cypher

import scala.collection.JavaConverters._


object CypherParser {
  def parseFile(fileName: String): Cypher = {
    Logger.getLogger("org.eclipse.xtext").setLevel(Level.ERROR)

    // https://typefox.io/how-and-why-use-xtext-without-the-ide
    val injector = new OpenCypherStandaloneSetup().createInjectorAndDoEMFRegistration()
    val resourceSet = injector.getInstance(classOf[XtextResourceSet])
    val filePath = "../queries/" + fileName + ".cypher"
    val resource = resourceSet.getResource(URI.createFileURI(filePath), true)
    validateAndThrowError(resource)

    resource.getContents.get(0).asInstanceOf[Cypher]
  }

  def parseString(queryString: String): Cypher = {
    Logger.getLogger("org.eclipse.xtext").setLevel(Level.ERROR)

    // https://wiki.eclipse.org/Xtext/FAQ
    val injector = new OpenCypherStandaloneSetup().createInjectorAndDoEMFRegistration()
    val resourceSet = injector.getInstance(classOf[XtextResourceSet])
    val resource = resourceSet.createResource(URI.createURI("http:/example.cypher"))
    val in = new ByteArrayInputStream(queryString.getBytes())
    resource.load(in, resourceSet.getLoadOptions())
    validateAndThrowError(resource)

    resource.getContents.get(0).asInstanceOf[Cypher]
  }

  def validateAndThrowError(resource: Resource) {
    var seenError = false
    var firstError: String = null
    val validator = resource.asInstanceOf[XtextResource].getResourceServiceProvider.getResourceValidator
    val issues = validator.validate(resource, CheckMode.ALL, CancelIndicator.NullImpl).asScala
    for (issue <- issues) {
      if (issue.getSeverity == Severity.ERROR && !seenError) {
        seenError = true
        firstError = issue.getMessage
      }
      println(issue.getMessage)
    }
    if (seenError) {
      throw new CompilerException(s"Error during cypher parse, the first error was: ${firstError}")
    }
  }
} 
Example 161
Source File: LocalSparkContext.scala    From sandpiper   with Apache License 2.0 5 votes vote down vote up
package sparkle.util

import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx.GraphXUtils
import org.apache.spark.{SparkConf, SparkContext}


  def withSpark[T](f: SparkContext => T): T = {
    val conf = new SparkConf()
    GraphXUtils.registerKryoClasses(conf)
    val sc = new SparkContext("local", "test", conf)
    Logger.getLogger("org").setLevel(Level.OFF)
    Logger.getLogger("akka").setLevel(Level.OFF)
    try {
      f(sc)
    } finally {
      sc.stop()
    }
  }
} 
Example 162
Source File: Driver.scala    From OnlineLDA_Spark   with Apache License 2.0 5 votes vote down vote up
package com.github.yuhao.yang

import java.util.Calendar
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkContext, SparkConf}
import scala.collection.mutable.ArrayBuffer

object Driver extends Serializable{

  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.ERROR)
    Logger.getLogger("akka").setLevel(Level.ERROR)
    val inputDir = args(0)
    val filePaths = extractPaths(inputDir + "texts", true)
    val stopWordsPath = inputDir + "stop.txt"
    val vocabPath = inputDir + "wordsEn.txt"

    println("begin: " + Calendar.getInstance().getTime)
    println("path size: " + filePaths.size)
    assert(filePaths.size > 0)

    val conf = new SparkConf().setAppName("online LDA Spark")
    val sc = new SparkContext(conf)

    val vocab = Docs2Vec.extractVocab(sc, Seq(vocabPath), stopWordsPath)
    val vocabArray = vocab.map(_.swap)

    val K = args(1).toInt
//    val lda = OnlineLDA_Spark.runBatchMode(sc, filePaths, vocab, K, 50)
    val lda = OnlineLDA_Spark.runOnlineMode(sc, filePaths, vocab, K, args(2).toInt)

    println("_lambda:")
    for(row <- 0 until lda._lambda.rows){
      val v = lda._lambda(row, ::).t
      val topk = lda._lambda(row, ::).t.argtopk(10)
      val pairs = topk.map(k => (vocabArray(k), v(k)))
      val sorted = pairs.sortBy(_._2).reverse
       println(sorted.map(x => (x._1)).mkString(","), sorted.map(x => ("%2.2f".format(x._2))).mkString(","))
    }

    println("end: " + Calendar.getInstance().getTime())

  }

  def extractPaths(path: String, recursive: Boolean = true): Array[String] ={
    val docsets = ArrayBuffer[String]()
    val fileList = new java.io.File(path).listFiles()
    if(fileList == null) return docsets.toArray
    for(f <- fileList){
      if(f.isDirectory){
        if(recursive)
          docsets ++= extractPaths(f.getAbsolutePath, true)
      }
      else{
        docsets +=  f.getAbsolutePath
      }
    }
    docsets.toArray
  }

} 
Example 163
Source File: LinearRegression.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.optimization.{SimpleUpdater, SquaredL2Updater, L1Updater}

spark-examples-*.jar \
          |  data/mllib/sample_linear_regression_data.txt
        """.stripMargin)
    }

    parser.parse(args, defaultParams).map { params =>
      run(params)
    } getOrElse {
      sys.exit(1)
    }
  }

  def run(params: Params) {
    val conf = new SparkConf().setAppName(s"LinearRegression with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()

    val splits = examples.randomSplit(Array(0.8, 0.2))
    val training = splits(0).cache()
    val test = splits(1).cache()

    val numTraining = training.count()
    val numTest = test.count()
    println(s"Training: $numTraining, test: $numTest.")

    examples.unpersist(blocking = false)

    val updater = params.regType match {
      case NONE => new SimpleUpdater()
      case L1 => new L1Updater()
      case L2 => new SquaredL2Updater()
    }

    val algorithm = new LinearRegressionWithSGD()
    algorithm.optimizer
      .setNumIterations(params.numIterations)
      .setStepSize(params.stepSize)
      .setUpdater(updater)
      .setRegParam(params.regParam)

    val model = algorithm.run(training)

    val prediction = model.predict(test.map(_.features))
    val predictionAndLabel = prediction.zip(test.map(_.label))

    val loss = predictionAndLabel.map { case (p, l) =>
      val err = p - l
      err * err
    }.reduce(_ + _)
    val rmse = math.sqrt(loss / numTest)

    println(s"Test RMSE = $rmse.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 164
Source File: SparseNaiveBayes.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.util.MLUtils


object SparseNaiveBayes {

  case class Params(
      input: String = null,
      minPartitions: Int = 0,
      numFeatures: Int = -1,
      lambda: Double = 1.0) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("SparseNaiveBayes") {
      head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.")
      opt[Int]("numPartitions")
        .text("min number of partitions")
        .action((x, c) => c.copy(minPartitions = x))
      opt[Int]("numFeatures")
        .text("number of features")
        .action((x, c) => c.copy(numFeatures = x))
      opt[Double]("lambda")
        .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}")
        .action((x, c) => c.copy(lambda = x))
      arg[String]("<input>")
        .text("input paths to labeled examples in LIBSVM format")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams).map { params =>
      run(params)
    }.getOrElse {
      sys.exit(1)
    }
  }

  def run(params: Params) {
    val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val minPartitions =
      if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions

    val examples =
      MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions)
    // Cache examples because it will be used in both training and evaluation.
    examples.cache()

    val splits = examples.randomSplit(Array(0.8, 0.2))
    val training = splits(0)
    val test = splits(1)

    val numTraining = training.count()
    val numTest = test.count()

    println(s"numTraining = $numTraining, numTest = $numTest.")

    val model = new NaiveBayes().setLambda(params.lambda).run(training)

    val prediction = model.predict(test.map(_.features))
    val predictionAndLabel = prediction.zip(test.map(_.label))
    val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest

    println(s"Test accuracy = $accuracy.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 165
Source File: DenseKMeans.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors


object DenseKMeans {

  object InitializationMode extends Enumeration {
    type InitializationMode = Value
    val Random, Parallel = Value
  }

  import InitializationMode._

  case class Params(
      input: String = null,
      k: Int = -1,
      numIterations: Int = 10,
      initializationMode: InitializationMode = Parallel) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DenseKMeans") {
      head("DenseKMeans: an example k-means app for dense data.")
      opt[Int]('k', "k")
        .required()
        .text(s"number of clusters, required")
        .action((x, c) => c.copy(k = x))
      opt[Int]("numIterations")
        .text(s"number of iterations, default: ${defaultParams.numIterations}")
        .action((x, c) => c.copy(numIterations = x))
      opt[String]("initMode")
        .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " +
        s"default: ${defaultParams.initializationMode}")
        .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x)))
      arg[String]("<input>")
        .text("input paths to examples")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams).map { params =>
      run(params)
    }.getOrElse {
      sys.exit(1)
    }
  }

  def run(params: Params) {
    val conf = new SparkConf().setAppName(s"DenseKMeans with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = sc.textFile(params.input).map { line =>
      Vectors.dense(line.split(' ').map(_.toDouble))
    }.cache()

    val numExamples = examples.count()

    println(s"numExamples = $numExamples.")

    val initMode = params.initializationMode match {
      case Random => KMeans.RANDOM
      case Parallel => KMeans.K_MEANS_PARALLEL
    }

    val model = new KMeans()
      .setInitializationMode(initMode)
      .setK(params.k)
      .setMaxIterations(params.numIterations)
      .run(examples)

    val cost = model.computeCost(examples)

    println(s"Total cost = $cost.")

    sc.stop()
  }
}
// scalastyle:on println 
Example 166
Source File: StreamingExamples.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.streaming

import org.apache.spark.Logging

import org.apache.log4j.{Level, Logger}


  def setStreamingLogLevels() {
    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
    if (!log4jInitialized) {
      // We first log something to initialize Spark's default logging, then we override the
      // logging level.
      logInfo("Setting log level to [WARN] for streaming example." +
        " To override add a custom log4j.properties to the classpath.")
      Logger.getRootLogger.setLevel(Level.WARN)
    }
  }
} 
Example 167
Source File: YarnScheduler.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler.cluster

import org.apache.hadoop.yarn.util.RackResolver

import org.apache.log4j.{Level, Logger}

import org.apache.spark._
import org.apache.spark.scheduler.TaskSchedulerImpl
import org.apache.spark.util.Utils

private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) {

  // RackResolver logs an INFO message whenever it resolves a rack, which is way too often.
  if (Logger.getLogger(classOf[RackResolver]).getLevel == null) {
    Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN)
  }

  // By default, rack is unknown
  override def getRackForHost(hostPort: String): Option[String] = {
    val host = Utils.parseHostPort(hostPort)._1
    Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation)
  }
} 
Example 168
Source File: ClientArguments.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy

import java.net.{URI, URISyntaxException}

import scala.collection.mutable.ListBuffer

import org.apache.log4j.Level
import org.apache.spark.util.{IntParam, MemoryParam, Utils}


  private def printUsageAndExit(exitCode: Int) {
    // TODO: It wouldn't be too hard to allow users to submit their app and dependency jars
    //       separately similar to in the YARN client.
    val usage =
     s"""
      |Usage: DriverClient [options] launch <active-master> <jar-url> <main-class> [driver options]
      |Usage: DriverClient kill <active-master> <driver-id>
      |
      |Options:
      |   -c CORES, --cores CORES        Number of cores to request (default: $DEFAULT_CORES)
      |   -m MEMORY, --memory MEMORY     Megabytes of memory to request (default: $DEFAULT_MEMORY)
      |   -s, --supervise                Whether to restart the driver on failure
      |                                  (default: $DEFAULT_SUPERVISE)
      |   -v, --verbose                  Print more debugging output
     """.stripMargin
    // scalastyle:off println
    System.err.println(usage)
    // scalastyle:on println
    System.exit(exitCode)
  }
}

private[deploy] object ClientArguments {
  val DEFAULT_CORES = 1
  val DEFAULT_MEMORY = Utils.DEFAULT_DRIVER_MEM_MB // MB
  val DEFAULT_SUPERVISE = false

  def isValidJarUrl(s: String): Boolean = {
    try {
      val uri = new URI(s)
      uri.getScheme != null && uri.getPath != null && uri.getPath.endsWith(".jar")
    } catch {
      case _: URISyntaxException => false
    }
  }
} 
Example 169
Source File: SparkSessionTestWrapper.scala    From spark-stringmetric   with MIT License 5 votes vote down vote up
package com.github.mrpowers.spark.stringmetric

import org.apache.spark.sql.SparkSession
import org.apache.log4j.{Logger, Level}

trait SparkSessionTestWrapper {

  lazy val spark: SparkSession = {
    Logger.getLogger("org").setLevel(Level.OFF)
    SparkSession
      .builder()
      .master("local")
      .appName("spark session")
      .config("spark.sql.shuffle.partitions", "1")
      .getOrCreate()
  }

} 
Example 170
Source File: SparkLocal.scala    From parquet-index   with Apache License 2.0 5 votes vote down vote up
package com.github.lightcopy.testutil

import org.apache.log4j.Level
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession


  private def localConf: SparkConf = {
    new SparkConf().
      setMaster("local[4]").
      setAppName("spark-local-test").
      set("spark.driver.memory", "1g").
      set("spark.executor.memory", "2g")
  }

  override def createSparkSession(): SparkSession = {
    SparkSession.builder().config(localConf).getOrCreate()
  }
} 
Example 171
Source File: SQLAggregationScala.scala    From infinispan-spark   with Apache License 2.0 5 votes vote down vote up
package org.infinispan.spark.examples.twitter

import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.infinispan.spark.examples.twitter.Sample.{getSparkConf, usage}
import org.infinispan.spark.rdd.InfinispanRDD


object SQLAggregationScala {

   def main(args: Array[String]) {
      if (args.length < 1) {
         usage("SQLAggregationScala")
      }

      Logger.getLogger("org").setLevel(Level.WARN)
      val infinispanHost = args(0)

      // Reduce the log level in the driver
      Logger.getLogger("org").setLevel(Level.WARN)

      // Create Spark Context
      val conf = getSparkConf("spark-infinispan-rdd-aggregation-scala")
      val sc = new SparkContext(conf)

      // Populate infinispan properties
      val config = Sample.getConnectorConf(infinispanHost)

      // Create RDD from infinispan data
      val infinispanRDD = new InfinispanRDD[Long, Tweet](sc, config)

      // Create a SQLContext, register a data frame and a temp table
      val valuesRDD = infinispanRDD.values
      val sparkSession = SparkSession.builder().config(conf).getOrCreate()
      val dataFrame = sparkSession.createDataFrame(valuesRDD, classOf[Tweet])
      dataFrame.createOrReplaceTempView("tweets")

      // Run the Query, collect and print results
      sparkSession.sql("SELECT country, count(*) as c from tweets WHERE country != 'N/A' GROUP BY country ORDER BY c desc")
        .collect().take(20).foreach(println)

   }

} 
Example 172
Source File: StreamConsumerScala.scala    From infinispan-spark   with Apache License 2.0 5 votes vote down vote up
package org.infinispan.spark.examples.twitter

import java.util.concurrent.{Executors, TimeUnit}

import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.infinispan.client.hotrod.RemoteCacheManager
import org.infinispan.client.hotrod.configuration.ConfigurationBuilder
import org.infinispan.spark.examples.twitter.Sample.{getSparkConf, runAndExit, usageStream}
import org.infinispan.spark.examples.util.TwitterDStream
import org.infinispan.spark.stream._

import scala.collection.JavaConverters._
import scala.concurrent.duration._
import scala.language.postfixOps


object StreamConsumerScala {

   def main(args: Array[String]) {
      Logger.getLogger("org").setLevel(Level.WARN)

      if (args.length < 2) {
         usageStream("StreamConsumerScala")
      }

      val infinispanHost = args(0)
      val duration = args(1).toLong * 1000

      val conf = getSparkConf("spark-infinispan-stream-consumer-scala")
      val sparkContext = new SparkContext(conf)

      val streamingContext = new StreamingContext(sparkContext, Seconds(1))

      val config = Sample.getConnectorConf(infinispanHost)

      val remoteCacheManager = new RemoteCacheManager(new ConfigurationBuilder().withProperties(config.getHotRodClientProperties).build())
      val cache = remoteCacheManager.getCache[Long, Tweet]("default")

      val twitterDStream = TwitterDStream.create(streamingContext)

      val keyValueTweetStream = twitterDStream.map(s => (s.getId, s))

      keyValueTweetStream.writeToInfinispan(config)

      Repeat.every(5 seconds, {
         val keySet = cache.keySet()
         val maxKey = keySet.asScala.max
         println(s"${keySet.size} tweets inserted in the cache")
         println(s"Last tweet:${Option(cache.get(maxKey)).map(_.getText).getOrElse("<no tweets received so far>")}")
         println()
      })

      runAndExit(streamingContext, duration)
   }

   object Repeat {
      def every(d: Duration, code: => Unit) =
         Executors.newSingleThreadScheduledExecutor.scheduleWithFixedDelay(new Runnable {
            override def run(): Unit = code
         }, 10, d.toSeconds, TimeUnit.SECONDS)
   }

} 
Example 173
Source File: CallRecordGeneratorIngress.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package carly.aggregator

import java.sql.Timestamp

import scala.util.Random
import scala.concurrent.duration._

import org.apache.spark.sql.{ Dataset, SparkSession }
import org.apache.spark.sql.streaming.OutputMode

import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.LongType

import cloudflow.streamlets._
import cloudflow.streamlets.avro._
import cloudflow.spark.sql.SQLImplicits._
import carly.data.CallRecord
import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic }
import org.apache.log4j.{ Level, Logger }

case class Rate(timestamp: Timestamp, value: Long)

class CallRecordGeneratorIngress extends SparkStreamlet {

  val rootLogger = Logger.getRootLogger()
  rootLogger.setLevel(Level.ERROR)

  val RecordsPerSecond = IntegerConfigParameter("records-per-second", "Records per second to process.", Some(50))

  override def configParameters = Vector(RecordsPerSecond)

  val out   = AvroOutlet[CallRecord]("out", _.user)
  val shape = StreamletShape(out)

  override def createLogic() = new SparkStreamletLogic {
    val recordsPerSecond = RecordsPerSecond.value
    override def buildStreamingQueries = {
      val outStream = DataGenerator.mkData(super.session, recordsPerSecond)
      writeStream(outStream, out, OutputMode.Append).toQueryExecution
    }
  }
}

object DataGenerator {
  def mkData(session: SparkSession, recordsPerSecond: Int): Dataset[CallRecord] = {
    // do we need to expose this through configuration?

    val MaxTime           = 2.hours.toMillis
    val MaxUsers          = 100000
    val TS0               = new java.sql.Timestamp(0)
    val ZeroTimestampProb = 0.05 // error rate

    // Random Data Generator
    val usersUdf     = udf(() ⇒ "user-" + Random.nextInt(MaxUsers))
    val directionUdf = udf(() ⇒ if (Random.nextDouble() < 0.5) "incoming" else "outgoing")

    // Time-biased randomized filter - 1/2 hour cycles
    val sinTime: Long ⇒ Double                   = t ⇒ Math.sin((t / 1000 % 1800) * 1.0 / 1800 * Math.PI)
    val timeBoundFilter: Long ⇒ Double ⇒ Boolean = t ⇒ prob ⇒ (sinTime(t) + 0.5) > prob
    val timeFilterUdf                            = udf((ts: java.sql.Timestamp, rng: Double) ⇒ timeBoundFilter(ts.getTime)(rng))
    val zeroTimestampUdf = udf { (ts: java.sql.Timestamp, rng: Double) ⇒
      if (rng < ZeroTimestampProb) {
        TS0
      } else {
        ts
      }
    }

    val rateStream = session.readStream
      .format("rate")
      .option("rowsPerSecond", recordsPerSecond)
      .load()
      .as[Rate]

    val randomDataset = rateStream.withColumn("rng", rand()).withColumn("tsRng", rand())
    val sampledData = randomDataset
      .where(timeFilterUdf($"timestamp", $"rng"))
      .withColumn("user", usersUdf())
      .withColumn("other", usersUdf())
      .withColumn("direction", directionUdf())
      .withColumn("duration", (round(abs(rand()) * MaxTime)).cast(LongType))
      .withColumn("updatedTimestamp", zeroTimestampUdf($"timestamp", $"tsRng"))
      .select($"user", $"other", $"direction", $"duration", $"updatedTimestamp".as("timestamp"))
      .as[CallRecord]
    sampledData
  }
} 
Example 174
Source File: CallStatsAggregator.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package carly.aggregator

import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

import cloudflow.streamlets._
import cloudflow.streamlets.avro._
import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic }
import org.apache.spark.sql.streaming.OutputMode
import cloudflow.spark.sql.SQLImplicits._
import org.apache.log4j.{ Level, Logger }

import carly.data._
class CallStatsAggregator extends SparkStreamlet {

  val rootLogger = Logger.getRootLogger()
  rootLogger.setLevel(Level.ERROR)

  //tag::docs-schemaAware-example[]
  val in    = AvroInlet[CallRecord]("in")
  val out   = AvroOutlet[AggregatedCallStats]("out", _.startTime.toString)
  val shape = StreamletShape(in, out)
  //end::docs-schemaAware-example[]

  val GroupByWindow = DurationConfigParameter("group-by-window", "Window duration for the moving average computation", Some("1 minute"))

  val Watermark = DurationConfigParameter("watermark", "Late events watermark duration: how long to wait for late events", Some("1 minute"))

  override def configParameters = Vector(GroupByWindow, Watermark)
  override def createLogic = new SparkStreamletLogic {
    val watermark     = Watermark.value
    val groupByWindow = GroupByWindow.value
//    val t0 = System.currentTimeMillis() // serialization error!

    //tag::docs-aggregationQuery-example[]
    override def buildStreamingQueries = {
      val dataset   = readStream(in)
      val outStream = process(dataset)
      writeStream(outStream, out, OutputMode.Update).toQueryExecution
    }

    private def process(inDataset: Dataset[CallRecord]): Dataset[AggregatedCallStats] = {
      val query =
        inDataset
          .withColumn("ts", $"timestamp".cast(TimestampType))
          .withWatermark("ts", s"${watermark.toMillis()} milliseconds")
          .groupBy(window($"ts", s"${groupByWindow.toMillis()} milliseconds"))
          .agg(avg($"duration").as("avgCallDuration"), sum($"duration").as("totalCallDuration"))
          .withColumn("windowDuration", $"window.end".cast(LongType) - $"window.start".cast(LongType))

      query
        .select($"window.start".cast(LongType).as("startTime"), $"windowDuration", $"avgCallDuration", $"totalCallDuration")
        .as[AggregatedCallStats]
    }
    //end::docs-aggregationQuery-example[]
  }
} 
Example 175
Source File: CallAggregatorConsoleEgress.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package carly.aggregator

import cloudflow.streamlets._
import cloudflow.streamlets.avro._
import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic }
import cloudflow.spark.sql.SQLImplicits._
import org.apache.spark.sql.streaming.OutputMode

import org.apache.log4j.{ Level, Logger }

import carly.data._

class CallAggregatorConsoleEgress extends SparkStreamlet {

  val rootLogger = Logger.getRootLogger()
  rootLogger.setLevel(Level.ERROR)

  val in    = AvroInlet[AggregatedCallStats]("in")
  val shape = StreamletShape(in)

  override def createLogic = new SparkStreamletLogic {
    override def buildStreamingQueries =
      readStream(in).writeStream
        .format("console")
        .outputMode(OutputMode.Append())
        .start()
        .toQueryExecution
  }
} 
Example 176
Source File: CallStatsAggregator.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.callrecordaggregator

import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

import cloudflow.streamlets._
import cloudflow.streamlets.avro._
import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic }
import org.apache.spark.sql.streaming.OutputMode
import cloudflow.spark.sql.SQLImplicits._
import org.apache.log4j.{ Level, Logger }

import carly.data._
class CallStatsAggregator extends SparkStreamlet {

  val rootLogger = Logger.getRootLogger()
  rootLogger.setLevel(Level.ERROR)

  //tag::docs-schemaAware-example[]
  val in    = AvroInlet[CallRecord]("in")
  val out   = AvroOutlet[AggregatedCallStats]("out", _.startTime.toString)
  val shape = StreamletShape(in, out)
  //end::docs-schemaAware-example[]

  val GroupByWindow = DurationConfigParameter("group-by-window", "Window duration for the moving average computation", Some("1 minute"))

  val Watermark = DurationConfigParameter("watermark", "Late events watermark duration: how long to wait for late events", Some("1 minute"))

  override def configParameters = Vector(GroupByWindow, Watermark)
  override def createLogic = new SparkStreamletLogic {
    val watermark     = Watermark.value
    val groupByWindow = GroupByWindow.value

    //tag::docs-aggregationQuery-example[]
    override def buildStreamingQueries = {
      val dataset   = readStream(in)
      val outStream = process(dataset)
      writeStream(outStream, out, OutputMode.Update).toQueryExecution
    }
    //end::docs-aggregationQuery-example[]

    private def process(inDataset: Dataset[CallRecord]): Dataset[AggregatedCallStats] = {
      val query =
        inDataset
          .withColumn("ts", $"timestamp".cast(TimestampType))
          .withWatermark("ts", s"${watermark.toMillis()} milliseconds")
          .groupBy(window($"ts", s"${groupByWindow.toMillis()} milliseconds"))
          .agg(avg($"duration").as("avgCallDuration"), sum($"duration").as("totalCallDuration"))
          .withColumn("windowDuration", $"window.end".cast(LongType) - $"window.start".cast(LongType))

      query
        .select($"window.start".cast(LongType).as("startTime"), $"windowDuration", $"avgCallDuration", $"totalCallDuration")
        .as[AggregatedCallStats]
    }
  }
} 
Example 177
Source File: SparkFunSuite.scala    From spark-ranking-algorithms   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark

// scalastyle:off
import org.scalatest.{Outcome, FunSuite}
import org.apache.log4j.{Level, Logger}


  final protected override def withFixture(test: NoArgTest): Outcome = {
    val testName = test.text
    val suiteName = this.getClass.getName
    val shortSuiteName = suiteName.replaceAll("org.apache.spark", "o.a.s")
    try {
      Logger.getLogger("org").setLevel(Level.OFF)
      Logger.getLogger("akka").setLevel(Level.OFF)

      logInfo(s"\n\n===== TEST OUTPUT FOR $shortSuiteName: '$testName' =====\n")
      test()
    } finally {
      logInfo(s"\n\n===== FINISHED $shortSuiteName: '$testName' =====\n")
    }
  }

} 
Example 178
Source File: CSVProfiler.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.profilers

import io.gzet.profilers.field.{CardinalityProfiler, EmptinessProfiler, MaskBasedProfiler, PredefinedMasks}
import io.gzet.profilers.raw.{AsciiProfiler, RowProfiler, StructuralProfiler}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Dataset, SparkSession}
import org.elasticsearch.spark.sql._

object CSVProfiler {

  Logger.getLogger("akka").setLevel(Level.WARN)
  Logger.getLogger("org").setLevel(Level.WARN)

  val HEADER = Array(
    "rowId",
    "firstName",
    "lastName",
    "email",
    "gender",
    "ipAddress",
    "shaPass"
  )

  def main(args: Array[String]) {

    val spark = SparkSession.builder().appName("Profiler").getOrCreate()
    import spark.implicits._

    val rawDf: Dataset[String] = spark.read.text(args.head).map(_.getAs[String](0))
    rawDf.cache()
    rawDf.count()

    val tabDf: Dataset[Array[String]] = Utils.split(rawDf, delimiter = ",")

    val sources = spark.sparkContext.broadcast(rawDf.inputFiles)
    val ingestTime = spark.sparkContext.broadcast(new java.util.Date().getTime)

    val headers = spark.sparkContext.broadcast(HEADER.zipWithIndex.map(_.swap).toMap)

    RowProfiler.apply().profile(rawDf).map({ report =>
      ("row.count", report.metricValue, Map[String, String]())
    }).union(AsciiProfiler.apply().profile(rawDf).map({ report =>
      ("row.ascii", report.metricValue, Map(Tags.ASCII_NAME -> report.ascii, Tags.ASCII_BINARY -> report.binary))
    })).union(StructuralProfiler.apply(delimiter = ",").profile(rawDf).map({ report =>
      ("field.count", report.metricValue, Map(Tags.EXTRA -> report.description, Tags.FIELD_COUNT -> report.fields.toString))
    })).union(EmptinessProfiler.apply().profile(tabDf).map({ report =>
      ("field.emptiness", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString))
    })).union(CardinalityProfiler.apply(topN = 5).profile(tabDf).map({ report =>
      ("field.cardinality", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString, Tags.EXTRA -> report.description.map(l => s"[$l]").mkString(",")))
    })).union(MaskBasedProfiler.apply(topN = 5, PredefinedMasks.ASCIICLASS_LOWGRAIN).profile(tabDf).map({ report =>
      ("field.ascii.low", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString, Tags.MASK -> report.mask, Tags.EXTRA -> report.description.map(l => s"[$l]").mkString(",")))
    })).union(MaskBasedProfiler.apply(topN = 5, PredefinedMasks.ASCIICLASS_HIGHGRAIN).profile(tabDf).map({ report =>
      ("field.ascii.high", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString, Tags.MASK -> report.mask, Tags.EXTRA -> report.description.map(l => s"[$l]").mkString(",")))
    })).union(MaskBasedProfiler.apply(topN = 5, PredefinedMasks.POP_CHECKS).profile(tabDf).map({ report =>
      ("field.pop.check", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString, Tags.MASK -> report.mask, Tags.EXTRA -> report.description.map(l => s"[$l]").mkString(",")))
    })).union(MaskBasedProfiler.apply(topN = 5, PredefinedMasks.CLASS_FREQS).profile(tabDf).map({ report =>
      ("field.class.freq", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString, Tags.MASK -> report.mask, Tags.EXTRA -> report.description.map(l => s"[$l]").mkString(",")))
    })).map({ case (metricName, metricValue, tags) =>
      val newTags = {
        if (tags.contains(Tags.FIELD_IDX)) {
          val fieldIdx = tags.get(Tags.FIELD_IDX).get.toInt
          val fieldName = headers.value.getOrElse(fieldIdx, "NA")
          tags ++ Map(Tags.FIELD_NAME -> fieldName)
        } else {
          tags
        }
      }

      ReportBuilder.create
        .withName(metricName)
        .withMetric(metricValue)
        .withSources(sources.value)
        .withTime(ingestTime.value)
        .withTags(newTags)
        .build

    }).toDF().saveToEs("profiler/mock")

  }

} 
Example 179
Source File: GzetLoader.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.community

import com.typesafe.config.ConfigFactory
import io.gzet.community.accumulo.{AccumuloLoader, AccumuloConfig}
import io.gzet.community.elasticsearch.{ESReader, ESConfig}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object GzetLoader {

  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args: Array[String]) = {

    val spark = SparkSession.builder()
      .appName("communities-loader")
      .getOrCreate()

    val sc = spark.sparkContext

    val blacklist = args.mkString(" ").split(",").map(_.trim).toSet
    val config = ConfigFactory.load()

    val esField = config.getString("io.gzet.elasticsearch.field")
    val esConf = ESConfig(
      config.getString("io.gzet.elasticsearch.nodes"),
      config.getInt("io.gzet.elasticsearch.port"),
      config.getString("io.gzet.elasticsearch.index")
    )

    val accumuloTable = config.getString("io.gzet.accumulo.table")
    val accumuloConf = AccumuloConfig(
      config.getString("io.gzet.accumulo.instance"),
      config.getString("io.gzet.accumulo.user"),
      config.getString("io.gzet.accumulo.password"),
      config.getString("io.gzet.accumulo.zookeeper")
    )

    val reader = new ESReader(esConf)
    val personsRdd = reader.loadPersons(sc, esField)
    personsRdd.cache()

    val writer = new AccumuloLoader(accumuloConf)
    writer.persist(sc, accumuloTable, personsRdd, blacklist)

  }

} 
Example 180
Source File: ESReaderIT.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.community

import io.gzet.community.elasticsearch.{ESReader, ESConfig}
import io.gzet.test.SparkFunSuite
import org.apache.log4j.{Level, Logger}

class ESReaderIT extends SparkFunSuite {

  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  localTest("Read from ES") { spark =>

    val sc = spark.sparkContext
    val esConf = ESConfig("localhost", 9200, "gzet/articles")
    val esField = "persons"

    val reader = new ESReader(esConf)
    val esQuery = "?q=persons:'David Bowie'"
    val tuples = reader.loadPersons(sc, esField, esQuery)
    tuples.cache
    assert(tuples.count() > 0L)
    tuples.take(100).foreach(println)
  }
} 
Example 181
Source File: AccumuloIT.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.community

import io.gzet.community.accumulo.{AccumuloAuthorization, AccumuloReader, AccumuloLoader, AccumuloConfig}
import io.gzet.test.SparkFunSuite
import org.apache.log4j.{Level, Logger}

class AccumuloIT extends SparkFunSuite {

  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  localTest("Write to Accumulo, Read from Accumulo") { spark =>

    val sc = spark.sparkContext
    val accumuloConf = AccumuloConfig("ACCUMULO_INSTANCE", "root", "secret", "localhost:2181")
    val accumuloTable = "persons"

    val writer = new AccumuloLoader(accumuloConf)
    val persisted = sc.parallelize(Seq(("Antoine Amend", "Matthew Hallett")))
    writer.persist(sc, accumuloTable, persisted)

    val reader = new AccumuloReader(accumuloConf)
    val retrieved = reader.read(sc, accumuloTable)
    retrieved.cache()

    val filtered = retrieved.filter(_.getSourceVertex == "Antoine Amend")
    filtered.cache()
    filtered.count should be(1L)
    filtered.map(_.getDestVertex).first() should be("Matthew Hallett")
    filtered.map(_.getCount).first() should be(1L)
    filtered.map(_.toString).take(1).foreach(println)

    writer.persist(sc, accumuloTable, persisted)
    val retrieved2 = reader.read(sc, accumuloTable)
    val filtered2 = retrieved2.filter(_.getSourceVertex == "Antoine Amend")

    filtered2.cache()
    filtered2.count should be(1L)
    filtered2.map(_.getDestVertex).first() should be("Matthew Hallett")
    filtered2.map(_.getCount).first() should be(2L)
    filtered2.map(_.toString).take(1).foreach(println)
  }


  localTest("Row security") { spark =>

    val accumuloConf = AccumuloConfig("ACCUMULO_INSTANCE", "root", "secret", "localhost:2181")
    val accumuloTable = "security"

    val sc = spark.sparkContext
    val writer = new AccumuloLoader(accumuloConf)
    val persisted = sc.parallelize(
      Seq(
        ("Antoine Amend", "Matthew Hallett"),
        ("Matthew Hallett", "Antoine Amend"),
        ("Antoine", "Matthew Hallett"))
    )

    writer.persist(sc, accumuloTable, persisted, Set("Antoine Amend"))

    println("WITH UNRESTRICTED ACCESS")
    val reader1 = new AccumuloReader(accumuloConf)
    val retrieved1 = reader1.read(sc, accumuloTable, Some(AccumuloAuthorization.BLACKLIST))
    retrieved1.cache()
    retrieved1.map(_.toString).foreach(println)
    assert(retrieved1.count() === 3)

    println("WITH RESTRICTED ACCESS")
    val reader2 = new AccumuloReader(accumuloConf)
    val retrieved2 = reader2.read(sc, accumuloTable)
    retrieved2.cache()
    retrieved2.map(_.toString).foreach(println)
    assert(retrieved2.count() === 1)

  }
} 
Example 182
Source File: GzetCommunitiesTest.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.community

import io.gzet.community.clustering.wcc.WCCDetection
import io.gzet.test.SparkFunSuite
import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx.{Graph, Edge}

import scala.io.Source

class GzetCommunitiesTest extends SparkFunSuite {

  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  localTest("WCC communities") { spark =>

    val lines = Source.fromInputStream(getClass.getResourceAsStream("/local-edges.csv")).getLines().zipWithIndex.filter(_._2 > 0).map(_._1).toSeq
    val sc = spark.sparkContext
    val edges = sc.parallelize(lines).map({ line =>
      val a = line.split(",").map(_.toLong).sorted
      Edge(a.head, a.last, 1L)
    }).distinct()

    val graph = Graph.fromEdges(edges, 0L)

    graph.triplets.take(2).foreach(println)
    val communities = new WCCDetection(1).run(graph, sc)
    communities.map(_._2 -> 1).reduceByKey(_+_).collectAsMap() should be(Map(5L -> 5, 15L -> 6, 21L -> 5))
  }
} 
Example 183
Source File: GodwinTest.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.timeseries.graph

import io.gzet.test.SparkFunSuite
import org.apache.log4j.{Logger, Level}
import org.apache.spark.graphx.{Graph, Edge}
import org.apache.spark.rdd.RDD

import scala.io.Source

class GodwinTest extends SparkFunSuite {

  Logger.getLogger("akka").setLevel(Level.OFF)
  Logger.getLogger("org").setLevel(Level.OFF)

  def buildEdges() = {
    Source.fromInputStream(getClass.getResourceAsStream("/edges.csv")).getLines().drop(1).map(s => {
      val Array(source, target, weight) = s.split(",")
      Edge(source.toLong, target.toLong, weight.toDouble)
    }).toList
  }

  localTest("Test Random Walks") { sc =>
    val edges: RDD[Edge[Double]] = sc.parallelize(buildEdges(), 1)
    val godwin = new Godwin(Seq(16))
    val walks = godwin.randomWalks(Graph.fromEdges(edges, 0L), 4).collect().sortBy(_._2)
    println(walks.map(_._1).mkString(" -> "))
    walks.last._1 should be(16)
  }

} 
Example 184
package com.chapter14.Serilazition

import org.apache.log4j.LogManager
import org.apache.log4j.Level
import org.apache.log4j.Logger
import org.apache.spark.sql.SparkSession

object myCustomLogwithClosure extends Serializable {
 def main(args: Array[String]): Unit = {   
    val log = LogManager.getRootLogger
    
    //Everything is printed as INFO onece the log level is set to INFO untill you set the level to new level for example WARN. 
    log.setLevel(Level.INFO)
    log.info("Let's get started!")
    
     // Setting logger level as WARN: after that nothing prints other then WARN
    log.setLevel(Level.WARN)
    
    // Creating Spark Session
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName("Logging")
      .getOrCreate()

    // These will note be printed!
    log.info("Get prepared!")
    log.trace("Show if there is any ERROR!")

    //Started the computation and printing the logging information
    log.warn("Started")
    val data = spark.sparkContext.parallelize(0 to 100000)
    data.foreach(i => log.info("My number"+ i))
    data.collect()
    log.warn("Finished")
 }
} 
Example 185
package com.chapter16.SparkTesting
import org.apache.spark.sql.SparkSession
import org.apache.log4j.LogManager
import org.apache.log4j.Level
import org.apache.log4j.Logger

class MultiplicaitonOfTwoNumber {
  def multiply(a: Int, b: Int): Int = {
    val product = a * b
    product
  }
}
object MakingTaskSerilazible {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName(s"OneVsRestExample")
      .getOrCreate()

    val myRDD = spark.sparkContext.parallelize(0 to 100)
    myRDD.foreachPartition(s => {
      val notSerializable = new MultiplicaitonOfTwoNumber
      println(notSerializable.multiply(s.next(), s.next()))
    })
  }
} 
Example 186
Source File: myCustomLog.scala    From Scala-and-Spark-for-Big-Data-Analytics   with MIT License 5 votes vote down vote up
package com.chapter16.SparkTesting

import org.apache.log4j.LogManager
import org.apache.log4j.Level
import org.apache.spark.sql.SparkSession

object myCustomLogwithoutSerializable {
  def main(args: Array[String]): Unit = {   
    val log = LogManager.getRootLogger
    
    //Everything is printed as INFO onece the log level is set to INFO untill you set the level to new level for example WARN. 
    log.setLevel(Level.INFO)
    log.info("Let's get started!")
    
     // Setting logger level as WARN: after that nothing prints other then WARN
    log.setLevel(Level.WARN)
    
    // Creating Spark Session
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName("Logging")
      .getOrCreate()

    // These will note be printed!
    log.info("Get prepared!")
    log.trace("Show if there is any ERROR!")

    //Started the computation and printing the logging information
    log.warn("Started")
    spark.sparkContext.parallelize(1 to 5).foreach(println)
    log.warn("Finished")
  }
} 
Example 187
package com.chapter16.SparkTesting

import org.apache.log4j.{ Level, LogManager }
import org.apache.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

class MyMapper(n: Int) extends Serializable {
  @transient lazy val log = org.apache.log4j.LogManager.getLogger("myLogger")
  def logMapper(rdd: RDD[Int]): RDD[String] =
    rdd.map { i =>
      log.warn("mapping: " + i)
      (i + n).toString
    }
}

//Companion object
object MyMapper {
  def apply(n: Int): MyMapper = new MyMapper(n)
}

//Main object
object myCustomLogwithClosureSerializable {
  def main(args: Array[String]) {
    val log = LogManager.getRootLogger
    log.setLevel(Level.WARN)
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName("Testing")
      .getOrCreate()
    log.warn("Started")
    val data = spark.sparkContext.parallelize(1 to 100000)
    val mapper = MyMapper(1)
    val other = mapper.logMapper(data)
    other.collect()
    log.warn("Finished")
  }
} 
Example 188
package com.chapter14.Serilazition

import org.apache.log4j.{ Level, LogManager, PropertyConfigurator }
import org.apache.spark._
import org.apache.spark.rdd.RDD

class MyMapper2(n: Int) {
  @transient lazy val log = org.apache.log4j.LogManager.getLogger("myLogger")
  def MyMapperDosomething(rdd: RDD[Int]): RDD[String] =
    rdd.map { i =>
      log.warn("mapping: " + i)
      (i + n).toString
    }
}

//Companion object
object MyMapper2 {
  def apply(n: Int): MyMapper = new MyMapper(n)
}

//Main object
object KyroRegistrationDemo {
  def main(args: Array[String]) {
    val log = LogManager.getRootLogger
    log.setLevel(Level.WARN)
    val conf = new SparkConf()
      .setAppName("My App")
      .setMaster("local[*]")
    conf.registerKryoClasses(Array(classOf[MyMapper2]))
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    val sc = new SparkContext(conf)

    log.warn("Started")
    val data = sc.parallelize(1 to 100000)
    val mapper = MyMapper2(10)
    val other = mapper.MyMapperDosomething(data)
    other.collect()
    log.warn("Finished")
  }
} 
Example 189
Source File: MyLog.scala    From Scala-and-Spark-for-Big-Data-Analytics   with MIT License 5 votes vote down vote up
package com.chapter14.Serilazition

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.log4j.LogManager
import org.apache.log4j.Level
import org.apache.log4j.Logger

object MyLog1 extends Serializable {
 def main(args: Array[String]):Unit= {
   // Stting logger level as WARN
   val log = LogManager.getRootLogger
   log.setLevel(Level.WARN)
   @transient lazy val log2 = org.apache.log4j.LogManager.getLogger("myLogger")

   // Creating Spark Context
   val conf = new SparkConf().setAppName("My App").setMaster("local[*]")
   val sc = new SparkContext(conf)

   //Started the computation and printing the logging inforamtion
   //log.warn("Started")
   //val i = 0
   val data = sc.parallelize(0 to 100000)
   data.foreach(i => log.info("My number"+ i))
   data.collect()
   log.warn("Finished")
 }
} 
Example 190
Source File: MyLogCompleteDemo.scala    From Scala-and-Spark-for-Big-Data-Analytics   with MIT License 5 votes vote down vote up
package com.chapter14.Serilazition

import org.apache.log4j.{Level, LogManager, PropertyConfigurator}
import org.apache.spark._
import org.apache.spark.rdd.RDD

class MyMapper(n: Int) extends Serializable{
 @transient lazy val log = org.apache.log4j.LogManager.getLogger("myLogger")
 def MyMapperDosomething(rdd: RDD[Int]): RDD[String] =
   rdd.map{ i =>
     log.warn("mapping: " + i)
     (i + n).toString
   }
}

//Companion object
object MyMapper {
 def apply(n: Int): MyMapper = new MyMapper(n)
}

//Main object
object MyLog {
 def main(args: Array[String]) {
   val log = LogManager.getRootLogger
   log.setLevel(Level.WARN)
   val conf = new SparkConf()
                  .setAppName("My App")
                  .setMaster("local[*]")
   conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")               
   val sc = new SparkContext(conf)

   log.warn("Started")
   val data = sc.parallelize(1 to 100000)
   val mapper = MyMapper(1)
   val other = mapper.MyMapperDosomething(data)
   other.collect()
   log.warn("Finished")
 }
} 
Example 191
Source File: SparkPredictionTrainer.scala    From smart-meter   with MIT License 5 votes vote down vote up
package com.logimethods.nats.connector.spark.app

import java.util.Properties;
import java.io.File
import java.io.Serializable

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.storage.StorageLevel;
import org.apache.spark.streaming._

import io.nats.client.ConnectionFactory._
import java.nio.ByteBuffer

import org.apache.log4j.{Level, LogManager, PropertyConfigurator}

import com.logimethods.connector.nats.to_spark._
import com.logimethods.scala.connector.spark.to_nats._

import org.apache.spark.ml.classification.MultilayerPerceptronClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

import java.util.function._

import java.time.{LocalDateTime, ZoneOffset}
import java.time.DayOfWeek._

import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel

object SparkPredictionTrainer extends App with SparkPredictionProcessor {
  log.setLevel(Level.WARN)

  val (properties, targets, logLevel, sc, inputNatsStreaming, inputSubject, outputSubject, clusterId, outputNatsStreaming, natsUrl) = setup(args)

  val streamingDuration = scala.util.Properties.envOrElse("STREAMING_DURATION", "2000").toInt
  println("STREAMING_DURATION = " + streamingDuration)

  new Thread(new Runnable {
              def run() {
                 while( true ){
                   try {
                     val data = SparkPredictionProcessor.getData(sc, THRESHOLD)
                     val model = trainer.fit(data)
                     model.write.overwrite.save(PREDICTION_MODEL_PATH)
                     println("New model of size " + data.count() + " trained: " + model.uid)
                     Thread.sleep(streamingDuration)
                   } catch {
                     case e: Throwable => log.error(e)
                   }
                 }
              }
             }).start()
} 
Example 192
Source File: SparkProcessor.scala    From smart-meter   with MIT License 5 votes vote down vote up
package com.logimethods.nats.connector.spark.app

import java.util.Properties;
import java.io.File
import java.io.Serializable

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.storage.StorageLevel;
import org.apache.spark.streaming._

import io.nats.client.Nats._
import io.nats.client.ConnectionFactory._
import java.nio.ByteBuffer

import org.apache.log4j.{Level, LogManager, PropertyConfigurator}

import com.logimethods.connector.nats.to_spark._
import com.logimethods.scala.connector.spark.to_nats._

import java.util.function._

import java.time.{LocalDateTime, ZoneOffset}

trait SparkProcessor {
  def setup(args: Array[String]) = {
    val inputSubject = args(0)
//    val inputNatsStreaming = inputSubject.toUpperCase.contains("STREAMING")
    val outputSubject = args(1)
//    val outputNatsStreaming = outputSubject.toUpperCase.contains("STREAMING")
    println("Will process messages from '" + inputSubject + "' to '" + outputSubject + "'")

    val logLevel = scala.util.Properties.envOrElse("LOG_LEVEL", "INFO")
    println("LOG_LEVEL = " + logLevel)

    val targets = scala.util.Properties.envOrElse("TARGETS", "ALL")
    println("TARGETS = " + targets)

    val cassandraUrl = System.getenv("CASSANDRA_URL")
    println("CASSANDRA_URL = " + cassandraUrl)

    val sparkMasterUrl = System.getenv("SPARK_MASTER_URL")
    println("SPARK_MASTER_URL = " + sparkMasterUrl)

    val sparkCoresMax = System.getenv("SPARK_CORES_MAX")
    println("SPARK_CORES_MAX = " + sparkCoresMax)

    val conf = new SparkConf()
                  .setAppName(args(2))
                  .setMaster(sparkMasterUrl)
                  .set("spark.cores.max", sparkCoresMax)
                  .set("spark.cassandra.connection.host", cassandraUrl);
    val sc = new SparkContext(conf);

//    val streamingDuration = scala.util.Properties.envOrElse("STREAMING_DURATION", "2000").toInt
//    val ssc = new StreamingContext(sc, new Duration(streamingDuration));
///    ssc.checkpoint("/spark/storage")

    val properties = new Properties();
    val natsUrl = System.getenv("NATS_URI")
    println("NATS_URI = " + natsUrl)
    properties.put("servers", natsUrl)
    properties.put(PROP_URL, natsUrl)

    val clusterId = System.getenv("NATS_CLUSTER_ID")

    val inputNatsStreaming = inputSubject.toUpperCase.contains("STREAMING")
    val outputNatsStreaming = outputSubject.toUpperCase.contains("STREAMING")

    (properties, targets, logLevel, sc, inputNatsStreaming, inputSubject, outputSubject, clusterId, outputNatsStreaming, natsUrl)
  }

  def dataDecoder: Array[Byte] => Tuple2[Long,Float] = bytes => {
        val buffer = ByteBuffer.wrap(bytes);
        val epoch = buffer.getLong()
        val value = buffer.getFloat()
        (epoch, value)
      }
}


trait SparkStreamingProcessor extends SparkProcessor {
  def setupStreaming(args: Array[String]) = {
    val (properties, target, logLevel, sc, inputNatsStreaming, inputSubject, outputSubject, clusterId, outputNatsStreaming, natsUrl) = setup(args)

    val streamingDuration = scala.util.Properties.envOrElse("STREAMING_DURATION", "2000").toInt
    println("STREAMING_DURATION = " + streamingDuration)

    val ssc = new StreamingContext(sc, new Duration(streamingDuration));
//    ssc.checkpoint("/spark/storage")

    (properties, target, logLevel, sc, ssc, inputNatsStreaming, inputSubject, outputSubject, clusterId, outputNatsStreaming, natsUrl, streamingDuration)
  }
} 
Example 193
Source File: SparkTemperatureProcessor.scala    From smart-meter   with MIT License 5 votes vote down vote up
package com.logimethods.nats.connector.spark.app

import java.util.Properties;
import java.io.File
import java.io.Serializable

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.storage.StorageLevel;
import org.apache.spark.streaming._
import com.datastax.spark.connector.streaming._
import com.datastax.spark.connector.SomeColumns

import io.nats.client.ConnectionFactory._
import java.nio.ByteBuffer

import org.apache.log4j.{Level, LogManager, PropertyConfigurator}

import com.logimethods.connector.nats.to_spark._
import com.logimethods.scala.connector.spark.to_nats._

import java.util.function._

import java.time.{LocalDateTime, ZoneOffset}

object SparkTemperatureProcessor extends App with SparkStreamingProcessor {
  val log = LogManager.getRootLogger
  log.setLevel(Level.WARN)

  val (properties, target, logLevel, sc, ssc, inputNatsStreaming, inputSubject, outputSubject, clusterId, outputNatsStreaming, natsUrl, streamingDuration) =
    setupStreaming(args)

  // Temperatures //

  val temperatures =
    if (inputNatsStreaming) {
      NatsToSparkConnector
        .receiveFromNatsStreaming(classOf[Tuple2[Long,Float]], StorageLevel.MEMORY_ONLY, clusterId)
        .withNatsURL(natsUrl)
        .withSubjects(inputSubject)
        .withDataDecoder(dataDecoder)
        .asStreamOf(ssc)
    } else {
      NatsToSparkConnector
        .receiveFromNats(classOf[Tuple2[Long,Float]], StorageLevel.MEMORY_ONLY)
        .withProperties(properties)
        .withSubjects(inputSubject)
        .withDataDecoder(dataDecoder)
        .asStreamOf(ssc)
    }

  // Ideally, should be the AVG
  val singleTemperature = temperatures.reduceByKey(Math.max(_,_))

  if (logLevel.contains("TEMPERATURE")) {
    singleTemperature.print()
  }

  singleTemperature.saveToCassandra("smartmeter", "temperature")

  val temperatureReport = singleTemperature.map({case (epoch, temperature) => (s"""{"epoch": $epoch, "temperature": $temperature}""") })
  SparkToNatsConnectorPool.newPool()
                      .withProperties(properties)
                      .withSubjects(outputSubject) // "smartmeter.extract.temperature"
                      .publishToNats(temperatureReport)

  // Start //
  ssc.start();

  ssc.awaitTermination()
} 
Example 194
Source File: SparkBatch.scala    From smart-meter   with MIT License 5 votes vote down vote up
package com.logimethods.nats.connector.spark.app

import java.util.Properties;
import java.io.File
import java.io.Serializable

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

import org.apache.log4j.{Level, LogManager, PropertyConfigurator}
import org.apache.log4j.Logger

import org.apache.spark.sql.SparkSession

//import com.datastax.spark.connector._
//import com.datastax.spark.connector.cql.CassandraConnector

// @see http://stackoverflow.com/questions/39423131/how-to-use-cassandra-context-in-spark-2-0
// @see https://databricks.com/blog/2016/08/15/how-to-use-sparksession-in-apache-spark-2-0.html
// @see https://dzone.com/articles/cassandra-with-spark-20-building-rest-api
object SparkBatch extends App {
  val logLevel = System.getenv("APP_BATCH_LOG_LEVEL")
  println("APP_BATCH_LOG_LEVEL = " + logLevel)
  if ("DEBUG" != logLevel) {
  	Logger.getLogger("org").setLevel(Level.OFF)
  }
  
  val cassandraUrl = System.getenv("CASSANDRA_URL")
  println("CASSANDRA_URL = " + cassandraUrl)
  
  val sparkMasterUrl = System.getenv("SPARK_MASTER_URL")
  println("SPARK_MASTER_URL = " + sparkMasterUrl)
  
  val spark = SparkSession
    .builder()
    .master(sparkMasterUrl)
    .appName("Smartmeter Batch")
    .config("spark.cassandra.connection.host", cassandraUrl)
    //   .config("spark.sql.warehouse.dir", warehouseLocation)
    //.enableHiveSupport()
    .getOrCreate()
  
  spark
    .read
    .format("org.apache.spark.sql.cassandra")
    .options(Map("keyspace" -> "smartmeter", "table" -> "raw_data"))
    .load
    .createOrReplaceTempView("raw_data")
  
  val rawVoltageData = spark.sql("select * from raw_data")
  rawVoltageData.show(10)
  
  
  // @see http://stackoverflow.com/questions/40324153/what-is-the-best-way-to-insert-update-rows-in-cassandra-table-via-java-spark
  //Save data to Cassandra
  import org.apache.spark.sql.SaveMode
  avgByTransformer.write.format("org.apache.spark.sql.cassandra").options(Map("keyspace" -> "smartmeter", "table" -> "avg_voltage_by_transformer")).mode(SaveMode.Overwrite).save();
} 
Example 195
Source File: Log4jAppender.scala    From rollbar-scala   with MIT License 5 votes vote down vote up
package com.storecove.rollbar.appenders

import org.apache.log4j.helpers.LogLog
import org.apache.log4j.spi.{LoggingEvent, ThrowableInformation}
import org.apache.log4j.{AppenderSkeleton, Level}


class Log4jAppender extends AppenderSkeleton with AbstractAppender {

    override def append(event: LoggingEvent): Unit = {
        if (enabled) {
            try {
                logBuffer.enqueueFinite(this.layout.format(event).trim, limit)

                if (event.getLevel.isGreaterOrEqual(notifyLevel)) {
                    val hasThrowable = event.getThrowableInformation != null || event.getMessage.isInstanceOf[Throwable]
                    if (!onlyThrowable || hasThrowable) {
                        rollbarNotifier.notify(event.getLevel.toString, event.getMessage.toString, getThrowable(event), getMDCContext)
                    }
                }
            } catch {
                case e: Exception =>
                    val stackTrace = e.getStackTrace.map(trace => trace.toString).mkString("\n")
                    LogLog.error("error=" + e.getClass.getName + " with message=" + e.getMessage + "\n" + stackTrace)
            }
        }
    }

    override def requiresLayout(): Boolean = true

    override def close(): Unit = {}


    override def activateOptions(): Unit = {
        if (this.apiKey == null || this.apiKey.isEmpty) {
            println("No apiKey set for the appender named [" + getName + "].")
        } else if (this.environment == null || this.environment.isEmpty) {
            println("No environment set for the appender named [" + getName + "].")
        } else {
            println(s"PARAMETERS SET\n\n$apiKey / $environment\n")
            super.activateOptions()
        }
    }

    protected def getThrowable(event: LoggingEvent): Option[Throwable] = {
        event.getThrowableInformation match {
            case throwableInfo: ThrowableInformation => Some(throwableInfo.getThrowable)
            case _ => event.getMessage match {
                case throwable: Throwable => Some(throwable)
                case _ => None
            }
        }
    }

    override protected def notifyLevel: Level = Level.toLevel(notifyLevelString)

    def setNotifyLevel(notifyLevel: String): Unit = notifyLevelString = notifyLevel
} 
Example 196
Source File: SparkTestContext.scala    From scalable-deeplearning   with Apache License 2.0 5 votes vote down vote up
package scaladl.util

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkContext
import org.scalatest.{BeforeAndAfterAll, Suite}

trait SparkTestContext extends BeforeAndAfterAll { self: Suite =>
  @transient var spark: SparkSession = _
  @transient var sc: SparkContext = _
  @transient var checkpointDir: String = _

  override def beforeAll() {
    super.beforeAll()
    spark = SparkSession.builder
      .master("local[2]")
      .appName("MLlibUnitTest")
      .config("spark.sql.warehouse.dir", "warehouse-temp")
      .getOrCreate()
    sc = spark.sparkContext
    Logger.getLogger("org").setLevel(Level.WARN)
  }

  override def afterAll() {
    try {
      SparkSession.clearActiveSession()
      if (spark != null) {
        spark.stop()
      }
      spark = null
    } finally {
      super.afterAll()
    }
  }
} 
Example 197
Source File: SparkTransformerBenchmark.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package com.truecar.mleap.spark.benchmark

import java.io.{FileInputStream, File}

import com.esotericsoftware.kryo.io.Input
import com.truecar.mleap.runtime.LocalLeapFrame
import com.truecar.mleap.spark.benchmark.util.SparkSerializer
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.ml.Transformer
import org.scalameter.Bench
import scala.collection.JavaConverters._
import org.scalameter.api._
import org.scalameter.picklers.Implicits._
import org.apache.log4j.Logger
import org.apache.log4j.Level
import com.truecar.mleap.spark.MleapSparkSupport._
import spray.json._
import com.truecar.mleap.serialization.mleap.v1.MleapJsonSupport._


object SparkTransformerBenchmark extends Bench.ForkedTime {
  lazy override val executor = {
    SeparateJvmsExecutor(
      Executor.Warmer.Zero,
      Aggregator.min[Double],
      new Measurer.Default)
  }

  val classLoader = getClass.getClassLoader
  val regressionFile = new File("/tmp/spark.transformer.kryo")
  val frameFile = new File("/tmp/frame.json")

  val inputStream = new FileInputStream(regressionFile)
  val input = new Input(inputStream)

  val regression: Transformer = SparkSerializer().read(input)
  val lines = scala.io.Source.fromFile(frameFile).mkString
  val frame = lines.parseJson.convertTo[LocalLeapFrame]

  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  val sparkConf = new SparkConf()
    .setAppName("Spark Transformer Benchmark")
    .setMaster("local[1]")
  val sc = new SparkContext(sparkConf)
  val sqlContext = new SQLContext(sc)

  val rdd = frame.dataset.data.map(a => Row(a.toSeq: _*)).toList.asJava
  val schema = frame.schema.toSpark
  val sparkFrame = sqlContext.createDataFrame(rdd, schema)

  val ranges = for {
    size <- Gen.range("size")(1000, 10000, 1000)
  } yield 0 until size

  measure method "transform" in {
    using(ranges) in {
      size =>
        size.foreach {
          _ => regression.transform(sparkFrame).head
        }
    }
  }

//  sc.stop()
} 
Example 198
Source File: package.scala    From kafka-scala-api   with Apache License 2.0 5 votes vote down vote up
package com

import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.{Seconds, StreamingContext}

package object example {

  def setupLogging(): Unit = {
    import org.apache.log4j.{Level, Logger}
    val rootLogger = Logger.getRootLogger
    rootLogger.setLevel(Level.ERROR)
  }

  def kafkaParams = Map[String, Object](
    "bootstrap.servers" -> "127.0.0.1:9092",
    "key.deserializer" -> classOf[StringDeserializer],
    "value.deserializer" -> classOf[StringDeserializer],
    "group.id" -> "mygroup1",
    "auto.offset.reset" -> "latest",
    "enable.auto.commit" -> (false: java.lang.Boolean)
  )

  def launchWithCheckpointing(logic: StreamingContext => Unit, appName:String, checkpointPath:String): Unit = {
    val streamingContext = new StreamingContext("local[*]", appName, Seconds(2))
    setupLogging()
    logic.apply(streamingContext)

    streamingContext.checkpoint(checkpointPath)
    streamingContext.start()
    streamingContext.awaitTermination()
  }

  def launchWithItself(logic: StreamingContext => Unit, appName:String): Unit = {
    val streamingContext = new StreamingContext("local[*]", appName, Seconds(2))
    setupLogging()
    logic.apply(streamingContext)

    streamingContext.start()
    streamingContext.awaitTermination()
  }
} 
Example 199
Source File: KafkaFlowExampleTest.scala    From kafka-scala-api   with Apache License 2.0 5 votes vote down vote up
package com.example.flow

import org.apache.log4j.{Level, Logger}
import org.specs2.mutable._

import scala.Predef.{conforms => _}

class KafkaFlowExampleTest extends Specification {
  Logger.getRootLogger.setLevel(Level.ERROR)

  sequential

  "the transformStream method" should {
    implicit val fun = KafkaFlowExample.transformStream _

    "with 10 identical records" should {
      val records = Seq.fill(10)("""{"item_id":"abc123","amount":1.23,"time":1431504603105}""")
      "return a single record with the correct total" in new SparkStreamingSpec(records) {
        collector.length mustEqual 1
        val output = collector.head
        output.total mustEqual BigDecimal(12.3)
      }
    }

    "with invalid records" should {
      val records = Seq("this is not json", """{"this":"isn't in the right format"}""")
      "output no records" in new SparkStreamingSpec(records) {
        collector.length mustEqual 0
      }
    }
  }
} 
Example 200
Source File: package.scala    From kafka-scala-api   with Apache License 2.0 5 votes vote down vote up
package com

import org.apache.spark.streaming.{Seconds, StreamingContext}

package object example {

    def setupLogging(): Unit = {
      import org.apache.log4j.{Level, Logger}
      val rootLogger = Logger.getRootLogger
      rootLogger.setLevel(Level.ERROR)
    }

    def launch(logic: StreamingContext => Unit, appName:String, checkpointPath:String): Unit = {
      val streamingContext = new StreamingContext("local[*]", appName, Seconds(2))
      setupLogging()
      logic.apply(streamingContext)

      streamingContext.checkpoint(checkpointPath)
      streamingContext.start()
      streamingContext.awaitTermination()
    }
}