org.apache.log4j.Level Scala Examples
The following examples show how to use org.apache.log4j.Level.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DenseKMeans.scala From drizzle-spark with Apache License 2.0 | 6 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 2
Source File: LinearRegression.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.optimization.{L1Updater, SimpleUpdater, SquaredL2Updater} import org.apache.spark.mllib.regression.LinearRegressionWithSGD import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"LinearRegression with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0).cache() val test = splits(1).cache() val numTraining = training.count() val numTest = test.count() println(s"Training: $numTraining, test: $numTest.") examples.unpersist(blocking = false) val updater = params.regType match { case NONE => new SimpleUpdater() case L1 => new L1Updater() case L2 => new SquaredL2Updater() } val algorithm = new LinearRegressionWithSGD() algorithm.optimizer .setNumIterations(params.numIterations) .setStepSize(params.stepSize) .setUpdater(updater) .setRegParam(params.regParam) val model = algorithm.run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val loss = predictionAndLabel.map { case (p, l) => val err = p - l err * err }.reduce(_ + _) val rmse = math.sqrt(loss / numTest) println(s"Test RMSE = $rmse.") sc.stop() } } // scalastyle:on println
Example 3
Source File: BinaryClassification.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, SVMWithSGD} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.optimization.{L1Updater, SquaredL2Updater} import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --algorithm LR --regType L2 --regParam 1.0 \ | data/mllib/sample_binary_classification_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"BinaryClassification with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0).cache() val test = splits(1).cache() val numTraining = training.count() val numTest = test.count() println(s"Training: $numTraining, test: $numTest.") examples.unpersist(blocking = false) val updater = params.regType match { case L1 => new L1Updater() case L2 => new SquaredL2Updater() } val model = params.algorithm match { case LR => val algorithm = new LogisticRegressionWithLBFGS() algorithm.optimizer .setNumIterations(params.numIterations) .setUpdater(updater) .setRegParam(params.regParam) algorithm.run(training).clearThreshold() case SVM => val algorithm = new SVMWithSGD() algorithm.optimizer .setNumIterations(params.numIterations) .setStepSize(params.stepSize) .setUpdater(updater) .setRegParam(params.regParam) algorithm.run(training).clearThreshold() } val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val metrics = new BinaryClassificationMetrics(predictionAndLabel) println(s"Test areaUnderPR = ${metrics.areaUnderPR()}.") println(s"Test areaUnderROC = ${metrics.areaUnderROC()}.") sc.stop() } } // scalastyle:on println
Example 4
Source File: SparseNaiveBayes.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println
Example 5
Source File: StreamingExamples.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.apache.log4j.{Level, Logger} import org.apache.spark.internal.Logging def setStreamingLogLevels() { val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements if (!log4jInitialized) { // We first log something to initialize Spark's default logging, then we override the // logging level. logInfo("Setting log level to [WARN] for streaming example." + " To override add a custom log4j.properties to the classpath.") Logger.getRootLogger.setLevel(Level.WARN) } } }
Example 6
Source File: YarnScheduler.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.yarn.util.RackResolver import org.apache.log4j.{Level, Logger} import org.apache.spark._ import org.apache.spark.scheduler.TaskSchedulerImpl import org.apache.spark.util.Utils private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) { // RackResolver logs an INFO message whenever it resolves a rack, which is way too often. if (Logger.getLogger(classOf[RackResolver]).getLevel == null) { Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN) } // By default, rack is unknown override def getRackForHost(hostPort: String): Option[String] = { val host = Utils.parseHostPort(hostPort)._1 Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation) } }
Example 7
Source File: ClientArguments.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import java.net.{URI, URISyntaxException} import scala.annotation.tailrec import scala.collection.mutable.ListBuffer import org.apache.log4j.Level import org.apache.spark.util.{IntParam, MemoryParam, Utils} private def printUsageAndExit(exitCode: Int) { // TODO: It wouldn't be too hard to allow users to submit their app and dependency jars // separately similar to in the YARN client. val usage = s""" |Usage: DriverClient [options] launch <active-master> <jar-url> <main-class> [driver options] |Usage: DriverClient kill <active-master> <driver-id> | |Options: | -c CORES, --cores CORES Number of cores to request (default: $DEFAULT_CORES) | -m MEMORY, --memory MEMORY Megabytes of memory to request (default: $DEFAULT_MEMORY) | -s, --supervise Whether to restart the driver on failure | (default: $DEFAULT_SUPERVISE) | -v, --verbose Print more debugging output """.stripMargin // scalastyle:off println System.err.println(usage) // scalastyle:on println System.exit(exitCode) } } private[deploy] object ClientArguments { val DEFAULT_CORES = 1 val DEFAULT_MEMORY = Utils.DEFAULT_DRIVER_MEM_MB // MB val DEFAULT_SUPERVISE = false def isValidJarUrl(s: String): Boolean = { try { val uri = new URI(s) uri.getScheme != null && uri.getPath != null && uri.getPath.endsWith(".jar") } catch { case _: URISyntaxException => false } } }
Example 8
Source File: DLClassifierLeNet.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.example.MLPipeline import com.intel.analytics.bigdl._ import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToBatch} import com.intel.analytics.bigdl.dataset.{DataSet, DistributedDataSet, MiniBatch, _} import com.intel.analytics.bigdl.dlframes.DLClassifier import com.intel.analytics.bigdl.models.lenet.LeNet5 import com.intel.analytics.bigdl.models.lenet.Utils._ import com.intel.analytics.bigdl.nn.ClassNLLCriterion import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter} import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext object DLClassifierLeNet { LoggerFilter.redirectSparkInfoLogs() def main(args: Array[String]): Unit = { val inputs = Array[String]("Feature data", "Label data") trainParser.parse(args, new TrainParams()).foreach(param => { val conf = Engine.createSparkConf() .setAppName("MLPipeline Example") .set("spark.task.maxFailures", "1") val sc = new SparkContext(conf) val sqLContext = SQLContext.getOrCreate(sc) Engine.init val trainData = param.folder + "/train-images-idx3-ubyte" val trainLabel = param.folder + "/train-labels-idx1-ubyte" val validationData = param.folder + "/t10k-images-idx3-ubyte" val validationLabel = param.folder + "/t10k-labels-idx1-ubyte" val trainSet = DataSet.array(load(trainData, trainLabel), sc) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch(1) val trainingRDD : RDD[Data[Float]] = trainSet. asInstanceOf[DistributedDataSet[MiniBatch[Float]]].data(false).map(batch => { val feature = batch.getInput().asInstanceOf[Tensor[Float]] val label = batch.getTarget().asInstanceOf[Tensor[Float]] Data[Float](feature.storage().array(), label.storage().array()) }) val trainingDF = sqLContext.createDataFrame(trainingRDD).toDF(inputs: _*) val model = LeNet5(classNum = 10) val criterion = ClassNLLCriterion[Float]() val featureSize = Array(28, 28) val estimator = new DLClassifier[Float](model, criterion, featureSize) .setFeaturesCol(inputs(0)) .setLabelCol(inputs(1)) .setBatchSize(param.batchSize) .setMaxEpoch(param.maxEpoch) val transformer = estimator.fit(trainingDF) val validationSet = DataSet.array(load(validationData, validationLabel), sc) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToBatch(1) val validationRDD: RDD[Data[Float]] = validationSet. asInstanceOf[DistributedDataSet[MiniBatch[Float]]].data(false).map{batch => val feature = batch.getInput().asInstanceOf[Tensor[Float]] val label = batch.getTarget().asInstanceOf[Tensor[Float]] Data[Float](feature.storage().array(), label.storage().array()) } val validationDF = sqLContext.createDataFrame(validationRDD).toDF(inputs: _*) val transformed = transformer.transform(validationDF) transformed.show() sc.stop() }) } } private case class Data[T](featureData : Array[T], labelData : Array[T])
Example 9
Source File: ImagePredictor.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.example.imageclassification import java.nio.file.Paths import com.intel.analytics.bigdl.dataset.image._ import com.intel.analytics.bigdl.dlframes.DLClassifierModel import com.intel.analytics.bigdl.example.imageclassification.MlUtils._ import com.intel.analytics.bigdl.numeric.NumericFloat import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter} import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext object ImagePredictor { LoggerFilter.redirectSparkInfoLogs() Logger.getLogger("com.intel.analytics.bigdl.example").setLevel(Level.INFO) def main(args: Array[String]): Unit = { predictParser.parse(args, new PredictParams()).map(param => { val conf = Engine.createSparkConf() conf.setAppName("Predict with trained model") val sc = new SparkContext(conf) Engine.init val sqlContext = new SQLContext(sc) val partitionNum = Engine.nodeNumber() * Engine.coreNumber() val model = loadModel(param) val valTrans = new DLClassifierModel(model, Array(3, imageSize, imageSize)) .setBatchSize(param.batchSize) .setFeaturesCol("features") .setPredictionCol("predict") val valRDD = if (param.isHdfs) { // load image set from hdfs imagesLoadSeq(param.folder, sc, param.classNum).coalesce(partitionNum, true) } else { // load image set from local val paths = LocalImageFiles.readPaths(Paths.get(param.folder), hasLabel = false) sc.parallelize(imagesLoad(paths, 256), partitionNum) } val transf = RowToByteRecords() -> BytesToBGRImg() -> BGRImgCropper(imageSize, imageSize) -> BGRImgNormalizer(testMean, testStd) -> BGRImgToImageVector() val valDF = transformDF(sqlContext.createDataFrame(valRDD), transf) valTrans.transform(valDF) .select("imageName", "predict") .collect() .take(param.showNum) .foreach(println) sc.stop() }) } }
Example 10
Source File: ImageNetInference.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.example.mkldnn.int8 import com.intel.analytics.bigdl.models.resnet.ImageNetDataSet import com.intel.analytics.bigdl.nn.Module import com.intel.analytics.bigdl.optim._ import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._ import com.intel.analytics.bigdl.utils._ import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext object ImageNetInference { LoggerFilter.redirectSparkInfoLogs() Logger.getLogger("com.intel.analytics.bigdl.optim").setLevel(Level.INFO) val logger: Logger = Logger.getLogger(getClass) import Utils._ def main(args: Array[String]): Unit = { testParser.parse(args, TestParams()).foreach(param => { val conf = Engine.createSparkConf() .setAppName("Test model on ImageNet2012 with Int8") .set("spark.rpc.message.maxSize", "200") val sc = new SparkContext(conf) Engine.init val evaluationSet = ImageNetDataSet.valDataSet(param.folder, sc, 224, param.batchSize).toDistributed().data(train = false) val model = Module.loadModule[Float](param.model).quantize() model.evaluate() val result = model.evaluate(evaluationSet, Array(new Top1Accuracy[Float], new Top5Accuracy[Float])) result.foreach(r => println(s"${r._2} is ${r._1}")) sc.stop() }) } }
Example 11
Source File: GenerateInt8Scales.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.example.mkldnn.int8 import com.intel.analytics.bigdl.dataset.{DataSet, MiniBatch} import com.intel.analytics.bigdl.models.resnet.ImageNetDataSet import com.intel.analytics.bigdl.nn.{Graph, Module} import com.intel.analytics.bigdl.utils.Engine import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD object GenerateInt8Scales { val logger: Logger = Logger.getLogger(getClass) Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) Logger.getLogger("breeze").setLevel(Level.ERROR) import Utils._ def genereateInt8Scales(model: Graph[Float], modelName: String, evaluationSet: RDD[MiniBatch[Float]]): Unit = { model.evaluate() model.setInputDimMask(0, true) model.setOutputDimMask(0, true) model.setWeightDimMask(1, true) logger.info(s"Generate the scales for $modelName ...") val samples = evaluationSet .repartition(1) // repartition (shuffle) will have better accuracy .take(1) // only split one batch to sample .map(_.getInput().toTensor[Float]) samples.foreach { sample => model.forward(sample) model.calcScales(sample) } // we should clean the state, such as output model.clearState() logger.info(s"Generate the scales for $modelName done.") } def saveQuantizedModel(model: Graph[Float], modelName: String): Unit = { val suffix = ".bigdl" val prefix = modelName.stripSuffix(suffix) val name = prefix.concat(".quantized").concat(suffix) logger.info(s"Save the quantized model $name ...") // it will force overWrite the existed model file model.saveModule(name, overWrite = true) logger.info(s"Save the quantized model $name done.") } def main(args: Array[String]): Unit = { genInt8ScalesParser.parse(args, GenInt8ScalesParams()).foreach { param => val conf = Engine.createSparkConf().setAppName("Quantize the model") .set("spark.akka.frameSize", 64.toString) .set("spark.task.maxFailures", "1") val sc = new SparkContext(conf) Engine.init val partitionNum = Engine.nodeNumber() val imageFrame = DataSet.SeqFileFolder.filesToImageFrame(param.folder, sc, 1000, partitionNum = Option(partitionNum)) // the transformer is the same as as that in validation during training val evaluationSet = ImageNetDataSet.valDataSet(param.folder, sc, 224, param.batchSize).toDistributed().data(train = false) // Currently, we only support the graph model, so we add a `toGraph` // if the model is already graph, you can need not to it. val model = Module.loadModule[Float](param.model).toGraph() genereateInt8Scales(model, param.model, evaluationSet) saveQuantizedModel(model, param.model) } } }
Example 12
Source File: Test.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.example.lenetLocal import com.intel.analytics.bigdl.dataset.{DataSet, SampleToBatch} import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToSample} import com.intel.analytics.bigdl.nn.Module import com.intel.analytics.bigdl.optim.{Top1Accuracy, ValidationMethod} import com.intel.analytics.bigdl.utils.Engine import org.apache.log4j.{Level, Logger} object Test { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) Logger.getLogger("breeze").setLevel(Level.ERROR) import Utils._ def main(args: Array[String]): Unit = { testParser.parse(args, new TestParams()).foreach { param => System.setProperty("bigdl.localMode", "true") System.setProperty("bigdl.coreNumber", param.coreNumber.toString) Engine.init val validationData = param.folder + "/t10k-images-idx3-ubyte" val validationLabel = param.folder + "/t10k-labels-idx1-ubyte" val evaluationSet = DataSet.array(load(validationData, validationLabel)) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToSample() -> SampleToBatch( batchSize = param.batchSize, None, None, None, partitionNum = Some(1)) val model = Module.load[Float](param.model) val result = model.evaluate(evaluationSet.toLocal(), Array(new Top1Accuracy[Float].asInstanceOf[ValidationMethod[Float]])) result.foreach(r => println(s"${r._2} is ${r._1}")) } } }
Example 13
Source File: Predict.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.example.lenetLocal import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToSample} import com.intel.analytics.bigdl.nn.Module import com.intel.analytics.bigdl.utils.Engine import com.intel.analytics.bigdl.dataset.Sample import com.intel.analytics.bigdl.optim.LocalPredictor import org.apache.log4j.{Level, Logger} import scala.collection.mutable.ArrayBuffer object Predict { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) Logger.getLogger("breeze").setLevel(Level.ERROR) import Utils._ def main(args: Array[String]): Unit = { predictParser.parse(args, new PredictParams()).foreach { param => System.setProperty("bigdl.localMode", "true") System.setProperty("bigdl.coreNumber", (param.coreNumber.toString)) Engine.init val validationData = param.folder + "/t10k-images-idx3-ubyte" val validationLabel = param.folder + "/t10k-labels-idx1-ubyte" val rawData = load(validationData, validationLabel) val iter = rawData.iterator val sampleIter = GreyImgToSample()( GreyImgNormalizer(trainMean, trainStd)( BytesToGreyImg(28, 28)(iter))) var samplesBuffer = ArrayBuffer[Sample[Float]]() while (sampleIter.hasNext) { val elem = sampleIter.next().clone() samplesBuffer += elem } val samples = samplesBuffer.toArray val model = Module.load[Float](param.model) val localPredictor = LocalPredictor(model) val result = localPredictor.predict(samples) val result_class = localPredictor.predictClass(samples) result_class.foreach(r => println(s"${r}")) } } }
Example 14
Source File: Train.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.example.lenetLocal import com.intel.analytics.bigdl._ import com.intel.analytics.bigdl.dataset.DataSet import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToBatch} import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Module} import com.intel.analytics.bigdl.numeric.NumericFloat import com.intel.analytics.bigdl.optim._ import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter} import com.intel.analytics.bigdl.models.lenet.LeNet5 import org.apache.log4j.{Level, Logger} object Train { LoggerFilter.redirectSparkInfoLogs() import Utils._ def main(args: Array[String]): Unit = { trainParser.parse(args, new TrainParams()).map(param => { System.setProperty("bigdl.localMode", "true") System.setProperty("bigdl.coreNumber", param.coreNumber.toString) Engine.init val trainData = param.folder + "/train-images-idx3-ubyte" val trainLabel = param.folder + "/train-labels-idx1-ubyte" val validationData = param.folder + "/t10k-images-idx3-ubyte" val validationLabel = param.folder + "/t10k-labels-idx1-ubyte" val model = if (param.modelSnapshot.isDefined) { Module.load[Float](param.modelSnapshot.get) } else { LeNet5(classNum = 10) } val optimMethod = if (param.stateSnapshot.isDefined) { OptimMethod.load[Float](param.stateSnapshot.get) } else { new SGD[Float](learningRate = param.learningRate, learningRateDecay = param.learningRateDecay) } val trainSet = DataSet.array(load(trainData, trainLabel)) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch( param.batchSize) val optimizer = Optimizer( model = model, dataset = trainSet, criterion = ClassNLLCriterion[Float]()) if (param.checkpoint.isDefined) { optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch) } if(param.overWriteCheckpoint) { optimizer.overWriteCheckpoint() } val validationSet = DataSet.array(load(validationData, validationLabel)) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToBatch( param.batchSize) optimizer .setValidation( trigger = Trigger.everyEpoch, dataset = validationSet, vMethods = Array(new Top1Accuracy, new Top5Accuracy[Float], new Loss[Float])) .setOptimMethod(optimMethod) .setEndWhen(Trigger.maxEpoch(param.maxEpoch)) .optimize() }) } }
Example 15
Source File: Test.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.resnet import com.intel.analytics.bigdl.Module import com.intel.analytics.bigdl.nn.Module import com.intel.analytics.bigdl.utils.Engine import com.intel.analytics.bigdl.models.resnet.Utils._ import com.intel.analytics.bigdl.optim.{Top1Accuracy, ValidationMethod, ValidationResult} import com.intel.analytics.bigdl.dataset.image.{BGRImgNormalizer, BGRImgToSample, BytesToBGRImg} import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext object Test { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) Logger.getLogger("breeze").setLevel(Level.ERROR) def main(args: Array[String]): Unit = { testParser.parse(args, TestParams()).foreach { param => val conf = Engine.createSparkConf().setAppName("Test ResNet on Cifar10") .set("spark.akka.frameSize", 64.toString) .set("spark.task.maxFailures", "1") val sc = new SparkContext(conf) Engine.init val partitionNum = Engine.nodeNumber() * Engine.coreNumber() val rddData = sc.parallelize(loadTest(param.folder), partitionNum) val transformer = BytesToBGRImg() -> BGRImgNormalizer(Cifar10DataSet.trainMean, Cifar10DataSet.trainStd) -> BGRImgToSample() val evaluationSet = transformer(rddData) val model = Module.load[Float](param.model) println(model) val result = model.evaluate(evaluationSet, Array(new Top1Accuracy[Float]), Some(param.batchSize)) result.foreach(r => println(s"${r._2} is ${r._1}")) sc.stop() } } }
Example 16
Source File: TestImageNet.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.resnet import com.intel.analytics.bigdl._ import com.intel.analytics.bigdl.dataset.DataSet import com.intel.analytics.bigdl.dataset.image.CropCenter import com.intel.analytics.bigdl.models.resnet.ResNet.DatasetType import com.intel.analytics.bigdl.nn.{Module, StaticGraph} import com.intel.analytics.bigdl.optim._ import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._ import com.intel.analytics.bigdl.transform.vision.image.{ImageFeature, MTImageFeatureToBatch, MatToTensor, PixelBytesToMat} import com.intel.analytics.bigdl.transform.vision.image.augmentation.{ChannelScaledNormalizer, RandomCropper, RandomResize} import com.intel.analytics.bigdl.utils._ import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext object TestImageNet { LoggerFilter.redirectSparkInfoLogs() Logger.getLogger("com.intel.analytics.bigdl.optim").setLevel(Level.INFO) val logger = Logger.getLogger(getClass) import Utils._ def main(args: Array[String]): Unit = { testParser.parse(args, new TestParams()).map(param => { val conf = Engine.createSparkConf().setAppName("Test model on ImageNet2012") .set("spark.rpc.message.maxSize", "200") val sc = new SparkContext(conf) Engine.init val model = Module.loadModule[Float](param.model) val evaluationSet = ImageNetDataSet.valDataSet(param.folder, sc, 224, param.batchSize).toDistributed().data(train = false) val result = model.evaluate(evaluationSet, Array(new Top1Accuracy[Float], new Top5Accuracy[Float])) result.foreach(r => println(s"${r._2} is ${r._1}")) sc.stop() }) } }
Example 17
Source File: TrainCIFAR10.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.resnet import com.intel.analytics.bigdl.nn.{CrossEntropyCriterion, Module} import com.intel.analytics.bigdl._ import com.intel.analytics.bigdl.models.resnet.ResNet.{DatasetType, ShortcutType} import com.intel.analytics.bigdl.optim._ import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter, OptimizerV1, OptimizerV2, T, Table} import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._ object TrainCIFAR10 { LoggerFilter.redirectSparkInfoLogs() import Utils._ def cifar10Decay(epoch: Int): Double = if (epoch >= 122) 2.0 else if (epoch >= 81) 1.0 else 0.0 def main(args: Array[String]): Unit = { trainParser.parse(args, new TrainParams()).map(param => { val conf = Engine.createSparkConf().setAppName("Train ResNet on Cifar10") // Will throw exception without this config when has only one executor .set("spark.rpc.message.maxSize", "200") val sc = new SparkContext(conf) Engine.init val batchSize = param.batchSize val (imageSize, lrSchedule, maxEpoch, dataSet) = (32, DatasetType.CIFAR10, param.nepochs, Cifar10DataSet) val trainDataSet = dataSet.trainDataSet(param.folder, sc, imageSize, batchSize) val validateSet = dataSet.valDataSet(param.folder, sc, imageSize, batchSize) val shortcut: ShortcutType = param.shortcutType match { case "A" => ShortcutType.A case "B" => ShortcutType.B case _ => ShortcutType.C } val model = if (param.modelSnapshot.isDefined) { Module.load[Float](param.modelSnapshot.get) } else { val curModel = if (param.graphModel) { ResNet.graph(param.classes, T("shortcutType" -> shortcut, "depth" -> param.depth, "optnet" -> param.optnet)) } else { ResNet(param.classes, T("shortcutType" -> shortcut, "depth" -> param.depth, "optnet" -> param.optnet)) } if (param.optnet) { ResNet.shareGradInput(curModel) } ResNet.modelInit(curModel) curModel } if (param.optimizerVersion.isDefined) { param.optimizerVersion.get.toLowerCase match { case "optimizerv1" => Engine.setOptimizerVersion(OptimizerV1) case "optimizerv2" => Engine.setOptimizerVersion(OptimizerV2) } } val optimMethod = if (param.stateSnapshot.isDefined) { OptimMethod.load[Float](param.stateSnapshot.get) } else { new SGD[Float](learningRate = param.learningRate, learningRateDecay = 0.0, weightDecay = param.weightDecay, momentum = param.momentum, dampening = param.dampening, nesterov = param.nesterov, learningRateSchedule = SGD.EpochDecay(cifar10Decay)) } val optimizer = Optimizer( model = model, dataset = trainDataSet, criterion = new CrossEntropyCriterion[Float]() ) if (param.checkpoint.isDefined) { optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch) } optimizer .setOptimMethod(optimMethod) .setValidation(Trigger.everyEpoch, validateSet, Array(new Top1Accuracy[Float])) .setEndWhen(Trigger.maxEpoch(maxEpoch)) .optimize() sc.stop() }) } }
Example 18
Source File: Test.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.inception import com.intel.analytics.bigdl.dataset.{ByteRecord, DataSet} import com.intel.analytics.bigdl.dataset.image._ import com.intel.analytics.bigdl.nn.Module import com.intel.analytics.bigdl.optim.{Top1Accuracy, Top5Accuracy, Validator} import com.intel.analytics.bigdl.utils.Engine import org.apache.hadoop.io.Text import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext object Test { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) Logger.getLogger("breeze").setLevel(Level.ERROR) import Options._ val imageSize = 224 def main(args: Array[String]) { testParser.parse(args, new TestParams()).foreach { param => val batchSize = param.batchSize.getOrElse(128) val conf = Engine.createSparkConf().setAppName("Test Inception on ImageNet") val sc = new SparkContext(conf) Engine.init // We set partition number to be node*core, actually you can also assign other partitionNum val partitionNum = Engine.nodeNumber() * Engine.coreNumber() val rawData = sc.sequenceFile(param.folder, classOf[Text], classOf[Text], partitionNum) .map(image => { ByteRecord(image._2.copyBytes(), DataSet.SeqFileFolder.readLabel(image._1).toFloat) }).coalesce(partitionNum, true) val rddData = DataSet.SeqFileFolder.filesToRdd(param.folder, sc, 1000) val transformer = BytesToBGRImg() -> BGRImgCropper(imageSize, imageSize, CropCenter) -> HFlip(0.5) -> BGRImgNormalizer(0.485, 0.456, 0.406, 0.229, 0.224, 0.225) -> BGRImgToSample() val evaluationSet = transformer(rddData) val model = Module.load[Float](param.model) val result = model.evaluate(evaluationSet, Array(new Top1Accuracy[Float], new Top5Accuracy[Float]), param.batchSize) result.foreach(r => println(s"${r._2} is ${r._1}")) sc.stop() } } }
Example 19
Source File: Test.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.lenet import java.nio.file.Paths import com.intel.analytics.bigdl.dataset.DataSet import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToSample} import com.intel.analytics.bigdl.nn.Module import com.intel.analytics.bigdl.optim.Top1Accuracy import com.intel.analytics.bigdl.utils.Engine import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext object Test { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) Logger.getLogger("breeze").setLevel(Level.ERROR) import Utils._ def main(args: Array[String]): Unit = { testParser.parse(args, new TestParams()).foreach { param => val conf = Engine.createSparkConf().setAppName("Test Lenet on MNIST") .set("spark.akka.frameSize", 64.toString) .set("spark.task.maxFailures", "1") val sc = new SparkContext(conf) Engine.init val validationData = param.folder + "/t10k-images-idx3-ubyte" val validationLabel = param.folder + "/t10k-labels-idx1-ubyte" val partitionNum = Engine.nodeNumber() * Engine.coreNumber() val rddData = sc.parallelize(load(validationData, validationLabel), partitionNum) val transformer = BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToSample() val evaluationSet = transformer(rddData) val model = Module.load[Float](param.model) val result = model.evaluate(evaluationSet, Array(new Top1Accuracy[Float]), Some(param.batchSize)) result.foreach(r => println(s"${r._2} is ${r._1}")) sc.stop() } } }
Example 20
Source File: Train.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.lenet import com.intel.analytics.bigdl._ import com.intel.analytics.bigdl.dataset.DataSet import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToBatch} import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, CrossEntropyCriterion, Module} import com.intel.analytics.bigdl.numeric.NumericFloat import com.intel.analytics.bigdl.optim._ import com.intel.analytics.bigdl.utils._ import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext object Train { LoggerFilter.redirectSparkInfoLogs() import Utils._ def main(args: Array[String]): Unit = { trainParser.parse(args, new TrainParams()).map(param => { val conf = Engine.createSparkConf() .setAppName("Train Lenet on MNIST") .set("spark.task.maxFailures", "1") val sc = new SparkContext(conf) Engine.init val trainData = param.folder + "/train-images-idx3-ubyte" val trainLabel = param.folder + "/train-labels-idx1-ubyte" val validationData = param.folder + "/t10k-images-idx3-ubyte" val validationLabel = param.folder + "/t10k-labels-idx1-ubyte" val model = if (param.modelSnapshot.isDefined) { Module.load[Float](param.modelSnapshot.get) } else { if (param.graphModel) { LeNet5.graph(classNum = 10) } else { Engine.getEngineType() match { case MklBlas => LeNet5(10) case MklDnn => LeNet5.dnnGraph(param.batchSize / Engine.nodeNumber(), 10) } } } val criterion = Engine.getEngineType() match { case MklBlas => ClassNLLCriterion() case MklDnn => CrossEntropyCriterion() } if (param.optimizerVersion.isDefined) { param.optimizerVersion.get.toLowerCase match { case "optimizerv1" => Engine.setOptimizerVersion(OptimizerV1) case "optimizerv2" => Engine.setOptimizerVersion(OptimizerV2) } } val optimMethod = if (param.stateSnapshot.isDefined) { OptimMethod.load[Float](param.stateSnapshot.get) } else { new SGD[Float](learningRate = param.learningRate, learningRateDecay = param.learningRateDecay) } val trainSet = DataSet.array(load(trainData, trainLabel), sc) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch( param.batchSize) val optimizer = Optimizer( model = model, dataset = trainSet, criterion = criterion) if (param.checkpoint.isDefined) { optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch) } if(param.overWriteCheckpoint) { optimizer.overWriteCheckpoint() } val validationSet = DataSet.array(load(validationData, validationLabel), sc) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToBatch( param.batchSize) optimizer .setValidation( trigger = Trigger.everyEpoch, dataset = validationSet, vMethods = Array(new Top1Accuracy, new Top5Accuracy[Float], new Loss[Float])) .setOptimMethod(optimMethod) .setEndWhen(Trigger.maxEpoch(param.maxEpoch)) .optimize() sc.stop() }) } }
Example 21
Source File: Train.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.autoencoder import java.nio.file.Paths import com.intel.analytics.bigdl._ import com.intel.analytics.bigdl.dataset.image._ import com.intel.analytics.bigdl.dataset.{DataSet, MiniBatch, Transformer} import com.intel.analytics.bigdl.nn.{MSECriterion, Module} import com.intel.analytics.bigdl.optim._ import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._ import com.intel.analytics.bigdl.utils.{Engine, OptimizerV1, OptimizerV2, T, Table} import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import scala.reflect.ClassTag object toAutoencoderBatch { def apply(): toAutoencoderBatch[Float] = new toAutoencoderBatch[Float]() } class toAutoencoderBatch[T: ClassTag](implicit ev: TensorNumeric[T] )extends Transformer[MiniBatch[T], MiniBatch[T]] { override def apply(prev: Iterator[MiniBatch[T]]): Iterator[MiniBatch[T]] = { prev.map(batch => { MiniBatch(batch.getInput().toTensor[T], batch.getInput().toTensor[T]) }) } } object Train { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) Logger.getLogger("breeze").setLevel(Level.ERROR) import Utils._ def main(args: Array[String]): Unit = { trainParser.parse(args, new TrainParams()).map(param => { val conf = Engine.createSparkConf().setAppName("Train Autoencoder on MNIST") val sc = new SparkContext(conf) Engine.init val trainData = Paths.get(param.folder, "/train-images-idx3-ubyte") val trainLabel = Paths.get(param.folder, "/train-labels-idx1-ubyte") val trainDataSet = DataSet.array(load(trainData, trainLabel), sc) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch(param.batchSize) -> toAutoencoderBatch() val model = if (param.modelSnapshot.isDefined) { Module.load[Float](param.modelSnapshot.get) } else { if (param.graphModel) Autoencoder.graph(classNum = 32) else Autoencoder(classNum = 32) } if (param.optimizerVersion.isDefined) { param.optimizerVersion.get.toLowerCase match { case "optimizerv1" => Engine.setOptimizerVersion(OptimizerV1) case "optimizerv2" => Engine.setOptimizerVersion(OptimizerV2) } } val optimMethod = if (param.stateSnapshot.isDefined) { OptimMethod.load[Float](param.stateSnapshot.get) } else { new Adagrad[Float](learningRate = 0.01, learningRateDecay = 0.0, weightDecay = 0.0005) } val optimizer = Optimizer( model = model, dataset = trainDataSet, criterion = new MSECriterion[Float]() ) if (param.checkpoint.isDefined) { optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch) } optimizer .setOptimMethod(optimMethod) .setEndWhen(Trigger.maxEpoch(param.maxEpoch)) .optimize() sc.stop() }) } }
Example 22
Source File: Test.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.vgg import com.intel.analytics.bigdl.dataset.DataSet import com.intel.analytics.bigdl.dataset.image._ import com.intel.analytics.bigdl.models.lenet.Utils._ import com.intel.analytics.bigdl.nn.Module import com.intel.analytics.bigdl.optim.{Top1Accuracy, Validator} import com.intel.analytics.bigdl.utils.Engine import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext object Test { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) Logger.getLogger("breeze").setLevel(Level.ERROR) import Utils._ def main(args: Array[String]) { testParser.parse(args, new TestParams()).foreach { param => val conf = Engine.createSparkConf().setAppName("Test Vgg on Cifar10") .set("spark.akka.frameSize", 64.toString) val sc = new SparkContext(conf) Engine.init val partitionNum = Engine.nodeNumber() * Engine.coreNumber() val rddData = sc.parallelize(Utils.loadTest(param.folder), partitionNum) val transformer = BytesToBGRImg() -> BGRImgNormalizer(testMean, testStd) -> BGRImgToSample() val evaluationSet = transformer(rddData) val model = Module.load[Float](param.model) val result = model.evaluate(evaluationSet, Array(new Top1Accuracy[Float]), Some(param.batchSize)) result.foreach(r => println(s"${r._2} is ${r._1}")) sc.stop() } } }
Example 23
Source File: Train.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.vgg import java.text.SimpleDateFormat import java.util.Date import com.intel.analytics.bigdl._ import com.intel.analytics.bigdl.dataset.DataSet import com.intel.analytics.bigdl.dataset.image._ import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Module} import com.intel.analytics.bigdl.optim._ import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._ import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter, OptimizerV1, OptimizerV2, T, Table} import com.intel.analytics.bigdl.visualization.{TrainSummary, ValidationSummary} import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext object Train { LoggerFilter.redirectSparkInfoLogs() import Utils._ def main(args: Array[String]): Unit = { trainParser.parse(args, new TrainParams()).map(param => { val conf = Engine.createSparkConf().setAppName("Train Vgg on Cifar10") // Will throw exception without this config when has only one executor .set("spark.rpc.message.maxSize", "200") val sc = new SparkContext(conf) Engine.init val trainDataSet = DataSet.array(Utils.loadTrain(param.folder), sc) -> BytesToBGRImg() -> BGRImgNormalizer(trainMean, trainStd) -> BGRImgToBatch(param.batchSize) val model = if (param.modelSnapshot.isDefined) { Module.load[Float](param.modelSnapshot.get) } else { if (param.graphModel) VggForCifar10.graph(classNum = 10) else VggForCifar10(classNum = 10) } if (param.optimizerVersion.isDefined) { param.optimizerVersion.get.toLowerCase match { case "optimizerv1" => Engine.setOptimizerVersion(OptimizerV1) case "optimizerv2" => Engine.setOptimizerVersion(OptimizerV2) } } val optimMethod = if (param.stateSnapshot.isDefined) { OptimMethod.load[Float](param.stateSnapshot.get) } else { new SGD[Float](learningRate = param.learningRate, learningRateDecay = 0.0, weightDecay = param.weightDecay, momentum = 0.9, dampening = 0.0, nesterov = false, learningRateSchedule = SGD.EpochStep(25, 0.5)) } val optimizer = Optimizer( model = model, dataset = trainDataSet, criterion = new ClassNLLCriterion[Float]() ) val validateSet = DataSet.array(Utils.loadTest(param.folder), sc) -> BytesToBGRImg() -> BGRImgNormalizer(testMean, testStd) -> BGRImgToBatch(param.batchSize) if (param.checkpoint.isDefined) { optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch) } if (param.overWriteCheckpoint) { optimizer.overWriteCheckpoint() } if (param.summaryPath.isDefined) { val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") val timeStamp = sdf.format(new Date()) val trainSummry = new TrainSummary(param.summaryPath.get, s"vgg-on-cifar10-train-$timeStamp") optimizer.setTrainSummary(trainSummry) val validationSummary = new ValidationSummary(param.summaryPath.get, s"vgg-on-cifar10-val-$timeStamp") optimizer.setValidationSummary(validationSummary) } optimizer .setValidation(Trigger.everyEpoch, validateSet, Array(new Top1Accuracy[Float])) .setOptimMethod(optimMethod) .setEndWhen(Trigger.maxEpoch(param.maxEpoch)) .optimize() sc.stop() }) } }
Example 24
Source File: ParallelOptimizerSpec.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.optim import com.intel.analytics.bigdl.dataset.{DataSet, MiniBatch} import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Linear, MSECriterion} import com.intel.analytics.bigdl.optim.DistriOptimizerSpecModel.mse import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.utils.{Engine, T} import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers} @com.intel.analytics.bigdl.tags.Serial class ParallelOptimizerSpec extends FlatSpec with Matchers with BeforeAndAfter { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) private var sc: SparkContext = _ before { val conf = Engine.createSparkConf() .setMaster("local[1]").setAppName("ParallelOptimizerSpec") sc = new SparkContext(conf) Engine.init Engine.setCoreNumber(1) } after { if (sc != null) { sc.stop() } } "Train with parallel" should "work properly" in { val input = Tensor[Float](1, 10).fill(1.0f) val target = Tensor[Float](1).fill(1.0f) val miniBatch = MiniBatch(input, target) val model = Linear[Float](10, 2) model.getParameters()._1.fill(1.0f) val optimMethod = new SGD[Float]() val dataSet = DataSet.array(Array(miniBatch), sc) val optimizer = new DistriOptimizer[Float](model, dataSet, new ClassNLLCriterion[Float]()) .setState(T("learningRate" -> 1.0)) .setEndWhen(Trigger.maxIteration(10)) optimizer.optimize() } "Train with parallel" should "have same results as DistriOptimizer" in { val input = Tensor[Float](1, 10).fill(1.0f) val target = Tensor[Float](1).fill(1.0f) val miniBatch = MiniBatch(input, target) val model1 = Linear[Float](10, 2) model1.getParameters()._1.fill(1.0f) val model2 = Linear[Float](10, 2) model2.getParameters()._1.fill(1.0f) val dataSet = DataSet.array(Array(miniBatch), sc) val parallelOptimizer = new DistriOptimizer[Float](model1, dataSet, new ClassNLLCriterion[Float]()) .setState(T("learningRate" -> 1.0)) .setEndWhen(Trigger.maxIteration(10)) parallelOptimizer.optimize val distriOptimizer = new DistriOptimizer[Float](model2, dataSet, new ClassNLLCriterion[Float]()) .setState(T("learningRate" -> 1.0)) .setEndWhen(Trigger.maxIteration(10)) distriOptimizer.optimize model1.getParameters()._1 should be (model2.getParameters()._1) } }
Example 25
Source File: SparkFunSuite.scala From spark-alchemy with Apache License 2.0 | 5 votes |
package org.apache.spark // scalastyle:off import java.io.File import scala.annotation.tailrec import org.apache.log4j.{Appender, Level, Logger} import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, BeforeAndAfterEach, FunSuite, Outcome, Suite} import org.apache.spark.internal.Logging import org.apache.spark.internal.config.Tests.IS_TESTING import org.apache.spark.util.{AccumulatorContext, Utils} protected def withLogAppender( appender: Appender, loggerName: Option[String] = None, level: Option[Level] = None)( f: => Unit): Unit = { val logger = loggerName.map(Logger.getLogger).getOrElse(Logger.getRootLogger) val restoreLevel = logger.getLevel logger.addAppender(appender) if (level.isDefined) { logger.setLevel(level.get) } try f finally { logger.removeAppender(appender) if (level.isDefined) { logger.setLevel(restoreLevel) } } } }
Example 26
Source File: DenseKMeans.scala From AI with Apache License 2.0 | 5 votes |
// scalastyle:off println package com.bigchange.mllib import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( var input: String = null, k: Int = 2, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() defaultParams.input = args(0) run(defaultParams) } def run(params: Params) { val conf = new SparkConf().setAppName(s"DenseKMeans with $params").setMaster("local") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) // Return the K-means cost (sum of squared distances of points to their nearest center) for this val cost = model.computeCost(examples) // 获取质点(k个) val centerPoint = model.clusterCenters val one = centerPoint(0) val two = centerPoint(1) println(s"centerPoint=$one,$two.") println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 27
Source File: SparkSqlUtils.scala From HadoopLearning with MIT License | 5 votes |
package com.c503.utils import java.io.{BufferedInputStream, BufferedReader, FileInputStream, InputStreamReader} import java.nio.file.Path import com.google.common.io.Resources import org.apache.log4j.{Level, Logger} import org.apache.mesos.Protos.Resource import org.apache.spark.sql.SparkSession import scala.io.Source def readSqlByPath(sqlPath: String) = { val buf = new StringBuilder val path = this.getPathByName(sqlPath) val file = Source.fromFile(path) for (line <- file.getLines) { buf ++= line + "\n" } file.close buf.toString() } }
Example 28
Source File: Streaming.scala From scala-spark-cab-rides-predictions with MIT License | 5 votes |
import com.amazonaws.services.dynamodbv2.document.internal.InternalUtils import com.amazonaws.services.dynamodbv2.streamsadapter.model.RecordAdapter import com.amazonaws.services.kinesis.model.Record import com.google.gson.Gson import org.apache.spark.sql._ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.kinesis.dynamostream.KinesisInitialPositions.Latest import org.apache.spark.streaming.kinesis.dynamostream.KinesisInputDStream import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext} object Trials extends App { import org.apache.log4j.{Level, Logger} Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) //session setup System.setProperty("hadoop.home.dir", "C:\\winutils") val sparkSession = SparkSession.builder() .master("local[*]") .appName("test") .getOrCreate() val sc = sparkSession.sparkContext val ssc = new StreamingContext(sc, Seconds(10)) val sqlContext = sparkSession.sqlContext //creates an array of strings from raw byte array def kinesisRecordHandler: Record => Array[String] = (record: Record) => new String(record.getData.array()).split(",") //converts records to map of key value pair and then json def recordHandler = (record: Record) => { val gson = new Gson val sRecord = record.asInstanceOf[RecordAdapter].getInternalObject val map = InternalUtils.toSimpleMapValue(sRecord.getDynamodb.getNewImage) gson.toJson(map) } case class CabPrice(cab_type: String, product_id: String, name: String, price: String, distance: String, surge_multiplier: String, time_stamp: String, source: String, destination: String, id: String) val stream_cab = KinesisInputDStream.builder .streamingContext(ssc) .streamName("cab_rides") .regionName("us-east-1") .initialPosition(new Latest()) .checkpointAppName("cab_rides-app") .checkpointInterval(Milliseconds(1000)) .storageLevel(StorageLevel.MEMORY_AND_DISK_2) .buildWithMessageHandler(recordHandler) val stream_weather = KinesisInputDStream.builder .streamingContext(ssc) .streamName("weather") .regionName("us-east-1") .initialPosition(new Latest()) .checkpointAppName("cab_rides-app") .checkpointInterval(Milliseconds(1000)) .storageLevel(StorageLevel.MEMORY_AND_DISK_2) .buildWithMessageHandler(recordHandler) //creating dataframe, can be stored as temp view val cabSchema = Encoders.product[CabPrice].schema stream_cab.foreachRDD(rdd => { import sqlContext.implicits._ //val xx: Dataset[String] = rdd.toDS() val df: DataFrame = sqlContext.read.schema(cabSchema).json(rdd.toDS()) df.show() }) ssc.start() ssc.awaitTermination() }
Example 29
Source File: SocialGraphJob.scala From spark-graphx with GNU General Public License v3.0 | 5 votes |
package com.github.graphx.pregel.jobs.social import com.github.graphx.pregel.social.SocialGraph import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext object SocialGraphJob { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val sc = new SparkContext("local[*]", "GraphX") val graph = new SocialGraph(sc) println("Top 10 most-connected users:") graph.getMostConnectedUsers(10) foreach println println("Computing degrees of separation for user Arch") graph.degreeOfSeparationSingleUser(5306) foreach println println("Computing degrees of separation for user Arch and Fred") graph.degreeOfSeparationTwoUser(5306, 14) foreach println println("Connected component") graph.connectedComponentGroupedByUsers .sortBy ( {case (_, lowestVertexId) => lowestVertexId}, ascending = false).take(10) foreach println sc.stop() } }
Example 30
Source File: ShortestPathProblemJob.scala From spark-graphx with GNU General Public License v3.0 | 5 votes |
package com.github.graphx.pregel.jobs.ssp import com.github.graphx.pregel.ssp.ShortestPathProblem import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import org.apache.spark.graphx.VertexId object ShortestPathProblemJob extends App { Logger.getLogger("org").setLevel(Level.ERROR) val sc = new SparkContext("local[*]", "ShortestPathProblemDemo") val ssp = new ShortestPathProblem(sc) val sourceIdForTest: VertexId = 3 val sourceIdForRandom: VertexId = 75 val testGraph = ssp.testGraph val resultOnTestGraph = ssp.shortestPath(testGraph, sourceIdForTest) println(s"Test Graph:\n${ssp.graphToString(testGraph)}\n\n" + s"Distances on the test graph $resultOnTestGraph\n") val randomGraph = ssp.randomGraph val resultOnRandomGraph = ssp.shortestPath(randomGraph, sourceIdForRandom) println(s"Distances on the random graph $resultOnRandomGraph\n") }
Example 31
Source File: SocialPageRankJob.scala From spark-graphx with GNU General Public License v3.0 | 5 votes |
package com.github.graphx.pagerank import com.github.graphx.pregel.social.SocialGraph import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import org.apache.spark.graphx.VertexRDD object SocialPageRankJob { def static(socialGraph: SocialGraph, tolerance: Double): VertexRDD[Double] = socialGraph.graph.staticPageRank(numIter = 20).vertices def handleResult(socialGraph: SocialGraph, ranks: VertexRDD[Double]) = { socialGraph.verts.join(ranks).map { case (_, (username, rank)) => (username, rank) }.sortBy({ case (_, rank) => rank }, ascending = false).take(10) } def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val sc = new SparkContext("local[*]", "PageRank") val socialGraph: SocialGraph = new SocialGraph(sc) val TOLERANCE: Double = 0.0001 import scala.compat.Platform.{EOL => D} val topUsersDynamically = handleResult(socialGraph, ranks(socialGraph, TOLERANCE)).mkString(D) val topUsersIterative = handleResult(socialGraph, static(socialGraph, TOLERANCE)).mkString(D) println(s"Top 10 users in network counted with TOLERANCE until convergence $TOLERANCE - $D $topUsersDynamically") println(s"Top 10 users in the network counted iteratively - $D $topUsersIterative") sc.stop() } }
Example 32
Source File: AkkaUtils.scala From DataXServer with Apache License 2.0 | 5 votes |
package org.tianlangstudio.data.hamal.yarn.util import akka.actor.{ActorSystem, ExtendedActorSystem} import com.typesafe.config.ConfigFactory import org.apache.log4j.{Level, Logger} import org.tianlangstudio.data.hamal.core.{Constants, HamalConf} import org.tianlangstudio.data.hamal.core.HamalConf def maxFrameSizeBytes(conf: HamalConf): Int = { val frameSizeInMB = conf.getInt("datax.akka.frameSize", 128) if (frameSizeInMB > AKKA_MAX_FRAME_SIZE_IN_MB) { throw new IllegalArgumentException( s"spark.akka.frameSize should not be greater than $AKKA_MAX_FRAME_SIZE_IN_MB MB") } frameSizeInMB * 1024 * 1024 } def protocol(actorSystem: ActorSystem): String = { val akkaConf = actorSystem.settings.config val sslProp = "akka.remote.netty.tcp.enable-ssl" protocol(akkaConf.hasPath(sslProp) && akkaConf.getBoolean(sslProp)) } def protocol(ssl: Boolean = false): String = { if (ssl) { "akka.ssl.tcp" } else { "akka.tcp" } } def address( protocol: String, systemName: String, host: String, port: Int, actorName: String): String = { address(protocol, systemName, s"$host:$port", actorName ) } def address( protocol: String, systemName: String, hostPort: String, actorName: String): String = { s"$protocol://$systemName@$hostPort/user/$actorName" } }
Example 33
Source File: ModelSerialization.scala From CTRmodel with Apache License 2.0 | 5 votes |
package com.ggstar.example import com.ggstar.ctrmodel._ import com.ggstar.features.FeatureEngineering import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.{SparkConf, SparkContext} object ModelSerialization { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val conf = new SparkConf() .setMaster("local") .setAppName("ctrModel") .set("spark.submit.deployMode", "client") val spark = SparkSession.builder.config(conf).getOrCreate() val resourcesPath = this.getClass.getResource("/samples.snappy.orc") val rawSamples = spark.read.format("orc").option("compression", "snappy").load(resourcesPath.getPath) //transform array to vector for following vectorAssembler val samples = FeatureEngineering.transferArray2Vector(rawSamples) samples.printSchema() samples.show(5, false) //model training println("Neural Network Ctr Prediction Model:") val innModel = new InnerProductNNCtrModel() innModel.train(samples) val transformedData = innModel.transform(samples) transformedData.show(1,false) //model serialization by mleap val mleapModelSerializer = new com.ggstar.serving.mleap.serialization.ModelSerializer() mleapModelSerializer.serializeModel(innModel._pipelineModel, "jar:file:/Users/zhwang/Workspace/CTRmodel/model/inn.model.mleap.zip", transformedData) //model serialization by JPMML val jpmmlModelSerializer = new com.ggstar.serving.jpmml.serialization.ModelSerializer() jpmmlModelSerializer.serializeModel(innModel._pipelineModel, "model/inn.model.jpmml.xml", transformedData) } }
Example 34
Source File: ModelSelection.scala From CTRmodel with Apache License 2.0 | 5 votes |
package com.ggstar.example import com.ggstar.ctrmodel._ import com.ggstar.evaluation.Evaluator import com.ggstar.features.FeatureEngineering import org.apache.spark.sql.SparkSession import org.apache.spark.{SparkConf, SparkContext} import org.apache.log4j.{Level, Logger} object ModelSelection { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val conf = new SparkConf() .setMaster("local") .setAppName("ctrModel") .set("spark.submit.deployMode", "client") val spark = SparkSession.builder.config(conf).getOrCreate() val resourcesPath = this.getClass.getResource("/samples.snappy.orc") val rawSamples = spark.read.format("orc").option("compression", "snappy").load(resourcesPath.getPath) rawSamples.printSchema() rawSamples.show(10) //transform array to vector for following vectorAssembler val samples = FeatureEngineering.transferArray2Vector(rawSamples) //split samples into training samples and validation samples val Array(trainingSamples, validationSamples) = samples.randomSplit(Array(0.7, 0.3)) val evaluator = new Evaluator } }
Example 35
Source File: GenerateVerticesExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch08 // scalastyle:off println import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.rdd.RDD object GenerateVerticesExample { def main(args: Array[String]): Unit = { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } // ログレベルをWARNに設定 Logger.getLogger("org").setLevel(Level.WARN) // SparkContextの生成 val conf = new SparkConf().setAppName("GenerateVerticesExample") val sc = new SparkContext(conf) // 引数から設定値を取得 val (numProducts, numUsers): (Int, Int) = (args(0).toInt, args(1).toInt) implicit val recOpts: RecommendLogOptions = RecommendLogOptions(numProducts, numUsers) run(sc) sc.stop() } def run(sc: SparkContext) (implicit recOpts: RecommendLogOptions) : Unit = { // 商品リスト、ユーザリストのRDDを生成 val products: RDD[VertexProperty] = sc.parallelize(PurchaseLogGenerator.genProductList) val users: RDD[VertexProperty] = sc.parallelize(PurchaseLogGenerator.genUserList) // 商品リスト20件を表示 println("===================================") println("get top 20 products:") products.take(20).foreach(x => println(s"id: ${x.id},\ttype: ${x.kind},\tname: ${x.name}")) // ユーザリスト20件を表示 println("===================================") println("get top 20 users:") users.take(20).foreach(x => println(s"id: ${x.id},\ttype: ${x.kind},\tname: ${x.name}")) } } // scalastyle:on println
Example 36
Source File: ReduceExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object ReduceExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("ReduceExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1, 6, 3), 3) nums.reduce((x, y) => x + y) println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""sum: ${nums.reduce((x, y) => x + y)}""") } } // scalastyle:on println
Example 37
Source File: StatsExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object StatsExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("StatsExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val nums = sc.parallelize(Array.range(1, 11)) val stats = nums.stats() println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""count: ${stats.count}""") println(s"""mean: ${stats.mean}""") println(s"""stdev: ${stats.stdev}""") } } // scalastyle:on println
Example 38
Source File: FoldExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object FoldExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("FoldExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1, 6, 3), 3) nums.reduce((x, y) => x + y) println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""sum: ${nums.fold(0)((x, y) => x + y)}""") } } // scalastyle:on println
Example 39
Source File: OrderExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object OrderExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("OrderExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1)) println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""top3: ${nums.top(3).mkString(", ")}""") println(s"""takeOredered3: ${nums.takeOrdered(3).mkString(", ")}""") } } // scalastyle:on println
Example 40
Source File: AggregateExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object AggregateExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("AggregateExample") val sc = new SparkContext(conf) run(sc) sc.stop() } private[basic_action] def run(sc: SparkContext) { val nums = sc.parallelize(Array.range(1, 11), 3) val acc = nums.aggregate(zeroValue = (0.0, 0))( seqOp = (partAcc, n) => (partAcc._1 + n, partAcc._2 + 1), combOp = (acc1, acc2) => (acc1._1 + acc2._1, acc1._2 + acc2._2) ) val avg = acc._1 / acc._2 println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""sum: ${nums.fold(0)((x, y) => x + y)}""") } } // scalastyle:on println
Example 41
Source File: CollectAsMapExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object CollectAsMapExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("CollectAsMapExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize( Array( ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1) ), 3 ) val fruitsAsMap = fruits.collectAsMap() println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""fruitsAsMap: $fruitsAsMap""") } } // scalastyle:on println
Example 42
Source File: PersistExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.persistence import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} object PersistExample { def main(args: Array[String]) { if (args.length != 1) { new IllegalArgumentException("Invalid arguments") System.exit(1) } Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("PersistExample") val sc = new SparkContext(conf) run(sc, args(0)) sc.stop() } def run(sc: SparkContext, inputFile: String) { val lines = sc.textFile(inputFile) lines.count() lines.collect() val persistedLines = sc.textFile(inputFile).persist() persistedLines.collect() persistedLines.count() persistedLines.unpersist() persistedLines.collect() } }
Example 43
Source File: CustomPartitionerExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.partition import org.apache.log4j.{Level, Logger} import org.apache.spark.Partitioner import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object CustomPartitionerExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("CustomPartitionerExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val defaultPartitioned = fruits.map((_, 1)).reduceByKey(_ + _) val customPartitioned = fruits.map((_, 1)).reduceByKey( new FirstLetterPartitioner(sc.defaultParallelism), _ + _) println(s"""fruits:\n ${fruits.collect().mkString(", ")}""") println() println("partitioned by default partitioner") defaultPartitioned.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) println() println("partitioned by first letter partitioner") customPartitioned.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) } } private[partition] class FirstLetterPartitioner(numParts: Int) extends Partitioner { override def numPartitions: Int = numParts override def getPartition(key: Any): Int = { key.toString.charAt(0).hashCode % numPartitions match { case p if p < 0 => p + numPartitions case p => p } } override def equals(other: Any): Boolean = { other match { case p: FirstLetterPartitioner => p.numPartitions == numPartitions case _ => false } } } // scalastyle:on println
Example 44
Source File: PartitionExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.partition import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object PartitionExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("Partition") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1), 1) println(s"""nums:\n ${nums.collect().mkString(", ")}""") println() println("original:") nums.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) println() val numsPar3 = nums.repartition(3) println("repartition to 3:") numsPar3.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) println() val numsPar2 = numsPar3.coalesce(2) println("coalesce to 2:") numsPar2.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) } } // scalastyle:on println
Example 45
Source File: WordCountExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.shared_variable import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object WordCountExample { def main(args: Array[String]) { if (args.length != 1) { new IllegalArgumentException("Invalid arguments") System.exit(1) } Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("WordCountExample") val sc = new SparkContext(conf) run(sc, args(0)) sc.stop() } def run(sc: SparkContext, inputFile: String) { val stopWordCount = sc.accumulator(0L) val stopWords = sc.broadcast(Set("a", "an", "for", "in", "on")) val lines = sc.textFile(inputFile) val words = lines.flatMap(_.split(" ")).filter(!_.isEmpty) val wordCounts = words.map(w => (w, 1)).reduceByKey(_ + _).filter { w => val result = !stopWords.value.contains(w._1) if (!result) stopWordCount += 1L result } val sortedWordCounts = wordCounts.sortBy(_._2, ascending = false) println(s"""wordCounts: ${sortedWordCounts.take(10).mkString(", ")}""") println(s"""stopWordCounts: ${stopWordCount.value}""") } } // scalastyle:on println
Example 46
Source File: AggregateByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object AggregateByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("AggregateByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize( Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1))) val fruitCountAvgs = fruits.aggregateByKey(zeroValue = Acc(0.0, 0))( seqOp = (partAcc, n) => partAcc += n, combOp = (acc1, acc2) => acc1 ++= acc2 ).mapValues(acc => acc.sum / acc.count) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""fruitCountAvgs: ${fruitCountAvgs.collect().mkString(", ")}""") } } // scalastyle:on println
Example 47
Source File: MapValuesExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object MapValuesExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("MapValuesExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array(("Apple", 1), ("Orange", 4), ("Apple", 2), ("Peach", 1))) val plusOnes = fruits.mapValues(v => v + 1) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""plusOnes: ${plusOnes.collect().mkString(", ")}""") } } // scalastyle:on println
Example 48
Source File: SortByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object SortByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("SortByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize( Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1))) val sortedByKeyAsc = fruits.sortByKey(ascending = false) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""sortedByKeyAsc: ${sortedByKeyAsc.collect().mkString(", ")}""") val nums = sc.parallelize( Array(("One", 1), ("Hundred", 100), ("Three", 3), ("Thousand", 1000))) implicit val sortByStrLen = new Ordering[String] { def compare(x: String, y: String): Int = x.length - y.length } val sortedByKeyLength = nums.sortByKey() println() println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""sortedByKeyLength: ${sortedByKeyLength.collect().mkString(", ")}""") } } // scalastyle:on println
Example 49
Source File: CoGroupExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object CoGroupExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("CoGroupExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val persons = sc.parallelize(Array( ("Adam", "San francisco"), ("Bob", "San francisco"), ("Taro", "Tokyo"), ("Charles", "New York") )) val cities = sc.parallelize(Array( ("Tokyo", "Japan"), ("San francisco", "America"), ("Beijing", "China") )) val grouped = persons.map(_.swap).cogroup(cities) println(s"""persons: ${persons.collect().mkString(", ")}""") println(s"""cities: ${cities.collect().mkString(", ")}""") println() println(s"""grouped:\n${grouped.collect().mkString("\n")}""") } } // scalastyle:on println
Example 50
Source File: JoinExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object JoinExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("JoinExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val persons = sc.parallelize(Array( ("Adam", "San francisco"), ("Bob", "San francisco"), ("Taro", "Tokyo"), ("Charles", "New York") )) val cities = sc.parallelize(Array( ("Tokyo", "Japan"), ("San francisco", "America"), ("Beijing", "China") )) val leftJoined = persons.map(_.swap).join(cities) val leftOuterJoined = persons.map(_.swap).leftOuterJoin(cities) val rightOuterJoined = persons.map(_.swap).rightOuterJoin(cities) val fullOuterJoined = persons.map(_.swap).fullOuterJoin(cities) println(s"""persons: ${persons.collect().mkString(", ")}""") println(s"""cities: ${cities.collect().mkString(", ")}""") println() println(s"""leftJoined:\n${leftJoined.collect().mkString("\n")}""") println() println(s"""leftOuterJoined:\n${leftOuterJoined.collect().mkString("\n")}""") println() println(s"""rightOuterJoined:\n${rightOuterJoined.collect().mkString("\n")}""") println() println(s"""fullOuterJoined:\n${fullOuterJoined.collect().mkString("\n")}""") } } // scalastyle:on println
Example 51
Source File: GroupByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object GroupByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("GroupByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize( Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1))) val grouped = fruits.groupByKey() println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""grouped: ${grouped.collect().mkString(", ")}""") } } // scalastyle:on println
Example 52
Source File: ReduceByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object ReduceByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("ReduceByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array( ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1))) val fruitCounts = fruits.reduceByKey((x, y) => x + y) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""fruitCounts: ${fruitCounts.collect().mkString(", ")}""") } } // scalastyle:on println
Example 53
Source File: CombineByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object CombineByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("CombineByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize( Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1))) val fruitCountAvgs = fruits.combineByKey( createCombiner = (v: Int) => Acc(v.toDouble, 1), mergeValue = (partAcc: Acc, n: Int) => partAcc += n, mergeCombiners = (acc1: Acc, acc2: Acc) => acc1 ++= acc2 ).mapValues(acc => acc.sum / acc.count) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""fruitCountAvgs: ${fruitCountAvgs.collect().mkString(", ")}""") } } // scalastyle:on println
Example 54
Source File: FoldByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object FoldByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("FoldByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array( ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1))) val fruitCounts = fruits.foldByKey(0)((x, y) => x + y) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""fruitCounts: ${fruitCounts.collect().mkString(", ")}""") } } // scalastyle:on println
Example 55
Source File: MapPartitionsExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.module.scala.DefaultScalaModule import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object MapPartitionsExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("MapPartitionsExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val jsonLines = sc.parallelize(Array( """{"name": "Apple", "num": 1}""", """{"name": "Orange", "num": 4}""", """{"name": "Apple", "num": 2}""", """{"name": "Peach", "num": 1}""" )) val parsed = jsonLines.mapPartitions { lines => val mapper = new ObjectMapper() mapper.registerModule(DefaultScalaModule) lines.map { line => val f = mapper.readValue(line, classOf[Map[String, String]]) (f("name"), f("num")) } } println(s"""json:\n${jsonLines.collect().mkString("\n")}""") println() println(s"""parsed:\n${parsed.collect().mkString("\n")}""") } } // scalastyle:on println
Example 56
Source File: FlatMapExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object FlatMapExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("FlatMapExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val lines = sc.parallelize(Array("Apple is red", "PineApple is yellow")) val words = lines.flatMap(line => line.split(" ")) println(s"""lines: ${lines.collect().mkString(", ")}""") println(s"""words: ${words.collect().mkString(", ")}""") } } // scalastyle:on println
Example 57
Source File: SetOperationsExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object SetOperationsExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("SetOperationsExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits1 = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val fruits2 = sc.parallelize(Array("Grape", "Apple", "Banana", "Orange")) val union = fruits1.union(fruits2) val subtract = fruits1.subtract(fruits2) val intersection = fruits1.intersection(fruits2) val cartesian = fruits1.cartesian(fruits2) println(s"""fruits1: ${fruits1.collect().mkString(", ")}""") println(s"""fruits2: ${fruits2.collect().mkString(", ")}""") println(s"""union: ${union.collect().mkString(", ")}""") println(s"""subtract: ${subtract.collect().mkString(", ")}""") println(s"""intersection: ${intersection.collect().mkString(", ")}""") println(s"""cartesian: ${cartesian.collect().mkString(", ")}""") } } // scalastyle:on println
Example 58
Source File: MapExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object MapExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("MapExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val lengths = fruits.map(fruit => fruit.length) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""lengths: ${lengths.collect().mkString(", ")}""") } } // scalastyle:on println
Example 59
Source File: ZipExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object ZipExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("ZipExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits1 = sc.parallelize( Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val fruits2 = sc.parallelize( Array("りんご", "オレンジ", "桃", "オレンジ", "パイナップル", "オレンジ")) val zipped = fruits1.zip(fruits2) println(s"""fruits1: ${fruits1.collect().mkString(", ")}""") println(s"""fruits2: ${fruits2.collect().mkString(", ")}""") println(s"""zipped: ${zipped.collect().mkString(", ")}""") } } // scalastyle:on println
Example 60
Source File: DistinctExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object DistinctExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("DistinctExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val uniques = fruits.distinct() println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""uniques: ${uniques.collect().mkString(", ")}""") } } // scalastyle:on println
Example 61
Source File: SampleExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object SampleExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("SampleExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val samples = fruits.sample(withReplacement = false, 0.5, 1) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""samples: ${samples.collect().mkString(", ")}""") } } // scalastyle:on println
Example 62
Source File: FilterExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object FilterExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("FilterExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val startWithPs = fruits.filter(fruit => fruit.startsWith("P")) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""startWithPs: ${startWithPs.collect().mkString(", ")}""") } } // scalastyle:on println
Example 63
Source File: SparkFunSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark // scalastyle:off import org.apache.log4j.{Level, Logger} import org.scalatest.{FunSuite, Outcome} import org.apache.spark.Logging final protected override def withFixture(test: NoArgTest): Outcome = { val testName = test.text val suiteName = this.getClass.getName val shortSuiteName = suiteName.replaceAll("org.apache.spark", "o.a.s") try { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) logInfo(s"\n\n===== TEST OUTPUT FOR $shortSuiteName: '$testName' =====\n") test() } finally { logInfo(s"\n\n===== FINISHED $shortSuiteName: '$testName' =====\n") } } }
Example 64
Source File: Logging.scala From spark-distcp with Apache License 2.0 | 5 votes |
package com.coxautodata.objects import org.apache.log4j.{Level, LogManager, Logger} trait Logging { // Method to get the logger name for this object protected def logName: String = { // Ignore trailing $'s in the class names for Scala objects this.getClass.getName.stripSuffix("$") } private val log: Logger = LogManager.getLogger(logName) // Set logger level protected def setLogLevel(level: Level): Unit = log.setLevel(level) // Log methods that take only a String protected def logInfo(msg: => String) { if (log.isInfoEnabled) log.info(msg) } protected def logDebug(msg: => String) { if (log.isDebugEnabled) log.debug(msg) } protected def logTrace(msg: => String) { if (log.isTraceEnabled) log.trace(msg) } protected def logWarning(msg: => String) { log.warn(msg) } protected def logError(msg: => String) { log.error(msg) } // Log methods that take Throwables (Exceptions/Errors) too protected def logInfo(msg: => String, throwable: Throwable) { if (log.isInfoEnabled) log.info(msg, throwable) } protected def logDebug(msg: => String, throwable: Throwable) { if (log.isDebugEnabled) log.debug(msg, throwable) } protected def logTrace(msg: => String, throwable: Throwable) { if (log.isTraceEnabled) log.trace(msg, throwable) } protected def logWarning(msg: => String, throwable: Throwable) { log.warn(msg, throwable) } protected def logError(msg: => String, throwable: Throwable) { log.error(msg, throwable) } protected def isTraceEnabled: Boolean = { log.isTraceEnabled } }
Example 65
Source File: MCLModelSuite.scala From MCL_spark with MIT License | 5 votes |
package org.apache.spark.mllib.clustering import org.apache.log4j.{Level, Logger} import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.util.Utils class MCLModelSuite extends MCLFunSuite{ // Disable Spark messages when running program Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) test("model save/load", UnitTest){ val users: RDD[(VertexId, String)] = sc.parallelize(Array((0L,"Node1"), (1L,"Node2"), (2L,"Node3"), (3L,"Node4"),(4L,"Node5"), (5L,"Node6"), (6L,"Node7"), (7L, "Node8"), (8L, "Node9"), (9L, "Node10"), (10L, "Node11"))) val relationships: RDD[Edge[Double]] = sc.parallelize( Seq(Edge(0, 1, 1.0), Edge(1, 0, 1.0), Edge(0, 2, 1.0), Edge(2, 0, 1.0), Edge(0, 3, 1.0), Edge(3, 0, 1.0), Edge(1, 2, 1.0), Edge(2, 1, 1.0), Edge(1, 3, 1.0), Edge(3, 1, 1.0), Edge(2, 3, 1.0), Edge(3, 2, 1.0), Edge(4, 5, 1.0), Edge(5, 4, 1.0), Edge(4, 6, 1.0), Edge(6, 4, 1.0), Edge(4, 7, 1.0), Edge(7, 4, 1.0), Edge(5, 6, 1.0), Edge(6, 5, 1.0), Edge(5, 7, 1.0), Edge(7, 5, 1.0), Edge(6, 7, 1.0), Edge(7, 6, 1.0), Edge(3, 8, 1.0), Edge(8, 3, 1.0), Edge(9, 8, 1.0), Edge(8, 9, 1.0), Edge(9, 10, 1.0), Edge(10, 9, 1.0), Edge(4, 10, 1.0), Edge(10, 4, 1.0) )) val graph = Graph(users, relationships) val model: MCLModel = MCL.train(graph) // Check number of clusters model.nbClusters shouldEqual 3 // Check save and load methods val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString Array(true, false).foreach { case selector => // Save model, load it back, and compare. try { model.save(sc, path) val sameModel = MCLModel.load(sc, path) assertDatasetEquals(model.assignments.orderBy("id"), sameModel.assignments.orderBy("id")) } finally { Utils.deleteRecursively(tempDir) } } } test("nodes assignments", UnitTest) { val nodeId = 1.0.toLong val cluster = 2.0.toLong val newAssignment:Assignment = Assignment.apply(Row(nodeId, cluster)) newAssignment.id shouldEqual nodeId newAssignment.cluster shouldEqual cluster } }
Example 66
Source File: KmeansTest.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.spark.mllib import org.apache.log4j.{Level, Logger} import org.apache.spark.storage.StorageLevel import org.apache.spark.{SparkConf, SparkContext} import org.scalaml.{Logging, Resource} import org.scalaml.Predef._ import org.scalaml.stats.TSeries._ import org.scalaml.trading.YahooFinancials import org.scalaml.workflow.data.DataSource import org.scalatest.FunSuite import org.scalatest.concurrent.ScalaFutures import scala.concurrent.Future final class KmeansTest extends FunSuite with ScalaFutures with Logging with Resource { import scala.concurrent.ExecutionContext.Implicits.global protected[this] val name = "Spark MLlib K-Means" private val K = 8 private val NRUNS = 4 private val MAXITERS = 60 private val PATH = "spark/CSCO.csv" private val CACHE = false test(s"$name evaluation") { show(s"Evaluation") Logger.getRootLogger.setLevel(Level.ERROR) // The Spark configuration has to be customize to your environment val sparkConf = new SparkConf().setMaster("local") .setAppName("Kmeans") .set("spark.executor.memory", "4096m") implicit val sc = SparkContext.getOrCreate(sparkConf) // no need to load additional jar file val kmeanClustering: Option[Kmeans] = extract.map(input => { val volatilityVol = zipToSeries(input._1, input._2).take(500) val config = new KmeansConfig(K, MAXITERS, NRUNS) val rddConfig = RDDConfig(CACHE, StorageLevel.MEMORY_ONLY) Kmeans(config, rddConfig, volatilityVol) }) // Wraps into a future to enforce time out in case of a straggler val ft = Future[Boolean] { predict(kmeanClustering) } whenReady(ft) { result => assert(result) } sc.stop } private def predict(kmeanClustering: Option[Kmeans]): Boolean = { kmeanClustering.map(kmeansCluster => { val obs = Array[Double](0.1, 0.9) val clusterId1 = kmeansCluster |> obs show(s"(${obs(0)},${obs(1)}) => Cluster #$clusterId1") val obs2 = Array[Double](0.56, 0.11) val clusterId2 = kmeansCluster |> obs2 val result = s"(${obs2(0)},${obs2(1)}) => Cluster #$clusterId2" show(s"$name result: $result") }) true } private def extract: Option[(DblVec, DblVec)] = { import scala.util._ val extractors = List[Array[String] => Double]( YahooFinancials.volatility, YahooFinancials.volume ) DataSource(getPath(PATH).get, true).map(_.|>) match { case Success(pfnSrc) => pfnSrc(extractors).map(res => ((res(0).toVector, res(1).toVector))).toOption case Failure(e) => failureHandler(e) None } } } // --------------------------------- EOF -------------------------------------------------
Example 67
Source File: Application.scala From retail_analytics with Apache License 2.0 | 5 votes |
package controllers import scalaz._ import Scalaz._ import scalaz.EitherT._ import scalaz.Validation //import scalaz.Validation.FlatMap._ import scalaz.NonEmptyList._ import play.api.mvc._ import java.io.File import scala.io.Source import org.apache.log4j.Logger import org.apache.log4j.Level import models._ import models.stack._ import play.api.libs.json._ object Application extends Controller { def index() = Action { implicit request => Ok(views.html.index("Megam Analytics.")) } def upload = Action(parse.multipartFormData) { implicit request => request.body.file("picture").map { picture => import java.io.File val filename = picture.filename val contentType = picture.contentType picture.ref.moveTo(new File("/tmp/"+filename)) models.HDFSFileService.saveFile("/tmp/"+filename) match { case Success(succ) => { val fu = List(("success" -> succ)) Redirect("/").flashing(fu: _*) } case Failure(err) => { val fu = List(("error" -> "File doesn't get uploaded")) Redirect("/").flashing(fu: _*) } } }.getOrElse { val fu = List(("error" -> "File doesn't get uploaded..")) Redirect("/").flashing(fu: _*) } } def analysis() = Action { implicit request => val tuple_res = models.Retail.buyingbehaviour(MConfig.recommand_ID.toInt, MConfig.retailfile) println("BACK==========================>>>") println(tuple_res._1) //val finalJson = { // for { // product <- productList // } yield Json.parse(product).as[JsObject] // } Ok(views.html.finalProducts(tuple_res._1, tuple_res._2)) } }
Example 68
Source File: FileOutputIT.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta import java.sql.Timestamp import java.util.UUID import com.github.nscala_time.time.Imports._ import com.stratio.sparta.sdk.pipeline.output.{Output, OutputFormatEnum, SaveModeEnum} import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} import org.scalatest._ import scala.reflect.io.File class FileOutputIT extends FlatSpec with ShouldMatchers with BeforeAndAfterAll { self: FlatSpec => @transient var sc: SparkContext = _ override def beforeAll { Logger.getRootLogger.setLevel(Level.ERROR) sc = FileOutputIT.getNewLocalSparkContext(1, "test") } override def afterAll { sc.stop() System.clearProperty("spark.driver.port") } trait CommonValues { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ val time = new Timestamp(DateTime.now.getMillis) val data = sc.parallelize(Seq(Person("Kevin", 18, time), Person("Kira", 21, time), Person("Ariadne", 26, time))).toDF val tmpPath: String = s"/tmp/sparta-test/${UUID.randomUUID().toString}" } trait WithEventData extends CommonValues { val properties = Map("path" -> tmpPath, "createDifferentFiles" -> "false") val output = new FileOutput("file-test", properties) } "FileOutputIT" should "save a dataframe" in new WithEventData { output.save(data, SaveModeEnum.Append, Map(Output.TimeDimensionKey -> "minute", Output.TableNameKey -> "person")) val source = new java.io.File(tmpPath).listFiles() val read = sqlContext.read.json(tmpPath).toDF read.count shouldBe(3) File("/tmp/sparta-test").deleteRecursively } } object FileOutputIT { def getNewLocalSparkContext(numExecutors: Int = 1, title: String): SparkContext = { val conf = new SparkConf().setMaster(s"local[$numExecutors]").setAppName(title) SparkContext.getOrCreate(conf) } } case class Person(name: String, age: Int, minute: Timestamp) extends Serializable
Example 69
Source File: GamerSparkSQLExample.scala From SparkOnKudu with Apache License 2.0 | 5 votes |
package org.kududb.spark.demo.gamer.aggregates import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object GamerSparkSQLExample { def main(args:Array[String]): Unit = { if (args.length == 0) { println("{kudumaster} {runLocal}") return } Logger.getRootLogger.setLevel(Level.ERROR) val kuduMaster = args(0) val runLocal = args(1).equals("l") println("Loading Spark Context") var sc:SparkContext = null if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") sc = new SparkContext(sparkConfig) } println("Loading Spark Context: Finished") println("Setting up Tables") val sqlContext = new SQLContext(sc) sqlContext.load("org.kududb.spark", Map("kudu.table" -> "gamer", "kudu.master" -> kuduMaster)).registerTempTable("gamer") println("Query 1: SELECT count(*) FROM gamer") val startTimeQ1 = System.currentTimeMillis() sqlContext.sql("SELECT count(*) FROM gamer").take(10).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 1: " + (System.currentTimeMillis() - startTimeQ1)) println("Query 2: SELECT * FROM gamer limit 100") val startTimeQ2 = System.currentTimeMillis() sqlContext.sql("SELECT * FROM gamer limit 100").take(100).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 2: " + (System.currentTimeMillis() - startTimeQ2)) println("Query 3: SELECT * FROM gamer order_by last_time_played desc limit 100") val startTimeQ3 = System.currentTimeMillis() sqlContext.sql("SELECT * FROM gamer order by last_time_played desc limit 100").take(100).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 3: " + (System.currentTimeMillis() - startTimeQ3)) println("Query 4: SELECT max(games_played), max(oks), max(damage_given) FROM gamer") val startTimeQ4 = System.currentTimeMillis() sqlContext.sql("SELECT max(games_played), max(oks), max(damage_given) FROM gamer").take(100).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 4: " + (System.currentTimeMillis() - startTimeQ4)) println("Query 5 + MLLIB: SELECT gamer_id, oks, games_won, games_played FROM gamer" ) val startTimeQ5 = System.currentTimeMillis() val resultDf = sqlContext.sql("SELECT gamer_id, oks, games_won, games_played FROM gamer") val parsedData = resultDf.map(r => { val array = Array(r.getInt(1).toDouble, r.getInt(2).toDouble, r.getInt(3).toDouble) Vectors.dense(array) }) val dataCount = parsedData.count() if (dataCount > 0) { val clusters = KMeans.train(parsedData, 3, 5) clusters.clusterCenters.foreach(v => println(" Vector Center:" + v)) } //TODO add Mllib here println("Finish Query 5 + MLLIB: " + (System.currentTimeMillis() - startTimeQ5)) } }
Example 70
Source File: BasicSparkSQLExamples.scala From SparkOnKudu with Apache License 2.0 | 5 votes |
package org.kududb.spark.demo.basic import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object BasicSparkSQLExamples { def main(args:Array[String]): Unit = { if (args.length == 0) { println("<kuduMaster> <tablename> <runLocal>") } Logger.getRootLogger.setLevel(Level.ERROR) val kuduMaster = args(0) val tableName = args(1) val runLocal = args(2).equals("l") println("starting") var sc:SparkContext = null if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") sc = new SparkContext(sparkConfig) } try { println("Setting up Tables") val sqlContext = new SQLContext(sc) sqlContext.load("org.kududb.spark", Map("kudu.table" -> tableName, "kudu.master" -> kuduMaster)).registerTempTable(tableName) println("Query 1: SELECT count(*) FROM " + tableName) val startTimeQ1 = System.currentTimeMillis() sqlContext.sql("SELECT count(*) FROM " + tableName).take(10).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 1: " + (System.currentTimeMillis() - startTimeQ1)) println("Query 2: SELECT key_id, col_1 FROM " + tableName + " limit 100") val startTimeQ2 = System.currentTimeMillis() sqlContext.sql("SELECT key_id, col_1 FROM " + tableName + " limit 100 ").take(100).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 2: " + (System.currentTimeMillis() - startTimeQ2)) val q3 = "select key_id from " + tableName + " a join (SELECT max(col_1) col_max FROM " + tableName + ") b on (a.col_1 = b.col_max)" println("Query 3: " + q3) val startTimeQ3 = System.currentTimeMillis() sqlContext.sql(q3).take(100).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 3: " + (System.currentTimeMillis() - startTimeQ3)) println("Query 5 + MLLIB: SELECT key_id, col_1, col_2 FROM " + tableName ) val startTimeQ5 = System.currentTimeMillis() val resultDf = sqlContext.sql("SELECT key_id, col_1, col_2 FROM " + tableName + " limit 1000") val parsedData = resultDf.map(r => { val array = Array(r.getInt(1).toDouble, r.getInt(2).toDouble) Vectors.dense(array) }) val clusters = KMeans.train(parsedData, 3, 4) clusters.clusterCenters.foreach(v => println(" Vector Center:" + v)) //TODO add Mllib here println("Finish Query 5 + MLLIB: " + (System.currentTimeMillis() - startTimeQ5)) } finally { sc.stop() } } }
Example 71
Source File: StreamingKMeansSuite.scala From spark-structured-streaming-ml with Apache License 2.0 | 5 votes |
package com.highperformancespark.examples.structuredstreaming import com.holdenkarau.spark.testing.DataFrameSuiteBase import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.ml.linalg._ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.execution.streaming.MemoryStream import org.scalatest.FunSuite import org.apache.log4j.{Level, Logger} case class TestRow(features: Vector) class StreamingKMeansSuite extends FunSuite with DataFrameSuiteBase { override def beforeAll(): Unit = { super.beforeAll() Logger.getLogger("org").setLevel(Level.OFF) } test("streaming model with one center should converge to true center") { import spark.implicits._ val k = 1 val dim = 5 val clusterSpread = 0.1 val seed = 63 // TODO: this test is very flaky. The centers do not converge for some // (most?) random seeds val (batches, trueCenters) = StreamingKMeansSuite.generateBatches(100, 80, k, dim, clusterSpread, seed) val inputStream = MemoryStream[TestRow] val ds = inputStream.toDS() val skm = new StreamingKMeans().setK(k).setRandomCenters(dim, 0.01) val query = skm.evilTrain(ds.toDF()) val streamingModels = batches.map { batch => inputStream.addData(batch) query.processAllAvailable() skm.getModel } // TODO: use spark's testing suite streamingModels.last.centers.zip(trueCenters).foreach { case (center, trueCenter) => val centers = center.toArray.mkString(",") val trueCenters = trueCenter.toArray.mkString(",") println(s"${centers} | ${trueCenters}") assert(center.toArray.zip(trueCenter.toArray).forall( x => math.abs(x._1 - x._2) < 0.1)) } query.stop() } def compareBatchAndStreaming( batchModel: KMeansModel, streamingModel: StreamingKMeansModel, validationData: DataFrame): Unit = { assert(batchModel.clusterCenters === streamingModel.centers) // TODO: implement prediction comparison } } object StreamingKMeansSuite { def generateBatches( numPoints: Int, numBatches: Int, k: Int, d: Int, r: Double, seed: Int, initCenters: Array[Vector] = null): (IndexedSeq[IndexedSeq[TestRow]], Array[Vector]) = { val rand = scala.util.Random rand.setSeed(seed) val centers = initCenters match { case null => Array.fill(k)(Vectors.dense(Array.fill(d)(rand.nextGaussian()))) case _ => initCenters } val data = (0 until numBatches).map { i => (0 until numPoints).map { idx => val center = centers(idx % k) val vec = Vectors.dense( Array.tabulate(d)(x => center(x) + rand.nextGaussian() * r)) TestRow(vec) } } (data, centers) } }
Example 72
Source File: SuspiciousConnects.scala From oni-ml with Apache License 2.0 | 5 votes |
package org.opennetworkinsight import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SQLContext import org.slf4j.LoggerFactory import org.opennetworkinsight.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig import org.opennetworkinsight.dns.DNSSuspiciousConnects import org.opennetworkinsight.netflow.FlowSuspiciousConnects import org.opennetworkinsight.proxy.ProxySuspiciousConnectsAnalysis def main(args: Array[String]) { val parser = SuspiciousConnectsArgumentParser.parser parser.parse(args, SuspiciousConnectsConfig()) match { case Some(config) => val logger = LoggerFactory.getLogger(this.getClass) Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) val analysis = config.analysis val sparkConfig = new SparkConf().setAppName("ONI ML: " + analysis + " lda") val sparkContext = new SparkContext(sparkConfig) val sqlContext = new SQLContext(sparkContext) implicit val outputDelimiter = OutputDelimiter analysis match { case "flow" => FlowSuspiciousConnects.run(config, sparkContext, sqlContext, logger) case "dns" => DNSSuspiciousConnects.run(config, sparkContext, sqlContext, logger) case "proxy" => ProxySuspiciousConnectsAnalysis.run(config, sparkContext, sqlContext, logger) case _ => println("ERROR: unsupported (or misspelled) analysis: " + analysis) } sparkContext.stop() case None => println("Error parsing arguments") } System.exit(0) } }
Example 73
Source File: SparseNaiveBayes.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println
Example 74
Source File: DenseKMeans.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 75
Source File: StreamingExamples.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.apache.log4j.{Level, Logger} import org.apache.spark.internal.Logging def setStreamingLogLevels() { val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements if (!log4jInitialized) { // We first log something to initialize Spark's default logging, then we override the // logging level. logInfo("Setting log level to [WARN] for streaming example." + " To override add a custom log4j.properties to the classpath.") Logger.getRootLogger.setLevel(Level.WARN) } } }
Example 76
Source File: YarnScheduler.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.yarn.util.RackResolver import org.apache.log4j.{Level, Logger} import org.apache.spark._ import org.apache.spark.scheduler.TaskSchedulerImpl import org.apache.spark.util.Utils private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) { // RackResolver logs an INFO message whenever it resolves a rack, which is way too often. if (Logger.getLogger(classOf[RackResolver]).getLevel == null) { Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN) } // By default, rack is unknown override def getRackForHost(hostPort: String): Option[String] = { val host = Utils.parseHostPort(hostPort)._1 Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation) } }
Example 77
Source File: ClientArguments.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import java.net.{URI, URISyntaxException} import scala.annotation.tailrec import scala.collection.mutable.ListBuffer import org.apache.log4j.Level import org.apache.spark.util.{IntParam, MemoryParam, Utils} private def printUsageAndExit(exitCode: Int) { // TODO: It wouldn't be too hard to allow users to submit their app and dependency jars // separately similar to in the YARN client. val usage = s""" |Usage: DriverClient [options] launch <active-master> <jar-url> <main-class> [driver options] |Usage: DriverClient kill <active-master> <driver-id> | |Options: | -c CORES, --cores CORES Number of cores to request (default: $DEFAULT_CORES) | -m MEMORY, --memory MEMORY Megabytes of memory to request (default: $DEFAULT_MEMORY) | -s, --supervise Whether to restart the driver on failure | (default: $DEFAULT_SUPERVISE) | -v, --verbose Print more debugging output """.stripMargin // scalastyle:off println System.err.println(usage) // scalastyle:on println System.exit(exitCode) } } private[deploy] object ClientArguments { val DEFAULT_CORES = 1 val DEFAULT_MEMORY = Utils.DEFAULT_DRIVER_MEM_MB // MB val DEFAULT_SUPERVISE = false def isValidJarUrl(s: String): Boolean = { try { val uri = new URI(s) uri.getScheme != null && uri.getPath != null && uri.getPath.endsWith(".jar") } catch { case _: URISyntaxException => false } } }
Example 78
Source File: BeforeAndAfterWithContext.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.rpc.netty import eleflow.uberdata.core.IUberdataContext import eleflow.uberdata.core.util.ClusterSettings import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkEnv} import org.scalatest.{BeforeAndAfterEach, Suite} object TestSparkConf { def conf = { val sconf = new SparkConf() sconf.set("spark.app.name", "teste") sconf } val separator = "," } trait BeforeAndAfterWithContext extends BeforeAndAfterEach { this: Suite => val defaultFilePath = "src/test/resources/" import TestSparkConf._ ClusterSettings.master = Some("local[*]") conf.set("spark.driver.allowMultipleContexts", "true") @transient val context = IUberdataContext.getUC(conf) override def beforeEach() = { setLogLevels(Level.INFO, Seq("spark", "org.eclipse.jetty", "akka")) } def setLogLevels(level: org.apache.log4j.Level, loggers: TraversableOnce[String]) = { loggers.map { loggerName => val logger = Logger.getLogger(loggerName) val prevLevel = logger.getLevel logger.setLevel(level) loggerName -> prevLevel }.toMap } override def afterEach() = { val get = SparkEnv.get val rpcEnv = if (get != null) { Some(get.rpcEnv) } else None context.clearContext() //rpcEnv.foreach( // _.fileServer.asInstanceOf[org.apache.spark.rpc.netty.HttpBasedFileServer].shutdown()) System.clearProperty("spark.master.port") } }
Example 79
Source File: ClientArguments.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import java.net.{URI, URISyntaxException} import scala.collection.mutable.ListBuffer import org.apache.log4j.Level import org.apache.spark.util.{IntParam, MemoryParam} def printUsageAndExit(exitCode: Int) { // TODO: It wouldn't be too hard to allow users to submit their app and dependency jars // separately similar to in the YARN client. val usage = s""" |Usage: DriverClient [options] launch <active-master> <jar-url> <main-class> [driver options] |Usage: DriverClient kill <active-master> <driver-id> | |Options: | -c CORES, --cores CORES Number of cores to request (default: $DEFAULT_CORES) | -m MEMORY, --memory MEMORY Megabytes of memory to request (default: $DEFAULT_MEMORY) | -s, --supervise Whether to restart the driver on failure | (default: $DEFAULT_SUPERVISE) | -v, --verbose Print more debugging output """.stripMargin System.err.println(usage) System.exit(exitCode) } } object ClientArguments { private[spark] val DEFAULT_CORES = 1 private[spark] val DEFAULT_MEMORY = 512 // MB private[spark] val DEFAULT_SUPERVISE = false def isValidJarUrl(s: String): Boolean = { try { val uri = new URI(s) uri.getScheme != null && uri.getPath != null && uri.getPath.endsWith(".jar") } catch { case _: URISyntaxException => false } } }
Example 80
Source File: MLLibSuite.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib import org.scalatest.{BeforeAndAfterAll, FunSuite} import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.{Row, SparkSession} class MLLibSuite extends FunSuite with BeforeAndAfterAll { private var sparkSession: SparkSession = _ var savedLevels: Map[String, Level] = _ override def beforeAll(): Unit = { super.beforeAll() sparkSession = SparkSession.builder.master("local[2]").appName("MLlib QA").getOrCreate() // Travis limits the size of the log file produced by a build. Because we do run a small // version of all the ML benchmarks in this suite, we produce a ton of logs. Here we set the // log level to ERROR, just for this suite, to avoid displeasing travis. savedLevels = Seq("akka", "org", "com.databricks").map { name => val logger = Logger.getLogger(name) val curLevel = logger.getLevel logger.setLevel(Level.ERROR) name -> curLevel }.toMap } override def afterAll(): Unit = { savedLevels.foreach { case (name, level) => Logger.getLogger(name).setLevel(level) } try { if (sparkSession != null) { sparkSession.stop() } // To avoid RPC rebinding to the same port, since it doesn't unbind immediately on shutdown System.clearProperty("spark.driver.port") sparkSession = null } finally { super.afterAll() } } test("test MlLib benchmarks with mllib-small.yaml.") { val results = MLLib.run(yamlConfig = MLLib.smallConfig) val failures = results.na.drop(Seq("failure")) if (failures.count() > 0) { failures.select("name", "failure.*").collect().foreach { case Row(name: String, error: String, message: String) => println( s"""There as a failure in the benchmark for $name: | $error ${message.replace("\n", "\n ")} """.stripMargin) } fail("Unable to run all benchmarks successfully, see console output for more info.") } } test("test before benchmark methods for pipeline benchmarks.") { val benchmarks = MLLib.getBenchmarks(MLLib.getConf(yamlConfig = MLLib.smallConfig)) benchmarks.foreach { b => b.beforeBenchmark() } } }
Example 81
Source File: SomeSQLOnTitanic.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.machinelearning.titanic import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession object SomeSQLOnTitanic { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main (args:Array[String]): Unit = { val testFile = args(0) val trainFile = args(1) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } import sparkSession.implicits._ //Load Data val trainDs = sparkSession.read.option("header", "true") .option("charset", "UTF8") .option("delimiter",",") .csv(trainFile) trainDs.createOrReplaceTempView("train") println("Sex -> Servived") sparkSession.sql("select Sex, sum(Survived), count(*), (sum(Survived)/count(*)) from train group by Sex").collect().foreach(println) println("Cabin -> Servived") sparkSession.sql("select substring(Cabin,1,1), sum(Survived), count(*), (sum(Survived)/count(*)) from train group by 1 order by 1").collect().foreach(println) println("Age -> Servived") sparkSession.sql("select round(cast(Age as Int) / 10) as age_block, sum(Survived), count(*), (sum(Survived)/count(*)) from train group by 1 order by 1").collect().foreach(println) println("PClass -> Servived") sparkSession.sql("select pclass, sum(Survived), count(*), (sum(Survived)/count(*)) from train group by pclass order by 1").collect().foreach(println) println("Embarked -> Servived") sparkSession.sql("select Embarked, sum(Survived), count(*), (sum(Survived)/count(*)) from train group by Embarked order by 1").collect().foreach(println) println("Fare -> Servived") sparkSession.sql("select round((Fare / 10)), sum(Survived), count(*), (sum(Survived)/count(*)) from train group by 1 order by 1").collect().foreach(println) println("Servived -> Servived") sparkSession.sql("select sum(Survived), count(*) from train order by 1").collect().foreach(println) sparkSession.stop() } }
Example 82
Source File: ManyToManyNormalJoin.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.manytomany import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.{Row, SparkSession} import scala.collection.mutable object ManyToManyNormalJoin { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val jsonPath = args(0) val sparkSession = SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .getOrCreate() val jsonDf = sparkSession.read.json(jsonPath) val nGramWordCount = jsonDf.rdd.flatMap(r => { val actions = r.getAs[mutable.WrappedArray[Row]]("actions") val resultList = new mutable.MutableList[((Long, Long), Int)] actions.foreach(a => { val aValue = a.getAs[Long]("action") actions.foreach(b => { val bValue = b.getAs[Long]("action") if (aValue < bValue) { resultList.+=(((aValue, bValue), 1)) } }) }) resultList.toSeq }).reduceByKey(_ + _) nGramWordCount.collect().foreach(r => { println(" - " + r) }) } }
Example 83
Source File: ManyToManyNestedJoin.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.manytomany import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.{Row, SparkSession} import scala.collection.mutable object ManyToManyNestedJoin { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val jsonPath = args(0) val sparkSession = SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .getOrCreate() val jsonDf = sparkSession.read.json(jsonPath) val nGramWordCount = jsonDf.rdd.flatMap(r => { val actions = r.getAs[mutable.WrappedArray[Row]]("actions") val resultList = new mutable.MutableList[(Long, NestedCount)] actions.foreach(a => { val aValue = a.getAs[Long]("action") val aNestedCount = new NestedCount actions.foreach(b => { val bValue = b.getAs[Long]("action") if (aValue < bValue) { aNestedCount.+=(bValue, 1) } }) resultList.+=((aValue, aNestedCount)) }) resultList.toSeq }).reduceByKey((a, b) => a + b) //.reduceByKey(_ + _) nGramWordCount.collect().foreach(r => { println(" - " + r) }) } } //1,2 //1,3 //1,4 //1 (2, 3, 4) class NestedCount() extends Serializable{ val map = new mutable.HashMap[Long, Long]() def += (key:Long, count:Long): Unit = { val currentValue = map.getOrElse(key, 0l) map.put(key, currentValue + count) } def + (other:NestedCount): NestedCount = { val result = new NestedCount other.map.foreach(r => { result.+=(r._1, r._2) }) this.map.foreach(r => { result.+=(r._1, r._2) }) result } override def toString(): String = { val stringBuilder = new StringBuilder map.foreach(r => { stringBuilder.append("(" + r._1 + "," + r._2 + ")") }) stringBuilder.toString() } }
Example 84
Source File: SaltedExample.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.salted import java.util.Random import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession object SaltedExample { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val jsonPath = args(0) val sparkSession = SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .getOrCreate() val jsonDfLeft = sparkSession.read.json(jsonPath) val saltedLeft = jsonDfLeft.rdd.flatMap(r => { val group = r.getAs[String]("group") val value = r.getAs[Long]("value") Seq((group + "_" + 0, value),(group + "_" + 1, value)) }) val jsonDfRight = sparkSession.read.json(jsonPath) val saltedRight = jsonDfRight.rdd.mapPartitions(it => { val random = new Random() it.map(r => { val group = r.getAs[String]("group") val value = r.getAs[Long]("value") (group + "_" + random.nextInt(2), value) }) }) jsonDfLeft.join(jsonDfRight).collect().foreach(r => { println("Normal.result:" + r) }) println("----") saltedLeft.join(saltedRight).collect().foreach(r => { println("Salted.result:" + r) }) } }
Example 85
Source File: SmallWindowing.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.windowing.small import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession object SmallWindowing { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val jsonPath = args(0) val sparkSession = SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .getOrCreate() val jsonDf = sparkSession.read.json(jsonPath) val timeDifRdd = jsonDf.rdd.map(row => { val group = row.getAs[String]("group") val time = row.getAs[Long]("time") val value = row.getAs[Long]("value") //(key , value) (group, (time, value)) }).groupByKey().flatMap{case (group, records) => var lastValue = 0l val localList = records.toSeq println("localList.size:" + localList.size) localList.sortBy(_._1).map{case (time, value) => val dif = value - lastValue lastValue = value (group, time, value, dif) } } timeDifRdd.take(10).foreach(r => { println(r) }) sparkSession.stop() } }
Example 86
Source File: SuperBigWindowing.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.windowing.superbig import org.apache.log4j.{Level, Logger} import org.apache.spark.Partitioner import org.apache.spark.sql.SparkSession object SuperBigWindowing { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args: Array[String]): Unit = { val jsonPath = args(0) val pageSize = args(1).toInt val spark = SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .getOrCreate() val jsonDf = spark.read.json(jsonPath) import spark.implicits._ val diffDs = jsonDf.flatMap(row => { val group = row.getAs[String]("group") val time = row.getAs[Long]("time") val value = row.getAs[Long]("value") val timePage = time / pageSize if (time % pageSize == 0) { //Am I on the edge of the page Seq((timePage, (time, value)), (timePage + 1, (time, value))) } else { Seq((timePage, (time, value))) } }).groupByKey(r => r._1).flatMapGroups((k, it) => { var lastValue = 0l it.toSeq. sortBy{case (page, (time, value)) => time}. map{case (page, (time, value)) => val dif = value - lastValue lastValue = value (time, value, dif) } }) diffDs.collect().foreach(r => println(" - " + r)) spark.stop() } }
Example 87
Source File: SessionWindowing.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.timeseries import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import scala.collection.mutable object SessionWindowing { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val sessionJson = args(0) val timeGap = args(1).toInt val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host", "127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } println("---") import sparkSession.implicits._ val sessionDs = sparkSession.read.json(sessionJson).as[JsonLeadLag] sessionDs.createOrReplaceTempView("session_table") sparkSession.sql("select * from session_table").collect().foreach(println) val sessionDefinitinonDf = sessionDs.rdd.map(r => { (r.group, r) }).groupByKey().flatMap{ case (group, jsonObjIt) => var lastStart:Long = -1 var lastEnd:Long = -1 var sessionCount = 1 var eventsInASession = 0 val sessionList = new mutable.MutableList[SessionDefinition] jsonObjIt.toSeq.sortBy(r => r.ts).foreach(record => { val ts = record.ts eventsInASession += 1 if (lastStart == -1) { lastStart = ts } else if (ts > lastEnd + timeGap) { sessionList += SessionDefinition(group, lastStart, lastEnd, lastEnd - lastStart, eventsInASession) lastStart = ts eventsInASession = 0 } lastEnd = ts }) sessionList } sessionDefinitinonDf.collect().foreach(println) } } case class SessionDefinition(group:String, sessionStart:Long, sessionEnd:Long, sessionLength:Long, sessionEvents:Int)
Example 88
Source File: LeadLagExample.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.timeseries import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession object LeadLagExample { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val leadLagJson = args(0) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host", "127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } println("---") import sparkSession.implicits._ val leadLag = sparkSession.read.json(leadLagJson).as[JsonLeadLag] leadLag.createOrReplaceTempView("leadlag") sparkSession.sql("select * from leadlag").collect().foreach(println) val leadLagDf = sparkSession.sql("SELECT " + "group, ts, " + "value as v_now, " + "LEAD(value) OVER (PARTITION BY group ORDER BY ts) as v_after, " + "LAG(value) OVER (PARTITION BY group ORDER BY ts) as v_before " + "FROM leadlag") leadLagDf.collect().foreach(println) leadLagDf.createOrReplaceTempView("leadlag_stage2") leadLagDf.printSchema() sparkSession.sql("select " + "group, ts, v_now, v_after, v_before, " + "case " + " when v_now < v_after and v_now < v_before then 'valley'" + " when v_now > v_after and v_now > v_before then 'peak'" + " else 'n/a' " + "end " + "from leadlag_stage2").collect().foreach(println) } } case class JsonLeadLag(group:String, ts:Long, value:Long)
Example 89
Source File: TumblingWindows.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.timeseries import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession object TumblingWindows { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val leadLagJson = args(0) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host", "127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } println("---") import sparkSession.implicits._ val leadLag = sparkSession.read.json(leadLagJson).as[JsonLeadLag] leadLag.createOrReplaceTempView("leadlag") sparkSession.sql("select * from leadlag").collect().foreach(println) val leadLagDf = sparkSession.sql("SELECT " + "group, " + "round(ts / 3), " + "avg(value), " + "max(value), " + "min(value) " + "FROM leadlag " + "group by 1,2") leadLagDf.collect().foreach(println) } }
Example 90
Source File: InfectionPointWindow.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.timeseries import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession object InfectionPointWindow { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val inflectionPointJson = args(0) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host", "127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } println("---") import sparkSession.implicits._ val inflectionPointDs = sparkSession.read.json(inflectionPointJson).as[JsonInfectionPoint] inflectionPointDs.createOrReplaceTempView("inflection_point") sparkSession.sql("select * from inflection_point").collect().foreach(println) val leadLagDf = sparkSession.sql("SELECT " + "group, ts, " + "value as v_now, " + "AVG(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg, " + "Min(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg, " + "Max(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg " + "FROM inflection_point " + "where event_type = 'inflection'") leadLagDf.collect().foreach(println) } } case class JsonInfectionPoint(group:String, ts:Long, value:Long, event_type:String)
Example 91
Source File: SplidingWindows.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.timeseries import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession object SplidingWindows { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val leadLagJson = args(0) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host", "127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } println("---") import sparkSession.implicits._ val leadLag = sparkSession.read.json(leadLagJson).as[JsonLeadLag] leadLag.createOrReplaceTempView("leadlag") sparkSession.sql("select * from leadlag").collect().foreach(println) val leadLagDf = sparkSession.sql("SELECT " + "group, ts, " + "value as v_now, " + "AVG(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg, " + "Min(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg, " + "Max(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg " + "FROM leadlag") leadLagDf.collect().foreach(println) } }
Example 92
Source File: JsonNestedExample.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.nested import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.types.{ArrayType, DataType, StructField, StructType} import scala.collection.mutable object JsonNestedExample { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args: Array[String]): Unit = { val isLocal = args(0).equalsIgnoreCase("l") val jsonPath = args(1) val outputTableName = args(2) val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } println("---") val jsonDf = sparkSession.read.json(jsonPath) val localJsonDf = jsonDf.collect() println("--Df") jsonDf.foreach(row => { println("row:" + row) }) println("--local") localJsonDf.foreach(row => { println("row:" + row) }) jsonDf.createOrReplaceTempView("json_table") println("--Tree Schema") jsonDf.schema.printTreeString() println("--") jsonDf.write.saveAsTable(outputTableName) sparkSession.sqlContext.sql("select * from " + outputTableName).take(10).foreach(println) println("--") sparkSession.stop() } def populatedFlattedHashMap(row:Row, schema:StructType, fields:Array[StructField], flattedMap:mutable.HashMap[(String, DataType), mutable.MutableList[Any]], parentFieldName:String): Unit = { fields.foreach(field => { println("field:" + field.dataType) if (field.dataType.isInstanceOf[ArrayType]) { val elementType = field.dataType.asInstanceOf[ArrayType].elementType if (elementType.isInstanceOf[StructType]) { val childSchema = elementType.asInstanceOf[StructType] val childRow = Row.fromSeq(row.getAs[mutable.WrappedArray[Any]](field.name).toSeq) populatedFlattedHashMap(childRow, childSchema, childSchema.fields, flattedMap, parentFieldName + field.name + ".") } } else { val fieldList = flattedMap.getOrElseUpdate((parentFieldName + field.name, field.dataType), new mutable.MutableList[Any]) fieldList.+=:(row.getAs[Any](schema.fieldIndex(field.name))) } }) } }
Example 93
Source File: NestedTableExample.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.nested import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType} import org.apache.spark.sql.{Row, SparkSession} object NestedTableExample { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args: Array[String]): Unit = { val spark = SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .enableHiveSupport() .getOrCreate() spark.sql("create table IF NOT EXISTS nested_empty " + "( A int, " + " B string, " + " nested ARRAY<STRUCT< " + " nested_C: int," + " nested_D: string" + " >>" + ") ") val rowRDD = spark.sparkContext. parallelize(Array( Row(1, "foo", Seq(Row(1, "barA"),Row(2, "bar"))), Row(2, "foo", Seq(Row(1, "barB"),Row(2, "bar"))), Row(3, "foo", Seq(Row(1, "barC"),Row(2, "bar"))))) val emptyDf = spark.sql("select * from nested_empty limit 0") val tableSchema = emptyDf.schema val populated1Df = spark.sqlContext.createDataFrame(rowRDD, tableSchema) println("----") populated1Df.collect().foreach(r => println(" emptySchemaExample:" + r)) val nestedSchema = new StructType() .add("nested_C", IntegerType) .add("nested_D", StringType) val definedSchema = new StructType() .add("A", IntegerType) .add("B", StringType) .add("nested", ArrayType(nestedSchema)) val populated2Df = spark.sqlContext.createDataFrame(rowRDD, definedSchema) println("----") populated1Df.collect().foreach(r => println(" BuiltExample:" + r)) spark.stop() } }
Example 94
Source File: PopulateHiveTable.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.nested import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType} object PopulateHiveTable { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args: Array[String]): Unit = { val spark = SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() spark.sql("create table IF NOT EXISTS nested_empty " + "( A int, " + " B string, " + " nested ARRAY<STRUCT< " + " nested_C: int," + " nested_D: string" + " >>" + ") ") val rowRDD = spark.sparkContext. parallelize(Array( Row(1, "foo", Seq(Row(1, "barA"),Row(2, "bar"))), Row(2, "foo", Seq(Row(1, "barB"),Row(2, "bar"))), Row(3, "foo", Seq(Row(1, "barC"),Row(2, "bar"))))) val emptyDf = spark.sql("select * from nested_empty limit 0") val tableSchema = emptyDf.schema val populated1Df = spark.sqlContext.createDataFrame(rowRDD, tableSchema) populated1Df.repartition(2).write.saveAsTable("nested_populated") println("----") populated1Df.collect().foreach(r => println(" emptySchemaExample:" + r)) val nestedSchema = new StructType() .add("nested_C", IntegerType) .add("nested_D", StringType) val definedSchema = new StructType() .add("A", IntegerType) .add("B", StringType) .add("nested", ArrayType(nestedSchema)) val populated2Df = spark.sqlContext.createDataFrame(rowRDD, definedSchema) println("----") populated1Df.collect().foreach(r => println(" BuiltExample:" + r)) spark.stop() } }
Example 95
Source File: CountingInAStreamExpUpdateStateByKey.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.dstream import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{Seconds, StreamingContext} object CountingInAStreamExpUpdateStateByKey { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val checkpointFolder = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .master("local[3]") .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } val ssc = new StreamingContext(sparkSession.sparkContext, Seconds(1)) ssc.checkpoint(checkpointFolder) val lines = ssc.socketTextStream(host, port.toInt) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(word => (word, 1)) .updateStateByKey((values: Seq[(Int)], state: Option[(Int)]) => { var value = state.getOrElse(0) values.foreach(i => { value += i }) Some(value) }) wordCounts.foreachRDD(rdd => { println("{") val localCollection = rdd.collect() println(" size:" + localCollection.length) localCollection.foreach(r => println(" " + r)) println("}") }) ssc.start() ssc.awaitTermination() } }
Example 96
Source File: CountingInAStreamExpBatchCounting.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.dstream import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{Seconds, StreamingContext} object CountingInAStreamExpBatchCounting { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val checkpointFolder = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .master("local[3]") .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } val ssc = new StreamingContext(sparkSession.sparkContext, Seconds(2)) ssc.checkpoint(checkpointFolder) val lines = ssc.socketTextStream(host, port.toInt) val words = lines.flatMap(line => line.toLowerCase.split(" ")) val wordCounts = words.map(word => (word, 1)) .reduceByKey((a,b) => a + b) wordCounts.foreachRDD(rdd => { println("{") val localCollection = rdd.collect() println(" size:" + localCollection.length) localCollection.foreach(r => println(" " + r)) println("}") }) ssc.start() ssc.awaitTermination() } }
Example 97
Source File: CountingInAStreamMapWithState.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.structured import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout} object CountingInAStreamMapWithState { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val checkpointFolder = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .master("local[3]") .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .master("local[3]") .getOrCreate() } import sparkSession.implicits._ val socketLines = sparkSession.readStream .format("socket") .option("host", host) .option("port", port) .load() val messageDs = socketLines.as[String]. flatMap(line => line.toLowerCase().split(" ")). map(word => WordCountEvent(word, 1)) // Generate running word count val wordCounts = messageDs.groupByKey(tuple => tuple.word). mapGroupsWithState[WordCountInMemory, WordCountReturn](GroupStateTimeout.ProcessingTimeTimeout) { case (word: String, events: Iterator[WordCountEvent], state: GroupState[WordCountInMemory]) => var newCount = if (state.exists) state.get.countOfWord else 0 events.foreach(tuple => { newCount += tuple.countOfWord }) state.update(WordCountInMemory(newCount)) WordCountReturn(word, newCount) } // Start running the query that prints the running counts to the console val query = wordCounts.writeStream .outputMode("update") .format("console") .start() query.awaitTermination() } } case class WordCountEvent(word:String, countOfWord:Int) extends Serializable { } case class WordCountInMemory(countOfWord: Int) extends Serializable { } case class WordCountReturn(word:String, countOfWord:Int) extends Serializable { }
Example 98
Source File: CountingInAStreamExpGroupBy.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.structured import com.malaska.spark.training.streaming.{Message, MessageBuilder} import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.{OutputMode, Trigger} import org.apache.spark.sql.functions._ object CountingInAStreamExpGroupBy { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val checkpointFolder = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .master("local[3]") .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .master("local[3]") .getOrCreate() } import sparkSession.implicits._ val socketLines = sparkSession.readStream .format("socket") .option("host", host) .option("port", port) .load() val messageDs = socketLines.as[String]. flatMap(line => line.toLowerCase().split(" ")) // Generate running word count val wordCounts = messageDs.groupBy("value").count() // Start running the query that prints the running counts to the console val query = wordCounts.writeStream .outputMode("complete") .format("console") .start() query.awaitTermination() } }
Example 99
Source File: CountingInAStreamDatasetExpGroupBy.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.structured import com.malaska.spark.training.streaming.{Message, MessageBuilder} import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.Trigger import org.apache.spark.sql.functions._ object CountingInAStreamDatasetExpGroupBy { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val checkpointFolder = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .master("local[3]") .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .master("local[3]") .getOrCreate() } import sparkSession.implicits._ val socketLines = sparkSession.readStream .format("socket") .option("host", host) .option("port", port) .load() val messageDs = socketLines.as[String].map(line => { MessageBuilder.build(line) }).as[Message] val tickerCount = messageDs.groupBy("ticker", "destUser").agg(sum($"price"), avg($"price")) val ticketOutput = tickerCount.writeStream .format("Console") .trigger(Trigger.ProcessingTime("5 seconds")) .option("checkpointLocation", checkpointFolder) .outputMode("complete") .format("console") .start() ticketOutput.awaitTermination() } }
Example 100
Source File: CountingInAStreamExpWindowing.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.structured import com.malaska.spark.training.streaming.{Message, MessageBuilder} import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.streaming.FileStreamSource.Timestamp import org.apache.spark.sql.functions._ import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.streaming.{OutputMode, Trigger} object CountingInAStreamExpWindowing { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val checkpointFolder = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .master("local[5]") .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .master("local[5]") .getOrCreate() } import sparkSession.implicits._ val socketLines = sparkSession.readStream .format("socket") .option("host", host) .option("port", port) .option("includeTimestamp", true) .load() val messageDsDStream = socketLines.as[(String, Timestamp)].map(line => { MessageBuilder.build(line._1, line._2) }).filter(r => r != null).as[Message] val tickerCount = messageDsDStream.withColumn("eventTime", $"tradeTs".cast("timestamp")) .withWatermark("eventTime", "30 seconds") .groupBy(window($"eventTime", "30 seconds", "5 seconds"), $"ticker") .agg(max($"tradeTs") as "max_time", sum($"price") as "total_price", avg($"price") as "avg_price", count($"price") as "number_of_trades")//.orderBy("window") val ticketOutput = tickerCount.writeStream .format("Console") .option("checkpointLocation", checkpointFolder) .outputMode("update") //.outputMode("complete") .format("console") .option("truncate", false) .option("numRows", 40) .start() ticketOutput.awaitTermination() } }
Example 101
Source File: ZombieExample.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.graph import org.apache.log4j.{Level, Logger} import org.apache.spark.graphx.{Edge, EdgeDirection, Graph, _} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession object ZombieExample { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val vertexJsonFile = args(0) val edgeJsonFile = args(1) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } println("---") import sparkSession.implicits._ val vectorDs = sparkSession.read.json(vertexJsonFile).as[JsonVertex] val edgeDs = sparkSession.read.json(edgeJsonFile).as[JsonEdge] val vectorRdd:RDD[(VertexId, ZombieStats)] = vectorDs.rdd.map(r => { (r.vertex_id.toLong, new ZombieStats(r.is_zombie.equals("yes"), r.time_alive)) }) val edgeRdd = edgeDs.rdd.map(r => { new Edge[String](r.src, r.dst, r.edge_type) }) val defaultUser = new ZombieStats(false, 0) val graph = Graph(vectorRdd, edgeRdd, defaultUser) val zombieResults = graph.pregel[Long](0, 30, EdgeDirection.Either)( (vertexId, zombieState, message) => { if (message > 0 && !zombieState.isZombie) { new ZombieStats(true, message) } else { zombieState } }, triplet => { if (triplet.srcAttr.isZombie && !triplet.dstAttr.isZombie) { Iterator((triplet.dstId, triplet.srcAttr.lengthOfLife + 1l)) } else if (triplet.dstAttr.isZombie && !triplet.srcAttr.isZombie) { Iterator((triplet.srcId, triplet.dstAttr.lengthOfLife + 1l)) } else { Iterator.empty } }, (a, b) => Math.min(a, b)) println("ZombieBite") zombieResults.vertices.collect().sortBy(r => r._1).foreach(r => { println("vertexId:" + r._1 + ",ZobmieStat:" + r._2) }) sparkSession.stop() } } case class ZombieStats (isZombie:Boolean, lengthOfLife:Long)
Example 102
Source File: TrianglesExample.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.graph import org.apache.log4j.{Level, Logger} import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession object TrianglesExample { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val vertexJsonFile = args(0) val edgeJsonFile = args(1) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } println("---") import sparkSession.implicits._ val vectorDs = sparkSession.read.json(vertexJsonFile).as[JsonVertex] val edgeDs = sparkSession.read.json(edgeJsonFile).as[JsonEdge] val vectorRdd:RDD[(VertexId, ZombieStats)] = vectorDs.rdd.map(r => { (r.vertex_id.toLong, new ZombieStats(r.is_zombie.equals("yes"), r.time_alive)) }) val edgeRdd = edgeDs.rdd.map(r => { new Edge[String](r.src, r.dst, r.edge_type) }) val defaultUser = new ZombieStats(false, 0) val graph = Graph(vectorRdd, edgeRdd, defaultUser) println("TriangleCount") graph.triangleCount().vertices.collect().sortBy(r => r._1).foreach(r => { println("vertexId:" + r._1 + ",triangleCount:" + r._2) }) graph.pageRank(1.1, 1.1) sparkSession.stop() } }
Example 103
Source File: Test_example_CNN.scala From SparkMLlibDeepLearn with Apache License 2.0 | 5 votes |
package tests import org.apache.log4j.{ Level, Logger } import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.storage.StorageLevel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.linalg.{ Vector, Vectors } import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.regression.LabeledPoint import breeze.linalg.{ Matrix => BM, CSCMatrix => BSM, DenseMatrix => BDM, Vector => BV, DenseVector => BDV, SparseVector => BSV, axpy => brzAxpy, svd => brzSvd, max => Bmax, min => Bmin, sum => Bsum } import scala.collection.mutable.ArrayBuffer import CNN.CNN object Test_example_CNN { def main(args: Array[String]) { //1 ����Spark���� val conf = new SparkConf().setAppName("CNNtest") val sc = new SparkContext(conf) //2 �������� Logger.getRootLogger.setLevel(Level.WARN) val data_path = "/deeplearn/train_d3.txt" val examples = sc.textFile(data_path).cache() val train_d1 = examples.map { line => val f1 = line.split("\t") val f = f1.map(f => f.toDouble) val y = f.slice(0, 10) val x = f.slice(10, f.length) (new BDM(1, y.length, y), (new BDM(1, x.length, x)).reshape(28, 28) / 255.0) } val train_d = train_d1.map(f => (f._1, f._2)) //3 ����ѵ������������ģ�� // opts:��������������������������֤���� val opts = Array(50.0, 1.0, 0.0) train_d.cache val numExamples = train_d.count() println(s"numExamples = $numExamples.") val CNNmodel = new CNN(). setMapsize(new BDM(1, 2, Array(28.0, 28.0))). setTypes(Array("i", "c", "s", "c", "s")). setLayer(5). setOnum(10). setOutputmaps(Array(0.0, 6.0, 0.0, 12.0, 0.0)). setKernelsize(Array(0.0, 5.0, 0.0, 5.0, 0.0)). setScale(Array(0.0, 0.0, 2.0, 0.0, 2.0)). setAlpha(1.0). CNNtrain(train_d, opts) //4 ģ�Ͳ��� val CNNforecast = CNNmodel.predict(train_d) val CNNerror = CNNmodel.Loss(CNNforecast) println(s"NNerror = $CNNerror.") val printf1 = CNNforecast.map(f => (f.label.data, f.predict_label.data)).take(200) println("Ԥ��ֵ") for (i <- 0 until printf1.length) { val outi = printf1(i)._2.mkString("\t") println(outi) } } }
Example 104
Source File: LRAccuracyTest.scala From SparseML with Apache License 2.0 | 5 votes |
package MLlib import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, LogisticRegressionModel, SparseLogisticRegressionWithLBFGS} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkContext, SparkConf} object LRAccuracyTest { def main(args: Array[String]) { val conf = new SparkConf().setAppName(s"LogisticRegressionTest with $args").setMaster("local") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").map( l => LabeledPoint(l.label, l.features.toSparse)) // Split data into training (60%) and test (40%). val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) // Run training algorithm to build the model val model = new SparseLogisticRegressionWithLBFGS() .setNumClasses(5) .run(training) // Compute raw scores on the test set. val predictionAndLabels = test.map { case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } // Get evaluation metrics. val metrics = new MulticlassMetrics(predictionAndLabels) val precision = metrics.precision println("Precision = " + precision) } }
Example 105
Source File: MnistExample.scala From SparseML with Apache License 2.0 | 5 votes |
import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.{KMeans, ScalableKMeans, SparseKMeans} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql.SparkSession object MnistExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val spark = SparkSession.builder.appName("svm").master("local[8]").getOrCreate() val trainRDD = spark.sparkContext.textFile("data/mnist/mnist_train.csv", 8) .map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => Vectors.dense(arr.slice(1, 785))) val model = new KMeans() .setK(10) .setInitializationMode("random") .setMaxIterations(10) .run(trainRDD) println("final clusters:") println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n")) } }
Example 106
Source File: KMeanTest.scala From SparseML with Apache License 2.0 | 5 votes |
import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.{ScalableKMeans, KMeans} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.{SparseVector, Vectors, Vector} import scala.util.Random //spark/bin/spark-submit --master spark://10.100.34.48:7077 --class ScalableKMeanTest --executor-memory 20g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 1000000 100 0.1 1 my 9 //guale spark/bin/spark-submit --master spark://10.100.34.48:7077 --class ScalableKMeanTest --executor-memory 5g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 5000000 100 0.1 1 my 15 object ScalableKMeanTest { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setAppName(s"kmeans: ${args.mkString(",")}") val sc = new SparkContext(conf) val k = args(0).toInt val dimension = args(1).toInt val recordNum = args(2).toInt val sparsity = args(3).toDouble val iterations = args(4).toInt val means = args(5) val parNumber = args(6).toInt val data: RDD[Vector] = sc.parallelize(1 to recordNum, parNumber).map(i => { val ran = new Random() val indexArr = ran.shuffle((0 until dimension).toList).take((dimension * sparsity).toInt).sorted.toArray val valueArr = (1 to (dimension * sparsity).toInt).map(in => ran.nextDouble()).sorted.toArray val vec: Vector = new SparseVector(dimension, indexArr, valueArr) vec }).cache() println(args.mkString(", ")) println(data.count() + " records generated") val st = System.nanoTime() val model = if(means == "my") { println("running scalable kmeans") val model = new ScalableKMeans() .setK(k) .setInitializationMode("random") .setMaxIterations(iterations) .run(data) model } else { println("running mllib kmeans") val model = new KMeans() .setK(k) .setInitializationMode("random") .setMaxIterations(iterations) .run(data) model } println((System.nanoTime() - st) / 1e9 + " seconds cost") println("final clusters: " + model.clusterCenters.length) println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n")) sc.stop() } }
Example 107
Source File: MannWhitneyUTestSuite.scala From StatisticsOnSpark with Apache License 2.0 | 5 votes |
package test import org.apache.commons.math3.stat.inference.MannWhitneyUTest import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkContext, SparkConf} object MannWhitneyUTestSuite { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setAppName("TallSkinnySVD").setMaster("local") val sc = new SparkContext(conf) def main(args: Array[String]) { testMannWhitneyU testMannWhitneyUTest } private def testMannWhitneyU(): Unit ={ val sample1 = Array(1d, 3d, 5, 7) val sample2 = Array(2, 4, 6, 8d) val rdd1 = sc.parallelize(sample1) val rdd2 = sc.parallelize(sample2) val result = new MannWhitneyUTest() .mannWhitneyU(sample1, sample2) val result2 = org.apache.spark.mllib.stat.test.MannWhitneyUTest.mannWhitneyU(rdd1, rdd2) assert(result == result2) } private def testMannWhitneyUTest(): Unit ={ val sample1 = Array(1d, 3d, 5, 7) val sample2 = Array(2, 4, 6, 8d) val rdd1 = sc.parallelize(sample1) val rdd2 = sc.parallelize(sample2) val result = new MannWhitneyUTest() .mannWhitneyUTest(sample1, sample2) val result2 = org.apache.spark.mllib.stat.test.MannWhitneyUTest.mannWhitneyUTest(rdd1, rdd2) println(result) println(result2) assert(result == result2) } }
Example 108
Source File: TTestSuite.scala From StatisticsOnSpark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import org.apache.commons.math3.stat.inference.TestUtils import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} object TTestSuite { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setAppName("TallSkinnySVD").setMaster("local") val sc = new SparkContext(conf) def main(args: Array[String]) { OneSampleTTest twoIndependentSampleTTest pairedTwoSampleTTest } def OneSampleTTest(): Unit ={ val observed = Array(100d, 200d, 300d, 400d) val mu = 2.5d assert(TestUtils.tTest(mu, observed, 0.05) == new OneSampleTTest().tTest(mu, sc.parallelize(observed), 0.05)) assert(TestUtils.tTest(mu, observed) == new OneSampleTTest().tTest(mu, sc.parallelize(observed))) } def twoIndependentSampleTTest(): Unit ={ val sample1 = Array(100d, 200d, 300d, 400d) val sample2 = Array(101d, 205d, 300d, 400d) val rdd1 = sc.parallelize(sample1) val rdd2 = sc.parallelize(sample2) assert(TestUtils.tTest(sample1, sample2, 0.05) == new TwoSampleIndependentTTest().tTest(rdd1, rdd2, 0.05)) assert(TestUtils.tTest(sample1, sample2) == new TwoSampleIndependentTTest().tTest(rdd1, rdd2)) } def pairedTwoSampleTTest(): Unit ={ val sample1 = Array(100d, 200d, 300d, 400d) val sample2 = Array(101d, 202d, 300d, 400d) val rdd1 = sc.parallelize(sample1) val rdd2 = sc.parallelize(sample2) assert(TestUtils.pairedTTest(sample1, sample2, 0.05) == new PairTwoSampleTTest().tTest(rdd1, rdd2, 0.05)) assert(TestUtils.pairedTTest(sample1, sample2) == new PairTwoSampleTTest().tTest(rdd1, rdd2)) } }
Example 109
Source File: ANOVASuite.scala From StatisticsOnSpark with Apache License 2.0 | 5 votes |
package test import java.util import main.ANOVA.OneWayANOVA import org.apache.commons.math3.stat.inference.TestUtils import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.stat.OneSampleTTest import org.apache.spark.{SparkContext, SparkConf} object ANOVASuite { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setAppName("TallSkinnySVD").setMaster("local") val sc = new SparkContext(conf) def main(args: Array[String]) { OneWayANOVA } def OneWayANOVA(): Unit ={ val sample1 = Array(100d, 200d, 300d, 400d) val sample2 = Array(101d, 200d, 300d, 400d) val sample3 = Array(102d, 200d, 300d, 400d) val data = new util.ArrayList[Array[Double]]() data.add(sample1) data.add(sample2) data.add(sample3) val rdd1 = sc.parallelize(sample1) val rdd2 = sc.parallelize(sample2) val rdd3 = sc.parallelize(sample3) val rddData = Seq(rdd1, rdd2, rdd3) assert(TestUtils.oneWayAnovaFValue(data) == new OneWayANOVA().anovaFValue(rddData)) assert(TestUtils.oneWayAnovaPValue(data) == new OneWayANOVA().anovaPValue(rddData)) } }
Example 110
Source File: ProxyPlugin.scala From AppCrawler with Apache License 2.0 | 5 votes |
package com.testerhome.appcrawler.plugin import java.io.File import com.brsanthu.googleanalytics.GoogleAnalytics import com.testerhome.appcrawler.URIElement import com.testerhome.appcrawler.Plugin import net.lightbody.bmp.BrowserMobProxyServer import net.lightbody.bmp.proxy.CaptureType import org.apache.log4j.{BasicConfigurator, Level, Logger} import scala.util.Try class ProxyPlugin extends Plugin { private var proxy: BrowserMobProxyServer = _ val port = 7777 //todo: 支持代理 override def start(): Unit = { BasicConfigurator.configure() Logger.getRootLogger.setLevel(Level.INFO) Logger.getLogger("ProxyServer").setLevel(Level.WARN) proxy = new BrowserMobProxyServer() proxy.setHarCaptureTypes(CaptureType.getNonBinaryContentCaptureTypes) proxy.setTrustAllServers(true) proxy.start(port) //proxy.setHarCaptureTypes(CaptureType.getAllContentCaptureTypes) //proxy.setHarCaptureTypes(CaptureType.getHeaderCaptureTypes) log.info(s"proxy server listen on ${port}") proxy.newHar("start") } override def beforeElementAction(element: URIElement): Unit = { log.info("clear har") proxy.endHar() //创建新的har val harFileName = getCrawler().getBasePathName() + ".har" proxy.newHar(harFileName) } override def afterElementAction(element: URIElement): Unit = { log.info("save har") val harFileName = getCrawler().getBasePathName() + ".har" val file = new File(harFileName) try { log.info(proxy.getHar) log.info(proxy.getHar.getLog) log.info(proxy.getHar.getLog.getEntries.size()) log.info(s"har entry size = ${proxy.getHar.getLog.getEntries.size()}") if (proxy.getHar.getLog.getEntries.size() > 0) { proxy.getHar.writeTo(file) } } catch { case e: Exception =>{ log.error("read har error") log.error(e.getCause) log.error(e.getMessage) e.getStackTrace.foreach(log.error) } } } override def stop(): Unit ={ log.info("prpxy stop") proxy.stop() } }
Example 111
Source File: TestGA.scala From AppCrawler with Apache License 2.0 | 5 votes |
package com.testerhome.appcrawler.ut import com.brsanthu.googleanalytics.{GoogleAnalytics, PageViewHit} import org.apache.log4j.{BasicConfigurator, Level, Logger} import org.scalatest.FunSuite class TestGA extends FunSuite{ test("google analyse"){ println("ga start") BasicConfigurator.configure() Logger.getRootLogger().setLevel(Level.WARN) val ga = new GoogleAnalytics("UA-74406102-1") 1 to 10 foreach(x=>{ ga.postAsync(new PageViewHit(s"http://appcrawler.io/demo${x}", "test")) }) Thread.sleep(10000) 1 to 10 foreach(x=>{ ga.postAsync(new PageViewHit(s"http://appcrawler.io/dem1${x}", "test")) }) Thread.sleep(10000) 1 to 10 foreach(x=>{ ga.postAsync(new PageViewHit(s"http://appcrawler.io/dem2${x}", "test")) }) //ga.post(new PageViewHit("http://appcrawler.io/test2", "test")) println("ga end") } }
Example 112
Source File: StreamHQL.scala From spark-cep with Apache License 2.0 | 5 votes |
import java.util.Properties import kafka.consumer.ConsumerConfig import org.I0Itec.zkclient.ZkClient import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.streaming.StreamSQLContext import org.apache.spark.sql.streaming.sources.MessageDelimiter import org.apache.spark.streaming.dstream.ConstantInputDStream import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} import redis.RedisManager import scala.util.parsing.json.JSON class TabDelimiter extends MessageDelimiter { override val delimiter = "\t" } object StreamDDL { def main(args: Array[String]): Unit = { Logger.getRootLogger.setLevel(Level.WARN) val query = args(0) val sc = new SparkContext(new SparkConf()) val ssc = new StreamingContext(sc, Seconds(1)) val streamSqlContext = new StreamSQLContext(ssc, new HiveContext(sc)) streamSqlContext.command(query) new ConstantInputDStream[Int](ssc, sc.parallelize(Seq(1))).print ssc.start() ssc.awaitTerminationOrTimeout(100) ssc.stop() } } object StreamHQL { object Redis { var initialized = false var manager: RedisManager = _ def init(confMap: Map[String, String]) { if (initialized == false) { manager = new RedisManager( confMap("redis.shards"), confMap("redis.sentinels"), confMap("redis.database").toInt) manager.init initialized = true } } } def removeConsumerGroup(zkQuorum: String, groupId: String) { val properties = new Properties() properties.put("zookeeper.connect", zkQuorum) properties.put("group.id", groupId) val conf = new ConsumerConfig(properties) val zkClient = new ZkClient(conf.zkConnect) zkClient.deleteRecursive(s"/consumers/${conf.groupId}") zkClient.close() } def main(args: Array[String]): Unit = { Logger.getRootLogger.setLevel(Level.WARN) val confMap = JSON.parseFull(args(0)).get.asInstanceOf[Map[String, String]] val qid = args(1) val query = args(2) val sc = new SparkContext(new SparkConf()) val ssc = new StreamingContext(sc, Seconds(1)) val hc = new HiveContext(sc) val streamSqlContext = new StreamSQLContext(ssc, hc) val redisExpireSec = confMap("redis.expire.sec").toInt ssc.checkpoint(s"checkpoint/$qid") hc.setConf("spark.streaming.query.id", qid) hc.setConf("spark.sql.shuffle.partitions", confMap("spark.sql.shuffle.partitions")) removeConsumerGroup(confMap("kafka.zookeeper.quorum"), qid) val result = streamSqlContext.sql(query) val schema = result.schema result.foreachRDD((rdd, time) => { rdd.foreachPartition(partition => { Redis.init(confMap) val jedis = Redis.manager.getResource val pipe = jedis.pipelined partition.foreach(record => { val seq = record.toSeq(schema) val ts = time.milliseconds / 1000 val hkey = seq.take(seq.size - 1).mkString(".") pipe.hset(qid + "." + ts, hkey, seq(seq.size - 1).toString) pipe.expire(qid + "." + ts, redisExpireSec) }) pipe.sync Redis.manager.returnResource(jedis) }) }) ssc.start() ssc.awaitTermination() ssc.stop() } }
Example 113
Source File: ZeroMQWordCount.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming.zeromq import scala.language.implicitConversions import scala.util.Random import org.apache.log4j.{Level, Logger} import org.zeromq.ZContext import org.zeromq.ZMQ import org.zeromq.ZMQException import org.zeromq.ZMsg import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.zeromq.ZeroMQUtils object ZeroMQWordCount { def main(args: Array[String]) { if (args.length < 2) { // scalastyle:off println System.err.println("Usage: ZeroMQWordCount <zeroMqUrl> <topic>") // scalastyle:on println System.exit(1) } // Set logging level if log4j not configured (override by adding log4j.properties to classpath). Logger.getRootLogger.setLevel(Level.WARN) val Seq(url, topic) = args.toSeq val sparkConf = new SparkConf().setAppName("ZeroMQWordCount") // Check Spark configuration for master URL, set it to local if not present. if (!sparkConf.contains("spark.master")) { sparkConf.setMaster("local[2]") } // Create the context and set the batch size. val ssc = new StreamingContext(sparkConf, Seconds(2)) val lines = ZeroMQUtils.createTextStream( ssc, url, true, Seq(topic.getBytes) ) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } }
Example 114
Source File: TwitterLocations.scala From bahir with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming.twitter import org.apache.log4j.{Level, Logger} import twitter4j.FilterQuery import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.twitter._ object TwitterLocations { def main(args: Array[String]) { if (args.length < 4 || args.length % 4 != 0) { System.err.println("Usage: TwitterLocations <consumer key> <consumer secret> " + "<access token> <access token secret> " + "[<latitude-south-west> <longitude-south-west>" + " <latitude-north-east> <longitude-north-east> ...]") System.exit(1) } // Set logging level if log4j not configured (override by adding log4j.properties to classpath) if (!Logger.getRootLogger.getAllAppenders.hasMoreElements) { Logger.getRootLogger.setLevel(Level.WARN) } // Set the system properties so that Twitter4j library used by twitter stream // can use them to generate OAuth credentials val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4) System.setProperty("twitter4j.oauth.consumerKey", consumerKey) System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret) System.setProperty("twitter4j.oauth.accessToken", accessToken) System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret) // Get bounding boxes of locations for which to retrieve Tweets from command line val locationArgs = args.takeRight(args.length - 4) val boundingBoxes = if (locationArgs.length == 0) { System.out.println("No location bounding boxes specified, using defaults for New York City") val nycSouthWest = Array(-74.0, 40.0) val nycNorthEast = Array(-73.0, 41.0) Array(nycSouthWest, nycNorthEast) } else { locationArgs.map(_.toDouble).sliding(2, 2).toArray } val sparkConf = new SparkConf().setAppName("TwitterLocations") // check Spark configuration for master URL, set it to local if not configured if (!sparkConf.contains("spark.master")) { sparkConf.setMaster("local[2]") } val ssc = new StreamingContext(sparkConf, Seconds(2)) val locationsQuery = new FilterQuery().locations(boundingBoxes : _*) // Print Tweets from the specified coordinates // This includes Tweets geo-tagged in the bounding box defined by the coordinates // As well as Tweets tagged in places inside of the bounding box TwitterUtils.createFilteredStream(ssc, None, Some(locationsQuery)) .map(tweet => { val latitude = Option(tweet.getGeoLocation).map(l => s"${l.getLatitude},${l.getLongitude}") val place = Option(tweet.getPlace).map(_.getName) val location = latitude.getOrElse(place.getOrElse("(no location)")) val text = tweet.getText.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') s"$location\t$text" }) .print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 115
Source File: TwitterAlgebirdHLL.scala From bahir with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming.twitter import com.twitter.algebird.HyperLogLog._ import com.twitter.algebird.HyperLogLogMonoid import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.twitter._ // scalastyle:off val BIT_SIZE = 12 val filters = args val sparkConf = new SparkConf().setAppName("TwitterAlgebirdHLL") // check Spark configuration for master URL, set it to local if not configured if (!sparkConf.contains("spark.master")) { sparkConf.setMaster("local[2]") } val ssc = new StreamingContext(sparkConf, Seconds(5)) val stream = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_ONLY_SER) val users = stream.map(status => status.getUser.getId) val hll = new HyperLogLogMonoid(BIT_SIZE) var globalHll = hll.zero var userSet: Set[Long] = Set() val approxUsers = users.mapPartitions(ids => { ids.map(id => hll.create(id)) }).reduce(_ + _) val exactUsers = users.map(id => Set(id)).reduce(_ ++ _) approxUsers.foreachRDD(rdd => { if (rdd.count() != 0) { val partial = rdd.first() globalHll += partial println("Approx distinct users this batch: %d".format(partial.estimatedSize.toInt)) println("Approx distinct users overall: %d".format(globalHll.estimatedSize.toInt)) } }) exactUsers.foreachRDD(rdd => { if (rdd.count() != 0) { val partial = rdd.first() userSet ++= partial println("Exact distinct users this batch: %d".format(partial.size)) println("Exact distinct users overall: %d".format(userSet.size)) println("Error rate: %2.5f%%".format(((globalHll.estimatedSize / userSet.size.toDouble) - 1 ) * 100)) } }) ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 116
Source File: TwitterPopularTags.scala From bahir with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming.twitter import org.apache.log4j.{Level, Logger} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.twitter._ import org.apache.spark.SparkConf object TwitterPopularTags { def main(args: Array[String]) { if (args.length < 4) { System.err.println("Usage: TwitterPopularTags <consumer key> <consumer secret> " + "<access token> <access token secret> [<filters>]") System.exit(1) } // Set logging level if log4j not configured (override by adding log4j.properties to classpath) if (!Logger.getRootLogger.getAllAppenders.hasMoreElements) { Logger.getRootLogger.setLevel(Level.WARN) } val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4) val filters = args.takeRight(args.length - 4) // Set the system properties so that Twitter4j library used by twitter stream // can use them to generate OAuth credentials System.setProperty("twitter4j.oauth.consumerKey", consumerKey) System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret) System.setProperty("twitter4j.oauth.accessToken", accessToken) System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret) val sparkConf = new SparkConf().setAppName("TwitterPopularTags") // check Spark configuration for master URL, set it to local if not configured if (!sparkConf.contains("spark.master")) { sparkConf.setMaster("local[2]") } val ssc = new StreamingContext(sparkConf, Seconds(2)) val stream = TwitterUtils.createStream(ssc, None, filters) val hashTags = stream.flatMap(status => status.getText.split(" ").filter(_.startsWith("#"))) val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60)) .map{case (topic, count) => (count, topic)} .transform(_.sortByKey(false)) val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10)) .map{case (topic, count) => (count, topic)} .transform(_.sortByKey(false)) // Print popular hashtags topCounts60.foreachRDD(rdd => { val topList = rdd.take(10) println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count())) topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))} }) topCounts10.foreachRDD(rdd => { val topList = rdd.take(10) println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count())) topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))} }) ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 117
Source File: TrainNewsClassWithDTDemo.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package applications.mining import config.paramconf.ClassParams import functions.Preprocessor import org.apache.log4j.{Level, Logger} import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.feature._ import org.apache.spark.sql.SparkSession object TrainNewsClassWithDTDemo { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.WARN) val spark = SparkSession .builder .master("local[2]") .appName("train news with DT Demo") .getOrCreate() val args = Array("ckooc-ml/data/classnews/train") val filePath = args(0) import spark.implicits._ val data = spark.sparkContext.textFile(filePath).flatMap { line => val tokens: Array[String] = line.split("\u00ef") if (tokens.length > 3) Some((tokens(0), tokens(1), tokens(2), tokens(3))) else None }.toDF("label", "title", "time", "content") data.persist() val preprocessor = new Preprocessor val pipeline = preprocessor.preprocess(data) // DT模型训练 val params = new ClassParams val dtClassifier = new DecisionTreeClassifier() .setMinInfoGain(params.minInfoGain) .setMaxDepth(params.maxDepth) //目前Spark只支持最大30层深度 .setLabelCol("indexedLabel") .setFeaturesCol("features") val indexModel = pipeline.getStages(1).asInstanceOf[StringIndexerModel] //索引标签化 val labelConverter = new IndexToString() .setLabels(indexModel.labels) .setInputCol(dtClassifier.getPredictionCol) .setOutputCol("predictedLabel") val stages = pipeline.getStages ++ Array(dtClassifier, labelConverter) pipeline.setStages(stages) val model = pipeline.fit(data) model.write.overwrite().save(params.DTModelPath) data.unpersist() spark.stop() } }
Example 118
Source File: PredictNewsClassDemo.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package applications.mining import algorithms.evaluation.MultiClassEvaluation import config.paramconf.ClassParams import org.apache.log4j.{Level, Logger} import org.apache.spark.ml.PipelineModel import org.apache.spark.sql.{Row, SparkSession} object PredictNewsClassDemo extends Serializable { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.WARN) val spark = SparkSession .builder .master("local[2]") .appName("predict news multi class demo") .getOrCreate() val args = Array("ckooc-ml/data/classnews/predict", "lr") val filePath = args(0) val modelType = args(1) var modelPath = "" val params = new ClassParams modelType match { case "lr" => modelPath = params.LRModelPath case "dt" => modelPath = params.DTModelPath case _ => println("模型类型错误!") System.exit(1) } import spark.implicits._ val data = spark.sparkContext.textFile(filePath).flatMap { line => val tokens: Array[String] = line.split("\u00ef") if (tokens.length > 3) Some((tokens(0), tokens(1), tokens(2), tokens(3))) else None }.toDF("label", "title", "time", "content") data.persist() //加载模型,进行数据转换 val model = PipelineModel.load(modelPath) val predictions = model.transform(data) //=== 模型评估 val resultRDD = predictions.select("prediction", "indexedLabel").rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val (precision, recall, f1) = MultiClassEvaluation.multiClassEvaluate(resultRDD) println("\n\n========= 评估结果 ==========") println(s"\n加权准确率:$precision") println(s"加权召回率:$recall") println(s"F1值:$f1") // predictions.select("label", "predictedLabel", "content").show(100, truncate = false) data.unpersist() spark.stop() } }
Example 119
Source File: TrainNewsClassWithLRDemo.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package applications.mining import config.paramconf.ClassParams import functions.Preprocessor import org.apache.log4j.{Level, Logger} import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature._ import org.apache.spark.sql.SparkSession object TrainNewsClassWithLRDemo extends Serializable { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.WARN) val spark = SparkSession .builder .master("local[2]") .appName("train news with LR Demo") .getOrCreate() val args = Array("ckooc-ml/data/classnews/train") val filePath = args(0) import spark.implicits._ val data = spark.sparkContext.textFile(filePath).flatMap { line => val tokens: Array[String] = line.split("\u00ef") if (tokens.length > 3) Some((tokens(0), tokens(1), tokens(2), tokens(3))) else None }.toDF("label", "title", "time", "content") data.persist() val preprocessor = new Preprocessor val pipeline = preprocessor.preprocess(data) //LR模型训练 val params = new ClassParams val logisticRegression = new LogisticRegression() .setTol(params.converTol) .setMaxIter(params.maxIteration) .setRegParam(params.regParam) .setElasticNetParam(params.elasticNetParam) .setLabelCol("indexedLabel") .setFeaturesCol("features") val indexModel = pipeline.getStages(1).asInstanceOf[StringIndexerModel] //索引标签化 val labelConverter = new IndexToString() .setLabels(indexModel.labels) .setInputCol(logisticRegression.getPredictionCol) .setOutputCol("predictedLabel") val stages = pipeline.getStages ++ Array(logisticRegression, labelConverter) pipeline.setStages(stages) val model = pipeline.fit(data) model.write.overwrite().save(params.LRModelPath) data.unpersist() spark.stop() } }
Example 120
Source File: StarsAnalysisDemo.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package applications.analysis import java.io.{BufferedWriter, FileOutputStream, OutputStreamWriter} import functions.segment.Segmenter import org.apache.log4j.{Level, Logger} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row, SparkSession} object StarsAnalysisDemo { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val spark = SparkSession .builder .master("local[2]") .appName("Stars Analysis Demo") .getOrCreate() val filePath = "E:/data/chinaNews/entertainment.txt" // 加载数据,并保留年份和内容字段,并对内容字段进行过滤 import spark.implicits._ val data = spark.sparkContext.textFile(filePath).flatMap { line => val tokens: Array[String] = line.split("\u00ef") if (tokens.length > 3) { var year: String = tokens(2).split("-")(0) if (tokens(2).contains("年")) year = tokens(2).split("年")(0) var content = tokens(3) if (content.length > 22 && content.substring(0, 20).contains("日电")) { content = content.substring(content.indexOf("日电") + 2, content.length).trim } if (content.startsWith("(")) content = content.substring(content.indexOf(")") + 1, content.length) if (content.length > 20 && content.substring(content.length - 20, content.length).contains("记者")) { content = content.substring(0, content.lastIndexOf("记者")).trim } Some(year, content) } else None }.toDF("year", "content") // 分词,去除长度为1的词,每个词保留词性 val segmenter = new Segmenter() .isAddNature(true) .isDelEn(true) .isDelNum(true) .setMinTermLen(2) .setMinTermNum(5) .setSegType("StandardSegment") .setInputCol("content") .setOutputCol("segmented") val segDF: DataFrame = segmenter.transform(data) segDF.cache() val segRDD: RDD[(Int, Seq[String])] = segDF.select("year", "segmented").rdd.map { case Row(year: String, terms: Seq[String]) => (Integer.parseInt(year), terms) } val result: Array[String] = segRDD.map(line => line._1.toString + "\u00ef" + line._2.mkString(",")).collect() val writer: BufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("E:/entertainment_seg.txt"))) result.foreach(line => writer.write(line + "\n")) writer.close() // 统计2016出现在新闻中最多的明星 val stars2016 = segRDD.filter(_._1 == 2016) .flatMap { case (year: Int, termStr: Seq[String]) => val person = termStr .map(term => (term.split("/")(0), term.split("/")(1))) .filter(_._2.equalsIgnoreCase("nr")) .map(term => (term._1, 1L)) person } .reduceByKey(_ + _) .sortBy(_._2, ascending = false) segDF.unpersist() stars2016.take(100).foreach(println) spark.stop() } }
Example 121
Source File: NLPPreprocessTest.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package nlp import com.hankcs.hanlp.utility.Predefine import functions.clean.Cleaner import functions.segment.Segmenter import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.junit.Test import scala.reflect.io.File @Test def testSegmenter(): Unit = { val spark = SparkSession .builder .master("local[2]") .appName("Segment Demo") .getOrCreate() val text = Seq( (0, "这段文本是用来做分词测试的!This text is for test!"), (1, "江州市长江大桥参加长江大桥通车仪式"), (2, "他邀请了不少于10个明星,有:范冰冰、赵薇、周杰伦等,还有20几位商业大佬") ) val sentenceData = spark.createDataFrame(text).toDF("id", "sentence") // 设置HanLP配置文件路径, 默认位于classpath路径中 val path = this.getClass.getClassLoader.getResource("").getPath Predefine.HANLP_PROPERTIES_PATH = path + File.separator + "hanlp.properties" val segmenter = new Segmenter() .isDelEn(true) .isDelNum(true) .isAddNature(true) .setSegType("StandardSegment") .setMinTermLen(2) .setMinTermNum(3) .setInputCol("sentence") .setOutputCol("segmented") segmenter.transform(sentenceData).show(false) spark.stop() } }
Example 122
Source File: printMatrix.scala From mCNN with Apache License 2.0 | 5 votes |
package hhbyyh.mCNN import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.{SparkContext, SparkConf} import breeze.linalg.{DenseMatrix => BDM, kron} object printMatrix { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setMaster("local[8]").setAppName("ttt") val sc = new SparkContext(conf) val lines = sc.textFile("dataset/mnist/mnist_train.csv", 8) val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => (arr(0), Example.Vector2Tensor(Vectors.dense(arr.slice(1, 785).map(v => if(v > 200) 1.0 else 0)))(0))) val lines2 = sc.textFile("dataset/train.format", 8) val data2 = lines2.map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => (arr(784), Example.Vector2Tensor(Vectors.dense(arr.slice(0, 784)))(0))) data2.take(10).foreach(record =>{ println("label: " + record._1) val intm = new BDM[Int](28, 28, record._2.toArray.map(d => d.toInt)) val str = intm.toString(1000, 1000).replace('0', '.').replace('0', '*') println(str) }) } }
Example 123
Source File: Example.scala From mCNN with Apache License 2.0 | 5 votes |
package hhbyyh.mCNN import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.{SparkConf, SparkContext} import breeze.linalg.{DenseMatrix => BDM, _} object Example { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setMaster("local[8]").setAppName("ttt") val sc = new SparkContext(conf) val lines = sc.textFile("dataset/train.format", 8) val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => (arr(784), Vector2Tensor(Vectors.dense(arr.slice(0, 784))))) val topology = new CNNTopology topology.addLayer(CNNLayer.buildConvolutionLayer(1, 6, new Scale(5, 5))) topology.addLayer(CNNLayer.buildMeanPoolingLayer(new Scale(2, 2))) topology.addLayer(CNNLayer.buildConvolutionLayer(6, 12, new Scale(5, 5))) topology.addLayer(CNNLayer.buildMeanPoolingLayer(new Scale(2, 2))) topology.addLayer(CNNLayer.buildConvolutionLayer(12, 12, new Scale(4, 4))) val cnn: CNN = new CNN(topology).setMaxIterations(5).setMiniBatchSize(16) val start = System.nanoTime() cnn.trainOneByOne(data) println("Training time: " + (System.nanoTime() - start) / 1e9) val right = data.map(record =>{ val result = cnn.predict(record._2) if(result == record._1) 1 else 0 }).sum() println(s"Predicting precision: $right " + right.toDouble/(data.count())) // val testData = sc.textFile("dataset/mnist/mnist_test.csv", 8) // .map(line => line.split(",")).map(arr => arr.map(_.toDouble)) // .map(arr => (arr(0), Example.Vector2Tensor(Vectors.dense(arr.slice(1, 785).map(v => if(v > 200) 1.0 else 0))))) val rightM = data.map(record =>{ val result = cnn.predict(record._2) if(result == record._1) 1 else 0 }).sum() println(s"Mnist Full Predicting precision: $rightM " + rightM.toDouble/(data.count())) } def Vector2Tensor(record: Vector): Array[BDM[Double]] = { val mapSize = new Scale(28, 28) val m = new BDM[Double](mapSize.x, mapSize.y) var i: Int = 0 while (i < mapSize.x) { var j: Int = 0 while (j < mapSize.y) { m(i, j) = record(mapSize.x * i + j) j += 1 } i += 1 } Array(m) } }
Example 124
Source File: Driver.scala From mCNN with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.ann import org.apache.log4j.{Logger, Level} import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.{SparkContext, SparkConf} object CNNDriver { def main(args: Array[String]) { val myLayers = new Array[Layer](8) myLayers(0) = new ConvolutionalLayer(1, 6, kernelSize = new MapSize(5, 5), inputMapSize = new MapSize(28, 28)) myLayers(1) = new FunctionalLayer(new SigmoidFunction()) myLayers(2) = new MeanPoolingLayer(new MapSize(2, 2), new MapSize(24, 24)) myLayers(3) = new ConvolutionalLayer(6, 12, new MapSize(5, 5), new MapSize(12, 12)) myLayers(4) = new FunctionalLayer(new SigmoidFunction()) myLayers(5) = new MeanPoolingLayer(new MapSize(2, 2), new MapSize(8, 8)) myLayers(6) = new ConvolutionalLayer(12, 12, new MapSize(4, 4), new MapSize(4, 4)) myLayers(7) = new FunctionalLayer(new SigmoidFunction()) val topology = FeedForwardTopology(myLayers) Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setMaster("local[8]").setAppName("ttt") val sc = new SparkContext(conf) val lines = sc.textFile("dataset/train.format", 8) val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => { val target = new Array[Double](12) target(arr(784).toInt) = 1 val in = Vector2BDM(Vectors.dense(arr.slice(0, 784))) (Vectors.fromBreeze(in.toDenseVector), Vectors.dense(target)) }).cache() val feedForwardTrainer = new FeedForwardTrainer(topology, 784, 12) feedForwardTrainer.setStackSize(4) // CNN does not benefit from the stacked data // .LBFGSOptimizer.setNumIterations(20) .SGDOptimizer .setMiniBatchFraction(0.002) .setConvergenceTol(0) .setNumIterations(1000) .setUpdater(new CNNUpdater(0.85)) for(iter <- 1 to 1000){ val start = System.nanoTime() val mlpModel = feedForwardTrainer.train(data) feedForwardTrainer.setWeights(mlpModel.weights()) println(s"Training time $iter: " + (System.nanoTime() - start) / 1e9) // predict val right = data.filter(v => mlpModel.predict(v._1).argmax == v._2.argmax).count() val precision = right.toDouble / data.count() println(s"right: $right, count: ${data.count()}, precision: $precision") } } def Vector2BDM(record: Vector): BDM[Double] = { val mapSize = new MapSize(28, 28) val m = new BDM[Double](mapSize.x, mapSize.y) var i: Int = 0 while (i < mapSize.x) { var j: Int = 0 while (j < mapSize.y) { m(i, j) = record(mapSize.x * i + j) j += 1 } i += 1 } m } }
Example 125
Source File: MnistCSVDriver.scala From mCNN with Apache License 2.0 | 5 votes |
package hhbyyh.mCNN import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.{SparkConf, SparkContext} object MnistCSVDriver { def main(args: Array[String]) { val topology = new CNNTopology topology.addLayer(CNNLayer.buildConvolutionLayer(new Scale(28, 28))) topology.addLayer(CNNLayer.buildConvLayer(6, new Scale(5, 5))) topology.addLayer(CNNLayer.buildSampLayer(new Scale(2, 2))) topology.addLayer(CNNLayer.buildConvLayer(12, new Scale(5, 5))) topology.addLayer(CNNLayer.buildSampLayer(new Scale(2, 2))) topology.addLayer(CNNLayer.buildConvLayer(12, new Scale(4, 4))) val cnn: CNN = new CNN(topology).setMaxIterations(500000).setMiniBatchSize(16) Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setMaster("local[8]").setAppName("ttt") val sc = new SparkContext(conf) val lines = sc.textFile("dataset/mnist/mnist_train.csv", 8) val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => new LabeledPoint(arr(0), Vectors.dense(arr.slice(1, 785).map(v => if(v > 0) 1.0 else 0)))) val start = System.nanoTime() cnn.trainOneByOne(data) println("Training time: " + (System.nanoTime() - start) / 1e9) } }
Example 126
Source File: LoggerOutputStream.scala From hail with MIT License | 5 votes |
package is.hail.utils import java.io.{ByteArrayOutputStream, OutputStream} import java.nio.charset.StandardCharsets import org.apache.log4j.{Level, Logger} class LoggerOutputStream(logger: Logger, level: Level) extends OutputStream { private val buffer = new ByteArrayOutputStream() override def write(b: Int) { buffer.write(b) if (b == '\n') { val line = buffer.toString(StandardCharsets.UTF_8.name()) level match { case Level.TRACE => logger.trace(line) case Level.DEBUG => logger.debug(line) case Level.INFO => logger.info(line) case Level.WARN => logger.warn(line) case Level.ERROR => logger.error(line) } buffer.reset() } } }
Example 127
Source File: APSPSpec.scala From spark-all-pairs-shortest-path with Apache License 2.0 | 5 votes |
import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry} import org.scalatest.{Outcome, FlatSpec} import AllPairsShortestPath._ import breeze.linalg.{DenseMatrix => BDM} class APSPSpec extends FlatSpec { val conf = new SparkConf().setAppName("AllPairsShortestPath").setMaster("local[4]").set("spark.driver.allowMultipleContexts", "true") val sc = new SparkContext(conf) override def withFixture(test: NoArgTest) : Outcome = { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) try { test() // invoke the test function } } def fourByFourBlockMatrx = { val entries = sc.parallelize(Array( (0, 1, 20), (0, 2, 4), (0, 3, 2), (1, 0, 2), (1, 2, 1), (1, 3, 3), (2, 0, 1), (2, 1, 6), (2, 3, 5), (3, 0, 4), (3, 1, 2), (3, 2, 2))).map { case (i, j, v) => MatrixEntry(i, j, v) } val coordMat = new CoordinateMatrix(entries) val matA = coordMat.toBlockMatrix(2, 2).cache() matA } def ApspPartitioner = { GridPartitioner(fourByFourBlockMatrx.numRowBlocks, fourByFourBlockMatrx.numColBlocks, fourByFourBlockMatrx.blocks.partitions.length) } def toBreeze(A: Matrix): BDM[Double] = { new BDM[Double](A.numRows, A.numCols, A.toArray) } "The sample 4x4 Block Matrix" should "be valid" in { fourByFourBlockMatrx.validate() } it should "match our APSP matrix" in { println(fourByFourBlockMatrx.toLocalMatrix()) val result = new DistributedBlockFW val observed = toBreeze(result.compute(fourByFourBlockMatrx).toLocal()) val expected = BDM( (0.0, 4.0, 4.0, 2.0), (2.0, 0.0, 1.0, 3.0), (1.0, 5.0, 0.0, 3.0), (3.0, 2.0, 2.0, 0.0) ) assert(observed === expected) } }
Example 128
Source File: Part5_BusinessRecommendations.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.integration.yelp import org.apache.log4j.{Level, Logger} import org.opencypher.morpheus.api.{GraphSources, MorpheusSession} import org.opencypher.morpheus.integration.yelp.YelpConstants._ import org.opencypher.okapi.api.value.CypherValue.CypherInteger import org.opencypher.okapi.neo4j.io.MetaLabelSupport._ import org.opencypher.okapi.neo4j.io.Neo4jHelpers.{cypher => neo4jCypher, _} object Part5_BusinessRecommendations extends App { Logger.getRootLogger.setLevel(Level.ERROR) log("Part 5 - Business Recommendation") lazy val inputPath = args.headOption.getOrElse(defaultYelpGraphFolder) implicit val morpheus: MorpheusSession = MorpheusSession.local() import morpheus._ registerSource(fsNamespace, GraphSources.fs(inputPath).parquet) registerSource(neo4jNamespace, GraphSources.cypher.neo4j(neo4jConfig)) val year = 2017 log("Write to Neo4j, detect communities and find similar users within communities", 1) cypher( s""" |CATALOG CREATE GRAPH $neo4jNamespace.${coReviewAndBusinessGraphName(year)} { | FROM $fsNamespace.${coReviewAndBusinessGraphName(year)} | RETURN GRAPH |} """.stripMargin) // Use Neo4j Graph Algorithms to compute Louvain clusters and Jaccard similarity within clusters neo4jConfig.withSession { implicit session => log("Find communities via Louvain", 1) val louvainStats = neo4jCypher( s""" |CALL algo.louvain('${coReviewAndBusinessGraphName(year).metaLabel}', 'CO_REVIEWS', { | write: true, | weightProperty: 'reviewCount', | writeProperty: '${communityProp(year)}' |}) |YIELD communityCount, nodes, loadMillis, computeMillis, writeMillis |RETURN communityCount, nodes, loadMillis + computeMillis + writeMillis AS total""".stripMargin).head log(s"Computing Louvain modularity on ${louvainStats("nodes")} nodes took ${louvainStats("total")} ms", 1) val communityNumber = louvainStats("communityCount").cast[CypherInteger].value.toInt log(s"Find similar users within $communityNumber communities", 1) // We use Jaccard similarity because it doesn't require equal length vectors (0 until communityNumber).foreach { communityNumber => neo4jCypher( s""" |MATCH (u:User)-[r:REVIEWS]->(b:Business) |WHERE u.${communityProp(year)} = $communityNumber |WITH { item: id(u), categories: collect(id(b))} AS userData |WITH collect(userData) AS data |CALL algo.similarity.jaccard(data, { | similarityCutoff: 0.5, | write: true, | writeRelationshipType: '${isSimilarRelType(year)}'}) |YIELD similarityPairs |RETURN similarityPairs """.stripMargin ) } } log("Load graphs back to Spark and compute recommendations", 1) // Reset schema cache to enable loading new properties catalog.source(neo4jNamespace).reset() val recommendations = cypher( s""" |FROM GRAPH $neo4jNamespace.${coReviewAndBusinessGraphName(year)} |MATCH (u:User)-[:${isSimilarRelType(year)}]-(o:User), | (o:User)-[r:REVIEWS]->(b:Business) |WHERE NOT((u)<-[:REVIEWS]-(b:Business)) AND r.stars > 3 |WITH id(u) AS user_id, u.name AS name, collect(DISTINCT b.name) AS recommendations |RETURN name AS user, recommendations |ORDER BY user_id DESC |LIMIT 10 """.stripMargin ) recommendations.show }
Example 129
Source File: Part4_BusinessTrends.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.integration.yelp import org.apache.log4j.{Level, Logger} import org.opencypher.morpheus.api.{GraphSources, MorpheusSession} import org.opencypher.morpheus.integration.yelp.YelpConstants._ import org.opencypher.okapi.api.value.CypherValue.CypherFloat import org.opencypher.okapi.neo4j.io.MetaLabelSupport._ import org.opencypher.okapi.neo4j.io.Neo4jHelpers.{cypher => neo4jCypher, _} object Part4_BusinessTrends extends App { Logger.getRootLogger.setLevel(Level.ERROR) log("Part 4 - Business trends") lazy val inputPath = args.headOption.getOrElse(defaultYelpGraphFolder) implicit val morpheus: MorpheusSession = MorpheusSession.local() import morpheus._ registerSource(fsNamespace, GraphSources.fs(inputPath).parquet) registerSource(neo4jNamespace, GraphSources.cypher.neo4j(neo4jConfig)) log("Write to Neo4j and compute pageRank", 1) (2017 to 2018) foreach { year => log(s"For year $year", 2) cypher( s""" |CATALOG CREATE GRAPH $neo4jNamespace.${coReviewedGraphName(year)} { | FROM $fsNamespace.${coReviewedGraphName(year)} | RETURN GRAPH |} """.stripMargin) // Compute PageRank using Neo4j Graph Algorithms neo4jConfig.withSession { implicit session => val pageRankStats = neo4jCypher( s""" |CALL algo.pageRank('${coReviewedGraphName(year).metaLabel}', null, { | iterations: 20, | dampingFactor: 0.85, | direction: "BOTH", | write: true, | writeProperty: "pageRank$year", | weightProperty: "reviewCount" |}) |YIELD nodes, loadMillis, computeMillis, writeMillis |RETURN nodes, loadMillis + computeMillis + writeMillis AS total""".stripMargin).head log(s"Computing page rank on ${pageRankStats("nodes")} nodes took ${pageRankStats("total")} ms", 2) } } // Reset schema cache to enable loading new properties catalog.source(neo4jNamespace).reset() // Load graphs from Neo4j into Spark and compute trend rank for each business based on their page ranks. log("Load graphs back to Spark and compute trend rank", 1) cypher( s""" |CATALOG CREATE GRAPH $businessTrendsGraphName { | FROM GRAPH $neo4jNamespace.${coReviewedGraphName(2017)} | MATCH (b1:Business) | FROM GRAPH $neo4jNamespace.${coReviewedGraphName(2018)} | MATCH (b2:Business) | WHERE b1.businessId = b2.businessId | WITH b1 AS b, (b2.${pageRankProp(2018)} / ${normalizationFactor(2018)}) - (b1.${pageRankProp(2017)} / ${normalizationFactor(2017)}) AS trendRank | CONSTRUCT | CREATE (newB COPY OF b) | SET newB.trendRank = trendRank | RETURN GRAPH |} """.stripMargin) // Top 10 Increasing popularity cypher( s""" |FROM GRAPH $businessTrendsGraphName |MATCH (b:Business) |RETURN b.name AS name, b.address AS address, b.trendRank AS trendRank |ORDER BY trendRank DESC |LIMIT 10 """.stripMargin).show def normalizationFactor(year: Int): Double = neo4jConfig.cypherWithNewSession( s""" |MATCH (b:Business) |RETURN sum(b.${pageRankProp(year)}) AS nf """.stripMargin).head("nf").cast[CypherFloat].value }
Example 130
Source File: Part1_YelpImport.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.integration.yelp import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.opencypher.morpheus.api.io.GraphElement._ import org.opencypher.morpheus.api.io.MorpheusElementTable import org.opencypher.morpheus.api.io.Relationship._ import org.opencypher.morpheus.api.{GraphSources, MorpheusSession} import org.opencypher.morpheus.integration.yelp.YelpConstants._ import org.opencypher.morpheus.integration.yelp.YelpHelpers._ import org.opencypher.okapi.api.graph.{GraphName, PropertyGraph} import org.opencypher.okapi.api.io.conversion.{NodeMappingBuilder, RelationshipMappingBuilder} object Part1_YelpImport extends App { Logger.getRootLogger.setLevel(Level.ERROR) log("Part 1 - Import") lazy val inputPath = args.headOption.getOrElse(defaultYelpJsonFolder) lazy val outputPath = args.lift(1).getOrElse(defaultYelpGraphFolder) implicit val morpheus: MorpheusSession = MorpheusSession.local() implicit val spark: SparkSession = morpheus.sparkSession storeGraph(inputPath, outputPath) def storeGraph(inputPath: String, outputPath: String): Unit = { // Load Yelp data into DataFrames log("Load yelp tables", 1) val yelpTables = loadYelpTables(inputPath) // Create a Property Graph from DataFrames log("Create property graph", 1) val propertyGraph = createPropertyGraph(yelpTables) log("Store in parquet", 1) storeAsParquet(yelpGraphName, propertyGraph) } def storeAsParquet(graphName: GraphName, graph: PropertyGraph): Unit = { // Init Property Graph Data Source (PGDS) val parquetPGDS = GraphSources.fs(outputPath).parquet // Store graph in PGDS if (parquetPGDS.hasGraph(graphName)) { log(s"Warning: A graph with GraphName $graphName already exists.") } else { parquetPGDS.store(yelpGraphName, graph) } } def createPropertyGraph(yelpTables: YelpTables): PropertyGraph = { // Define node tables // (:User) val userNodeTable = MorpheusElementTable.create(NodeMappingBuilder.on(sourceIdKey) .withImpliedLabel(userLabel) .withPropertyKey("name") .withPropertyKey("yelping_since") .withPropertyKey("elite") .build, yelpTables.userDf.prependIdColumn(sourceIdKey, userLabel)) // (:Business) val businessNodeTable = MorpheusElementTable.create(NodeMappingBuilder.on(sourceIdKey) .withImpliedLabel(businessLabel) .withPropertyKey("businessId", "business_id") .withPropertyKey("name") .withPropertyKey("address") .withPropertyKey("city") .withPropertyKey("state") .build, yelpTables.businessDf.prependIdColumn(sourceIdKey, businessLabel)) // Define relationship tables // (:User)-[:REVIEWS]->(:Business) val reviewRelTable = MorpheusElementTable.create(RelationshipMappingBuilder.on(sourceIdKey) .withSourceStartNodeKey(sourceStartNodeKey) .withSourceEndNodeKey(sourceEndNodeKey) .withRelType(reviewRelType) .withPropertyKey("stars") .withPropertyKey("date") .build, yelpTables.reviewDf .prependIdColumn(sourceIdKey, reviewRelType) .prependIdColumn(sourceStartNodeKey, userLabel) .prependIdColumn(sourceEndNodeKey, businessLabel)) // Create property graph morpheus.graphs.create(businessNodeTable, userNodeTable, reviewRelTable) } }
Example 131
Source File: SparseNaiveBayes.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println
Example 132
Source File: DenseKMeans.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 133
Source File: StreamingExamples.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.apache.log4j.{Level, Logger} import org.apache.spark.internal.Logging def setStreamingLogLevels() { val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements if (!log4jInitialized) { // We first log something to initialize Spark's default logging, then we override the // logging level. logInfo("Setting log level to [WARN] for streaming example." + " To override add a custom log4j.properties to the classpath.") Logger.getRootLogger.setLevel(Level.WARN) } } }
Example 134
Source File: YarnScheduler.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.yarn.util.RackResolver import org.apache.log4j.{Level, Logger} import org.apache.spark._ import org.apache.spark.scheduler.TaskSchedulerImpl import org.apache.spark.util.Utils private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) { // RackResolver logs an INFO message whenever it resolves a rack, which is way too often. if (Logger.getLogger(classOf[RackResolver]).getLevel == null) { Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN) } // By default, rack is unknown override def getRackForHost(hostPort: String): Option[String] = { val host = Utils.parseHostPort(hostPort)._1 Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation) } }
Example 135
Source File: ClientArguments.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import java.net.{URI, URISyntaxException} import scala.annotation.tailrec import scala.collection.mutable.ListBuffer import org.apache.log4j.Level import org.apache.spark.util.{IntParam, MemoryParam, Utils} private def printUsageAndExit(exitCode: Int) { // TODO: It wouldn't be too hard to allow users to submit their app and dependency jars // separately similar to in the YARN client. val usage = s""" |Usage: DriverClient [options] launch <active-master> <jar-url> <main-class> [driver options] |Usage: DriverClient kill <active-master> <driver-id> | |Options: | -c CORES, --cores CORES Number of cores to request (default: $DEFAULT_CORES) | -m MEMORY, --memory MEMORY Megabytes of memory to request (default: $DEFAULT_MEMORY) | -s, --supervise Whether to restart the driver on failure | (default: $DEFAULT_SUPERVISE) | -v, --verbose Print more debugging output """.stripMargin // scalastyle:off println System.err.println(usage) // scalastyle:on println System.exit(exitCode) } } private[deploy] object ClientArguments { val DEFAULT_CORES = 1 val DEFAULT_MEMORY = Utils.DEFAULT_DRIVER_MEM_MB // MB val DEFAULT_SUPERVISE = false def isValidJarUrl(s: String): Boolean = { try { val uri = new URI(s) uri.getScheme != null && uri.getPath != null && uri.getPath.endsWith(".jar") } catch { case _: URISyntaxException => false } } }
Example 136
Source File: Application.scala From data-faker with MIT License | 5 votes |
package com.dunnhumby.datafaker import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession object Application extends App { Logger.getLogger("org.apache.spark.SparkContext").setLevel(Level.WARN) val parsedArgs = ArgsParser.validateArgs(ArgsParser.parseArgs(args.toList)) val conf = new SparkConf() .set("spark.ui.showConsoleProgress", "true") .setAppName("data-faker") val spark: SparkSession = SparkSession .builder() .config(conf) .enableHiveSupport() .getOrCreate() spark.sparkContext.setLogLevel("OFF") spark.sql(s"create database if not exists ${parsedArgs("database")}") val schema = YamlParser.parseSchemaFromFile(parsedArgs("file")) val dataGenerator = new DataGenerator(spark, parsedArgs("database")) dataGenerator.generateAndWriteDataFromSchema(schema) }
Example 137
Source File: LogUtils.scala From Spark-MLlib-Twitter-Sentiment-Analysis with Apache License 2.0 | 5 votes |
package org.p7h.spark.sentiment.utils import org.apache.log4j.{Level, Logger} import org.apache.spark.{Logging, SparkContext} object LogUtils extends Logging { def setLogLevels(sparkContext: SparkContext) { sparkContext.setLogLevel(Level.WARN.toString) val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements if (!log4jInitialized) { logInfo( """Setting log level to [WARN] for streaming executions. |To override add a custom log4j.properties to the classpath.""".stripMargin) Logger.getRootLogger.setLevel(Level.WARN) } } }
Example 138
Source File: LinearRegression.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.regression.LinearRegressionWithSGD import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.optimization.{SimpleUpdater, SquaredL2Updater, L1Updater} spark-examples-*.jar \ | data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"LinearRegression with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0).cache() val test = splits(1).cache() val numTraining = training.count() val numTest = test.count() println(s"Training: $numTraining, test: $numTest.") examples.unpersist(blocking = false) val updater = params.regType match { case NONE => new SimpleUpdater() case L1 => new L1Updater() case L2 => new SquaredL2Updater() } val algorithm = new LinearRegressionWithSGD() algorithm.optimizer .setNumIterations(params.numIterations) .setStepSize(params.stepSize) .setUpdater(updater) .setRegParam(params.regParam) val model = algorithm.run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val loss = predictionAndLabel.map { case (p, l) => val err = p - l err * err }.reduce(_ + _) val rmse = math.sqrt(loss / numTest) println(s"Test RMSE = $rmse.") sc.stop() } }
Example 139
Source File: SparseNaiveBayes.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } }
Example 140
Source File: DenseKMeans.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } }
Example 141
Source File: StreamingExamples.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.apache.spark.Logging import org.apache.log4j.{Level, Logger} def setStreamingLogLevels() { val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements if (!log4jInitialized) { // We first log something to initialize Spark's default logging, then we override the // logging level. logInfo("Setting log level to [WARN] for streaming example." + " To override add a custom log4j.properties to the classpath.") Logger.getRootLogger.setLevel(Level.WARN) } } }
Example 142
Source File: YarnScheduler.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.yarn.util.RackResolver import org.apache.log4j.{Level, Logger} import org.apache.spark._ import org.apache.spark.scheduler.TaskSchedulerImpl import org.apache.spark.util.Utils private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) { // RackResolver logs an INFO message whenever it resolves a rack, which is way too often. if (Logger.getLogger(classOf[RackResolver]).getLevel == null) { Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN) } // By default, rack is unknown override def getRackForHost(hostPort: String): Option[String] = { val host = Utils.parseHostPort(hostPort)._1 Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation) } }
Example 143
Source File: ClientArguments.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import java.net.{URI, URISyntaxException} import scala.collection.mutable.ListBuffer import org.apache.log4j.Level import org.apache.spark.util.{IntParam, MemoryParam, Utils} private def printUsageAndExit(exitCode: Int) { // TODO: It wouldn't be too hard to allow users to submit their app and dependency jars // separately similar to in the YARN client. val usage = s""" |Usage: DriverClient [options] launch <active-master> <jar-url> <main-class> [driver options] |Usage: DriverClient kill <active-master> <driver-id> | |Options: | -c CORES, --cores CORES Number of cores to request (default: $DEFAULT_CORES) | -m MEMORY, --memory MEMORY Megabytes of memory to request (default: $DEFAULT_MEMORY) | -s, --supervise Whether to restart the driver on failure | (default: $DEFAULT_SUPERVISE) | -v, --verbose Print more debugging output """.stripMargin System.err.println(usage) System.exit(exitCode) } } private[deploy] object ClientArguments { val DEFAULT_CORES = 1 val DEFAULT_MEMORY = 512 // MB val DEFAULT_SUPERVISE = false def isValidJarUrl(s: String): Boolean = { try { val uri = new URI(s) uri.getScheme != null && uri.getPath != null && uri.getPath.endsWith(".jar") } catch { case _: URISyntaxException => false } } }
Example 144
Source File: driver.scala From SparkSMOTE with MIT License | 5 votes |
import java.io._ import utils._ import SMOTE._ import org.apache.log4j.Logger import org.apache.log4j.Level import breeze.linalg._ import org.apache.spark.SparkConf import org.apache.spark.SparkContext import scala.collection.mutable.{ArrayBuffer,Map} object driver { def main(args: Array[String]) { val conf = new SparkConf() val options = args.map { arg => arg.dropWhile(_ == '-').split('=') match { case Array(opt, v) => (opt -> v) case Array(opt) => (opt -> "") case _ => throw new IllegalArgumentException("Invalid argument: "+arg) } }.toMap val rootLogger = Logger.getRootLogger() rootLogger.setLevel(Level.ERROR) val sc = new SparkContext(conf) // read in general inputs val inputDirectory = options.getOrElse("inputDirectory","") val outputDirectory = options.getOrElse("outputDirectory","") val numFeatures = options.getOrElse("numFeatures","0").toInt val oversamplingPctg = options.getOrElse("oversamplingPctg","1.0").toDouble val kNN = options.getOrElse("K","5").toInt val delimiter = options.getOrElse("delimiter",",") val numPartitions = options.getOrElse("numPartitions","20").toInt SMOTE.runSMOTE(sc, inputDirectory, outputDirectory, numFeatures, oversamplingPctg, kNN, delimiter, numPartitions) println("The algorithm has finished running") sc.stop() } }
Example 145
Source File: DQMainClass.scala From DataQuality with GNU Lesser General Public License v3.0 | 5 votes |
package it.agilelab.bigdata.DataQuality.utils import java.util.Locale import it.agilelab.bigdata.DataQuality.utils.io.HistoryDBManager import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext import org.apache.spark.sql.hive.HiveContext trait DQMainClass { this: DQSparkContext with Logging => private def initLogger(): Unit = { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("org.apache.spark.scheduler.TaskSetManager").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.OFF) Logger.getLogger("io.netty").setLevel(Level.OFF) Logger.getLogger("org.spark-project.jetty").setLevel(Level.OFF) Logger.getLogger("org.apache.hadoop.hdfs.KeyProviderCache").setLevel(Level.OFF) } private def makeFileSystem(settings: DQSettings, sc: SparkContext): FileSystem = { if (sc.isLocal) FileSystem.getLocal(sc.hadoopConfiguration) else { if (settings.s3Bucket.isDefined) { sc.hadoopConfiguration.set("fs.defaultFS", settings.s3Bucket.get) sc.hadoopConfiguration.set("fs.AbstractFileSystem.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") } FileSystem.get( sc.hadoopConfiguration) } } protected def body()(implicit fs: FileSystem, sparkContext: SparkContext, sqlContext: SQLContext, sqlWriter: HistoryDBManager, settings: DQSettings): Boolean def preMessage(task: String): Unit = { log.warn("************************************************************************") log.warn(s" Starting execution of task $task") log.warn("************************************************************************") } def postMessage(task: String): Unit = { log.warn("************************************************************************") log.warn(s" Finishing execution of task $task") log.warn("************************************************************************") } def main(args: Array[String]): Unit = { // set to avoid casting problems in metric result name generation Locale.setDefault(Locale.ENGLISH) initLogger() DQCommandLineOptions.parser().parse(args, DQCommandLineOptions("","")) match { case Some(commandLineOptions) => // Load our own config values from the default location, application.conf val settings = new DQSettings(commandLineOptions) val sparkContext = makeSparkContext(settings) val fs = makeFileSystem(settings, sparkContext) settings.logThis()(log) val sqlContext: SQLContext = if (settings.hiveDir.isDefined) { val hc = new HiveContext(sparkContext) hc.setConf("hive.metastore.warehouse.dir", settings.hiveDir.get) hc } else makeSqlContext(sparkContext) val historyDatabase = new HistoryDBManager(settings) // Starting application body preMessage(s"{${settings.appName}}") val startTime = System.currentTimeMillis() body()(fs, sparkContext, sqlContext, historyDatabase, settings) postMessage(s"{${settings.appName}}") log.info(s"Execution finished in [${(System.currentTimeMillis() - startTime) / 60000}] min(s)") log.info("Closing application...") historyDatabase.closeConnection() sparkContext.stop() log.info("Spark context were terminated. Exiting...") case None => log.error("Wrong parameters provided") throw new Exception("Wrong parameters provided") } } }
Example 146
Source File: TestSparkContext.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
// scalastyle:off header.matches trait TestSparkStreamingContext extends TestSparkContext { self: Suite => implicit lazy val streaming: StreamingContext = StreamingContext.getActiveOrCreate(() => new StreamingContext(sc, Seconds(1)) ) override def afterAll: Unit = { streaming.stop(stopSparkContext = false) super[TestSparkContext].afterAll } }
Example 147
Source File: SVMWithSGDDemo.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.util.MLUtils import org.apache.log4j.Level import org.apache.log4j.Logger import org.apache.spark.SparkConf import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS //逻辑回归,基于lbfgs优化损失函数,支持多分类,(BFGS是逆秩2拟牛顿法) val modelBFGS = new LogisticRegressionWithLBFGS() .setNumClasses(10) .run(training) //在测试数据上计算原始分数 // Compute raw scores on the test set. val predictionAndLabels = test.map { //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } //获取评估指标 // Get evaluation metrics. val metricsBFGS = new MulticlassMetrics(predictionAndLabels) val precision = metricsBFGS.precision println("Precision = " + precision) } }
Example 148
Source File: LogisticRegressionWithLBFGSDeom.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.util.MLUtils import org.apache.log4j.Level import org.apache.log4j.Logger import org.apache.spark.SparkConf import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS //逻辑回归,基于lbfgs优化损失函数,支持多分类(BFGS是逆秩2拟牛顿法) val modelBFGS = new LogisticRegressionWithLBFGS() .setNumClasses(10) .run(training) //在测试数据上计算原始分数 // Compute raw scores on the test set. val predictionAndLabels = test.map { //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) case LabeledPoint(label, features) => val prediction = modelBFGS.predict(features) (prediction, label) } //获取评估指标 // Get evaluation metrics. val metricsBFGS = new MulticlassMetrics(predictionAndLabels) val precision = metricsBFGS.precision println("Precision = " + precision) } }
Example 149
Source File: SparseNaiveBayes.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params").setMaster("local") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. //缓存的例子,因为它将被用于在训练和评估。 examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() //numTraining = 81, numTest = 19. println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest //Test accuracy = 1.0. 准确率 println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println
Example 150
Source File: StreamingExamples.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.apache.spark.Logging import org.apache.log4j.{Level, Logger} def setStreamingLogLevels() { val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements if (!log4jInitialized) { // We first log something to initialize Spark's default logging, then we override the // logging level. logInfo("Setting log level to [WARN] for streaming example." + " To override add a custom log4j.properties to the classpath.") Logger.getRootLogger.setLevel(Level.WARN) } } }
Example 151
Source File: YarnScheduler.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.yarn.util.RackResolver import org.apache.log4j.{Level, Logger} import org.apache.spark._ import org.apache.spark.scheduler.TaskSchedulerImpl import org.apache.spark.util.Utils private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) { // RackResolver logs an INFO message whenever it resolves a rack, which is way too often. //RackResolver在解析机架时会记录INFO消息,这种情况太常见了 if (Logger.getLogger(classOf[RackResolver]).getLevel == null) { Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN) } // By default, rack is unknown //默认情况下,机架未知 override def getRackForHost(hostPort: String): Option[String] = { val host = Utils.parseHostPort(hostPort)._1 Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation) } }
Example 152
Source File: ClientArguments.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import java.net.{URI, URISyntaxException} import scala.collection.mutable.ListBuffer import org.apache.log4j.Level import org.apache.spark.util.{IntParam, MemoryParam, Utils} private def printUsageAndExit(exitCode: Int) { // TODO: It wouldn't be too hard to allow users to submit their app and dependency jars // separately similar to in the YARN client. val usage = s""" |Usage: DriverClient [options] launch <active-master> <jar-url> <main-class> [driver options] |Usage: DriverClient kill <active-master> <driver-id> | |Options: | -c CORES, --cores CORES Number of cores to request (default: $DEFAULT_CORES) | -m MEMORY, --memory MEMORY Megabytes of memory to request (default: $DEFAULT_MEMORY) | -s, --supervise Whether to restart the driver on failure | (default: $DEFAULT_SUPERVISE) | -v, --verbose Print more debugging output """.stripMargin // scalastyle:off println System.err.println(usage) // scalastyle:on println System.exit(exitCode) } } private[deploy] object ClientArguments { val DEFAULT_CORES = 1 val DEFAULT_MEMORY = Utils.DEFAULT_DRIVER_MEM_MB // MB val DEFAULT_SUPERVISE = false def isValidJarUrl(s: String): Boolean = { try { val uri = new URI(s) uri.getScheme != null && uri.getPath != null && uri.getPath.endsWith(".jar") } catch { case _: URISyntaxException => false } } }
Example 153
Source File: Loggable.scala From meetup-stream with Apache License 2.0 | 5 votes |
package core import org.apache.spark.Logging import org.apache.log4j.{Level, Logger} def setStreamingLogLevels() { val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements if (!log4jInitialized) { // We first log something to initialize Spark's default logging, then we override the // logging level. logInfo("Setting log level to [ERROR] for streaming example." + " To override add a custom log4j.properties to the classpath.") Logger.getRootLogger.setLevel(Level.WARN) Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) Logger.getLogger("streaming").setLevel(Level.WARN) Logger.getLogger("spark").setLevel(Level.WARN) } } }
Example 154
Source File: RemoteAppender.scala From mist with Apache License 2.0 | 5 votes |
package io.hydrosphere.mist.worker.logging import io.hydrosphere.mist.core.logging.LogEvent import org.apache.log4j.spi.LoggingEvent import org.apache.log4j.{AppenderSkeleton, Level, SimpleLayout} class RemoteAppender(sourceId: String, logsWriter: LogsWriter) extends AppenderSkeleton { override def append(event: LoggingEvent): Unit = { val timeStamp = event.timeStamp val message = event.getRenderedMessage val evt = event.getLevel match { case Level.INFO => LogEvent.mkInfo(sourceId, message, timeStamp) case Level.DEBUG => LogEvent.mkDebug(sourceId, message, timeStamp) case Level.ERROR => LogEvent.mkError( sourceId, message, Option(event.getThrowableInformation).map(_.getThrowable), timeStamp ) case Level.WARN => LogEvent.mkWarn(sourceId, message, timeStamp) case _ => LogEvent.mkInfo(sourceId, this.getLayout.format(event), timeStamp) } logsWriter.write(evt) } override def close(): Unit = () override def requiresLayout(): Boolean = true } object RemoteAppender { def create(sourceId: String, logsWriter: LogsWriter): RemoteAppender = { val jobLogsAppender = new RemoteAppender(sourceId, logsWriter) jobLogsAppender.setLayout(new SimpleLayout) jobLogsAppender.setName(sourceId) jobLogsAppender } }
Example 155
Source File: SparseNaiveBayes.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println
Example 156
Source File: DenseKMeans.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 157
Source File: StreamingExamples.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.apache.log4j.{Level, Logger} import org.apache.spark.internal.Logging def setStreamingLogLevels() { val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements if (!log4jInitialized) { // We first log something to initialize Spark's default logging, then we override the // logging level. logInfo("Setting log level to [WARN] for streaming example." + " To override add a custom log4j.properties to the classpath.") Logger.getRootLogger.setLevel(Level.WARN) } } }
Example 158
Source File: YarnScheduler.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.yarn.util.RackResolver import org.apache.log4j.{Level, Logger} import org.apache.spark._ import org.apache.spark.scheduler.TaskSchedulerImpl import org.apache.spark.util.Utils private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) { // RackResolver logs an INFO message whenever it resolves a rack, which is way too often. if (Logger.getLogger(classOf[RackResolver]).getLevel == null) { Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN) } // By default, rack is unknown override def getRackForHost(hostPort: String): Option[String] = { val host = Utils.parseHostPort(hostPort)._1 Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation) } }
Example 159
Source File: ClientArguments.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import java.net.{URI, URISyntaxException} import scala.annotation.tailrec import scala.collection.mutable.ListBuffer import org.apache.log4j.Level import org.apache.spark.util.{IntParam, MemoryParam, Utils} private def printUsageAndExit(exitCode: Int) { // TODO: It wouldn't be too hard to allow users to submit their app and dependency jars // separately similar to in the YARN client. val usage = s""" |Usage: DriverClient [options] launch <active-master> <jar-url> <main-class> [driver options] |Usage: DriverClient kill <active-master> <driver-id> | |Options: | -c CORES, --cores CORES Number of cores to request (default: $DEFAULT_CORES) | -m MEMORY, --memory MEMORY Megabytes of memory to request (default: $DEFAULT_MEMORY) | -s, --supervise Whether to restart the driver on failure | (default: $DEFAULT_SUPERVISE) | -v, --verbose Print more debugging output """.stripMargin // scalastyle:off println System.err.println(usage) // scalastyle:on println System.exit(exitCode) } } private[deploy] object ClientArguments { val DEFAULT_CORES = 1 val DEFAULT_MEMORY = Utils.DEFAULT_DRIVER_MEM_MB // MB val DEFAULT_SUPERVISE = false def isValidJarUrl(s: String): Boolean = { try { val uri = new URI(s) uri.getScheme != null && uri.getPath != null && uri.getPath.endsWith(".jar") } catch { case _: URISyntaxException => false } } }
Example 160
Source File: CypherParser.scala From ingraph with Eclipse Public License 1.0 | 5 votes |
package ingraph.compiler.cypher2gplan import java.io.ByteArrayInputStream import ingraph.compiler.exceptions.CompilerException import org.apache.log4j.{Level, Logger} import org.eclipse.emf.common.util.URI import org.eclipse.emf.ecore.resource.Resource import org.eclipse.xtext.diagnostics.Severity import org.eclipse.xtext.resource.{XtextResource, XtextResourceSet} import org.eclipse.xtext.util.CancelIndicator import org.eclipse.xtext.validation.CheckMode import org.slizaa.neo4j.opencypher.OpenCypherStandaloneSetup import org.slizaa.neo4j.opencypher.openCypher.Cypher import scala.collection.JavaConverters._ object CypherParser { def parseFile(fileName: String): Cypher = { Logger.getLogger("org.eclipse.xtext").setLevel(Level.ERROR) // https://typefox.io/how-and-why-use-xtext-without-the-ide val injector = new OpenCypherStandaloneSetup().createInjectorAndDoEMFRegistration() val resourceSet = injector.getInstance(classOf[XtextResourceSet]) val filePath = "../queries/" + fileName + ".cypher" val resource = resourceSet.getResource(URI.createFileURI(filePath), true) validateAndThrowError(resource) resource.getContents.get(0).asInstanceOf[Cypher] } def parseString(queryString: String): Cypher = { Logger.getLogger("org.eclipse.xtext").setLevel(Level.ERROR) // https://wiki.eclipse.org/Xtext/FAQ val injector = new OpenCypherStandaloneSetup().createInjectorAndDoEMFRegistration() val resourceSet = injector.getInstance(classOf[XtextResourceSet]) val resource = resourceSet.createResource(URI.createURI("http:/example.cypher")) val in = new ByteArrayInputStream(queryString.getBytes()) resource.load(in, resourceSet.getLoadOptions()) validateAndThrowError(resource) resource.getContents.get(0).asInstanceOf[Cypher] } def validateAndThrowError(resource: Resource) { var seenError = false var firstError: String = null val validator = resource.asInstanceOf[XtextResource].getResourceServiceProvider.getResourceValidator val issues = validator.validate(resource, CheckMode.ALL, CancelIndicator.NullImpl).asScala for (issue <- issues) { if (issue.getSeverity == Severity.ERROR && !seenError) { seenError = true firstError = issue.getMessage } println(issue.getMessage) } if (seenError) { throw new CompilerException(s"Error during cypher parse, the first error was: ${firstError}") } } }
Example 161
Source File: LocalSparkContext.scala From sandpiper with Apache License 2.0 | 5 votes |
package sparkle.util import org.apache.log4j.{Level, Logger} import org.apache.spark.graphx.GraphXUtils import org.apache.spark.{SparkConf, SparkContext} def withSpark[T](f: SparkContext => T): T = { val conf = new SparkConf() GraphXUtils.registerKryoClasses(conf) val sc = new SparkContext("local", "test", conf) Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) try { f(sc) } finally { sc.stop() } } }
Example 162
Source File: Driver.scala From OnlineLDA_Spark with Apache License 2.0 | 5 votes |
package com.github.yuhao.yang import java.util.Calendar import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkContext, SparkConf} import scala.collection.mutable.ArrayBuffer object Driver extends Serializable{ def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) val inputDir = args(0) val filePaths = extractPaths(inputDir + "texts", true) val stopWordsPath = inputDir + "stop.txt" val vocabPath = inputDir + "wordsEn.txt" println("begin: " + Calendar.getInstance().getTime) println("path size: " + filePaths.size) assert(filePaths.size > 0) val conf = new SparkConf().setAppName("online LDA Spark") val sc = new SparkContext(conf) val vocab = Docs2Vec.extractVocab(sc, Seq(vocabPath), stopWordsPath) val vocabArray = vocab.map(_.swap) val K = args(1).toInt // val lda = OnlineLDA_Spark.runBatchMode(sc, filePaths, vocab, K, 50) val lda = OnlineLDA_Spark.runOnlineMode(sc, filePaths, vocab, K, args(2).toInt) println("_lambda:") for(row <- 0 until lda._lambda.rows){ val v = lda._lambda(row, ::).t val topk = lda._lambda(row, ::).t.argtopk(10) val pairs = topk.map(k => (vocabArray(k), v(k))) val sorted = pairs.sortBy(_._2).reverse println(sorted.map(x => (x._1)).mkString(","), sorted.map(x => ("%2.2f".format(x._2))).mkString(",")) } println("end: " + Calendar.getInstance().getTime()) } def extractPaths(path: String, recursive: Boolean = true): Array[String] ={ val docsets = ArrayBuffer[String]() val fileList = new java.io.File(path).listFiles() if(fileList == null) return docsets.toArray for(f <- fileList){ if(f.isDirectory){ if(recursive) docsets ++= extractPaths(f.getAbsolutePath, true) } else{ docsets += f.getAbsolutePath } } docsets.toArray } }
Example 163
Source File: LinearRegression.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.regression.LinearRegressionWithSGD import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.optimization.{SimpleUpdater, SquaredL2Updater, L1Updater} spark-examples-*.jar \ | data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"LinearRegression with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0).cache() val test = splits(1).cache() val numTraining = training.count() val numTest = test.count() println(s"Training: $numTraining, test: $numTest.") examples.unpersist(blocking = false) val updater = params.regType match { case NONE => new SimpleUpdater() case L1 => new L1Updater() case L2 => new SquaredL2Updater() } val algorithm = new LinearRegressionWithSGD() algorithm.optimizer .setNumIterations(params.numIterations) .setStepSize(params.stepSize) .setUpdater(updater) .setRegParam(params.regParam) val model = algorithm.run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val loss = predictionAndLabel.map { case (p, l) => val err = p - l err * err }.reduce(_ + _) val rmse = math.sqrt(loss / numTest) println(s"Test RMSE = $rmse.") sc.stop() } } // scalastyle:on println
Example 164
Source File: SparseNaiveBayes.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println
Example 165
Source File: DenseKMeans.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 166
Source File: StreamingExamples.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.apache.spark.Logging import org.apache.log4j.{Level, Logger} def setStreamingLogLevels() { val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements if (!log4jInitialized) { // We first log something to initialize Spark's default logging, then we override the // logging level. logInfo("Setting log level to [WARN] for streaming example." + " To override add a custom log4j.properties to the classpath.") Logger.getRootLogger.setLevel(Level.WARN) } } }
Example 167
Source File: YarnScheduler.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.yarn.util.RackResolver import org.apache.log4j.{Level, Logger} import org.apache.spark._ import org.apache.spark.scheduler.TaskSchedulerImpl import org.apache.spark.util.Utils private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) { // RackResolver logs an INFO message whenever it resolves a rack, which is way too often. if (Logger.getLogger(classOf[RackResolver]).getLevel == null) { Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN) } // By default, rack is unknown override def getRackForHost(hostPort: String): Option[String] = { val host = Utils.parseHostPort(hostPort)._1 Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation) } }
Example 168
Source File: ClientArguments.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import java.net.{URI, URISyntaxException} import scala.collection.mutable.ListBuffer import org.apache.log4j.Level import org.apache.spark.util.{IntParam, MemoryParam, Utils} private def printUsageAndExit(exitCode: Int) { // TODO: It wouldn't be too hard to allow users to submit their app and dependency jars // separately similar to in the YARN client. val usage = s""" |Usage: DriverClient [options] launch <active-master> <jar-url> <main-class> [driver options] |Usage: DriverClient kill <active-master> <driver-id> | |Options: | -c CORES, --cores CORES Number of cores to request (default: $DEFAULT_CORES) | -m MEMORY, --memory MEMORY Megabytes of memory to request (default: $DEFAULT_MEMORY) | -s, --supervise Whether to restart the driver on failure | (default: $DEFAULT_SUPERVISE) | -v, --verbose Print more debugging output """.stripMargin // scalastyle:off println System.err.println(usage) // scalastyle:on println System.exit(exitCode) } } private[deploy] object ClientArguments { val DEFAULT_CORES = 1 val DEFAULT_MEMORY = Utils.DEFAULT_DRIVER_MEM_MB // MB val DEFAULT_SUPERVISE = false def isValidJarUrl(s: String): Boolean = { try { val uri = new URI(s) uri.getScheme != null && uri.getPath != null && uri.getPath.endsWith(".jar") } catch { case _: URISyntaxException => false } } }
Example 169
Source File: SparkSessionTestWrapper.scala From spark-stringmetric with MIT License | 5 votes |
package com.github.mrpowers.spark.stringmetric import org.apache.spark.sql.SparkSession import org.apache.log4j.{Logger, Level} trait SparkSessionTestWrapper { lazy val spark: SparkSession = { Logger.getLogger("org").setLevel(Level.OFF) SparkSession .builder() .master("local") .appName("spark session") .config("spark.sql.shuffle.partitions", "1") .getOrCreate() } }
Example 170
Source File: SparkLocal.scala From parquet-index with Apache License 2.0 | 5 votes |
package com.github.lightcopy.testutil import org.apache.log4j.Level import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession private def localConf: SparkConf = { new SparkConf(). setMaster("local[4]"). setAppName("spark-local-test"). set("spark.driver.memory", "1g"). set("spark.executor.memory", "2g") } override def createSparkSession(): SparkSession = { SparkSession.builder().config(localConf).getOrCreate() } }
Example 171
Source File: SQLAggregationScala.scala From infinispan-spark with Apache License 2.0 | 5 votes |
package org.infinispan.spark.examples.twitter import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession import org.infinispan.spark.examples.twitter.Sample.{getSparkConf, usage} import org.infinispan.spark.rdd.InfinispanRDD object SQLAggregationScala { def main(args: Array[String]) { if (args.length < 1) { usage("SQLAggregationScala") } Logger.getLogger("org").setLevel(Level.WARN) val infinispanHost = args(0) // Reduce the log level in the driver Logger.getLogger("org").setLevel(Level.WARN) // Create Spark Context val conf = getSparkConf("spark-infinispan-rdd-aggregation-scala") val sc = new SparkContext(conf) // Populate infinispan properties val config = Sample.getConnectorConf(infinispanHost) // Create RDD from infinispan data val infinispanRDD = new InfinispanRDD[Long, Tweet](sc, config) // Create a SQLContext, register a data frame and a temp table val valuesRDD = infinispanRDD.values val sparkSession = SparkSession.builder().config(conf).getOrCreate() val dataFrame = sparkSession.createDataFrame(valuesRDD, classOf[Tweet]) dataFrame.createOrReplaceTempView("tweets") // Run the Query, collect and print results sparkSession.sql("SELECT country, count(*) as c from tweets WHERE country != 'N/A' GROUP BY country ORDER BY c desc") .collect().take(20).foreach(println) } }
Example 172
Source File: StreamConsumerScala.scala From infinispan-spark with Apache License 2.0 | 5 votes |
package org.infinispan.spark.examples.twitter import java.util.concurrent.{Executors, TimeUnit} import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import org.apache.spark.streaming.{Seconds, StreamingContext} import org.infinispan.client.hotrod.RemoteCacheManager import org.infinispan.client.hotrod.configuration.ConfigurationBuilder import org.infinispan.spark.examples.twitter.Sample.{getSparkConf, runAndExit, usageStream} import org.infinispan.spark.examples.util.TwitterDStream import org.infinispan.spark.stream._ import scala.collection.JavaConverters._ import scala.concurrent.duration._ import scala.language.postfixOps object StreamConsumerScala { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) if (args.length < 2) { usageStream("StreamConsumerScala") } val infinispanHost = args(0) val duration = args(1).toLong * 1000 val conf = getSparkConf("spark-infinispan-stream-consumer-scala") val sparkContext = new SparkContext(conf) val streamingContext = new StreamingContext(sparkContext, Seconds(1)) val config = Sample.getConnectorConf(infinispanHost) val remoteCacheManager = new RemoteCacheManager(new ConfigurationBuilder().withProperties(config.getHotRodClientProperties).build()) val cache = remoteCacheManager.getCache[Long, Tweet]("default") val twitterDStream = TwitterDStream.create(streamingContext) val keyValueTweetStream = twitterDStream.map(s => (s.getId, s)) keyValueTweetStream.writeToInfinispan(config) Repeat.every(5 seconds, { val keySet = cache.keySet() val maxKey = keySet.asScala.max println(s"${keySet.size} tweets inserted in the cache") println(s"Last tweet:${Option(cache.get(maxKey)).map(_.getText).getOrElse("<no tweets received so far>")}") println() }) runAndExit(streamingContext, duration) } object Repeat { def every(d: Duration, code: => Unit) = Executors.newSingleThreadScheduledExecutor.scheduleWithFixedDelay(new Runnable { override def run(): Unit = code }, 10, d.toSeconds, TimeUnit.SECONDS) } }
Example 173
Source File: CallRecordGeneratorIngress.scala From cloudflow with Apache License 2.0 | 5 votes |
package carly.aggregator import java.sql.Timestamp import scala.util.Random import scala.concurrent.duration._ import org.apache.spark.sql.{ Dataset, SparkSession } import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.LongType import cloudflow.streamlets._ import cloudflow.streamlets.avro._ import cloudflow.spark.sql.SQLImplicits._ import carly.data.CallRecord import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.log4j.{ Level, Logger } case class Rate(timestamp: Timestamp, value: Long) class CallRecordGeneratorIngress extends SparkStreamlet { val rootLogger = Logger.getRootLogger() rootLogger.setLevel(Level.ERROR) val RecordsPerSecond = IntegerConfigParameter("records-per-second", "Records per second to process.", Some(50)) override def configParameters = Vector(RecordsPerSecond) val out = AvroOutlet[CallRecord]("out", _.user) val shape = StreamletShape(out) override def createLogic() = new SparkStreamletLogic { val recordsPerSecond = RecordsPerSecond.value override def buildStreamingQueries = { val outStream = DataGenerator.mkData(super.session, recordsPerSecond) writeStream(outStream, out, OutputMode.Append).toQueryExecution } } } object DataGenerator { def mkData(session: SparkSession, recordsPerSecond: Int): Dataset[CallRecord] = { // do we need to expose this through configuration? val MaxTime = 2.hours.toMillis val MaxUsers = 100000 val TS0 = new java.sql.Timestamp(0) val ZeroTimestampProb = 0.05 // error rate // Random Data Generator val usersUdf = udf(() ⇒ "user-" + Random.nextInt(MaxUsers)) val directionUdf = udf(() ⇒ if (Random.nextDouble() < 0.5) "incoming" else "outgoing") // Time-biased randomized filter - 1/2 hour cycles val sinTime: Long ⇒ Double = t ⇒ Math.sin((t / 1000 % 1800) * 1.0 / 1800 * Math.PI) val timeBoundFilter: Long ⇒ Double ⇒ Boolean = t ⇒ prob ⇒ (sinTime(t) + 0.5) > prob val timeFilterUdf = udf((ts: java.sql.Timestamp, rng: Double) ⇒ timeBoundFilter(ts.getTime)(rng)) val zeroTimestampUdf = udf { (ts: java.sql.Timestamp, rng: Double) ⇒ if (rng < ZeroTimestampProb) { TS0 } else { ts } } val rateStream = session.readStream .format("rate") .option("rowsPerSecond", recordsPerSecond) .load() .as[Rate] val randomDataset = rateStream.withColumn("rng", rand()).withColumn("tsRng", rand()) val sampledData = randomDataset .where(timeFilterUdf($"timestamp", $"rng")) .withColumn("user", usersUdf()) .withColumn("other", usersUdf()) .withColumn("direction", directionUdf()) .withColumn("duration", (round(abs(rand()) * MaxTime)).cast(LongType)) .withColumn("updatedTimestamp", zeroTimestampUdf($"timestamp", $"tsRng")) .select($"user", $"other", $"direction", $"duration", $"updatedTimestamp".as("timestamp")) .as[CallRecord] sampledData } }
Example 174
Source File: CallStatsAggregator.scala From cloudflow with Apache License 2.0 | 5 votes |
package carly.aggregator import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import cloudflow.streamlets._ import cloudflow.streamlets.avro._ import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.streaming.OutputMode import cloudflow.spark.sql.SQLImplicits._ import org.apache.log4j.{ Level, Logger } import carly.data._ class CallStatsAggregator extends SparkStreamlet { val rootLogger = Logger.getRootLogger() rootLogger.setLevel(Level.ERROR) //tag::docs-schemaAware-example[] val in = AvroInlet[CallRecord]("in") val out = AvroOutlet[AggregatedCallStats]("out", _.startTime.toString) val shape = StreamletShape(in, out) //end::docs-schemaAware-example[] val GroupByWindow = DurationConfigParameter("group-by-window", "Window duration for the moving average computation", Some("1 minute")) val Watermark = DurationConfigParameter("watermark", "Late events watermark duration: how long to wait for late events", Some("1 minute")) override def configParameters = Vector(GroupByWindow, Watermark) override def createLogic = new SparkStreamletLogic { val watermark = Watermark.value val groupByWindow = GroupByWindow.value // val t0 = System.currentTimeMillis() // serialization error! //tag::docs-aggregationQuery-example[] override def buildStreamingQueries = { val dataset = readStream(in) val outStream = process(dataset) writeStream(outStream, out, OutputMode.Update).toQueryExecution } private def process(inDataset: Dataset[CallRecord]): Dataset[AggregatedCallStats] = { val query = inDataset .withColumn("ts", $"timestamp".cast(TimestampType)) .withWatermark("ts", s"${watermark.toMillis()} milliseconds") .groupBy(window($"ts", s"${groupByWindow.toMillis()} milliseconds")) .agg(avg($"duration").as("avgCallDuration"), sum($"duration").as("totalCallDuration")) .withColumn("windowDuration", $"window.end".cast(LongType) - $"window.start".cast(LongType)) query .select($"window.start".cast(LongType).as("startTime"), $"windowDuration", $"avgCallDuration", $"totalCallDuration") .as[AggregatedCallStats] } //end::docs-aggregationQuery-example[] } }
Example 175
Source File: CallAggregatorConsoleEgress.scala From cloudflow with Apache License 2.0 | 5 votes |
package carly.aggregator import cloudflow.streamlets._ import cloudflow.streamlets.avro._ import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import cloudflow.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode import org.apache.log4j.{ Level, Logger } import carly.data._ class CallAggregatorConsoleEgress extends SparkStreamlet { val rootLogger = Logger.getRootLogger() rootLogger.setLevel(Level.ERROR) val in = AvroInlet[AggregatedCallStats]("in") val shape = StreamletShape(in) override def createLogic = new SparkStreamletLogic { override def buildStreamingQueries = readStream(in).writeStream .format("console") .outputMode(OutputMode.Append()) .start() .toQueryExecution } }
Example 176
Source File: CallStatsAggregator.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.callrecordaggregator import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import cloudflow.streamlets._ import cloudflow.streamlets.avro._ import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.streaming.OutputMode import cloudflow.spark.sql.SQLImplicits._ import org.apache.log4j.{ Level, Logger } import carly.data._ class CallStatsAggregator extends SparkStreamlet { val rootLogger = Logger.getRootLogger() rootLogger.setLevel(Level.ERROR) //tag::docs-schemaAware-example[] val in = AvroInlet[CallRecord]("in") val out = AvroOutlet[AggregatedCallStats]("out", _.startTime.toString) val shape = StreamletShape(in, out) //end::docs-schemaAware-example[] val GroupByWindow = DurationConfigParameter("group-by-window", "Window duration for the moving average computation", Some("1 minute")) val Watermark = DurationConfigParameter("watermark", "Late events watermark duration: how long to wait for late events", Some("1 minute")) override def configParameters = Vector(GroupByWindow, Watermark) override def createLogic = new SparkStreamletLogic { val watermark = Watermark.value val groupByWindow = GroupByWindow.value //tag::docs-aggregationQuery-example[] override def buildStreamingQueries = { val dataset = readStream(in) val outStream = process(dataset) writeStream(outStream, out, OutputMode.Update).toQueryExecution } //end::docs-aggregationQuery-example[] private def process(inDataset: Dataset[CallRecord]): Dataset[AggregatedCallStats] = { val query = inDataset .withColumn("ts", $"timestamp".cast(TimestampType)) .withWatermark("ts", s"${watermark.toMillis()} milliseconds") .groupBy(window($"ts", s"${groupByWindow.toMillis()} milliseconds")) .agg(avg($"duration").as("avgCallDuration"), sum($"duration").as("totalCallDuration")) .withColumn("windowDuration", $"window.end".cast(LongType) - $"window.start".cast(LongType)) query .select($"window.start".cast(LongType).as("startTime"), $"windowDuration", $"avgCallDuration", $"totalCallDuration") .as[AggregatedCallStats] } } }
Example 177
Source File: SparkFunSuite.scala From spark-ranking-algorithms with Apache License 2.0 | 5 votes |
package org.apache.spark // scalastyle:off import org.scalatest.{Outcome, FunSuite} import org.apache.log4j.{Level, Logger} final protected override def withFixture(test: NoArgTest): Outcome = { val testName = test.text val suiteName = this.getClass.getName val shortSuiteName = suiteName.replaceAll("org.apache.spark", "o.a.s") try { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) logInfo(s"\n\n===== TEST OUTPUT FOR $shortSuiteName: '$testName' =====\n") test() } finally { logInfo(s"\n\n===== FINISHED $shortSuiteName: '$testName' =====\n") } } }
Example 178
Source File: CSVProfiler.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.profilers import io.gzet.profilers.field.{CardinalityProfiler, EmptinessProfiler, MaskBasedProfiler, PredefinedMasks} import io.gzet.profilers.raw.{AsciiProfiler, RowProfiler, StructuralProfiler} import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.{Dataset, SparkSession} import org.elasticsearch.spark.sql._ object CSVProfiler { Logger.getLogger("akka").setLevel(Level.WARN) Logger.getLogger("org").setLevel(Level.WARN) val HEADER = Array( "rowId", "firstName", "lastName", "email", "gender", "ipAddress", "shaPass" ) def main(args: Array[String]) { val spark = SparkSession.builder().appName("Profiler").getOrCreate() import spark.implicits._ val rawDf: Dataset[String] = spark.read.text(args.head).map(_.getAs[String](0)) rawDf.cache() rawDf.count() val tabDf: Dataset[Array[String]] = Utils.split(rawDf, delimiter = ",") val sources = spark.sparkContext.broadcast(rawDf.inputFiles) val ingestTime = spark.sparkContext.broadcast(new java.util.Date().getTime) val headers = spark.sparkContext.broadcast(HEADER.zipWithIndex.map(_.swap).toMap) RowProfiler.apply().profile(rawDf).map({ report => ("row.count", report.metricValue, Map[String, String]()) }).union(AsciiProfiler.apply().profile(rawDf).map({ report => ("row.ascii", report.metricValue, Map(Tags.ASCII_NAME -> report.ascii, Tags.ASCII_BINARY -> report.binary)) })).union(StructuralProfiler.apply(delimiter = ",").profile(rawDf).map({ report => ("field.count", report.metricValue, Map(Tags.EXTRA -> report.description, Tags.FIELD_COUNT -> report.fields.toString)) })).union(EmptinessProfiler.apply().profile(tabDf).map({ report => ("field.emptiness", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString)) })).union(CardinalityProfiler.apply(topN = 5).profile(tabDf).map({ report => ("field.cardinality", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString, Tags.EXTRA -> report.description.map(l => s"[$l]").mkString(","))) })).union(MaskBasedProfiler.apply(topN = 5, PredefinedMasks.ASCIICLASS_LOWGRAIN).profile(tabDf).map({ report => ("field.ascii.low", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString, Tags.MASK -> report.mask, Tags.EXTRA -> report.description.map(l => s"[$l]").mkString(","))) })).union(MaskBasedProfiler.apply(topN = 5, PredefinedMasks.ASCIICLASS_HIGHGRAIN).profile(tabDf).map({ report => ("field.ascii.high", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString, Tags.MASK -> report.mask, Tags.EXTRA -> report.description.map(l => s"[$l]").mkString(","))) })).union(MaskBasedProfiler.apply(topN = 5, PredefinedMasks.POP_CHECKS).profile(tabDf).map({ report => ("field.pop.check", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString, Tags.MASK -> report.mask, Tags.EXTRA -> report.description.map(l => s"[$l]").mkString(","))) })).union(MaskBasedProfiler.apply(topN = 5, PredefinedMasks.CLASS_FREQS).profile(tabDf).map({ report => ("field.class.freq", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString, Tags.MASK -> report.mask, Tags.EXTRA -> report.description.map(l => s"[$l]").mkString(","))) })).map({ case (metricName, metricValue, tags) => val newTags = { if (tags.contains(Tags.FIELD_IDX)) { val fieldIdx = tags.get(Tags.FIELD_IDX).get.toInt val fieldName = headers.value.getOrElse(fieldIdx, "NA") tags ++ Map(Tags.FIELD_NAME -> fieldName) } else { tags } } ReportBuilder.create .withName(metricName) .withMetric(metricValue) .withSources(sources.value) .withTime(ingestTime.value) .withTags(newTags) .build }).toDF().saveToEs("profiler/mock") } }
Example 179
Source File: GzetLoader.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.community import com.typesafe.config.ConfigFactory import io.gzet.community.accumulo.{AccumuloLoader, AccumuloConfig} import io.gzet.community.elasticsearch.{ESReader, ESConfig} import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession object GzetLoader { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args: Array[String]) = { val spark = SparkSession.builder() .appName("communities-loader") .getOrCreate() val sc = spark.sparkContext val blacklist = args.mkString(" ").split(",").map(_.trim).toSet val config = ConfigFactory.load() val esField = config.getString("io.gzet.elasticsearch.field") val esConf = ESConfig( config.getString("io.gzet.elasticsearch.nodes"), config.getInt("io.gzet.elasticsearch.port"), config.getString("io.gzet.elasticsearch.index") ) val accumuloTable = config.getString("io.gzet.accumulo.table") val accumuloConf = AccumuloConfig( config.getString("io.gzet.accumulo.instance"), config.getString("io.gzet.accumulo.user"), config.getString("io.gzet.accumulo.password"), config.getString("io.gzet.accumulo.zookeeper") ) val reader = new ESReader(esConf) val personsRdd = reader.loadPersons(sc, esField) personsRdd.cache() val writer = new AccumuloLoader(accumuloConf) writer.persist(sc, accumuloTable, personsRdd, blacklist) } }
Example 180
Source File: ESReaderIT.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.community import io.gzet.community.elasticsearch.{ESReader, ESConfig} import io.gzet.test.SparkFunSuite import org.apache.log4j.{Level, Logger} class ESReaderIT extends SparkFunSuite { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) localTest("Read from ES") { spark => val sc = spark.sparkContext val esConf = ESConfig("localhost", 9200, "gzet/articles") val esField = "persons" val reader = new ESReader(esConf) val esQuery = "?q=persons:'David Bowie'" val tuples = reader.loadPersons(sc, esField, esQuery) tuples.cache assert(tuples.count() > 0L) tuples.take(100).foreach(println) } }
Example 181
Source File: AccumuloIT.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.community import io.gzet.community.accumulo.{AccumuloAuthorization, AccumuloReader, AccumuloLoader, AccumuloConfig} import io.gzet.test.SparkFunSuite import org.apache.log4j.{Level, Logger} class AccumuloIT extends SparkFunSuite { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) localTest("Write to Accumulo, Read from Accumulo") { spark => val sc = spark.sparkContext val accumuloConf = AccumuloConfig("ACCUMULO_INSTANCE", "root", "secret", "localhost:2181") val accumuloTable = "persons" val writer = new AccumuloLoader(accumuloConf) val persisted = sc.parallelize(Seq(("Antoine Amend", "Matthew Hallett"))) writer.persist(sc, accumuloTable, persisted) val reader = new AccumuloReader(accumuloConf) val retrieved = reader.read(sc, accumuloTable) retrieved.cache() val filtered = retrieved.filter(_.getSourceVertex == "Antoine Amend") filtered.cache() filtered.count should be(1L) filtered.map(_.getDestVertex).first() should be("Matthew Hallett") filtered.map(_.getCount).first() should be(1L) filtered.map(_.toString).take(1).foreach(println) writer.persist(sc, accumuloTable, persisted) val retrieved2 = reader.read(sc, accumuloTable) val filtered2 = retrieved2.filter(_.getSourceVertex == "Antoine Amend") filtered2.cache() filtered2.count should be(1L) filtered2.map(_.getDestVertex).first() should be("Matthew Hallett") filtered2.map(_.getCount).first() should be(2L) filtered2.map(_.toString).take(1).foreach(println) } localTest("Row security") { spark => val accumuloConf = AccumuloConfig("ACCUMULO_INSTANCE", "root", "secret", "localhost:2181") val accumuloTable = "security" val sc = spark.sparkContext val writer = new AccumuloLoader(accumuloConf) val persisted = sc.parallelize( Seq( ("Antoine Amend", "Matthew Hallett"), ("Matthew Hallett", "Antoine Amend"), ("Antoine", "Matthew Hallett")) ) writer.persist(sc, accumuloTable, persisted, Set("Antoine Amend")) println("WITH UNRESTRICTED ACCESS") val reader1 = new AccumuloReader(accumuloConf) val retrieved1 = reader1.read(sc, accumuloTable, Some(AccumuloAuthorization.BLACKLIST)) retrieved1.cache() retrieved1.map(_.toString).foreach(println) assert(retrieved1.count() === 3) println("WITH RESTRICTED ACCESS") val reader2 = new AccumuloReader(accumuloConf) val retrieved2 = reader2.read(sc, accumuloTable) retrieved2.cache() retrieved2.map(_.toString).foreach(println) assert(retrieved2.count() === 1) } }
Example 182
Source File: GzetCommunitiesTest.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.community import io.gzet.community.clustering.wcc.WCCDetection import io.gzet.test.SparkFunSuite import org.apache.log4j.{Level, Logger} import org.apache.spark.graphx.{Graph, Edge} import scala.io.Source class GzetCommunitiesTest extends SparkFunSuite { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) localTest("WCC communities") { spark => val lines = Source.fromInputStream(getClass.getResourceAsStream("/local-edges.csv")).getLines().zipWithIndex.filter(_._2 > 0).map(_._1).toSeq val sc = spark.sparkContext val edges = sc.parallelize(lines).map({ line => val a = line.split(",").map(_.toLong).sorted Edge(a.head, a.last, 1L) }).distinct() val graph = Graph.fromEdges(edges, 0L) graph.triplets.take(2).foreach(println) val communities = new WCCDetection(1).run(graph, sc) communities.map(_._2 -> 1).reduceByKey(_+_).collectAsMap() should be(Map(5L -> 5, 15L -> 6, 21L -> 5)) } }
Example 183
Source File: GodwinTest.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.timeseries.graph import io.gzet.test.SparkFunSuite import org.apache.log4j.{Logger, Level} import org.apache.spark.graphx.{Graph, Edge} import org.apache.spark.rdd.RDD import scala.io.Source class GodwinTest extends SparkFunSuite { Logger.getLogger("akka").setLevel(Level.OFF) Logger.getLogger("org").setLevel(Level.OFF) def buildEdges() = { Source.fromInputStream(getClass.getResourceAsStream("/edges.csv")).getLines().drop(1).map(s => { val Array(source, target, weight) = s.split(",") Edge(source.toLong, target.toLong, weight.toDouble) }).toList } localTest("Test Random Walks") { sc => val edges: RDD[Edge[Double]] = sc.parallelize(buildEdges(), 1) val godwin = new Godwin(Seq(16)) val walks = godwin.randomWalks(Graph.fromEdges(edges, 0L), 4).collect().sortBy(_._2) println(walks.map(_._1).mkString(" -> ")) walks.last._1 should be(16) } }
Example 184
Source File: myCustomLogwithClosure.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter14.Serilazition import org.apache.log4j.LogManager import org.apache.log4j.Level import org.apache.log4j.Logger import org.apache.spark.sql.SparkSession object myCustomLogwithClosure extends Serializable { def main(args: Array[String]): Unit = { val log = LogManager.getRootLogger //Everything is printed as INFO onece the log level is set to INFO untill you set the level to new level for example WARN. log.setLevel(Level.INFO) log.info("Let's get started!") // Setting logger level as WARN: after that nothing prints other then WARN log.setLevel(Level.WARN) // Creating Spark Session val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName("Logging") .getOrCreate() // These will note be printed! log.info("Get prepared!") log.trace("Show if there is any ERROR!") //Started the computation and printing the logging information log.warn("Started") val data = spark.sparkContext.parallelize(0 to 100000) data.foreach(i => log.info("My number"+ i)) data.collect() log.warn("Finished") } }
Example 185
Source File: MakingTaskSerilazible.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter16.SparkTesting import org.apache.spark.sql.SparkSession import org.apache.log4j.LogManager import org.apache.log4j.Level import org.apache.log4j.Logger class MultiplicaitonOfTwoNumber { def multiply(a: Int, b: Int): Int = { val product = a * b product } } object MakingTaskSerilazible { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() val myRDD = spark.sparkContext.parallelize(0 to 100) myRDD.foreachPartition(s => { val notSerializable = new MultiplicaitonOfTwoNumber println(notSerializable.multiply(s.next(), s.next())) }) } }
Example 186
Source File: myCustomLog.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter16.SparkTesting import org.apache.log4j.LogManager import org.apache.log4j.Level import org.apache.spark.sql.SparkSession object myCustomLogwithoutSerializable { def main(args: Array[String]): Unit = { val log = LogManager.getRootLogger //Everything is printed as INFO onece the log level is set to INFO untill you set the level to new level for example WARN. log.setLevel(Level.INFO) log.info("Let's get started!") // Setting logger level as WARN: after that nothing prints other then WARN log.setLevel(Level.WARN) // Creating Spark Session val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName("Logging") .getOrCreate() // These will note be printed! log.info("Get prepared!") log.trace("Show if there is any ERROR!") //Started the computation and printing the logging information log.warn("Started") spark.sparkContext.parallelize(1 to 5).foreach(println) log.warn("Finished") } }
Example 187
Source File: myCustomLogwithClosureSerializable.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter16.SparkTesting import org.apache.log4j.{ Level, LogManager } import org.apache.spark._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession class MyMapper(n: Int) extends Serializable { @transient lazy val log = org.apache.log4j.LogManager.getLogger("myLogger") def logMapper(rdd: RDD[Int]): RDD[String] = rdd.map { i => log.warn("mapping: " + i) (i + n).toString } } //Companion object object MyMapper { def apply(n: Int): MyMapper = new MyMapper(n) } //Main object object myCustomLogwithClosureSerializable { def main(args: Array[String]) { val log = LogManager.getRootLogger log.setLevel(Level.WARN) val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName("Testing") .getOrCreate() log.warn("Started") val data = spark.sparkContext.parallelize(1 to 100000) val mapper = MyMapper(1) val other = mapper.logMapper(data) other.collect() log.warn("Finished") } }
Example 188
Source File: KyroRegistrationDemo.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter14.Serilazition import org.apache.log4j.{ Level, LogManager, PropertyConfigurator } import org.apache.spark._ import org.apache.spark.rdd.RDD class MyMapper2(n: Int) { @transient lazy val log = org.apache.log4j.LogManager.getLogger("myLogger") def MyMapperDosomething(rdd: RDD[Int]): RDD[String] = rdd.map { i => log.warn("mapping: " + i) (i + n).toString } } //Companion object object MyMapper2 { def apply(n: Int): MyMapper = new MyMapper(n) } //Main object object KyroRegistrationDemo { def main(args: Array[String]) { val log = LogManager.getRootLogger log.setLevel(Level.WARN) val conf = new SparkConf() .setAppName("My App") .setMaster("local[*]") conf.registerKryoClasses(Array(classOf[MyMapper2])) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") val sc = new SparkContext(conf) log.warn("Started") val data = sc.parallelize(1 to 100000) val mapper = MyMapper2(10) val other = mapper.MyMapperDosomething(data) other.collect() log.warn("Finished") } }
Example 189
Source File: MyLog.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter14.Serilazition import org.apache.spark.{SparkConf, SparkContext} import org.apache.log4j.LogManager import org.apache.log4j.Level import org.apache.log4j.Logger object MyLog1 extends Serializable { def main(args: Array[String]):Unit= { // Stting logger level as WARN val log = LogManager.getRootLogger log.setLevel(Level.WARN) @transient lazy val log2 = org.apache.log4j.LogManager.getLogger("myLogger") // Creating Spark Context val conf = new SparkConf().setAppName("My App").setMaster("local[*]") val sc = new SparkContext(conf) //Started the computation and printing the logging inforamtion //log.warn("Started") //val i = 0 val data = sc.parallelize(0 to 100000) data.foreach(i => log.info("My number"+ i)) data.collect() log.warn("Finished") } }
Example 190
Source File: MyLogCompleteDemo.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter14.Serilazition import org.apache.log4j.{Level, LogManager, PropertyConfigurator} import org.apache.spark._ import org.apache.spark.rdd.RDD class MyMapper(n: Int) extends Serializable{ @transient lazy val log = org.apache.log4j.LogManager.getLogger("myLogger") def MyMapperDosomething(rdd: RDD[Int]): RDD[String] = rdd.map{ i => log.warn("mapping: " + i) (i + n).toString } } //Companion object object MyMapper { def apply(n: Int): MyMapper = new MyMapper(n) } //Main object object MyLog { def main(args: Array[String]) { val log = LogManager.getRootLogger log.setLevel(Level.WARN) val conf = new SparkConf() .setAppName("My App") .setMaster("local[*]") conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") val sc = new SparkContext(conf) log.warn("Started") val data = sc.parallelize(1 to 100000) val mapper = MyMapper(1) val other = mapper.MyMapperDosomething(data) other.collect() log.warn("Finished") } }
Example 191
Source File: SparkPredictionTrainer.scala From smart-meter with MIT License | 5 votes |
package com.logimethods.nats.connector.spark.app import java.util.Properties; import java.io.File import java.io.Serializable import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.storage.StorageLevel; import org.apache.spark.streaming._ import io.nats.client.ConnectionFactory._ import java.nio.ByteBuffer import org.apache.log4j.{Level, LogManager, PropertyConfigurator} import com.logimethods.connector.nats.to_spark._ import com.logimethods.scala.connector.spark.to_nats._ import org.apache.spark.ml.classification.MultilayerPerceptronClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import java.util.function._ import java.time.{LocalDateTime, ZoneOffset} import java.time.DayOfWeek._ import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel object SparkPredictionTrainer extends App with SparkPredictionProcessor { log.setLevel(Level.WARN) val (properties, targets, logLevel, sc, inputNatsStreaming, inputSubject, outputSubject, clusterId, outputNatsStreaming, natsUrl) = setup(args) val streamingDuration = scala.util.Properties.envOrElse("STREAMING_DURATION", "2000").toInt println("STREAMING_DURATION = " + streamingDuration) new Thread(new Runnable { def run() { while( true ){ try { val data = SparkPredictionProcessor.getData(sc, THRESHOLD) val model = trainer.fit(data) model.write.overwrite.save(PREDICTION_MODEL_PATH) println("New model of size " + data.count() + " trained: " + model.uid) Thread.sleep(streamingDuration) } catch { case e: Throwable => log.error(e) } } } }).start() }
Example 192
Source File: SparkProcessor.scala From smart-meter with MIT License | 5 votes |
package com.logimethods.nats.connector.spark.app import java.util.Properties; import java.io.File import java.io.Serializable import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.storage.StorageLevel; import org.apache.spark.streaming._ import io.nats.client.Nats._ import io.nats.client.ConnectionFactory._ import java.nio.ByteBuffer import org.apache.log4j.{Level, LogManager, PropertyConfigurator} import com.logimethods.connector.nats.to_spark._ import com.logimethods.scala.connector.spark.to_nats._ import java.util.function._ import java.time.{LocalDateTime, ZoneOffset} trait SparkProcessor { def setup(args: Array[String]) = { val inputSubject = args(0) // val inputNatsStreaming = inputSubject.toUpperCase.contains("STREAMING") val outputSubject = args(1) // val outputNatsStreaming = outputSubject.toUpperCase.contains("STREAMING") println("Will process messages from '" + inputSubject + "' to '" + outputSubject + "'") val logLevel = scala.util.Properties.envOrElse("LOG_LEVEL", "INFO") println("LOG_LEVEL = " + logLevel) val targets = scala.util.Properties.envOrElse("TARGETS", "ALL") println("TARGETS = " + targets) val cassandraUrl = System.getenv("CASSANDRA_URL") println("CASSANDRA_URL = " + cassandraUrl) val sparkMasterUrl = System.getenv("SPARK_MASTER_URL") println("SPARK_MASTER_URL = " + sparkMasterUrl) val sparkCoresMax = System.getenv("SPARK_CORES_MAX") println("SPARK_CORES_MAX = " + sparkCoresMax) val conf = new SparkConf() .setAppName(args(2)) .setMaster(sparkMasterUrl) .set("spark.cores.max", sparkCoresMax) .set("spark.cassandra.connection.host", cassandraUrl); val sc = new SparkContext(conf); // val streamingDuration = scala.util.Properties.envOrElse("STREAMING_DURATION", "2000").toInt // val ssc = new StreamingContext(sc, new Duration(streamingDuration)); /// ssc.checkpoint("/spark/storage") val properties = new Properties(); val natsUrl = System.getenv("NATS_URI") println("NATS_URI = " + natsUrl) properties.put("servers", natsUrl) properties.put(PROP_URL, natsUrl) val clusterId = System.getenv("NATS_CLUSTER_ID") val inputNatsStreaming = inputSubject.toUpperCase.contains("STREAMING") val outputNatsStreaming = outputSubject.toUpperCase.contains("STREAMING") (properties, targets, logLevel, sc, inputNatsStreaming, inputSubject, outputSubject, clusterId, outputNatsStreaming, natsUrl) } def dataDecoder: Array[Byte] => Tuple2[Long,Float] = bytes => { val buffer = ByteBuffer.wrap(bytes); val epoch = buffer.getLong() val value = buffer.getFloat() (epoch, value) } } trait SparkStreamingProcessor extends SparkProcessor { def setupStreaming(args: Array[String]) = { val (properties, target, logLevel, sc, inputNatsStreaming, inputSubject, outputSubject, clusterId, outputNatsStreaming, natsUrl) = setup(args) val streamingDuration = scala.util.Properties.envOrElse("STREAMING_DURATION", "2000").toInt println("STREAMING_DURATION = " + streamingDuration) val ssc = new StreamingContext(sc, new Duration(streamingDuration)); // ssc.checkpoint("/spark/storage") (properties, target, logLevel, sc, ssc, inputNatsStreaming, inputSubject, outputSubject, clusterId, outputNatsStreaming, natsUrl, streamingDuration) } }
Example 193
Source File: SparkTemperatureProcessor.scala From smart-meter with MIT License | 5 votes |
package com.logimethods.nats.connector.spark.app import java.util.Properties; import java.io.File import java.io.Serializable import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.storage.StorageLevel; import org.apache.spark.streaming._ import com.datastax.spark.connector.streaming._ import com.datastax.spark.connector.SomeColumns import io.nats.client.ConnectionFactory._ import java.nio.ByteBuffer import org.apache.log4j.{Level, LogManager, PropertyConfigurator} import com.logimethods.connector.nats.to_spark._ import com.logimethods.scala.connector.spark.to_nats._ import java.util.function._ import java.time.{LocalDateTime, ZoneOffset} object SparkTemperatureProcessor extends App with SparkStreamingProcessor { val log = LogManager.getRootLogger log.setLevel(Level.WARN) val (properties, target, logLevel, sc, ssc, inputNatsStreaming, inputSubject, outputSubject, clusterId, outputNatsStreaming, natsUrl, streamingDuration) = setupStreaming(args) // Temperatures // val temperatures = if (inputNatsStreaming) { NatsToSparkConnector .receiveFromNatsStreaming(classOf[Tuple2[Long,Float]], StorageLevel.MEMORY_ONLY, clusterId) .withNatsURL(natsUrl) .withSubjects(inputSubject) .withDataDecoder(dataDecoder) .asStreamOf(ssc) } else { NatsToSparkConnector .receiveFromNats(classOf[Tuple2[Long,Float]], StorageLevel.MEMORY_ONLY) .withProperties(properties) .withSubjects(inputSubject) .withDataDecoder(dataDecoder) .asStreamOf(ssc) } // Ideally, should be the AVG val singleTemperature = temperatures.reduceByKey(Math.max(_,_)) if (logLevel.contains("TEMPERATURE")) { singleTemperature.print() } singleTemperature.saveToCassandra("smartmeter", "temperature") val temperatureReport = singleTemperature.map({case (epoch, temperature) => (s"""{"epoch": $epoch, "temperature": $temperature}""") }) SparkToNatsConnectorPool.newPool() .withProperties(properties) .withSubjects(outputSubject) // "smartmeter.extract.temperature" .publishToNats(temperatureReport) // Start // ssc.start(); ssc.awaitTermination() }
Example 194
Source File: SparkBatch.scala From smart-meter with MIT License | 5 votes |
package com.logimethods.nats.connector.spark.app import java.util.Properties; import java.io.File import java.io.Serializable import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.log4j.{Level, LogManager, PropertyConfigurator} import org.apache.log4j.Logger import org.apache.spark.sql.SparkSession //import com.datastax.spark.connector._ //import com.datastax.spark.connector.cql.CassandraConnector // @see http://stackoverflow.com/questions/39423131/how-to-use-cassandra-context-in-spark-2-0 // @see https://databricks.com/blog/2016/08/15/how-to-use-sparksession-in-apache-spark-2-0.html // @see https://dzone.com/articles/cassandra-with-spark-20-building-rest-api object SparkBatch extends App { val logLevel = System.getenv("APP_BATCH_LOG_LEVEL") println("APP_BATCH_LOG_LEVEL = " + logLevel) if ("DEBUG" != logLevel) { Logger.getLogger("org").setLevel(Level.OFF) } val cassandraUrl = System.getenv("CASSANDRA_URL") println("CASSANDRA_URL = " + cassandraUrl) val sparkMasterUrl = System.getenv("SPARK_MASTER_URL") println("SPARK_MASTER_URL = " + sparkMasterUrl) val spark = SparkSession .builder() .master(sparkMasterUrl) .appName("Smartmeter Batch") .config("spark.cassandra.connection.host", cassandraUrl) // .config("spark.sql.warehouse.dir", warehouseLocation) //.enableHiveSupport() .getOrCreate() spark .read .format("org.apache.spark.sql.cassandra") .options(Map("keyspace" -> "smartmeter", "table" -> "raw_data")) .load .createOrReplaceTempView("raw_data") val rawVoltageData = spark.sql("select * from raw_data") rawVoltageData.show(10) // @see http://stackoverflow.com/questions/40324153/what-is-the-best-way-to-insert-update-rows-in-cassandra-table-via-java-spark //Save data to Cassandra import org.apache.spark.sql.SaveMode avgByTransformer.write.format("org.apache.spark.sql.cassandra").options(Map("keyspace" -> "smartmeter", "table" -> "avg_voltage_by_transformer")).mode(SaveMode.Overwrite).save(); }
Example 195
Source File: Log4jAppender.scala From rollbar-scala with MIT License | 5 votes |
package com.storecove.rollbar.appenders import org.apache.log4j.helpers.LogLog import org.apache.log4j.spi.{LoggingEvent, ThrowableInformation} import org.apache.log4j.{AppenderSkeleton, Level} class Log4jAppender extends AppenderSkeleton with AbstractAppender { override def append(event: LoggingEvent): Unit = { if (enabled) { try { logBuffer.enqueueFinite(this.layout.format(event).trim, limit) if (event.getLevel.isGreaterOrEqual(notifyLevel)) { val hasThrowable = event.getThrowableInformation != null || event.getMessage.isInstanceOf[Throwable] if (!onlyThrowable || hasThrowable) { rollbarNotifier.notify(event.getLevel.toString, event.getMessage.toString, getThrowable(event), getMDCContext) } } } catch { case e: Exception => val stackTrace = e.getStackTrace.map(trace => trace.toString).mkString("\n") LogLog.error("error=" + e.getClass.getName + " with message=" + e.getMessage + "\n" + stackTrace) } } } override def requiresLayout(): Boolean = true override def close(): Unit = {} override def activateOptions(): Unit = { if (this.apiKey == null || this.apiKey.isEmpty) { println("No apiKey set for the appender named [" + getName + "].") } else if (this.environment == null || this.environment.isEmpty) { println("No environment set for the appender named [" + getName + "].") } else { println(s"PARAMETERS SET\n\n$apiKey / $environment\n") super.activateOptions() } } protected def getThrowable(event: LoggingEvent): Option[Throwable] = { event.getThrowableInformation match { case throwableInfo: ThrowableInformation => Some(throwableInfo.getThrowable) case _ => event.getMessage match { case throwable: Throwable => Some(throwable) case _ => None } } } override protected def notifyLevel: Level = Level.toLevel(notifyLevelString) def setNotifyLevel(notifyLevel: String): Unit = notifyLevelString = notifyLevel }
Example 196
Source File: SparkTestContext.scala From scalable-deeplearning with Apache License 2.0 | 5 votes |
package scaladl.util import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.SparkContext import org.scalatest.{BeforeAndAfterAll, Suite} trait SparkTestContext extends BeforeAndAfterAll { self: Suite => @transient var spark: SparkSession = _ @transient var sc: SparkContext = _ @transient var checkpointDir: String = _ override def beforeAll() { super.beforeAll() spark = SparkSession.builder .master("local[2]") .appName("MLlibUnitTest") .config("spark.sql.warehouse.dir", "warehouse-temp") .getOrCreate() sc = spark.sparkContext Logger.getLogger("org").setLevel(Level.WARN) } override def afterAll() { try { SparkSession.clearActiveSession() if (spark != null) { spark.stop() } spark = null } finally { super.afterAll() } } }
Example 197
Source File: SparkTransformerBenchmark.scala From mleap with Apache License 2.0 | 5 votes |
package com.truecar.mleap.spark.benchmark import java.io.{FileInputStream, File} import com.esotericsoftware.kryo.io.Input import com.truecar.mleap.runtime.LocalLeapFrame import com.truecar.mleap.spark.benchmark.util.SparkSerializer import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.ml.Transformer import org.scalameter.Bench import scala.collection.JavaConverters._ import org.scalameter.api._ import org.scalameter.picklers.Implicits._ import org.apache.log4j.Logger import org.apache.log4j.Level import com.truecar.mleap.spark.MleapSparkSupport._ import spray.json._ import com.truecar.mleap.serialization.mleap.v1.MleapJsonSupport._ object SparkTransformerBenchmark extends Bench.ForkedTime { lazy override val executor = { SeparateJvmsExecutor( Executor.Warmer.Zero, Aggregator.min[Double], new Measurer.Default) } val classLoader = getClass.getClassLoader val regressionFile = new File("/tmp/spark.transformer.kryo") val frameFile = new File("/tmp/frame.json") val inputStream = new FileInputStream(regressionFile) val input = new Input(inputStream) val regression: Transformer = SparkSerializer().read(input) val lines = scala.io.Source.fromFile(frameFile).mkString val frame = lines.parseJson.convertTo[LocalLeapFrame] Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) val sparkConf = new SparkConf() .setAppName("Spark Transformer Benchmark") .setMaster("local[1]") val sc = new SparkContext(sparkConf) val sqlContext = new SQLContext(sc) val rdd = frame.dataset.data.map(a => Row(a.toSeq: _*)).toList.asJava val schema = frame.schema.toSpark val sparkFrame = sqlContext.createDataFrame(rdd, schema) val ranges = for { size <- Gen.range("size")(1000, 10000, 1000) } yield 0 until size measure method "transform" in { using(ranges) in { size => size.foreach { _ => regression.transform(sparkFrame).head } } } // sc.stop() }
Example 198
Source File: package.scala From kafka-scala-api with Apache License 2.0 | 5 votes |
package com import org.apache.kafka.common.serialization.StringDeserializer import org.apache.spark.streaming.{Seconds, StreamingContext} package object example { def setupLogging(): Unit = { import org.apache.log4j.{Level, Logger} val rootLogger = Logger.getRootLogger rootLogger.setLevel(Level.ERROR) } def kafkaParams = Map[String, Object]( "bootstrap.servers" -> "127.0.0.1:9092", "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> "mygroup1", "auto.offset.reset" -> "latest", "enable.auto.commit" -> (false: java.lang.Boolean) ) def launchWithCheckpointing(logic: StreamingContext => Unit, appName:String, checkpointPath:String): Unit = { val streamingContext = new StreamingContext("local[*]", appName, Seconds(2)) setupLogging() logic.apply(streamingContext) streamingContext.checkpoint(checkpointPath) streamingContext.start() streamingContext.awaitTermination() } def launchWithItself(logic: StreamingContext => Unit, appName:String): Unit = { val streamingContext = new StreamingContext("local[*]", appName, Seconds(2)) setupLogging() logic.apply(streamingContext) streamingContext.start() streamingContext.awaitTermination() } }
Example 199
Source File: KafkaFlowExampleTest.scala From kafka-scala-api with Apache License 2.0 | 5 votes |
package com.example.flow import org.apache.log4j.{Level, Logger} import org.specs2.mutable._ import scala.Predef.{conforms => _} class KafkaFlowExampleTest extends Specification { Logger.getRootLogger.setLevel(Level.ERROR) sequential "the transformStream method" should { implicit val fun = KafkaFlowExample.transformStream _ "with 10 identical records" should { val records = Seq.fill(10)("""{"item_id":"abc123","amount":1.23,"time":1431504603105}""") "return a single record with the correct total" in new SparkStreamingSpec(records) { collector.length mustEqual 1 val output = collector.head output.total mustEqual BigDecimal(12.3) } } "with invalid records" should { val records = Seq("this is not json", """{"this":"isn't in the right format"}""") "output no records" in new SparkStreamingSpec(records) { collector.length mustEqual 0 } } } }
Example 200
Source File: package.scala From kafka-scala-api with Apache License 2.0 | 5 votes |
package com import org.apache.spark.streaming.{Seconds, StreamingContext} package object example { def setupLogging(): Unit = { import org.apache.log4j.{Level, Logger} val rootLogger = Logger.getRootLogger rootLogger.setLevel(Level.ERROR) } def launch(logic: StreamingContext => Unit, appName:String, checkpointPath:String): Unit = { val streamingContext = new StreamingContext("local[*]", appName, Seconds(2)) setupLogging() logic.apply(streamingContext) streamingContext.checkpoint(checkpointPath) streamingContext.start() streamingContext.awaitTermination() } }