org.apache.log4j.Logger Scala Examples
The following examples show how to use org.apache.log4j.Logger.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: WortschatzParser.scala From dbpedia-spotlight-model with Apache License 2.0 | 7 votes |
package org.dbpedia.spotlight.io import com.officedepot.cdap2.collection.CompactHashSet import org.apache.log4j.Logger import scala.io.Source object WortschatzParser { val LOG = Logger.getLogger(this.getClass) def parse(filename: String) : CompactHashSet[String] = { parse(filename, count => true); } def parse(filename: String, minimumCount: Int) : CompactHashSet[String] = { parse(filename, count => (count > minimumCount) ) } def parse(filename: String, minimumCount: Int, maximumCount: Int) : CompactHashSet[String] = { parse(filename, count => (count > minimumCount) && (count < maximumCount)) } def parse(filename: String, condition: Int => Boolean) : CompactHashSet[String] = { LOG.info(" parsing common words file ") // get lines, split in three fields, get the middle one (word) val commonWords = new CompactHashSet[String](); val log = Source.fromFile(filename, "iso-8859-1").getLines.foreach(line => { if (line.trim()!="") { val fields = line.split("\\s") if (condition(fields(2).toInt)) commonWords.add(fields(1)) } }); commonWords } }
Example 2
Source File: DenseKMeans.scala From drizzle-spark with Apache License 2.0 | 6 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 3
Source File: HttpUtil.scala From sparta with Apache License 2.0 | 6 votes |
package com.stratio.benchmark.generator.utils import org.apache.http.HttpStatus import org.apache.http.client.methods.{HttpDelete, HttpGet, HttpPost, HttpPut} import org.apache.http.entity.StringEntity import org.apache.http.impl.client.HttpClientBuilder import org.apache.http.util.EntityUtils import org.apache.log4j.Logger import org.json4s.DefaultFormats import org.json4s.native.JsonMethods._ import scala.io.Source trait HttpUtil { private val logger = Logger.getLogger(this.getClass) def createPolicy(policyContent: String, endpoint: String)(implicit defaultFormats: DefaultFormats): String = { val policyName = (parse(policyContent) \ "name").extract[String] // If the policy exists when it launches the benchmark, it should stop and delete it. getPolicyId(policyName, endpoint) match { case Some(id) => stopPolicy(id, endpoint) deletePolicy(id, endpoint) case None => logger.debug(s"No policy with name $policyName exists in Sparta yet.") } val client = HttpClientBuilder.create().build() val post = new HttpPost(s"$endpoint/policyContext") post.setHeader("Content-type", "application/json") post.setEntity(new StringEntity(policyContent)) val response = client.execute(post) if(response.getStatusLine.getStatusCode != HttpStatus.SC_OK) throw new IllegalStateException(s"Sparta status code is not OK: ${response.getStatusLine.getStatusCode}") else { val entity = response.getEntity val policyId = (parse(EntityUtils.toString(entity)) \ "policyId").extract[String] policyId } } def getPolicyId(name: String, endpoint: String)(implicit defaultFormats: DefaultFormats): Option[String] = { val client = HttpClientBuilder.create().build() val get = new HttpGet(s"$endpoint/policy/findByName/$name") val response = client.execute(get) response.getStatusLine.getStatusCode match { case HttpStatus.SC_OK => Option((parse(EntityUtils.toString(response.getEntity)) \ "id").extract[String]) case _ => None } } def stopPolicy(id: String, endpoint: String): Unit = { val client = HttpClientBuilder.create().build() val put = new HttpPut(s"$endpoint/policyContext") put.setHeader("Content-Type", "application/json") val entity = new StringEntity(s"""{"id":"$id", "status":"Stopping"}""") put.setEntity(entity) val response = client.execute(put) if(response.getStatusLine.getStatusCode != HttpStatus.SC_CREATED) { logger.info(Source.fromInputStream(response.getEntity.getContent).mkString("")) logger.info(s"Sparta status code is not OK: ${response.getStatusLine.getStatusCode}") } } def deletePolicy(id: String, endpoint: String): Unit = { val client = HttpClientBuilder.create().build() val delete = new HttpDelete(s"$endpoint/policy/$id") val response = client.execute(delete) if(response.getStatusLine.getStatusCode != HttpStatus.SC_OK) logger.info(s"Sparta status code is not OK: ${response.getStatusLine.getStatusCode}") } }
Example 4
Source File: RandomForestPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 6 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object RandomForestPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val rf = new RandomForestClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setNumTrees(20) .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += rf val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 5
Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 6 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object GradientBoostedTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val gbt = new GBTClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxIter(10) stages += vectorAssembler stages += gbt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/GBT.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/GBT.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 6
Source File: JDBCSink.scala From BigData-News with Apache License 2.0 | 5 votes |
package com.vita.spark import java.sql.{Connection, ResultSet, SQLException, Statement} import org.apache.log4j.{LogManager, Logger} import org.apache.spark.sql.{ForeachWriter, Row} /** * 处理从StructuredStreaming中向mysql中写入数据 */ class JDBCSink(url: String, username: String, password: String) extends ForeachWriter[Row] { var statement: Statement = _ var resultSet: ResultSet = _ var connection: Connection = _ override def open(partitionId: Long, version: Long): Boolean = { connection = new MySqlPool(url, username, password).getJdbcConn() statement = connection.createStatement(); print("open") return true } override def process(value: Row): Unit = { println("process step one") val titleName = value.getAs[String]("titleName").replaceAll("[\\[\\]]", "") val count = value.getAs[Long]("count") val querySql = "select 1 from webCount where titleName = '" + titleName + "'" val insertSql = "insert into webCount(titleName,count) values('" + titleName + "' , '" + count + "')" val updateSql = "update webCount set count = " + count + " where titleName = '" + titleName + "'" println("process step two") try { //查看连接是否成功 var resultSet = statement.executeQuery(querySql) if (resultSet.next()) { println("updateSql") statement.executeUpdate(updateSql) } else { println("insertSql") statement.execute(insertSql) } } catch { case ex: SQLException => { println("SQLException") } case ex: Exception => { println("Exception") } case ex: RuntimeException => { println("RuntimeException") } case ex: Throwable => { println("Throwable") } } } override def close(errorOrNull: Throwable): Unit = { if (statement == null) { statement.close() } if (connection == null) { connection.close() } } }
Example 7
Source File: MySqlPool.scala From BigData-News with Apache License 2.0 | 5 votes |
package com.vita.spark import java.sql.{Connection, DriverManager} import java.util import org.apache.log4j.{LogManager, Logger} /** * 从mysql连接池中获取连接 */ class MySqlPool(url: String, user: String, pwd: String) extends Serializable { //连接池连接总数 private val max = 3 //每次产生连接数 private val connectionNum = 1 //当前连接池已产生的连接数 private var conNum = 0 private val pool = new util.LinkedList[Connection]() //连接池 val LOGGER :Logger = LogManager.getLogger("vita") //获取连接 def getJdbcConn(): Connection = { LOGGER.info("getJdbcConn") //同步代码块,AnyRef为所有引用类型的基类,AnyVal为所有值类型的基类 AnyRef.synchronized({ if (pool.isEmpty) { //加载驱动 preGetConn() for (i <- 1 to connectionNum) { val conn = DriverManager.getConnection(url, user, pwd) pool.push(conn) conNum += 1 } } pool.poll() }) } //释放连接 def releaseConn(conn: Connection): Unit = { pool.push(conn) } //加载驱动 private def preGetConn(): Unit = { //控制加载 if (conNum < max && !pool.isEmpty) { LOGGER.info("Jdbc Pool has no connection now, please wait a moments!") Thread.sleep(2000) preGetConn() } else { Class.forName("com.mysql.jdbc.Driver") } } }
Example 8
Source File: StructuredStreamingOffset.scala From BigData-News with Apache License 2.0 | 5 votes |
package com.vita.spark.streaming import com.vita.Constants import com.vita.redies.RedisSingle import com.vita.spark.streaming.writer.RedisWriteKafkaOffset import org.apache.log4j.{LogManager, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.{ProcessingTime, Trigger} object StructuredStreamingOffset { val LOGGER: Logger = LogManager.getLogger("StructuredStreamingOffset") //topic val SUBSCRIBE = "log" case class readLogs(context: String, offset: String) def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local[*]") .appName("StructuredStreamingOffset") .getOrCreate() //开始 offset var startOffset = -1 //init val redisSingle: RedisSingle = new RedisSingle() redisSingle.init(Constants.IP, Constants.PORT) //get redis if (redisSingle.exists(Constants.REDIDS_KEY) && redisSingle.getTime(Constants.REDIDS_KEY) != -1) { startOffset = redisSingle.get(Constants.REDIDS_KEY).toInt } //sink val df = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", "localhost:9092") .option("subscribe", SUBSCRIBE) .option("startingOffsets", "{\"" + SUBSCRIBE + "\":{\"0\":" + startOffset + "}}") .load() import spark.implicits._ //row 包含: key、value 、topic、 partition、offset、timestamp、timestampType val lines = df.selectExpr("CAST(value AS STRING)", "CAST(offset AS LONG)").as[(String, Long)] val content = lines.map(x => readLogs(x._1, x._2.toString)) val count = content.toDF("context", "offset") //sink foreach 记录offset val query = count .writeStream .foreach(new RedisWriteKafkaOffset) .outputMode("update") .trigger(Trigger.ProcessingTime("5 seconds")) .format("console") .start() query.awaitTermination() } }
Example 9
Source File: LinearRegression.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.optimization.{L1Updater, SimpleUpdater, SquaredL2Updater} import org.apache.spark.mllib.regression.LinearRegressionWithSGD import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"LinearRegression with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0).cache() val test = splits(1).cache() val numTraining = training.count() val numTest = test.count() println(s"Training: $numTraining, test: $numTest.") examples.unpersist(blocking = false) val updater = params.regType match { case NONE => new SimpleUpdater() case L1 => new L1Updater() case L2 => new SquaredL2Updater() } val algorithm = new LinearRegressionWithSGD() algorithm.optimizer .setNumIterations(params.numIterations) .setStepSize(params.stepSize) .setUpdater(updater) .setRegParam(params.regParam) val model = algorithm.run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val loss = predictionAndLabel.map { case (p, l) => val err = p - l err * err }.reduce(_ + _) val rmse = math.sqrt(loss / numTest) println(s"Test RMSE = $rmse.") sc.stop() } } // scalastyle:on println
Example 10
Source File: BinaryClassification.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, SVMWithSGD} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.optimization.{L1Updater, SquaredL2Updater} import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --algorithm LR --regType L2 --regParam 1.0 \ | data/mllib/sample_binary_classification_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"BinaryClassification with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0).cache() val test = splits(1).cache() val numTraining = training.count() val numTest = test.count() println(s"Training: $numTraining, test: $numTest.") examples.unpersist(blocking = false) val updater = params.regType match { case L1 => new L1Updater() case L2 => new SquaredL2Updater() } val model = params.algorithm match { case LR => val algorithm = new LogisticRegressionWithLBFGS() algorithm.optimizer .setNumIterations(params.numIterations) .setUpdater(updater) .setRegParam(params.regParam) algorithm.run(training).clearThreshold() case SVM => val algorithm = new SVMWithSGD() algorithm.optimizer .setNumIterations(params.numIterations) .setStepSize(params.stepSize) .setUpdater(updater) .setRegParam(params.regParam) algorithm.run(training).clearThreshold() } val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val metrics = new BinaryClassificationMetrics(predictionAndLabel) println(s"Test areaUnderPR = ${metrics.areaUnderPR()}.") println(s"Test areaUnderROC = ${metrics.areaUnderROC()}.") sc.stop() } } // scalastyle:on println
Example 11
Source File: SparseNaiveBayes.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println
Example 12
Source File: StreamingExamples.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.apache.log4j.{Level, Logger} import org.apache.spark.internal.Logging def setStreamingLogLevels() { val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements if (!log4jInitialized) { // We first log something to initialize Spark's default logging, then we override the // logging level. logInfo("Setting log level to [WARN] for streaming example." + " To override add a custom log4j.properties to the classpath.") Logger.getRootLogger.setLevel(Level.WARN) } } }
Example 13
Source File: YarnScheduler.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.yarn.util.RackResolver import org.apache.log4j.{Level, Logger} import org.apache.spark._ import org.apache.spark.scheduler.TaskSchedulerImpl import org.apache.spark.util.Utils private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) { // RackResolver logs an INFO message whenever it resolves a rack, which is way too often. if (Logger.getLogger(classOf[RackResolver]).getLevel == null) { Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN) } // By default, rack is unknown override def getRackForHost(hostPort: String): Option[String] = { val host = Utils.parseHostPort(hostPort)._1 Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation) } }
Example 14
Source File: DLClassifierLeNet.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.example.MLPipeline import com.intel.analytics.bigdl._ import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToBatch} import com.intel.analytics.bigdl.dataset.{DataSet, DistributedDataSet, MiniBatch, _} import com.intel.analytics.bigdl.dlframes.DLClassifier import com.intel.analytics.bigdl.models.lenet.LeNet5 import com.intel.analytics.bigdl.models.lenet.Utils._ import com.intel.analytics.bigdl.nn.ClassNLLCriterion import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter} import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext object DLClassifierLeNet { LoggerFilter.redirectSparkInfoLogs() def main(args: Array[String]): Unit = { val inputs = Array[String]("Feature data", "Label data") trainParser.parse(args, new TrainParams()).foreach(param => { val conf = Engine.createSparkConf() .setAppName("MLPipeline Example") .set("spark.task.maxFailures", "1") val sc = new SparkContext(conf) val sqLContext = SQLContext.getOrCreate(sc) Engine.init val trainData = param.folder + "/train-images-idx3-ubyte" val trainLabel = param.folder + "/train-labels-idx1-ubyte" val validationData = param.folder + "/t10k-images-idx3-ubyte" val validationLabel = param.folder + "/t10k-labels-idx1-ubyte" val trainSet = DataSet.array(load(trainData, trainLabel), sc) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch(1) val trainingRDD : RDD[Data[Float]] = trainSet. asInstanceOf[DistributedDataSet[MiniBatch[Float]]].data(false).map(batch => { val feature = batch.getInput().asInstanceOf[Tensor[Float]] val label = batch.getTarget().asInstanceOf[Tensor[Float]] Data[Float](feature.storage().array(), label.storage().array()) }) val trainingDF = sqLContext.createDataFrame(trainingRDD).toDF(inputs: _*) val model = LeNet5(classNum = 10) val criterion = ClassNLLCriterion[Float]() val featureSize = Array(28, 28) val estimator = new DLClassifier[Float](model, criterion, featureSize) .setFeaturesCol(inputs(0)) .setLabelCol(inputs(1)) .setBatchSize(param.batchSize) .setMaxEpoch(param.maxEpoch) val transformer = estimator.fit(trainingDF) val validationSet = DataSet.array(load(validationData, validationLabel), sc) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToBatch(1) val validationRDD: RDD[Data[Float]] = validationSet. asInstanceOf[DistributedDataSet[MiniBatch[Float]]].data(false).map{batch => val feature = batch.getInput().asInstanceOf[Tensor[Float]] val label = batch.getTarget().asInstanceOf[Tensor[Float]] Data[Float](feature.storage().array(), label.storage().array()) } val validationDF = sqLContext.createDataFrame(validationRDD).toDF(inputs: _*) val transformed = transformer.transform(validationDF) transformed.show() sc.stop() }) } } private case class Data[T](featureData : Array[T], labelData : Array[T])
Example 15
Source File: ImagePredictor.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.example.imageclassification import java.nio.file.Paths import com.intel.analytics.bigdl.dataset.image._ import com.intel.analytics.bigdl.dlframes.DLClassifierModel import com.intel.analytics.bigdl.example.imageclassification.MlUtils._ import com.intel.analytics.bigdl.numeric.NumericFloat import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter} import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext object ImagePredictor { LoggerFilter.redirectSparkInfoLogs() Logger.getLogger("com.intel.analytics.bigdl.example").setLevel(Level.INFO) def main(args: Array[String]): Unit = { predictParser.parse(args, new PredictParams()).map(param => { val conf = Engine.createSparkConf() conf.setAppName("Predict with trained model") val sc = new SparkContext(conf) Engine.init val sqlContext = new SQLContext(sc) val partitionNum = Engine.nodeNumber() * Engine.coreNumber() val model = loadModel(param) val valTrans = new DLClassifierModel(model, Array(3, imageSize, imageSize)) .setBatchSize(param.batchSize) .setFeaturesCol("features") .setPredictionCol("predict") val valRDD = if (param.isHdfs) { // load image set from hdfs imagesLoadSeq(param.folder, sc, param.classNum).coalesce(partitionNum, true) } else { // load image set from local val paths = LocalImageFiles.readPaths(Paths.get(param.folder), hasLabel = false) sc.parallelize(imagesLoad(paths, 256), partitionNum) } val transf = RowToByteRecords() -> BytesToBGRImg() -> BGRImgCropper(imageSize, imageSize) -> BGRImgNormalizer(testMean, testStd) -> BGRImgToImageVector() val valDF = transformDF(sqlContext.createDataFrame(valRDD), transf) valTrans.transform(valDF) .select("imageName", "predict") .collect() .take(param.showNum) .foreach(println) sc.stop() }) } }
Example 16
Source File: RowToByteRecords.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.example.imageclassification import com.intel.analytics.bigdl.dataset.{ByteRecord, Transformer} import org.apache.log4j.Logger import org.apache.spark.sql.Row import scala.collection.Iterator object RowToByteRecords { val logger = Logger.getLogger(getClass) def apply(colName: String = "data"): RowToByteRecords = { new RowToByteRecords(colName) } } class RowToByteRecords(colName: String) extends Transformer[Row, ByteRecord] { override def apply(prev: Iterator[Row]): Iterator[ByteRecord] = { prev.map( img => { ByteRecord(img.getAs[Array[Byte]](colName), -1.0f) } ) } }
Example 17
Source File: ImageNetInference.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.example.mkldnn.int8 import com.intel.analytics.bigdl.models.resnet.ImageNetDataSet import com.intel.analytics.bigdl.nn.Module import com.intel.analytics.bigdl.optim._ import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._ import com.intel.analytics.bigdl.utils._ import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext object ImageNetInference { LoggerFilter.redirectSparkInfoLogs() Logger.getLogger("com.intel.analytics.bigdl.optim").setLevel(Level.INFO) val logger: Logger = Logger.getLogger(getClass) import Utils._ def main(args: Array[String]): Unit = { testParser.parse(args, TestParams()).foreach(param => { val conf = Engine.createSparkConf() .setAppName("Test model on ImageNet2012 with Int8") .set("spark.rpc.message.maxSize", "200") val sc = new SparkContext(conf) Engine.init val evaluationSet = ImageNetDataSet.valDataSet(param.folder, sc, 224, param.batchSize).toDistributed().data(train = false) val model = Module.loadModule[Float](param.model).quantize() model.evaluate() val result = model.evaluate(evaluationSet, Array(new Top1Accuracy[Float], new Top5Accuracy[Float])) result.foreach(r => println(s"${r._2} is ${r._1}")) sc.stop() }) } }
Example 18
Source File: GenerateInt8Scales.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.example.mkldnn.int8 import com.intel.analytics.bigdl.dataset.{DataSet, MiniBatch} import com.intel.analytics.bigdl.models.resnet.ImageNetDataSet import com.intel.analytics.bigdl.nn.{Graph, Module} import com.intel.analytics.bigdl.utils.Engine import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD object GenerateInt8Scales { val logger: Logger = Logger.getLogger(getClass) Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) Logger.getLogger("breeze").setLevel(Level.ERROR) import Utils._ def genereateInt8Scales(model: Graph[Float], modelName: String, evaluationSet: RDD[MiniBatch[Float]]): Unit = { model.evaluate() model.setInputDimMask(0, true) model.setOutputDimMask(0, true) model.setWeightDimMask(1, true) logger.info(s"Generate the scales for $modelName ...") val samples = evaluationSet .repartition(1) // repartition (shuffle) will have better accuracy .take(1) // only split one batch to sample .map(_.getInput().toTensor[Float]) samples.foreach { sample => model.forward(sample) model.calcScales(sample) } // we should clean the state, such as output model.clearState() logger.info(s"Generate the scales for $modelName done.") } def saveQuantizedModel(model: Graph[Float], modelName: String): Unit = { val suffix = ".bigdl" val prefix = modelName.stripSuffix(suffix) val name = prefix.concat(".quantized").concat(suffix) logger.info(s"Save the quantized model $name ...") // it will force overWrite the existed model file model.saveModule(name, overWrite = true) logger.info(s"Save the quantized model $name done.") } def main(args: Array[String]): Unit = { genInt8ScalesParser.parse(args, GenInt8ScalesParams()).foreach { param => val conf = Engine.createSparkConf().setAppName("Quantize the model") .set("spark.akka.frameSize", 64.toString) .set("spark.task.maxFailures", "1") val sc = new SparkContext(conf) Engine.init val partitionNum = Engine.nodeNumber() val imageFrame = DataSet.SeqFileFolder.filesToImageFrame(param.folder, sc, 1000, partitionNum = Option(partitionNum)) // the transformer is the same as as that in validation during training val evaluationSet = ImageNetDataSet.valDataSet(param.folder, sc, 224, param.batchSize).toDistributed().data(train = false) // Currently, we only support the graph model, so we add a `toGraph` // if the model is already graph, you can need not to it. val model = Module.loadModule[Float](param.model).toGraph() genereateInt8Scales(model, param.model, evaluationSet) saveQuantizedModel(model, param.model) } } }
Example 19
Source File: Test.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.example.lenetLocal import com.intel.analytics.bigdl.dataset.{DataSet, SampleToBatch} import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToSample} import com.intel.analytics.bigdl.nn.Module import com.intel.analytics.bigdl.optim.{Top1Accuracy, ValidationMethod} import com.intel.analytics.bigdl.utils.Engine import org.apache.log4j.{Level, Logger} object Test { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) Logger.getLogger("breeze").setLevel(Level.ERROR) import Utils._ def main(args: Array[String]): Unit = { testParser.parse(args, new TestParams()).foreach { param => System.setProperty("bigdl.localMode", "true") System.setProperty("bigdl.coreNumber", param.coreNumber.toString) Engine.init val validationData = param.folder + "/t10k-images-idx3-ubyte" val validationLabel = param.folder + "/t10k-labels-idx1-ubyte" val evaluationSet = DataSet.array(load(validationData, validationLabel)) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToSample() -> SampleToBatch( batchSize = param.batchSize, None, None, None, partitionNum = Some(1)) val model = Module.load[Float](param.model) val result = model.evaluate(evaluationSet.toLocal(), Array(new Top1Accuracy[Float].asInstanceOf[ValidationMethod[Float]])) result.foreach(r => println(s"${r._2} is ${r._1}")) } } }
Example 20
Source File: Predict.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.example.lenetLocal import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToSample} import com.intel.analytics.bigdl.nn.Module import com.intel.analytics.bigdl.utils.Engine import com.intel.analytics.bigdl.dataset.Sample import com.intel.analytics.bigdl.optim.LocalPredictor import org.apache.log4j.{Level, Logger} import scala.collection.mutable.ArrayBuffer object Predict { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) Logger.getLogger("breeze").setLevel(Level.ERROR) import Utils._ def main(args: Array[String]): Unit = { predictParser.parse(args, new PredictParams()).foreach { param => System.setProperty("bigdl.localMode", "true") System.setProperty("bigdl.coreNumber", (param.coreNumber.toString)) Engine.init val validationData = param.folder + "/t10k-images-idx3-ubyte" val validationLabel = param.folder + "/t10k-labels-idx1-ubyte" val rawData = load(validationData, validationLabel) val iter = rawData.iterator val sampleIter = GreyImgToSample()( GreyImgNormalizer(trainMean, trainStd)( BytesToGreyImg(28, 28)(iter))) var samplesBuffer = ArrayBuffer[Sample[Float]]() while (sampleIter.hasNext) { val elem = sampleIter.next().clone() samplesBuffer += elem } val samples = samplesBuffer.toArray val model = Module.load[Float](param.model) val localPredictor = LocalPredictor(model) val result = localPredictor.predict(samples) val result_class = localPredictor.predictClass(samples) result_class.foreach(r => println(s"${r}")) } } }
Example 21
Source File: Train.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.example.lenetLocal import com.intel.analytics.bigdl._ import com.intel.analytics.bigdl.dataset.DataSet import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToBatch} import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Module} import com.intel.analytics.bigdl.numeric.NumericFloat import com.intel.analytics.bigdl.optim._ import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter} import com.intel.analytics.bigdl.models.lenet.LeNet5 import org.apache.log4j.{Level, Logger} object Train { LoggerFilter.redirectSparkInfoLogs() import Utils._ def main(args: Array[String]): Unit = { trainParser.parse(args, new TrainParams()).map(param => { System.setProperty("bigdl.localMode", "true") System.setProperty("bigdl.coreNumber", param.coreNumber.toString) Engine.init val trainData = param.folder + "/train-images-idx3-ubyte" val trainLabel = param.folder + "/train-labels-idx1-ubyte" val validationData = param.folder + "/t10k-images-idx3-ubyte" val validationLabel = param.folder + "/t10k-labels-idx1-ubyte" val model = if (param.modelSnapshot.isDefined) { Module.load[Float](param.modelSnapshot.get) } else { LeNet5(classNum = 10) } val optimMethod = if (param.stateSnapshot.isDefined) { OptimMethod.load[Float](param.stateSnapshot.get) } else { new SGD[Float](learningRate = param.learningRate, learningRateDecay = param.learningRateDecay) } val trainSet = DataSet.array(load(trainData, trainLabel)) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch( param.batchSize) val optimizer = Optimizer( model = model, dataset = trainSet, criterion = ClassNLLCriterion[Float]()) if (param.checkpoint.isDefined) { optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch) } if(param.overWriteCheckpoint) { optimizer.overWriteCheckpoint() } val validationSet = DataSet.array(load(validationData, validationLabel)) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToBatch( param.batchSize) optimizer .setValidation( trigger = Trigger.everyEpoch, dataset = validationSet, vMethods = Array(new Top1Accuracy, new Top5Accuracy[Float], new Loss[Float])) .setOptimMethod(optimMethod) .setEndWhen(Trigger.maxEpoch(param.maxEpoch)) .optimize() }) } }
Example 22
Source File: Utils.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.dataset import com.intel.analytics.bigdl.utils.Engine import org.apache.log4j.Logger object Utils { private val logger = Logger.getLogger(getClass) def getBatchSize(batchSize : Int, totalPartition: Option[Int] = None): Int = { val nodeNumber = Engine.nodeNumber() val partitionNum = totalPartition.getOrElse(nodeNumber) logger.debug(s"partition number: $partitionNum, node number: $nodeNumber") require(partitionNum > 0, s"Utils.getBatchSize: partitionNum should be larger than 0, but get $partitionNum") require(batchSize % partitionNum == 0, s"Utils.getBatchSize: total batch size $batchSize " + s"should be divided by partitionNum ${partitionNum}") val batchPerUnit = batchSize / partitionNum logger.debug(s"Batch per unit: $batchPerUnit") batchPerUnit } }
Example 23
Source File: BGRImgToImageVector.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.dataset.image import com.intel.analytics.bigdl.dataset.Transformer import org.apache.log4j.Logger import org.apache.spark.mllib.linalg.DenseVector import scala.collection.Iterator object BGRImgToImageVector { val logger = Logger.getLogger(getClass) def apply(): BGRImgToImageVector = { new BGRImgToImageVector() } } class BGRImgToImageVector() extends Transformer[LabeledBGRImage, DenseVector] { private var featureData: Array[Float] = null override def apply(prev: Iterator[LabeledBGRImage]): Iterator[DenseVector] = { prev.map( img => { if (null == featureData) { featureData = new Array[Float](3 * img.height() * img.width()) } img.copyTo(featureData, 0, true) new DenseVector(featureData.map(_.toDouble)) } ) } }
Example 24
Source File: LocalImageFiles.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.dataset.image import java.awt.color.ColorSpace import java.nio.file.{Files, Path} import org.apache.log4j.Logger object LocalImageFiles { Class.forName("javax.imageio.ImageIO") Class.forName("java.awt.color.ICC_ColorSpace") // Class.forName("sun.java2d.cmm.lcms.LCMS") ColorSpace.getInstance(ColorSpace.CS_sRGB).toRGB(Array[Float](0, 0, 0)) val logger = Logger.getLogger(getClass) private[bigdl] def readPaths(path: Path, hasLabel: Boolean = true) : Array[LocalLabeledImagePath] = { if (hasLabel) readPathsWithLabel(path) else readPathsNoLabel(path) } }
Example 25
Source File: FeatureTransformer.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.transform.vision.image import com.intel.analytics.bigdl.dataset.{ChainedTransformer, Transformer} import com.intel.analytics.bigdl.opencv.OpenCV import com.intel.analytics.bigdl.transform.vision.image.opencv.OpenCVMat import org.apache.log4j.Logger class ChainedFeatureTransformer(first: FeatureTransformer, last: FeatureTransformer) extends FeatureTransformer { override def transform(prev: ImageFeature): ImageFeature = { last.transform(first.transform(prev)) } override def enableIgnoreException(): this.type = { first.enableIgnoreException() last.enableIgnoreException() this } }
Example 26
Source File: ChannelScaledNormalizer.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.transform.vision.image.augmentation import com.intel.analytics.bigdl.dataset.image.LabeledBGRImage import com.intel.analytics.bigdl.dataset.{LocalDataSet, Transformer} import com.intel.analytics.bigdl.transform.vision.image.{FeatureTransformer, ImageFeature} import com.intel.analytics.bigdl.transform.vision.image.opencv.OpenCVMat import org.apache.log4j.Logger import scala.collection.Iterator object ChannelScaledNormalizer { def apply(meanR: Int, meanG: Int, meanB: Int, scale: Double): ChannelScaledNormalizer = { new ChannelScaledNormalizer(meanR, meanG, meanB, scale) } } class ChannelScaledNormalizer(meanR: Int, meanG: Int, meanB: Int, scale: Double) extends FeatureTransformer { override protected def transformMat(feature: ImageFeature): Unit = { val mat = feature.opencvMat() val toFloats = OpenCVMat.toFloatPixels(mat) val content = toFloats._1 require(content.length % 3 == 0, "Content should be multiple of 3 channels") var i = 0 val frameLength = content.length / 3 val height = toFloats._2 val width = toFloats._3 val bufferContent = new Array[Float](width * height * 3) val channels = 3 val mean = Array(meanR, meanG, meanB) var c = 0 while (c < channels) { i = 0 while (i < frameLength) { val data_index = c * frameLength + i bufferContent(data_index) = ((content(data_index) - mean(c)) * scale).toFloat i += 1 } c += 1 } if (mat != null) { mat.release() } val newMat = OpenCVMat.fromFloats(bufferContent, height, width) feature(ImageFeature.mat) = newMat } }
Example 27
Source File: Test.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.resnet import com.intel.analytics.bigdl.Module import com.intel.analytics.bigdl.nn.Module import com.intel.analytics.bigdl.utils.Engine import com.intel.analytics.bigdl.models.resnet.Utils._ import com.intel.analytics.bigdl.optim.{Top1Accuracy, ValidationMethod, ValidationResult} import com.intel.analytics.bigdl.dataset.image.{BGRImgNormalizer, BGRImgToSample, BytesToBGRImg} import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext object Test { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) Logger.getLogger("breeze").setLevel(Level.ERROR) def main(args: Array[String]): Unit = { testParser.parse(args, TestParams()).foreach { param => val conf = Engine.createSparkConf().setAppName("Test ResNet on Cifar10") .set("spark.akka.frameSize", 64.toString) .set("spark.task.maxFailures", "1") val sc = new SparkContext(conf) Engine.init val partitionNum = Engine.nodeNumber() * Engine.coreNumber() val rddData = sc.parallelize(loadTest(param.folder), partitionNum) val transformer = BytesToBGRImg() -> BGRImgNormalizer(Cifar10DataSet.trainMean, Cifar10DataSet.trainStd) -> BGRImgToSample() val evaluationSet = transformer(rddData) val model = Module.load[Float](param.model) println(model) val result = model.evaluate(evaluationSet, Array(new Top1Accuracy[Float]), Some(param.batchSize)) result.foreach(r => println(s"${r._2} is ${r._1}")) sc.stop() } } }
Example 28
Source File: TestImageNet.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.resnet import com.intel.analytics.bigdl._ import com.intel.analytics.bigdl.dataset.DataSet import com.intel.analytics.bigdl.dataset.image.CropCenter import com.intel.analytics.bigdl.models.resnet.ResNet.DatasetType import com.intel.analytics.bigdl.nn.{Module, StaticGraph} import com.intel.analytics.bigdl.optim._ import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._ import com.intel.analytics.bigdl.transform.vision.image.{ImageFeature, MTImageFeatureToBatch, MatToTensor, PixelBytesToMat} import com.intel.analytics.bigdl.transform.vision.image.augmentation.{ChannelScaledNormalizer, RandomCropper, RandomResize} import com.intel.analytics.bigdl.utils._ import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext object TestImageNet { LoggerFilter.redirectSparkInfoLogs() Logger.getLogger("com.intel.analytics.bigdl.optim").setLevel(Level.INFO) val logger = Logger.getLogger(getClass) import Utils._ def main(args: Array[String]): Unit = { testParser.parse(args, new TestParams()).map(param => { val conf = Engine.createSparkConf().setAppName("Test model on ImageNet2012") .set("spark.rpc.message.maxSize", "200") val sc = new SparkContext(conf) Engine.init val model = Module.loadModule[Float](param.model) val evaluationSet = ImageNetDataSet.valDataSet(param.folder, sc, 224, param.batchSize).toDistributed().data(train = false) val result = model.evaluate(evaluationSet, Array(new Top1Accuracy[Float], new Top5Accuracy[Float])) result.foreach(r => println(s"${r._2} is ${r._1}")) sc.stop() }) } }
Example 29
Source File: TrainCIFAR10.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.resnet import com.intel.analytics.bigdl.nn.{CrossEntropyCriterion, Module} import com.intel.analytics.bigdl._ import com.intel.analytics.bigdl.models.resnet.ResNet.{DatasetType, ShortcutType} import com.intel.analytics.bigdl.optim._ import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter, OptimizerV1, OptimizerV2, T, Table} import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._ object TrainCIFAR10 { LoggerFilter.redirectSparkInfoLogs() import Utils._ def cifar10Decay(epoch: Int): Double = if (epoch >= 122) 2.0 else if (epoch >= 81) 1.0 else 0.0 def main(args: Array[String]): Unit = { trainParser.parse(args, new TrainParams()).map(param => { val conf = Engine.createSparkConf().setAppName("Train ResNet on Cifar10") // Will throw exception without this config when has only one executor .set("spark.rpc.message.maxSize", "200") val sc = new SparkContext(conf) Engine.init val batchSize = param.batchSize val (imageSize, lrSchedule, maxEpoch, dataSet) = (32, DatasetType.CIFAR10, param.nepochs, Cifar10DataSet) val trainDataSet = dataSet.trainDataSet(param.folder, sc, imageSize, batchSize) val validateSet = dataSet.valDataSet(param.folder, sc, imageSize, batchSize) val shortcut: ShortcutType = param.shortcutType match { case "A" => ShortcutType.A case "B" => ShortcutType.B case _ => ShortcutType.C } val model = if (param.modelSnapshot.isDefined) { Module.load[Float](param.modelSnapshot.get) } else { val curModel = if (param.graphModel) { ResNet.graph(param.classes, T("shortcutType" -> shortcut, "depth" -> param.depth, "optnet" -> param.optnet)) } else { ResNet(param.classes, T("shortcutType" -> shortcut, "depth" -> param.depth, "optnet" -> param.optnet)) } if (param.optnet) { ResNet.shareGradInput(curModel) } ResNet.modelInit(curModel) curModel } if (param.optimizerVersion.isDefined) { param.optimizerVersion.get.toLowerCase match { case "optimizerv1" => Engine.setOptimizerVersion(OptimizerV1) case "optimizerv2" => Engine.setOptimizerVersion(OptimizerV2) } } val optimMethod = if (param.stateSnapshot.isDefined) { OptimMethod.load[Float](param.stateSnapshot.get) } else { new SGD[Float](learningRate = param.learningRate, learningRateDecay = 0.0, weightDecay = param.weightDecay, momentum = param.momentum, dampening = param.dampening, nesterov = param.nesterov, learningRateSchedule = SGD.EpochDecay(cifar10Decay)) } val optimizer = Optimizer( model = model, dataset = trainDataSet, criterion = new CrossEntropyCriterion[Float]() ) if (param.checkpoint.isDefined) { optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch) } optimizer .setOptimMethod(optimMethod) .setValidation(Trigger.everyEpoch, validateSet, Array(new Top1Accuracy[Float])) .setEndWhen(Trigger.maxEpoch(maxEpoch)) .optimize() sc.stop() }) } }
Example 30
Source File: Test.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.inception import com.intel.analytics.bigdl.dataset.{ByteRecord, DataSet} import com.intel.analytics.bigdl.dataset.image._ import com.intel.analytics.bigdl.nn.Module import com.intel.analytics.bigdl.optim.{Top1Accuracy, Top5Accuracy, Validator} import com.intel.analytics.bigdl.utils.Engine import org.apache.hadoop.io.Text import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext object Test { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) Logger.getLogger("breeze").setLevel(Level.ERROR) import Options._ val imageSize = 224 def main(args: Array[String]) { testParser.parse(args, new TestParams()).foreach { param => val batchSize = param.batchSize.getOrElse(128) val conf = Engine.createSparkConf().setAppName("Test Inception on ImageNet") val sc = new SparkContext(conf) Engine.init // We set partition number to be node*core, actually you can also assign other partitionNum val partitionNum = Engine.nodeNumber() * Engine.coreNumber() val rawData = sc.sequenceFile(param.folder, classOf[Text], classOf[Text], partitionNum) .map(image => { ByteRecord(image._2.copyBytes(), DataSet.SeqFileFolder.readLabel(image._1).toFloat) }).coalesce(partitionNum, true) val rddData = DataSet.SeqFileFolder.filesToRdd(param.folder, sc, 1000) val transformer = BytesToBGRImg() -> BGRImgCropper(imageSize, imageSize, CropCenter) -> HFlip(0.5) -> BGRImgNormalizer(0.485, 0.456, 0.406, 0.229, 0.224, 0.225) -> BGRImgToSample() val evaluationSet = transformer(rddData) val model = Module.load[Float](param.model) val result = model.evaluate(evaluationSet, Array(new Top1Accuracy[Float], new Top5Accuracy[Float]), param.batchSize) result.foreach(r => println(s"${r._2} is ${r._1}")) sc.stop() } } }
Example 31
Source File: Test.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.lenet import java.nio.file.Paths import com.intel.analytics.bigdl.dataset.DataSet import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToSample} import com.intel.analytics.bigdl.nn.Module import com.intel.analytics.bigdl.optim.Top1Accuracy import com.intel.analytics.bigdl.utils.Engine import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext object Test { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) Logger.getLogger("breeze").setLevel(Level.ERROR) import Utils._ def main(args: Array[String]): Unit = { testParser.parse(args, new TestParams()).foreach { param => val conf = Engine.createSparkConf().setAppName("Test Lenet on MNIST") .set("spark.akka.frameSize", 64.toString) .set("spark.task.maxFailures", "1") val sc = new SparkContext(conf) Engine.init val validationData = param.folder + "/t10k-images-idx3-ubyte" val validationLabel = param.folder + "/t10k-labels-idx1-ubyte" val partitionNum = Engine.nodeNumber() * Engine.coreNumber() val rddData = sc.parallelize(load(validationData, validationLabel), partitionNum) val transformer = BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToSample() val evaluationSet = transformer(rddData) val model = Module.load[Float](param.model) val result = model.evaluate(evaluationSet, Array(new Top1Accuracy[Float]), Some(param.batchSize)) result.foreach(r => println(s"${r._2} is ${r._1}")) sc.stop() } } }
Example 32
Source File: Train.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.lenet import com.intel.analytics.bigdl._ import com.intel.analytics.bigdl.dataset.DataSet import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToBatch} import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, CrossEntropyCriterion, Module} import com.intel.analytics.bigdl.numeric.NumericFloat import com.intel.analytics.bigdl.optim._ import com.intel.analytics.bigdl.utils._ import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext object Train { LoggerFilter.redirectSparkInfoLogs() import Utils._ def main(args: Array[String]): Unit = { trainParser.parse(args, new TrainParams()).map(param => { val conf = Engine.createSparkConf() .setAppName("Train Lenet on MNIST") .set("spark.task.maxFailures", "1") val sc = new SparkContext(conf) Engine.init val trainData = param.folder + "/train-images-idx3-ubyte" val trainLabel = param.folder + "/train-labels-idx1-ubyte" val validationData = param.folder + "/t10k-images-idx3-ubyte" val validationLabel = param.folder + "/t10k-labels-idx1-ubyte" val model = if (param.modelSnapshot.isDefined) { Module.load[Float](param.modelSnapshot.get) } else { if (param.graphModel) { LeNet5.graph(classNum = 10) } else { Engine.getEngineType() match { case MklBlas => LeNet5(10) case MklDnn => LeNet5.dnnGraph(param.batchSize / Engine.nodeNumber(), 10) } } } val criterion = Engine.getEngineType() match { case MklBlas => ClassNLLCriterion() case MklDnn => CrossEntropyCriterion() } if (param.optimizerVersion.isDefined) { param.optimizerVersion.get.toLowerCase match { case "optimizerv1" => Engine.setOptimizerVersion(OptimizerV1) case "optimizerv2" => Engine.setOptimizerVersion(OptimizerV2) } } val optimMethod = if (param.stateSnapshot.isDefined) { OptimMethod.load[Float](param.stateSnapshot.get) } else { new SGD[Float](learningRate = param.learningRate, learningRateDecay = param.learningRateDecay) } val trainSet = DataSet.array(load(trainData, trainLabel), sc) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch( param.batchSize) val optimizer = Optimizer( model = model, dataset = trainSet, criterion = criterion) if (param.checkpoint.isDefined) { optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch) } if(param.overWriteCheckpoint) { optimizer.overWriteCheckpoint() } val validationSet = DataSet.array(load(validationData, validationLabel), sc) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToBatch( param.batchSize) optimizer .setValidation( trigger = Trigger.everyEpoch, dataset = validationSet, vMethods = Array(new Top1Accuracy, new Top5Accuracy[Float], new Loss[Float])) .setOptimMethod(optimMethod) .setEndWhen(Trigger.maxEpoch(param.maxEpoch)) .optimize() sc.stop() }) } }
Example 33
Source File: Train.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.autoencoder import java.nio.file.Paths import com.intel.analytics.bigdl._ import com.intel.analytics.bigdl.dataset.image._ import com.intel.analytics.bigdl.dataset.{DataSet, MiniBatch, Transformer} import com.intel.analytics.bigdl.nn.{MSECriterion, Module} import com.intel.analytics.bigdl.optim._ import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._ import com.intel.analytics.bigdl.utils.{Engine, OptimizerV1, OptimizerV2, T, Table} import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import scala.reflect.ClassTag object toAutoencoderBatch { def apply(): toAutoencoderBatch[Float] = new toAutoencoderBatch[Float]() } class toAutoencoderBatch[T: ClassTag](implicit ev: TensorNumeric[T] )extends Transformer[MiniBatch[T], MiniBatch[T]] { override def apply(prev: Iterator[MiniBatch[T]]): Iterator[MiniBatch[T]] = { prev.map(batch => { MiniBatch(batch.getInput().toTensor[T], batch.getInput().toTensor[T]) }) } } object Train { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) Logger.getLogger("breeze").setLevel(Level.ERROR) import Utils._ def main(args: Array[String]): Unit = { trainParser.parse(args, new TrainParams()).map(param => { val conf = Engine.createSparkConf().setAppName("Train Autoencoder on MNIST") val sc = new SparkContext(conf) Engine.init val trainData = Paths.get(param.folder, "/train-images-idx3-ubyte") val trainLabel = Paths.get(param.folder, "/train-labels-idx1-ubyte") val trainDataSet = DataSet.array(load(trainData, trainLabel), sc) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch(param.batchSize) -> toAutoencoderBatch() val model = if (param.modelSnapshot.isDefined) { Module.load[Float](param.modelSnapshot.get) } else { if (param.graphModel) Autoencoder.graph(classNum = 32) else Autoencoder(classNum = 32) } if (param.optimizerVersion.isDefined) { param.optimizerVersion.get.toLowerCase match { case "optimizerv1" => Engine.setOptimizerVersion(OptimizerV1) case "optimizerv2" => Engine.setOptimizerVersion(OptimizerV2) } } val optimMethod = if (param.stateSnapshot.isDefined) { OptimMethod.load[Float](param.stateSnapshot.get) } else { new Adagrad[Float](learningRate = 0.01, learningRateDecay = 0.0, weightDecay = 0.0005) } val optimizer = Optimizer( model = model, dataset = trainDataSet, criterion = new MSECriterion[Float]() ) if (param.checkpoint.isDefined) { optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch) } optimizer .setOptimMethod(optimMethod) .setEndWhen(Trigger.maxEpoch(param.maxEpoch)) .optimize() sc.stop() }) } }
Example 34
Source File: Test.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.vgg import com.intel.analytics.bigdl.dataset.DataSet import com.intel.analytics.bigdl.dataset.image._ import com.intel.analytics.bigdl.models.lenet.Utils._ import com.intel.analytics.bigdl.nn.Module import com.intel.analytics.bigdl.optim.{Top1Accuracy, Validator} import com.intel.analytics.bigdl.utils.Engine import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext object Test { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) Logger.getLogger("breeze").setLevel(Level.ERROR) import Utils._ def main(args: Array[String]) { testParser.parse(args, new TestParams()).foreach { param => val conf = Engine.createSparkConf().setAppName("Test Vgg on Cifar10") .set("spark.akka.frameSize", 64.toString) val sc = new SparkContext(conf) Engine.init val partitionNum = Engine.nodeNumber() * Engine.coreNumber() val rddData = sc.parallelize(Utils.loadTest(param.folder), partitionNum) val transformer = BytesToBGRImg() -> BGRImgNormalizer(testMean, testStd) -> BGRImgToSample() val evaluationSet = transformer(rddData) val model = Module.load[Float](param.model) val result = model.evaluate(evaluationSet, Array(new Top1Accuracy[Float]), Some(param.batchSize)) result.foreach(r => println(s"${r._2} is ${r._1}")) sc.stop() } } }
Example 35
Source File: Train.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.vgg import java.text.SimpleDateFormat import java.util.Date import com.intel.analytics.bigdl._ import com.intel.analytics.bigdl.dataset.DataSet import com.intel.analytics.bigdl.dataset.image._ import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Module} import com.intel.analytics.bigdl.optim._ import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric._ import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter, OptimizerV1, OptimizerV2, T, Table} import com.intel.analytics.bigdl.visualization.{TrainSummary, ValidationSummary} import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext object Train { LoggerFilter.redirectSparkInfoLogs() import Utils._ def main(args: Array[String]): Unit = { trainParser.parse(args, new TrainParams()).map(param => { val conf = Engine.createSparkConf().setAppName("Train Vgg on Cifar10") // Will throw exception without this config when has only one executor .set("spark.rpc.message.maxSize", "200") val sc = new SparkContext(conf) Engine.init val trainDataSet = DataSet.array(Utils.loadTrain(param.folder), sc) -> BytesToBGRImg() -> BGRImgNormalizer(trainMean, trainStd) -> BGRImgToBatch(param.batchSize) val model = if (param.modelSnapshot.isDefined) { Module.load[Float](param.modelSnapshot.get) } else { if (param.graphModel) VggForCifar10.graph(classNum = 10) else VggForCifar10(classNum = 10) } if (param.optimizerVersion.isDefined) { param.optimizerVersion.get.toLowerCase match { case "optimizerv1" => Engine.setOptimizerVersion(OptimizerV1) case "optimizerv2" => Engine.setOptimizerVersion(OptimizerV2) } } val optimMethod = if (param.stateSnapshot.isDefined) { OptimMethod.load[Float](param.stateSnapshot.get) } else { new SGD[Float](learningRate = param.learningRate, learningRateDecay = 0.0, weightDecay = param.weightDecay, momentum = 0.9, dampening = 0.0, nesterov = false, learningRateSchedule = SGD.EpochStep(25, 0.5)) } val optimizer = Optimizer( model = model, dataset = trainDataSet, criterion = new ClassNLLCriterion[Float]() ) val validateSet = DataSet.array(Utils.loadTest(param.folder), sc) -> BytesToBGRImg() -> BGRImgNormalizer(testMean, testStd) -> BGRImgToBatch(param.batchSize) if (param.checkpoint.isDefined) { optimizer.setCheckpoint(param.checkpoint.get, Trigger.everyEpoch) } if (param.overWriteCheckpoint) { optimizer.overWriteCheckpoint() } if (param.summaryPath.isDefined) { val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") val timeStamp = sdf.format(new Date()) val trainSummry = new TrainSummary(param.summaryPath.get, s"vgg-on-cifar10-train-$timeStamp") optimizer.setTrainSummary(trainSummry) val validationSummary = new ValidationSummary(param.summaryPath.get, s"vgg-on-cifar10-val-$timeStamp") optimizer.setValidationSummary(validationSummary) } optimizer .setValidation(Trigger.everyEpoch, validateSet, Array(new Top1Accuracy[Float])) .setOptimMethod(optimMethod) .setEndWhen(Trigger.maxEpoch(param.maxEpoch)) .optimize() sc.stop() }) } }
Example 36
Source File: Validator.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.optim import com.intel.analytics.bigdl._ import com.intel.analytics.bigdl.utils.Engine import com.intel.analytics.bigdl.dataset.{DistributedDataSet, LocalDataSet, MiniBatch} import org.apache.log4j.Logger abstract class Validator[T, D]( model: Module[T], dataSet: DataSet[D] ) { def test(vMethods: Array[ValidationMethod[T]]): Array[(ValidationResult, ValidationMethod[T])] } @deprecated( "Validator(model, dataset) is deprecated. Please use model.evaluate instead", "0.2.0") object Validator { private val logger = Logger.getLogger(getClass) def apply[T, D](model: Module[T], dataset: DataSet[D]): Validator[T, D] = { logger.warn("Validator(model, dataset) is deprecated. Please use model.evaluate instead") dataset match { case d: DistributedDataSet[_] => new DistriValidator[T]( model = model, dataSet = d.asInstanceOf[DistributedDataSet[MiniBatch[T]]] ).asInstanceOf[Validator[T, D]] case d: LocalDataSet[_] => new LocalValidator[T]( model = model, dataSet = d.asInstanceOf[LocalDataSet[MiniBatch[T]]] ).asInstanceOf[Validator[T, D]] case _ => throw new UnsupportedOperationException } } }
Example 37
Source File: DistriValidator.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.optim import com.intel.analytics.bigdl._ import com.intel.analytics.bigdl.dataset.{DistributedDataSet, MiniBatch} import com.intel.analytics.bigdl.optim.DistriValidator._ import com.intel.analytics.bigdl.utils.{Engine, MklBlas} import org.apache.log4j.Logger object DistriValidator { val logger = Logger.getLogger(this.getClass) } override def test(vMethods: Array[ValidationMethod[T]]) : Array[(ValidationResult, ValidationMethod[T])] = { val rdd = dataSet.data(train = false) val broadcastModel = rdd.sparkContext.broadcast(model.evaluate(), vMethods) val _subModelNumber = Engine.getEngineType match { case MklBlas => Engine.coreNumber() case _ => throw new IllegalArgumentException } val nExecutor = Engine.nodeNumber() val executorCores = Engine.coreNumber() rdd.mapPartitions(dataIter => { Engine.setNodeAndCore(nExecutor, executorCores) val localModel = broadcastModel.value._1 val localMethod = broadcastModel.value._2 logger.info("model thread pool size is " + Engine.model.getPoolSize) val workingModels = (1 to _subModelNumber) .map(_ => localModel.cloneModule().evaluate()).toArray val vMethodsArr = (1 to _subModelNumber).map(i => localMethod.map(_.clone())).toArray dataIter.map(batch => { val stackSize = batch.size() / _subModelNumber val extraSize = batch.size() % _subModelNumber val parallelism = if (stackSize == 0) extraSize else _subModelNumber Engine.default.invokeAndWait( (0 until parallelism).map(b => () => { val offset = b * stackSize + math.min(b, extraSize) + 1 val length = stackSize + (if (b < extraSize) 1 else 0) val currentMiniBatch = batch.slice(offset, length) val input = currentMiniBatch.getInput() val target = currentMiniBatch.getTarget() val output = workingModels(b).forward(input) val validatMethods = vMethodsArr(b) validatMethods.map(validation => { validation(output, target) }) } ) ).reduce((left, right) => { left.zip(right).map { case (l, r) => l + r } }) }) }).reduce((left, right) => { left.zip(right).map { case (l, r) => l + r } }).zip(vMethods) } }
Example 38
Source File: LocalValidator.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.optim import com.intel.analytics.bigdl.dataset.{LocalDataSet, MiniBatch} import com.intel.analytics.bigdl._ import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.utils.{Engine, MklBlas} import org.apache.log4j.Logger object LocalValidator { val logger = Logger.getLogger(getClass) } class LocalValidator[T] private[optim](model: Module[T], dataSet: LocalDataSet[MiniBatch[T]]) extends Validator[T, MiniBatch[T]](model, dataSet) { val logger = LocalValidator.logger private val coreNumber = Engine.coreNumber() private val subModelNumber = Engine.getEngineType match { case MklBlas => coreNumber case _ => throw new IllegalArgumentException } private val workingModels = (1 to subModelNumber).map(_ => model.cloneModule().evaluate()).toArray override def test(vMethods: Array[ValidationMethod[T]]) : Array[(ValidationResult, ValidationMethod[T])] = { val dataIter = dataSet.data (train = false) var count = 0 val vMethodsArr = (1 to subModelNumber).map(i => vMethods.map(_.clone())).toArray logger.info("model thread pool size is " + Engine.model.getPoolSize) dataIter.map(batch => { val stackSize = batch.size() / subModelNumber val extraSize = batch.size() % subModelNumber val parallelism = if (stackSize == 0) extraSize else subModelNumber val start = System.nanoTime() val result = Engine.default.invokeAndWait( (0 until parallelism).map(b => () => { val offset = b * stackSize + math.min(b, extraSize) + 1 val length = stackSize + (if (b < extraSize) 1 else 0) val currentMiniBatch = batch.slice(offset, length) val input = currentMiniBatch.getInput() val target = currentMiniBatch.getTarget() val output = workingModels(b).forward(input) val validatMethods = vMethodsArr(b) validatMethods.map(validation => { validation(output.asInstanceOf[Tensor[T]], target) }) } ) ).reduce((left, right) => { left.zip(right).map { case (l, r) => l + r } }) count += batch.size() logger.info(s"[Validation] $count/${dataSet.size()} Throughput is ${ batch.size() / ((System.nanoTime() - start) / 1e9) } record / sec") result }).reduce((left, right) => { left.zip(right).map { case (l, r) => l + r } }).zip(vMethods) } }
Example 39
Source File: BigDLSpecHelper.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.utils import java.io.{File => JFile} import org.apache.log4j.Logger import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers} import scala.collection.mutable.ArrayBuffer abstract class BigDLSpecHelper extends FlatSpec with Matchers with BeforeAndAfter { protected val logger = Logger.getLogger(getClass) private val tmpFiles : ArrayBuffer[JFile] = new ArrayBuffer[JFile]() protected def createTmpFile(): JFile = { val file = java.io.File.createTempFile("UnitTest", "BigDLSpecBase") logger.info(s"created file $file") tmpFiles.append(file) file } protected def getFileFolder(path: String): String = { path.substring(0, path.lastIndexOf(JFile.separator)) } protected def getFileName(path: String): String = { path.substring(path.lastIndexOf(JFile.separator) + 1) } def doAfter(): Unit = {} def doBefore(): Unit = {} before { doBefore() } after { doAfter() tmpFiles.foreach(f => { if (f.exists()) { require(f.isFile, "cannot clean folder") f.delete() logger.info(s"deleted file $f") } }) } }
Example 40
Source File: RefLocalOptimizer.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.optim import com.intel.analytics.bigdl.DataSet import com.intel.analytics.bigdl.dataset.MiniBatch import com.intel.analytics.bigdl._ import com.intel.analytics.bigdl.optim.DistriOptimizer.getClass import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric import org.apache.log4j.Logger import scala.reflect.ClassTag class RefLocalOptimizer[T: ClassTag]( model: Module[T], dataset: DataSet[MiniBatch[T]], criterion: Criterion[T] )(implicit ev: TensorNumeric[T]) extends Optimizer[T, MiniBatch[T]](model, dataset, criterion) { val logger: Logger = Logger.getLogger(getClass) val (w, g) = model.getParameters() override def optimize(): Module[T] = { val data = dataset.toLocal().data(train = true) var count = 0 state("epoch") = state.get[Int]("epoch").getOrElse(1) state("neval") = state.get[Int]("neval").getOrElse(1) while (!endWhen(state)) { val batch = data.next() val input = batch.getInput val target = batch.getTarget model.training() model.zeroGradParameters() val output = model.forward(input).asInstanceOf[Tensor[T]] val loss = criterion.forward(output, target) model.backward(input, criterion.backward(output, target)) optimMethods.head._2.optimize(_ => (loss, g), w, state) count += batch.size() state("neval") = state[Int]("neval") + 1 logger.info(s"loss is $loss") if (count >= dataset.size()) { state("epoch") = state[Int]("epoch") + 1 count = 0 } } model } }
Example 41
Source File: ParallelOptimizerSpec.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.optim import com.intel.analytics.bigdl.dataset.{DataSet, MiniBatch} import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Linear, MSECriterion} import com.intel.analytics.bigdl.optim.DistriOptimizerSpecModel.mse import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.utils.{Engine, T} import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers} @com.intel.analytics.bigdl.tags.Serial class ParallelOptimizerSpec extends FlatSpec with Matchers with BeforeAndAfter { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) private var sc: SparkContext = _ before { val conf = Engine.createSparkConf() .setMaster("local[1]").setAppName("ParallelOptimizerSpec") sc = new SparkContext(conf) Engine.init Engine.setCoreNumber(1) } after { if (sc != null) { sc.stop() } } "Train with parallel" should "work properly" in { val input = Tensor[Float](1, 10).fill(1.0f) val target = Tensor[Float](1).fill(1.0f) val miniBatch = MiniBatch(input, target) val model = Linear[Float](10, 2) model.getParameters()._1.fill(1.0f) val optimMethod = new SGD[Float]() val dataSet = DataSet.array(Array(miniBatch), sc) val optimizer = new DistriOptimizer[Float](model, dataSet, new ClassNLLCriterion[Float]()) .setState(T("learningRate" -> 1.0)) .setEndWhen(Trigger.maxIteration(10)) optimizer.optimize() } "Train with parallel" should "have same results as DistriOptimizer" in { val input = Tensor[Float](1, 10).fill(1.0f) val target = Tensor[Float](1).fill(1.0f) val miniBatch = MiniBatch(input, target) val model1 = Linear[Float](10, 2) model1.getParameters()._1.fill(1.0f) val model2 = Linear[Float](10, 2) model2.getParameters()._1.fill(1.0f) val dataSet = DataSet.array(Array(miniBatch), sc) val parallelOptimizer = new DistriOptimizer[Float](model1, dataSet, new ClassNLLCriterion[Float]()) .setState(T("learningRate" -> 1.0)) .setEndWhen(Trigger.maxIteration(10)) parallelOptimizer.optimize val distriOptimizer = new DistriOptimizer[Float](model2, dataSet, new ClassNLLCriterion[Float]()) .setState(T("learningRate" -> 1.0)) .setEndWhen(Trigger.maxIteration(10)) distriOptimizer.optimize model1.getParameters()._1 should be (model2.getParameters()._1) } }
Example 42
Source File: SFObjectWriter.scala From spark-salesforce with Apache License 2.0 | 5 votes |
package com.springml.spark.salesforce import org.apache.log4j.Logger import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SaveMode} import com.springml.salesforce.wave.api.APIFactory import com.springml.salesforce.wave.api.BulkAPI import com.springml.salesforce.wave.util.WaveAPIConstants import com.springml.salesforce.wave.model.JobInfo class SFObjectWriter ( val username: String, val password: String, val login: String, val version: String, val sfObject: String, val mode: SaveMode, val upsert: Boolean, val externalIdFieldName: String, val csvHeader: String ) extends Serializable { @transient val logger = Logger.getLogger(classOf[SFObjectWriter]) def writeData(rdd: RDD[Row]): Boolean = { val csvRDD = rdd.map(row => row.toSeq.map(value => Utils.rowValue(value)).mkString(",")) val jobInfo = new JobInfo(WaveAPIConstants.STR_CSV, sfObject, operation(mode, upsert)) jobInfo.setExternalIdFieldName(externalIdFieldName) val jobId = bulkAPI.createJob(jobInfo).getId csvRDD.mapPartitionsWithIndex { case (index, iterator) => { val records = iterator.toArray.mkString("\n") var batchInfoId : String = null if (records != null && !records.isEmpty()) { val data = csvHeader + "\n" + records val batchInfo = bulkAPI.addBatch(jobId, data) batchInfoId = batchInfo.getId } val success = (batchInfoId != null) // Job status will be checked after completing all batches List(success).iterator } }.reduce((a, b) => a & b) bulkAPI.closeJob(jobId) var i = 1 while (i < 999999) { if (bulkAPI.isCompleted(jobId)) { logger.info("Job completed") return true } logger.info("Job not completed, waiting...") Thread.sleep(200) i = i + 1 } print("Returning false...") logger.info("Job not completed. Timeout..." ) false } // Create new instance of BulkAPI every time because Spark workers cannot serialize the object private def bulkAPI(): BulkAPI = { APIFactory.getInstance().bulkAPI(username, password, login, version) } private def operation(mode: SaveMode, upsert: Boolean): String = { if (upsert) { "upsert" } else if (mode != null && SaveMode.Overwrite.name().equalsIgnoreCase(mode.name())) { WaveAPIConstants.STR_UPDATE } else if (mode != null && SaveMode.Append.name().equalsIgnoreCase(mode.name())) { WaveAPIConstants.STR_INSERT } else { logger.warn("SaveMode " + mode + " Not supported. Using 'insert' operation") WaveAPIConstants.STR_INSERT } } }
Example 43
Source File: TopWORDSApp.scala From topwords with GNU General Public License v3.0 | 5 votes |
package io.github.qf6101.topwords import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.log4j.Logger import org.apache.spark.sql.SparkSession object TopWORDSApp extends Serializable { @transient private[this] val LOGGER = Logger.getLogger(this.getClass.toString) def main(args: Array[String]) { // setup spark session val spark = SparkSession.builder().getOrCreate() try { TopWORDSParser.parse(args).foreach { args => // remove output location files if exist val files = FileSystem.get(spark.sparkContext.hadoopConfiguration) if (files.exists(new Path(args.outputLoc))) files.delete(new Path(args.outputLoc), true) // read input corpus val corpus = if (args.numPartitions > 0) spark.sparkContext.textFile(args.inputLoc).repartition(args.numPartitions) else spark.sparkContext.textFile(args.inputLoc) LOGGER.info("Number of lines of input corpus: " + corpus.count()) // run TopWORDS with the parsed arguments new TopWORDS( tauL = args.tauL, tauF = args.tauF, textLenThld = args.textLenThld, useProbThld = args.useProbThld, numIterations = args.numIterations, convergeTol = args.convergeTol, wordBoundaryThld = args.wordBoundaryThld) .run(corpus, args.outputLoc + "/dictionary", args.outputLoc + "/segmented_texts") } //exit normally LOGGER.info("Running TopWORDS successfully!") if (spark.sparkContext.master.contains("local")) sys.exit(0) } catch { case ex: Throwable => LOGGER.error("Running TopWORDS fail!", ex) //signal to external process if (spark.sparkContext.master.contains("local")) sys.exit(1) } finally spark.stop() } }
Example 44
Source File: SparkFunSuite.scala From spark-alchemy with Apache License 2.0 | 5 votes |
package org.apache.spark // scalastyle:off import java.io.File import scala.annotation.tailrec import org.apache.log4j.{Appender, Level, Logger} import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, BeforeAndAfterEach, FunSuite, Outcome, Suite} import org.apache.spark.internal.Logging import org.apache.spark.internal.config.Tests.IS_TESTING import org.apache.spark.util.{AccumulatorContext, Utils} protected def withLogAppender( appender: Appender, loggerName: Option[String] = None, level: Option[Level] = None)( f: => Unit): Unit = { val logger = loggerName.map(Logger.getLogger).getOrElse(Logger.getRootLogger) val restoreLevel = logger.getLevel logger.addAppender(appender) if (level.isDefined) { logger.setLevel(level.get) } try f finally { logger.removeAppender(appender) if (level.isDefined) { logger.setLevel(restoreLevel) } } } }
Example 45
Source File: DenseKMeans.scala From AI with Apache License 2.0 | 5 votes |
// scalastyle:off println package com.bigchange.mllib import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( var input: String = null, k: Int = 2, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() defaultParams.input = args(0) run(defaultParams) } def run(params: Params) { val conf = new SparkConf().setAppName(s"DenseKMeans with $params").setMaster("local") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) // Return the K-means cost (sum of squared distances of points to their nearest center) for this val cost = model.computeCost(examples) // 获取质点(k个) val centerPoint = model.clusterCenters val one = centerPoint(0) val two = centerPoint(1) println(s"centerPoint=$one,$two.") println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 46
Source File: NonCommonWordSelector.scala From dbpedia-spotlight-model with Apache License 2.0 | 5 votes |
package org.dbpedia.spotlight.spot import java.io._ import com.officedepot.cdap2.collection.CompactHashSet import org.apache.log4j.Logger import org.dbpedia.spotlight.io.WortschatzParser import org.dbpedia.spotlight.model.SurfaceFormOccurrence import scala.collection.JavaConversions._ def select(occs: java.util.List[SurfaceFormOccurrence]) : java.util.List[SurfaceFormOccurrence] = { occs.filter( o => !isCommonWord(o) ); } def main(args: Array[String]) { def usage = println(" Usage: scala -cp $CP NonCommonWordSelector words.txt ") args(0) match { case file: String => { new NonCommonWordSelector(file) } case _ => usage } } }
Example 47
Source File: SparkSqlUtils.scala From HadoopLearning with MIT License | 5 votes |
package com.c503.utils import java.io.{BufferedInputStream, BufferedReader, FileInputStream, InputStreamReader} import java.nio.file.Path import com.google.common.io.Resources import org.apache.log4j.{Level, Logger} import org.apache.mesos.Protos.Resource import org.apache.spark.sql.SparkSession import scala.io.Source def readSqlByPath(sqlPath: String) = { val buf = new StringBuilder val path = this.getPathByName(sqlPath) val file = Source.fromFile(path) for (line <- file.getLines) { buf ++= line + "\n" } file.close buf.toString() } }
Example 48
Source File: Streaming.scala From scala-spark-cab-rides-predictions with MIT License | 5 votes |
import com.amazonaws.services.dynamodbv2.document.internal.InternalUtils import com.amazonaws.services.dynamodbv2.streamsadapter.model.RecordAdapter import com.amazonaws.services.kinesis.model.Record import com.google.gson.Gson import org.apache.spark.sql._ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.kinesis.dynamostream.KinesisInitialPositions.Latest import org.apache.spark.streaming.kinesis.dynamostream.KinesisInputDStream import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext} object Trials extends App { import org.apache.log4j.{Level, Logger} Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) //session setup System.setProperty("hadoop.home.dir", "C:\\winutils") val sparkSession = SparkSession.builder() .master("local[*]") .appName("test") .getOrCreate() val sc = sparkSession.sparkContext val ssc = new StreamingContext(sc, Seconds(10)) val sqlContext = sparkSession.sqlContext //creates an array of strings from raw byte array def kinesisRecordHandler: Record => Array[String] = (record: Record) => new String(record.getData.array()).split(",") //converts records to map of key value pair and then json def recordHandler = (record: Record) => { val gson = new Gson val sRecord = record.asInstanceOf[RecordAdapter].getInternalObject val map = InternalUtils.toSimpleMapValue(sRecord.getDynamodb.getNewImage) gson.toJson(map) } case class CabPrice(cab_type: String, product_id: String, name: String, price: String, distance: String, surge_multiplier: String, time_stamp: String, source: String, destination: String, id: String) val stream_cab = KinesisInputDStream.builder .streamingContext(ssc) .streamName("cab_rides") .regionName("us-east-1") .initialPosition(new Latest()) .checkpointAppName("cab_rides-app") .checkpointInterval(Milliseconds(1000)) .storageLevel(StorageLevel.MEMORY_AND_DISK_2) .buildWithMessageHandler(recordHandler) val stream_weather = KinesisInputDStream.builder .streamingContext(ssc) .streamName("weather") .regionName("us-east-1") .initialPosition(new Latest()) .checkpointAppName("cab_rides-app") .checkpointInterval(Milliseconds(1000)) .storageLevel(StorageLevel.MEMORY_AND_DISK_2) .buildWithMessageHandler(recordHandler) //creating dataframe, can be stored as temp view val cabSchema = Encoders.product[CabPrice].schema stream_cab.foreachRDD(rdd => { import sqlContext.implicits._ //val xx: Dataset[String] = rdd.toDS() val df: DataFrame = sqlContext.read.schema(cabSchema).json(rdd.toDS()) df.show() }) ssc.start() ssc.awaitTermination() }
Example 49
Source File: MetricsTest.scala From pulse with Apache License 2.0 | 5 votes |
package io.phdata.pulse.metrics import org.apache.log4j.Logger import org.scalatest.FunSuite class MetricsTest extends FunSuite { implicit val logger = Logger.getLogger(this.getClass) test("Write a String metric") { Metrics.gauge("foo", "bar") } test("Write a Long metric") { Metrics.gauge("long_measure_i", 10) } test("Time a function") { val result = Metrics.time("timed_function") { Thread.sleep(10) "ok" } assertResult("ok")(result) } test("Write an Int gauge") { Metrics.gauge("int_measure_l", 10L) } test("Write an Double gauge") { Metrics.gauge("double_measure_l", 10.0F) } test("Write an Float gauge") { Metrics.gauge("float_measure_l", 10.0D) } }
Example 50
Source File: StandaloneAppExample.scala From pulse with Apache License 2.0 | 5 votes |
package io.phdata.pulse.example import javax.naming.NamingException import org.apache.log4j.{ Logger, MDC, NDC } object StandaloneAppExample { private val log = Logger.getLogger(this.getClass) def main(args: Array[String]): Unit = { MDC.put("hostname", java.net.InetAddress.getLocalHost().getHostName()) NDC.push("ndc message") val numEvents = args(0).toInt val sleepMillis = args(1).toInt 0 to numEvents map { num => val uuid = java.util.UUID.randomUUID.toString() log.info(s"info message $uuid") Thread.sleep(sleepMillis) if (num % 300 == 0) { log.error(s"error happened $uuid", new Exception()) } } // Throw an exception so when we turn on `log4j.debug=true` we can see where events stop posting to the log collector // it's a <code>NamingException</code> so we can catch it in the test for this class throw new NamingException("exiting") } }
Example 51
Source File: SocialGraphJob.scala From spark-graphx with GNU General Public License v3.0 | 5 votes |
package com.github.graphx.pregel.jobs.social import com.github.graphx.pregel.social.SocialGraph import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext object SocialGraphJob { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val sc = new SparkContext("local[*]", "GraphX") val graph = new SocialGraph(sc) println("Top 10 most-connected users:") graph.getMostConnectedUsers(10) foreach println println("Computing degrees of separation for user Arch") graph.degreeOfSeparationSingleUser(5306) foreach println println("Computing degrees of separation for user Arch and Fred") graph.degreeOfSeparationTwoUser(5306, 14) foreach println println("Connected component") graph.connectedComponentGroupedByUsers .sortBy ( {case (_, lowestVertexId) => lowestVertexId}, ascending = false).take(10) foreach println sc.stop() } }
Example 52
Source File: ShortestPathProblemJob.scala From spark-graphx with GNU General Public License v3.0 | 5 votes |
package com.github.graphx.pregel.jobs.ssp import com.github.graphx.pregel.ssp.ShortestPathProblem import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import org.apache.spark.graphx.VertexId object ShortestPathProblemJob extends App { Logger.getLogger("org").setLevel(Level.ERROR) val sc = new SparkContext("local[*]", "ShortestPathProblemDemo") val ssp = new ShortestPathProblem(sc) val sourceIdForTest: VertexId = 3 val sourceIdForRandom: VertexId = 75 val testGraph = ssp.testGraph val resultOnTestGraph = ssp.shortestPath(testGraph, sourceIdForTest) println(s"Test Graph:\n${ssp.graphToString(testGraph)}\n\n" + s"Distances on the test graph $resultOnTestGraph\n") val randomGraph = ssp.randomGraph val resultOnRandomGraph = ssp.shortestPath(randomGraph, sourceIdForRandom) println(s"Distances on the random graph $resultOnRandomGraph\n") }
Example 53
Source File: SocialPageRankJob.scala From spark-graphx with GNU General Public License v3.0 | 5 votes |
package com.github.graphx.pagerank import com.github.graphx.pregel.social.SocialGraph import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import org.apache.spark.graphx.VertexRDD object SocialPageRankJob { def static(socialGraph: SocialGraph, tolerance: Double): VertexRDD[Double] = socialGraph.graph.staticPageRank(numIter = 20).vertices def handleResult(socialGraph: SocialGraph, ranks: VertexRDD[Double]) = { socialGraph.verts.join(ranks).map { case (_, (username, rank)) => (username, rank) }.sortBy({ case (_, rank) => rank }, ascending = false).take(10) } def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val sc = new SparkContext("local[*]", "PageRank") val socialGraph: SocialGraph = new SocialGraph(sc) val TOLERANCE: Double = 0.0001 import scala.compat.Platform.{EOL => D} val topUsersDynamically = handleResult(socialGraph, ranks(socialGraph, TOLERANCE)).mkString(D) val topUsersIterative = handleResult(socialGraph, static(socialGraph, TOLERANCE)).mkString(D) println(s"Top 10 users in network counted with TOLERANCE until convergence $TOLERANCE - $D $topUsersDynamically") println(s"Top 10 users in the network counted iteratively - $D $topUsersIterative") sc.stop() } }
Example 54
Source File: CheckDirEndPointImpl.scala From Backup-Repo with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import com.google.protobuf.{RpcCallback, RpcController, Service} import org.apache.hadoop.hbase.coprocessor._ import org.apache.hadoop.hbase.{Coprocessor, CoprocessorEnvironment} import org.apache.log4j.Logger class CheckDirEndPointImpl extends CheckDirProtos.CheckDirService with Coprocessor with CoprocessorService { private lazy val logger = Logger.getLogger(getClass.getName) private var env: RegionCoprocessorEnvironment = null override def start(env: CoprocessorEnvironment) = { env match { case e: RegionCoprocessorEnvironment => e case _ => throw new CoprocessorException("Must be loaded on a table region!") } } override def stop(env: CoprocessorEnvironment) = {} override def getService: Service = this override def getCheckResult(controller: RpcController, request: CheckDirProtos.CheckRequest, done: RpcCallback[CheckDirProtos.CheckResponse]) = { val isDir = new java.io.File(".").isDirectory if (!isDir) { logger.warn( """Current directory is not accessible, |please add 'cd ~' before start regionserver in your regionserver start script.""") } val response: CheckDirProtos.CheckResponse = { CheckDirProtos.CheckResponse.newBuilder().setAccessible(isDir).build() } done.run(response) } }
Example 55
Source File: AkkaUtils.scala From DataXServer with Apache License 2.0 | 5 votes |
package org.tianlangstudio.data.hamal.yarn.util import akka.actor.{ActorSystem, ExtendedActorSystem} import com.typesafe.config.ConfigFactory import org.apache.log4j.{Level, Logger} import org.tianlangstudio.data.hamal.core.{Constants, HamalConf} import org.tianlangstudio.data.hamal.core.HamalConf def maxFrameSizeBytes(conf: HamalConf): Int = { val frameSizeInMB = conf.getInt("datax.akka.frameSize", 128) if (frameSizeInMB > AKKA_MAX_FRAME_SIZE_IN_MB) { throw new IllegalArgumentException( s"spark.akka.frameSize should not be greater than $AKKA_MAX_FRAME_SIZE_IN_MB MB") } frameSizeInMB * 1024 * 1024 } def protocol(actorSystem: ActorSystem): String = { val akkaConf = actorSystem.settings.config val sslProp = "akka.remote.netty.tcp.enable-ssl" protocol(akkaConf.hasPath(sslProp) && akkaConf.getBoolean(sslProp)) } def protocol(ssl: Boolean = false): String = { if (ssl) { "akka.ssl.tcp" } else { "akka.tcp" } } def address( protocol: String, systemName: String, host: String, port: Int, actorName: String): String = { address(protocol, systemName, s"$host:$port", actorName ) } def address( protocol: String, systemName: String, hostPort: String, actorName: String): String = { s"$protocol://$systemName@$hostPort/user/$actorName" } }
Example 56
Source File: ModelSerialization.scala From CTRmodel with Apache License 2.0 | 5 votes |
package com.ggstar.example import com.ggstar.ctrmodel._ import com.ggstar.features.FeatureEngineering import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.{SparkConf, SparkContext} object ModelSerialization { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val conf = new SparkConf() .setMaster("local") .setAppName("ctrModel") .set("spark.submit.deployMode", "client") val spark = SparkSession.builder.config(conf).getOrCreate() val resourcesPath = this.getClass.getResource("/samples.snappy.orc") val rawSamples = spark.read.format("orc").option("compression", "snappy").load(resourcesPath.getPath) //transform array to vector for following vectorAssembler val samples = FeatureEngineering.transferArray2Vector(rawSamples) samples.printSchema() samples.show(5, false) //model training println("Neural Network Ctr Prediction Model:") val innModel = new InnerProductNNCtrModel() innModel.train(samples) val transformedData = innModel.transform(samples) transformedData.show(1,false) //model serialization by mleap val mleapModelSerializer = new com.ggstar.serving.mleap.serialization.ModelSerializer() mleapModelSerializer.serializeModel(innModel._pipelineModel, "jar:file:/Users/zhwang/Workspace/CTRmodel/model/inn.model.mleap.zip", transformedData) //model serialization by JPMML val jpmmlModelSerializer = new com.ggstar.serving.jpmml.serialization.ModelSerializer() jpmmlModelSerializer.serializeModel(innModel._pipelineModel, "model/inn.model.jpmml.xml", transformedData) } }
Example 57
Source File: ModelSelection.scala From CTRmodel with Apache License 2.0 | 5 votes |
package com.ggstar.example import com.ggstar.ctrmodel._ import com.ggstar.evaluation.Evaluator import com.ggstar.features.FeatureEngineering import org.apache.spark.sql.SparkSession import org.apache.spark.{SparkConf, SparkContext} import org.apache.log4j.{Level, Logger} object ModelSelection { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val conf = new SparkConf() .setMaster("local") .setAppName("ctrModel") .set("spark.submit.deployMode", "client") val spark = SparkSession.builder.config(conf).getOrCreate() val resourcesPath = this.getClass.getResource("/samples.snappy.orc") val rawSamples = spark.read.format("orc").option("compression", "snappy").load(resourcesPath.getPath) rawSamples.printSchema() rawSamples.show(10) //transform array to vector for following vectorAssembler val samples = FeatureEngineering.transferArray2Vector(rawSamples) //split samples into training samples and validation samples val Array(trainingSamples, validationSamples) = samples.randomSplit(Array(0.7, 0.3)) val evaluator = new Evaluator } }
Example 58
Source File: GenerateVerticesExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch08 // scalastyle:off println import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.rdd.RDD object GenerateVerticesExample { def main(args: Array[String]): Unit = { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } // ログレベルをWARNに設定 Logger.getLogger("org").setLevel(Level.WARN) // SparkContextの生成 val conf = new SparkConf().setAppName("GenerateVerticesExample") val sc = new SparkContext(conf) // 引数から設定値を取得 val (numProducts, numUsers): (Int, Int) = (args(0).toInt, args(1).toInt) implicit val recOpts: RecommendLogOptions = RecommendLogOptions(numProducts, numUsers) run(sc) sc.stop() } def run(sc: SparkContext) (implicit recOpts: RecommendLogOptions) : Unit = { // 商品リスト、ユーザリストのRDDを生成 val products: RDD[VertexProperty] = sc.parallelize(PurchaseLogGenerator.genProductList) val users: RDD[VertexProperty] = sc.parallelize(PurchaseLogGenerator.genUserList) // 商品リスト20件を表示 println("===================================") println("get top 20 products:") products.take(20).foreach(x => println(s"id: ${x.id},\ttype: ${x.kind},\tname: ${x.name}")) // ユーザリスト20件を表示 println("===================================") println("get top 20 users:") users.take(20).foreach(x => println(s"id: ${x.id},\ttype: ${x.kind},\tname: ${x.name}")) } } // scalastyle:on println
Example 59
Source File: ReduceExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object ReduceExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("ReduceExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1, 6, 3), 3) nums.reduce((x, y) => x + y) println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""sum: ${nums.reduce((x, y) => x + y)}""") } } // scalastyle:on println
Example 60
Source File: StatsExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object StatsExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("StatsExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val nums = sc.parallelize(Array.range(1, 11)) val stats = nums.stats() println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""count: ${stats.count}""") println(s"""mean: ${stats.mean}""") println(s"""stdev: ${stats.stdev}""") } } // scalastyle:on println
Example 61
Source File: FoldExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object FoldExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("FoldExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1, 6, 3), 3) nums.reduce((x, y) => x + y) println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""sum: ${nums.fold(0)((x, y) => x + y)}""") } } // scalastyle:on println
Example 62
Source File: OrderExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object OrderExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("OrderExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1)) println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""top3: ${nums.top(3).mkString(", ")}""") println(s"""takeOredered3: ${nums.takeOrdered(3).mkString(", ")}""") } } // scalastyle:on println
Example 63
Source File: AggregateExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object AggregateExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("AggregateExample") val sc = new SparkContext(conf) run(sc) sc.stop() } private[basic_action] def run(sc: SparkContext) { val nums = sc.parallelize(Array.range(1, 11), 3) val acc = nums.aggregate(zeroValue = (0.0, 0))( seqOp = (partAcc, n) => (partAcc._1 + n, partAcc._2 + 1), combOp = (acc1, acc2) => (acc1._1 + acc2._1, acc1._2 + acc2._2) ) val avg = acc._1 / acc._2 println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""sum: ${nums.fold(0)((x, y) => x + y)}""") } } // scalastyle:on println
Example 64
Source File: CollectAsMapExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object CollectAsMapExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("CollectAsMapExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize( Array( ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1) ), 3 ) val fruitsAsMap = fruits.collectAsMap() println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""fruitsAsMap: $fruitsAsMap""") } } // scalastyle:on println
Example 65
Source File: PersistExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.persistence import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} object PersistExample { def main(args: Array[String]) { if (args.length != 1) { new IllegalArgumentException("Invalid arguments") System.exit(1) } Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("PersistExample") val sc = new SparkContext(conf) run(sc, args(0)) sc.stop() } def run(sc: SparkContext, inputFile: String) { val lines = sc.textFile(inputFile) lines.count() lines.collect() val persistedLines = sc.textFile(inputFile).persist() persistedLines.collect() persistedLines.count() persistedLines.unpersist() persistedLines.collect() } }
Example 66
Source File: CustomPartitionerExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.partition import org.apache.log4j.{Level, Logger} import org.apache.spark.Partitioner import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object CustomPartitionerExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("CustomPartitionerExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val defaultPartitioned = fruits.map((_, 1)).reduceByKey(_ + _) val customPartitioned = fruits.map((_, 1)).reduceByKey( new FirstLetterPartitioner(sc.defaultParallelism), _ + _) println(s"""fruits:\n ${fruits.collect().mkString(", ")}""") println() println("partitioned by default partitioner") defaultPartitioned.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) println() println("partitioned by first letter partitioner") customPartitioned.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) } } private[partition] class FirstLetterPartitioner(numParts: Int) extends Partitioner { override def numPartitions: Int = numParts override def getPartition(key: Any): Int = { key.toString.charAt(0).hashCode % numPartitions match { case p if p < 0 => p + numPartitions case p => p } } override def equals(other: Any): Boolean = { other match { case p: FirstLetterPartitioner => p.numPartitions == numPartitions case _ => false } } } // scalastyle:on println
Example 67
Source File: PartitionExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.partition import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object PartitionExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("Partition") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1), 1) println(s"""nums:\n ${nums.collect().mkString(", ")}""") println() println("original:") nums.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) println() val numsPar3 = nums.repartition(3) println("repartition to 3:") numsPar3.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) println() val numsPar2 = numsPar3.coalesce(2) println("coalesce to 2:") numsPar2.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) } } // scalastyle:on println
Example 68
Source File: WordCountExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.shared_variable import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object WordCountExample { def main(args: Array[String]) { if (args.length != 1) { new IllegalArgumentException("Invalid arguments") System.exit(1) } Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("WordCountExample") val sc = new SparkContext(conf) run(sc, args(0)) sc.stop() } def run(sc: SparkContext, inputFile: String) { val stopWordCount = sc.accumulator(0L) val stopWords = sc.broadcast(Set("a", "an", "for", "in", "on")) val lines = sc.textFile(inputFile) val words = lines.flatMap(_.split(" ")).filter(!_.isEmpty) val wordCounts = words.map(w => (w, 1)).reduceByKey(_ + _).filter { w => val result = !stopWords.value.contains(w._1) if (!result) stopWordCount += 1L result } val sortedWordCounts = wordCounts.sortBy(_._2, ascending = false) println(s"""wordCounts: ${sortedWordCounts.take(10).mkString(", ")}""") println(s"""stopWordCounts: ${stopWordCount.value}""") } } // scalastyle:on println
Example 69
Source File: AggregateByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object AggregateByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("AggregateByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize( Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1))) val fruitCountAvgs = fruits.aggregateByKey(zeroValue = Acc(0.0, 0))( seqOp = (partAcc, n) => partAcc += n, combOp = (acc1, acc2) => acc1 ++= acc2 ).mapValues(acc => acc.sum / acc.count) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""fruitCountAvgs: ${fruitCountAvgs.collect().mkString(", ")}""") } } // scalastyle:on println
Example 70
Source File: MapValuesExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object MapValuesExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("MapValuesExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array(("Apple", 1), ("Orange", 4), ("Apple", 2), ("Peach", 1))) val plusOnes = fruits.mapValues(v => v + 1) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""plusOnes: ${plusOnes.collect().mkString(", ")}""") } } // scalastyle:on println
Example 71
Source File: SortByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object SortByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("SortByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize( Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1))) val sortedByKeyAsc = fruits.sortByKey(ascending = false) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""sortedByKeyAsc: ${sortedByKeyAsc.collect().mkString(", ")}""") val nums = sc.parallelize( Array(("One", 1), ("Hundred", 100), ("Three", 3), ("Thousand", 1000))) implicit val sortByStrLen = new Ordering[String] { def compare(x: String, y: String): Int = x.length - y.length } val sortedByKeyLength = nums.sortByKey() println() println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""sortedByKeyLength: ${sortedByKeyLength.collect().mkString(", ")}""") } } // scalastyle:on println
Example 72
Source File: CoGroupExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object CoGroupExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("CoGroupExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val persons = sc.parallelize(Array( ("Adam", "San francisco"), ("Bob", "San francisco"), ("Taro", "Tokyo"), ("Charles", "New York") )) val cities = sc.parallelize(Array( ("Tokyo", "Japan"), ("San francisco", "America"), ("Beijing", "China") )) val grouped = persons.map(_.swap).cogroup(cities) println(s"""persons: ${persons.collect().mkString(", ")}""") println(s"""cities: ${cities.collect().mkString(", ")}""") println() println(s"""grouped:\n${grouped.collect().mkString("\n")}""") } } // scalastyle:on println
Example 73
Source File: JoinExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object JoinExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("JoinExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val persons = sc.parallelize(Array( ("Adam", "San francisco"), ("Bob", "San francisco"), ("Taro", "Tokyo"), ("Charles", "New York") )) val cities = sc.parallelize(Array( ("Tokyo", "Japan"), ("San francisco", "America"), ("Beijing", "China") )) val leftJoined = persons.map(_.swap).join(cities) val leftOuterJoined = persons.map(_.swap).leftOuterJoin(cities) val rightOuterJoined = persons.map(_.swap).rightOuterJoin(cities) val fullOuterJoined = persons.map(_.swap).fullOuterJoin(cities) println(s"""persons: ${persons.collect().mkString(", ")}""") println(s"""cities: ${cities.collect().mkString(", ")}""") println() println(s"""leftJoined:\n${leftJoined.collect().mkString("\n")}""") println() println(s"""leftOuterJoined:\n${leftOuterJoined.collect().mkString("\n")}""") println() println(s"""rightOuterJoined:\n${rightOuterJoined.collect().mkString("\n")}""") println() println(s"""fullOuterJoined:\n${fullOuterJoined.collect().mkString("\n")}""") } } // scalastyle:on println
Example 74
Source File: GroupByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object GroupByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("GroupByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize( Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1))) val grouped = fruits.groupByKey() println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""grouped: ${grouped.collect().mkString(", ")}""") } } // scalastyle:on println
Example 75
Source File: ReduceByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object ReduceByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("ReduceByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array( ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1))) val fruitCounts = fruits.reduceByKey((x, y) => x + y) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""fruitCounts: ${fruitCounts.collect().mkString(", ")}""") } } // scalastyle:on println
Example 76
Source File: CombineByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object CombineByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("CombineByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize( Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1))) val fruitCountAvgs = fruits.combineByKey( createCombiner = (v: Int) => Acc(v.toDouble, 1), mergeValue = (partAcc: Acc, n: Int) => partAcc += n, mergeCombiners = (acc1: Acc, acc2: Acc) => acc1 ++= acc2 ).mapValues(acc => acc.sum / acc.count) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""fruitCountAvgs: ${fruitCountAvgs.collect().mkString(", ")}""") } } // scalastyle:on println
Example 77
Source File: FoldByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object FoldByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("FoldByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array( ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1))) val fruitCounts = fruits.foldByKey(0)((x, y) => x + y) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""fruitCounts: ${fruitCounts.collect().mkString(", ")}""") } } // scalastyle:on println
Example 78
Source File: MapPartitionsExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.module.scala.DefaultScalaModule import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object MapPartitionsExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("MapPartitionsExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val jsonLines = sc.parallelize(Array( """{"name": "Apple", "num": 1}""", """{"name": "Orange", "num": 4}""", """{"name": "Apple", "num": 2}""", """{"name": "Peach", "num": 1}""" )) val parsed = jsonLines.mapPartitions { lines => val mapper = new ObjectMapper() mapper.registerModule(DefaultScalaModule) lines.map { line => val f = mapper.readValue(line, classOf[Map[String, String]]) (f("name"), f("num")) } } println(s"""json:\n${jsonLines.collect().mkString("\n")}""") println() println(s"""parsed:\n${parsed.collect().mkString("\n")}""") } } // scalastyle:on println
Example 79
Source File: FlatMapExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object FlatMapExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("FlatMapExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val lines = sc.parallelize(Array("Apple is red", "PineApple is yellow")) val words = lines.flatMap(line => line.split(" ")) println(s"""lines: ${lines.collect().mkString(", ")}""") println(s"""words: ${words.collect().mkString(", ")}""") } } // scalastyle:on println
Example 80
Source File: SetOperationsExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object SetOperationsExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("SetOperationsExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits1 = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val fruits2 = sc.parallelize(Array("Grape", "Apple", "Banana", "Orange")) val union = fruits1.union(fruits2) val subtract = fruits1.subtract(fruits2) val intersection = fruits1.intersection(fruits2) val cartesian = fruits1.cartesian(fruits2) println(s"""fruits1: ${fruits1.collect().mkString(", ")}""") println(s"""fruits2: ${fruits2.collect().mkString(", ")}""") println(s"""union: ${union.collect().mkString(", ")}""") println(s"""subtract: ${subtract.collect().mkString(", ")}""") println(s"""intersection: ${intersection.collect().mkString(", ")}""") println(s"""cartesian: ${cartesian.collect().mkString(", ")}""") } } // scalastyle:on println
Example 81
Source File: MapExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object MapExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("MapExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val lengths = fruits.map(fruit => fruit.length) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""lengths: ${lengths.collect().mkString(", ")}""") } } // scalastyle:on println
Example 82
Source File: ZipExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object ZipExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("ZipExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits1 = sc.parallelize( Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val fruits2 = sc.parallelize( Array("りんご", "オレンジ", "桃", "オレンジ", "パイナップル", "オレンジ")) val zipped = fruits1.zip(fruits2) println(s"""fruits1: ${fruits1.collect().mkString(", ")}""") println(s"""fruits2: ${fruits2.collect().mkString(", ")}""") println(s"""zipped: ${zipped.collect().mkString(", ")}""") } } // scalastyle:on println
Example 83
Source File: DistinctExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object DistinctExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("DistinctExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val uniques = fruits.distinct() println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""uniques: ${uniques.collect().mkString(", ")}""") } } // scalastyle:on println
Example 84
Source File: SampleExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object SampleExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("SampleExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val samples = fruits.sample(withReplacement = false, 0.5, 1) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""samples: ${samples.collect().mkString(", ")}""") } } // scalastyle:on println
Example 85
Source File: FilterExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object FilterExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("FilterExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val startWithPs = fruits.filter(fruit => fruit.startsWith("P")) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""startWithPs: ${startWithPs.collect().mkString(", ")}""") } } // scalastyle:on println
Example 86
Source File: SparkFunSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark // scalastyle:off import org.apache.log4j.{Level, Logger} import org.scalatest.{FunSuite, Outcome} import org.apache.spark.Logging final protected override def withFixture(test: NoArgTest): Outcome = { val testName = test.text val suiteName = this.getClass.getName val shortSuiteName = suiteName.replaceAll("org.apache.spark", "o.a.s") try { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) logInfo(s"\n\n===== TEST OUTPUT FOR $shortSuiteName: '$testName' =====\n") test() } finally { logInfo(s"\n\n===== FINISHED $shortSuiteName: '$testName' =====\n") } } }
Example 87
Source File: Logging.scala From spark-distcp with Apache License 2.0 | 5 votes |
package com.coxautodata.objects import org.apache.log4j.{Level, LogManager, Logger} trait Logging { // Method to get the logger name for this object protected def logName: String = { // Ignore trailing $'s in the class names for Scala objects this.getClass.getName.stripSuffix("$") } private val log: Logger = LogManager.getLogger(logName) // Set logger level protected def setLogLevel(level: Level): Unit = log.setLevel(level) // Log methods that take only a String protected def logInfo(msg: => String) { if (log.isInfoEnabled) log.info(msg) } protected def logDebug(msg: => String) { if (log.isDebugEnabled) log.debug(msg) } protected def logTrace(msg: => String) { if (log.isTraceEnabled) log.trace(msg) } protected def logWarning(msg: => String) { log.warn(msg) } protected def logError(msg: => String) { log.error(msg) } // Log methods that take Throwables (Exceptions/Errors) too protected def logInfo(msg: => String, throwable: Throwable) { if (log.isInfoEnabled) log.info(msg, throwable) } protected def logDebug(msg: => String, throwable: Throwable) { if (log.isDebugEnabled) log.debug(msg, throwable) } protected def logTrace(msg: => String, throwable: Throwable) { if (log.isTraceEnabled) log.trace(msg, throwable) } protected def logWarning(msg: => String, throwable: Throwable) { log.warn(msg, throwable) } protected def logError(msg: => String, throwable: Throwable) { log.error(msg, throwable) } protected def isTraceEnabled: Boolean = { log.isTraceEnabled } }
Example 88
Source File: StressReceiver.scala From spark-cassandra-stress with Apache License 2.0 | 5 votes |
package com.datastax.sparkstress import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.receiver.Receiver import org.apache.log4j.Logger class StressReceiver[T]( index: Int, rowGenerator: RowGenerator[T], config: Config, blockIntervalInMs: Int, storageLevel: StorageLevel) extends Receiver[T](storageLevel) { class EmitterThread(receiver: StressReceiver[_]) extends Thread(s"Emitter$index") { override def run(): Unit = { val rowIterator = rowGenerator.generatePartition(config.seed, index) val throughPutPerBlockInterval = (blockIntervalInMs / (config.streamingBatchIntervalSeconds * 1000.0) * config.receiverThroughputPerBatch).toLong while (rowIterator.hasNext) { val batchBegin = System.currentTimeMillis() for (x <- 1l to throughPutPerBlockInterval if rowIterator.hasNext) { store(rowIterator.next()) } val batchEnd = System.currentTimeMillis() val napTime = blockIntervalInMs - (batchEnd - batchBegin) if (napTime > 0) Thread.sleep(napTime) } receiver.stop("Iterator Empty") } } def onStart() = { new EmitterThread(this).start() } def onStop() = { } }
Example 89
Source File: MCLModelSuite.scala From MCL_spark with MIT License | 5 votes |
package org.apache.spark.mllib.clustering import org.apache.log4j.{Level, Logger} import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.util.Utils class MCLModelSuite extends MCLFunSuite{ // Disable Spark messages when running program Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) test("model save/load", UnitTest){ val users: RDD[(VertexId, String)] = sc.parallelize(Array((0L,"Node1"), (1L,"Node2"), (2L,"Node3"), (3L,"Node4"),(4L,"Node5"), (5L,"Node6"), (6L,"Node7"), (7L, "Node8"), (8L, "Node9"), (9L, "Node10"), (10L, "Node11"))) val relationships: RDD[Edge[Double]] = sc.parallelize( Seq(Edge(0, 1, 1.0), Edge(1, 0, 1.0), Edge(0, 2, 1.0), Edge(2, 0, 1.0), Edge(0, 3, 1.0), Edge(3, 0, 1.0), Edge(1, 2, 1.0), Edge(2, 1, 1.0), Edge(1, 3, 1.0), Edge(3, 1, 1.0), Edge(2, 3, 1.0), Edge(3, 2, 1.0), Edge(4, 5, 1.0), Edge(5, 4, 1.0), Edge(4, 6, 1.0), Edge(6, 4, 1.0), Edge(4, 7, 1.0), Edge(7, 4, 1.0), Edge(5, 6, 1.0), Edge(6, 5, 1.0), Edge(5, 7, 1.0), Edge(7, 5, 1.0), Edge(6, 7, 1.0), Edge(7, 6, 1.0), Edge(3, 8, 1.0), Edge(8, 3, 1.0), Edge(9, 8, 1.0), Edge(8, 9, 1.0), Edge(9, 10, 1.0), Edge(10, 9, 1.0), Edge(4, 10, 1.0), Edge(10, 4, 1.0) )) val graph = Graph(users, relationships) val model: MCLModel = MCL.train(graph) // Check number of clusters model.nbClusters shouldEqual 3 // Check save and load methods val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString Array(true, false).foreach { case selector => // Save model, load it back, and compare. try { model.save(sc, path) val sameModel = MCLModel.load(sc, path) assertDatasetEquals(model.assignments.orderBy("id"), sameModel.assignments.orderBy("id")) } finally { Utils.deleteRecursively(tempDir) } } } test("nodes assignments", UnitTest) { val nodeId = 1.0.toLong val cluster = 2.0.toLong val newAssignment:Assignment = Assignment.apply(Row(nodeId, cluster)) newAssignment.id shouldEqual nodeId newAssignment.cluster shouldEqual cluster } }
Example 90
Source File: PCA.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.unsupervised.pca import scala.util.Try import org.apache.log4j.Logger import org.apache.commons.math3.linear._ import org.apache.commons.math3.stat.correlation.Covariance import org.scalaml.stats.TSeries import org.scalaml.Predef._ import org.scalaml.Predef.Context._ import org.scalaml.core.ITransform import org.scalaml.util.LoggingUtils._ import TSeries._ case class PCAModel(covariance: DblMatrix, eigenvalues: Array[Double]) override def |> : PartialFunction[Array[T], Try[Double]] = { case x: Array[T] if x.length == dimension(xt) && model.isDefined => Try(margin(x, model.get.eigenvalues)) } override def toString: String = model.map(m => { val covStr = m.covariance./:(new StringBuilder)((b, r) => b.append(s"${r.mkString(" ")}\n")).toString() s"""\nEigenvalues:\n${m.eigenvalues.mkString(" ,")}\n\nCovariance matrix\n | $covStr""".stripMargin }).getOrElse("PCA model undefined") } //-------------------------------- EOF -------------------------------------------------------------------------
Example 91
Source File: ClusteringModule.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.workflow.module import org.apache.log4j.Logger import org.scalaml.Predef.Context.ToDouble import org.scalaml.Predef.VSeries override def execute(xt: Array[T]): Unit = { try { val results = em |> xt show(results, logger) } catch { case e: MatchError => val errMsg = s"${e.getMessage()} caused by ${e.getCause.toString}" error(s"ClusteringModule.MultivariateEM $errMsg", logger) case e: Throwable => error("ClusteringModule.Kmeans", logger, e) } } } } // --------------------------------------- EOF ------------------------------------------------------
Example 92
Source File: BiasVariance.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.stats // 3rd party classes import org.apache.log4j.Logger // Scala for Machine Learning classes and singletons import org.scalaml.Predef._ def apply(emul: Double => Double, nValues: Int): BiasVariance = new BiasVariance(emul, nValues) private def check(nValues: Int): Unit = { require( nValues > NUMVALUES_LIMITS._1 && nValues < NUMVALUES_LIMITS._2, s"BiasVarianceEmulator.check Size of training sets $nValues is out of range" ) } } // ----------------------- EOF --------------------------------------
Example 93
Source File: FileUtils.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.util import org.apache.log4j.Logger import scala.io.Source._ import scala.util.{Failure, Success, Try} def write(content: String, pathName: String, className: String): Boolean = { import java.io.PrintWriter import DisplayUtils._ var printWriter: Option[PrintWriter] = None var status = false Try { printWriter = Some(new PrintWriter(pathName)) printWriter.foreach(_.write(content)) printWriter.foreach(_.flush) printWriter.foreach(_.close) status = true } match { // Catch and display exception description and return false case Failure(e) => error(s"$className.write failed for $pathName", logger, e) if (printWriter.isDefined) printWriter.foreach(_.close) status case Success(s) => status } } } // --------------------------------- EOF -------------------------------------
Example 94
Source File: KmeansTest.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.spark.mllib import org.apache.log4j.{Level, Logger} import org.apache.spark.storage.StorageLevel import org.apache.spark.{SparkConf, SparkContext} import org.scalaml.{Logging, Resource} import org.scalaml.Predef._ import org.scalaml.stats.TSeries._ import org.scalaml.trading.YahooFinancials import org.scalaml.workflow.data.DataSource import org.scalatest.FunSuite import org.scalatest.concurrent.ScalaFutures import scala.concurrent.Future final class KmeansTest extends FunSuite with ScalaFutures with Logging with Resource { import scala.concurrent.ExecutionContext.Implicits.global protected[this] val name = "Spark MLlib K-Means" private val K = 8 private val NRUNS = 4 private val MAXITERS = 60 private val PATH = "spark/CSCO.csv" private val CACHE = false test(s"$name evaluation") { show(s"Evaluation") Logger.getRootLogger.setLevel(Level.ERROR) // The Spark configuration has to be customize to your environment val sparkConf = new SparkConf().setMaster("local") .setAppName("Kmeans") .set("spark.executor.memory", "4096m") implicit val sc = SparkContext.getOrCreate(sparkConf) // no need to load additional jar file val kmeanClustering: Option[Kmeans] = extract.map(input => { val volatilityVol = zipToSeries(input._1, input._2).take(500) val config = new KmeansConfig(K, MAXITERS, NRUNS) val rddConfig = RDDConfig(CACHE, StorageLevel.MEMORY_ONLY) Kmeans(config, rddConfig, volatilityVol) }) // Wraps into a future to enforce time out in case of a straggler val ft = Future[Boolean] { predict(kmeanClustering) } whenReady(ft) { result => assert(result) } sc.stop } private def predict(kmeanClustering: Option[Kmeans]): Boolean = { kmeanClustering.map(kmeansCluster => { val obs = Array[Double](0.1, 0.9) val clusterId1 = kmeansCluster |> obs show(s"(${obs(0)},${obs(1)}) => Cluster #$clusterId1") val obs2 = Array[Double](0.56, 0.11) val clusterId2 = kmeansCluster |> obs2 val result = s"(${obs2(0)},${obs2(1)}) => Cluster #$clusterId2" show(s"$name result: $result") }) true } private def extract: Option[(DblVec, DblVec)] = { import scala.util._ val extractors = List[Array[String] => Double]( YahooFinancials.volatility, YahooFinancials.volume ) DataSource(getPath(PATH).get, true).map(_.|>) match { case Success(pfnSrc) => pfnSrc(extractors).map(res => ((res(0).toVector, res(1).toVector))).toOption case Failure(e) => failureHandler(e) None } } } // --------------------------------- EOF -------------------------------------------------
Example 95
Source File: StreamsTest.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.scalability.scala import java.lang.ref._ import org.apache.log4j.Logger import org.scalaml.Logging import org.scalaml.Predef._ import org.scalatest.{FlatSpec, Matchers} import scala.math._ case class DataPoint(x: DblVec, y: Double) final class StreamsTest extends FlatSpec with Matchers with Logging { import scala.util.Random protected[this] val name = "Scala streams" it should s"$name huge list" in { show(s"$name huge list") val input = (0 until 1000000000).toStream input(10) should be(10) } it should s"$name recursion" in { show(s"$name recursion") def mean(strm: => Stream[Double]): Double = { @scala.annotation.tailrec def mean(z: Double, count: Int, strm: Stream[Double]): (Double, Int) = if (strm.isEmpty) (z, count) else mean((1.0 - 1.0 / count) * z + strm.head / count, count + 1, strm.tail) mean(0.0, 1, strm)._1 } val input = List[Double](2.0, 5.0, 3.5, 2.0, 5.7, 1.0, 8.0) val ave: Double = mean(input.toStream) ave should be(3.88 +- 0.05) } it should s"$name with recycled memory blocks" in { show("$name with recycled memory blocks") type DblVec = Vector[Double] val DATASIZE = 20000 val dot = (s: Double, xy: (Double, Double)) => s + xy._1 * xy._2 val diff = (x: DblVec, y: DblVec) => x.zip(y).aggregate(0.0)(dot, _ + _) val weights = Vector[Double](0.5, 0.7) val lossFunction = new LossFunction(diff, weights, DATASIZE) // Create a stream of weak references to 10 stream segments of size DATESIZE/10 val stream = () => new WeakReference( Stream.tabulate(DATASIZE)(n => DataPoint( Vector[Double](n.toDouble, n * (n.toDouble)), n.toDouble * weights(0) + n * (n.toDouble) * weights(1) + 0.1 * Random.nextDouble )) ) // Compute a simple distance using the dot product val totalLoss = sqrt(lossFunction.compute(stream)) show(s"$name totalLoss ${totalLoss / DATASIZE}") val averageLoss = totalLoss / DATASIZE averageLoss should be(0.0 +- 0.001) } } // -------------------------- EOF --------------------------------
Example 96
Source File: Application.scala From retail_analytics with Apache License 2.0 | 5 votes |
package controllers import scalaz._ import Scalaz._ import scalaz.EitherT._ import scalaz.Validation //import scalaz.Validation.FlatMap._ import scalaz.NonEmptyList._ import play.api.mvc._ import java.io.File import scala.io.Source import org.apache.log4j.Logger import org.apache.log4j.Level import models._ import models.stack._ import play.api.libs.json._ object Application extends Controller { def index() = Action { implicit request => Ok(views.html.index("Megam Analytics.")) } def upload = Action(parse.multipartFormData) { implicit request => request.body.file("picture").map { picture => import java.io.File val filename = picture.filename val contentType = picture.contentType picture.ref.moveTo(new File("/tmp/"+filename)) models.HDFSFileService.saveFile("/tmp/"+filename) match { case Success(succ) => { val fu = List(("success" -> succ)) Redirect("/").flashing(fu: _*) } case Failure(err) => { val fu = List(("error" -> "File doesn't get uploaded")) Redirect("/").flashing(fu: _*) } } }.getOrElse { val fu = List(("error" -> "File doesn't get uploaded..")) Redirect("/").flashing(fu: _*) } } def analysis() = Action { implicit request => val tuple_res = models.Retail.buyingbehaviour(MConfig.recommand_ID.toInt, MConfig.retailfile) println("BACK==========================>>>") println(tuple_res._1) //val finalJson = { // for { // product <- productList // } yield Json.parse(product).as[JsObject] // } Ok(views.html.finalProducts(tuple_res._1, tuple_res._2)) } }
Example 97
Source File: InductiveClassifier.scala From scala-cp with Apache License 2.0 | 5 votes |
package se.uu.it.cp import org.apache.log4j.Logger def predict(features: Seq[Double], significance: Double) = { //Validate input require(significance > 0 && significance < 1, s"significance $significance is not in (0,1)") alphas.foreach { a => if (a.length < 1 / significance - 1) { log.warn(s"too few calibration samples (${a.length}) for significance $significance") } } //Compute prediction set mondrianPv(features).zipWithIndex.map { case (pVal, c) => if (pVal > significance) { Set(c.toDouble) } else { Set[Double]() } }.reduce(_ ++ _) } }
Example 98
Source File: FileOutputIT.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta import java.sql.Timestamp import java.util.UUID import com.github.nscala_time.time.Imports._ import com.stratio.sparta.sdk.pipeline.output.{Output, OutputFormatEnum, SaveModeEnum} import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} import org.scalatest._ import scala.reflect.io.File class FileOutputIT extends FlatSpec with ShouldMatchers with BeforeAndAfterAll { self: FlatSpec => @transient var sc: SparkContext = _ override def beforeAll { Logger.getRootLogger.setLevel(Level.ERROR) sc = FileOutputIT.getNewLocalSparkContext(1, "test") } override def afterAll { sc.stop() System.clearProperty("spark.driver.port") } trait CommonValues { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ val time = new Timestamp(DateTime.now.getMillis) val data = sc.parallelize(Seq(Person("Kevin", 18, time), Person("Kira", 21, time), Person("Ariadne", 26, time))).toDF val tmpPath: String = s"/tmp/sparta-test/${UUID.randomUUID().toString}" } trait WithEventData extends CommonValues { val properties = Map("path" -> tmpPath, "createDifferentFiles" -> "false") val output = new FileOutput("file-test", properties) } "FileOutputIT" should "save a dataframe" in new WithEventData { output.save(data, SaveModeEnum.Append, Map(Output.TimeDimensionKey -> "minute", Output.TableNameKey -> "person")) val source = new java.io.File(tmpPath).listFiles() val read = sqlContext.read.json(tmpPath).toDF read.count shouldBe(3) File("/tmp/sparta-test").deleteRecursively } } object FileOutputIT { def getNewLocalSparkContext(numExecutors: Int = 1, title: String): SparkContext = { val conf = new SparkConf().setMaster(s"local[$numExecutors]").setAppName(title) SparkContext.getOrCreate(conf) } } case class Person(name: String, age: Int, minute: Timestamp) extends Serializable
Example 99
Source File: GamerSparkSQLExample.scala From SparkOnKudu with Apache License 2.0 | 5 votes |
package org.kududb.spark.demo.gamer.aggregates import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object GamerSparkSQLExample { def main(args:Array[String]): Unit = { if (args.length == 0) { println("{kudumaster} {runLocal}") return } Logger.getRootLogger.setLevel(Level.ERROR) val kuduMaster = args(0) val runLocal = args(1).equals("l") println("Loading Spark Context") var sc:SparkContext = null if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") sc = new SparkContext(sparkConfig) } println("Loading Spark Context: Finished") println("Setting up Tables") val sqlContext = new SQLContext(sc) sqlContext.load("org.kududb.spark", Map("kudu.table" -> "gamer", "kudu.master" -> kuduMaster)).registerTempTable("gamer") println("Query 1: SELECT count(*) FROM gamer") val startTimeQ1 = System.currentTimeMillis() sqlContext.sql("SELECT count(*) FROM gamer").take(10).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 1: " + (System.currentTimeMillis() - startTimeQ1)) println("Query 2: SELECT * FROM gamer limit 100") val startTimeQ2 = System.currentTimeMillis() sqlContext.sql("SELECT * FROM gamer limit 100").take(100).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 2: " + (System.currentTimeMillis() - startTimeQ2)) println("Query 3: SELECT * FROM gamer order_by last_time_played desc limit 100") val startTimeQ3 = System.currentTimeMillis() sqlContext.sql("SELECT * FROM gamer order by last_time_played desc limit 100").take(100).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 3: " + (System.currentTimeMillis() - startTimeQ3)) println("Query 4: SELECT max(games_played), max(oks), max(damage_given) FROM gamer") val startTimeQ4 = System.currentTimeMillis() sqlContext.sql("SELECT max(games_played), max(oks), max(damage_given) FROM gamer").take(100).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 4: " + (System.currentTimeMillis() - startTimeQ4)) println("Query 5 + MLLIB: SELECT gamer_id, oks, games_won, games_played FROM gamer" ) val startTimeQ5 = System.currentTimeMillis() val resultDf = sqlContext.sql("SELECT gamer_id, oks, games_won, games_played FROM gamer") val parsedData = resultDf.map(r => { val array = Array(r.getInt(1).toDouble, r.getInt(2).toDouble, r.getInt(3).toDouble) Vectors.dense(array) }) val dataCount = parsedData.count() if (dataCount > 0) { val clusters = KMeans.train(parsedData, 3, 5) clusters.clusterCenters.foreach(v => println(" Vector Center:" + v)) } //TODO add Mllib here println("Finish Query 5 + MLLIB: " + (System.currentTimeMillis() - startTimeQ5)) } }
Example 100
Source File: BasicSparkSQLExamples.scala From SparkOnKudu with Apache License 2.0 | 5 votes |
package org.kududb.spark.demo.basic import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object BasicSparkSQLExamples { def main(args:Array[String]): Unit = { if (args.length == 0) { println("<kuduMaster> <tablename> <runLocal>") } Logger.getRootLogger.setLevel(Level.ERROR) val kuduMaster = args(0) val tableName = args(1) val runLocal = args(2).equals("l") println("starting") var sc:SparkContext = null if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") sc = new SparkContext(sparkConfig) } try { println("Setting up Tables") val sqlContext = new SQLContext(sc) sqlContext.load("org.kududb.spark", Map("kudu.table" -> tableName, "kudu.master" -> kuduMaster)).registerTempTable(tableName) println("Query 1: SELECT count(*) FROM " + tableName) val startTimeQ1 = System.currentTimeMillis() sqlContext.sql("SELECT count(*) FROM " + tableName).take(10).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 1: " + (System.currentTimeMillis() - startTimeQ1)) println("Query 2: SELECT key_id, col_1 FROM " + tableName + " limit 100") val startTimeQ2 = System.currentTimeMillis() sqlContext.sql("SELECT key_id, col_1 FROM " + tableName + " limit 100 ").take(100).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 2: " + (System.currentTimeMillis() - startTimeQ2)) val q3 = "select key_id from " + tableName + " a join (SELECT max(col_1) col_max FROM " + tableName + ") b on (a.col_1 = b.col_max)" println("Query 3: " + q3) val startTimeQ3 = System.currentTimeMillis() sqlContext.sql(q3).take(100).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 3: " + (System.currentTimeMillis() - startTimeQ3)) println("Query 5 + MLLIB: SELECT key_id, col_1, col_2 FROM " + tableName ) val startTimeQ5 = System.currentTimeMillis() val resultDf = sqlContext.sql("SELECT key_id, col_1, col_2 FROM " + tableName + " limit 1000") val parsedData = resultDf.map(r => { val array = Array(r.getInt(1).toDouble, r.getInt(2).toDouble) Vectors.dense(array) }) val clusters = KMeans.train(parsedData, 3, 4) clusters.clusterCenters.foreach(v => println(" Vector Center:" + v)) //TODO add Mllib here println("Finish Query 5 + MLLIB: " + (System.currentTimeMillis() - startTimeQ5)) } finally { sc.stop() } } }
Example 101
Source File: CommandLineClient.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.integrationtest.minicluster import org.apache.log4j.Logger import org.apache.gearpump.cluster.ApplicationStatus import org.apache.gearpump.integrationtest.Docker class CommandLineClient(host: String) { private val LOG = Logger.getLogger(getClass) def listApps(): Array[String] = { gearCommand(host, "gear info").split("\n").filter( _.startsWith("application: ") ) } def listRunningApps(): Array[String] = listApps().filter(_.contains(s", status: ${ApplicationStatus.ACTIVE}")) def queryApp(appId: Int): String = try { listApps().filter(_.startsWith(s"application: $appId")).head } catch { case ex: Throwable => LOG.warn(s"swallowed an exception: $ex") "" } def submitAppAndCaptureOutput(jar: String, executorNum: Int, args: String = ""): String = { gearCommand(host, s"gear app -verbose true -jar $jar -executors $executorNum $args") } def submitApp(jar: String, args: String = ""): Int = { LOG.debug(s"|=> Submit Application $jar...") submitAppUse("gear app", jar, args) } private def submitAppUse(launcher: String, jar: String, args: String = ""): Int = try { gearCommand(host, s"$launcher -jar $jar $args").split("\n") .filter(_.contains("The application id is ")).head.split(" ").last.toInt } catch { case ex: Throwable => LOG.warn(s"swallowed an exception: $ex") -1 } def killApp(appId: Int): Boolean = { tryGearCommand(host, s"gear kill -appid $appId") } private def gearCommand(container: String, command: String): String = { LOG.debug(s"|=> Gear command $command in container $container...") Docker.execute(container, s"/opt/start $command") } private def tryGearCommand(container: String, command: String): Boolean = { LOG.debug(s"|=> Gear command $command in container $container...") Docker.executeSilently(container, s"/opt/start $command") } }
Example 102
Source File: Util.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.integrationtest import scala.concurrent.duration._ import scala.util.{Failure, Success, Try} import org.apache.log4j.Logger object Util { private val LOG = Logger.getLogger(getClass) def encodeUriComponent(s: String): String = { try { java.net.URLEncoder.encode(s, "UTF-8") .replaceAll("\\+", "%20") .replaceAll("\\%21", "!") .replaceAll("\\%27", "'") .replaceAll("\\%28", "(") .replaceAll("\\%29", ")") .replaceAll("\\%7E", "~") } catch { case ex: Throwable => s } } def retryUntil( condition: () => Boolean, conditionDescription: String, maxTries: Int = 15, interval: Duration = 10.seconds): Unit = { var met = false var tries = 0 while (!met && tries < maxTries) { met = Try(condition()) match { case Success(true) => true case Success(false) => false case Failure(ex) => false } tries += 1 if (!met) { LOG.error(s"Failed due to (false == $conditionDescription), " + s"retrying for the ${tries} times...") Thread.sleep(interval.toMillis) } else { LOG.info(s"Success ($conditionDescription) after ${tries} retries") } } if (!met) { throw new Exception(s"Failed after ${tries} retries, ($conditionDescription) == false") } } }
Example 103
Source File: ShellExec.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.integrationtest import scala.collection.JavaConverters._ import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent._ import scala.concurrent.duration._ import scala.sys.process._ import org.apache.log4j.Logger import org.apache.storm.shade.org.eclipse.jetty.util.QuotedStringTokenizer private def splitQuotedString(str: String): List[String] = { val splitter = new QuotedStringTokenizer(str, " \t\n\r") splitter.asInstanceOf[java.util.Enumeration[String]].asScala.toList } def exec(command: String, sender: String, timeout: Duration = PROCESS_TIMEOUT): Boolean = { LOG.debug(s"$sender => `$command`") val p = splitQuotedString(command).run() val f = Future(blocking(p.exitValue())) // wrap in Future val retval = { try { Await.result(f, timeout) } catch { case _: TimeoutException => LOG.error(s"timeout to execute command `$command`") p.destroy() p.exitValue() } } LOG.debug(s"$sender <= exit $retval") retval == 0 } def execAndCaptureOutput(command: String, sender: String, timeout: Duration = PROCESS_TIMEOUT) : String = { LOG.debug(s"$sender => `$command`") val buf = new StringBuilder val processLogger = ProcessLogger((o: String) => buf.append(o).append("\n"), (e: String) => buf.append(e).append("\n")) val p = splitQuotedString(command).run(processLogger) val f = Future(blocking(p.exitValue())) // wrap in Future val retval = { try { Await.result(f, timeout) } catch { case _: TimeoutException => p.destroy() p.exitValue() } } val output = buf.toString().trim val PREVIEW_MAX_LENGTH = 200 val preview = if (output.length > PREVIEW_MAX_LENGTH) { output.substring(0, PREVIEW_MAX_LENGTH) + "..." } else { output } LOG.debug(s"$sender <= `$preview` exit $retval") if (retval != 0) { throw new RuntimeException( s"exited ($retval) by executing `$command`") } output } }
Example 104
Source File: NumericalDataProducer.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.integrationtest.kafka import java.util.Properties import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} import org.apache.kafka.common.serialization.ByteArraySerializer import org.apache.log4j.Logger import org.apache.gearpump.streaming.serializer.ChillSerializer class NumericalDataProducer(topic: String, bootstrapServers: String) { private val LOG = Logger.getLogger(getClass) private val producer = createProducer private val WRITE_SLEEP_NANOS = 10 private val serializer = new ChillSerializer[Int] var lastWriteNum = 0 def start(): Unit = { produceThread.start() } def stop(): Unit = { if (produceThread.isAlive) { produceThread.interrupt() produceThread.join() } producer.close() } def producedNumbers: Range = { Range(1, lastWriteNum + 1) } private def createProducer: KafkaProducer[Array[Byte], Array[Byte]] = { val properties = new Properties() properties.setProperty("bootstrap.servers", bootstrapServers) new KafkaProducer[Array[Byte], Array[Byte]](properties, new ByteArraySerializer, new ByteArraySerializer) } private val produceThread = new Thread(new Runnable { override def run(): Unit = { try { while (!Thread.currentThread.isInterrupted) { lastWriteNum += 1 val msg = serializer.serialize(lastWriteNum) val record = new ProducerRecord[Array[Byte], Array[Byte]](topic, msg) producer.send(record) Thread.sleep(0, WRITE_SLEEP_NANOS) } } catch { case ex: InterruptedException => LOG.error("message producing is stopped by an interrupt") } } }) }
Example 105
Source File: StreamingKMeansSuite.scala From spark-structured-streaming-ml with Apache License 2.0 | 5 votes |
package com.highperformancespark.examples.structuredstreaming import com.holdenkarau.spark.testing.DataFrameSuiteBase import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.ml.linalg._ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.execution.streaming.MemoryStream import org.scalatest.FunSuite import org.apache.log4j.{Level, Logger} case class TestRow(features: Vector) class StreamingKMeansSuite extends FunSuite with DataFrameSuiteBase { override def beforeAll(): Unit = { super.beforeAll() Logger.getLogger("org").setLevel(Level.OFF) } test("streaming model with one center should converge to true center") { import spark.implicits._ val k = 1 val dim = 5 val clusterSpread = 0.1 val seed = 63 // TODO: this test is very flaky. The centers do not converge for some // (most?) random seeds val (batches, trueCenters) = StreamingKMeansSuite.generateBatches(100, 80, k, dim, clusterSpread, seed) val inputStream = MemoryStream[TestRow] val ds = inputStream.toDS() val skm = new StreamingKMeans().setK(k).setRandomCenters(dim, 0.01) val query = skm.evilTrain(ds.toDF()) val streamingModels = batches.map { batch => inputStream.addData(batch) query.processAllAvailable() skm.getModel } // TODO: use spark's testing suite streamingModels.last.centers.zip(trueCenters).foreach { case (center, trueCenter) => val centers = center.toArray.mkString(",") val trueCenters = trueCenter.toArray.mkString(",") println(s"${centers} | ${trueCenters}") assert(center.toArray.zip(trueCenter.toArray).forall( x => math.abs(x._1 - x._2) < 0.1)) } query.stop() } def compareBatchAndStreaming( batchModel: KMeansModel, streamingModel: StreamingKMeansModel, validationData: DataFrame): Unit = { assert(batchModel.clusterCenters === streamingModel.centers) // TODO: implement prediction comparison } } object StreamingKMeansSuite { def generateBatches( numPoints: Int, numBatches: Int, k: Int, d: Int, r: Double, seed: Int, initCenters: Array[Vector] = null): (IndexedSeq[IndexedSeq[TestRow]], Array[Vector]) = { val rand = scala.util.Random rand.setSeed(seed) val centers = initCenters match { case null => Array.fill(k)(Vectors.dense(Array.fill(d)(rand.nextGaussian()))) case _ => initCenters } val data = (0 until numBatches).map { i => (0 until numPoints).map { idx => val center = centers(idx % k) val vec = Vectors.dense( Array.tabulate(d)(x => center(x) + rand.nextGaussian() * r)) TestRow(vec) } } (data, centers) } }
Example 106
Source File: SuspiciousConnects.scala From oni-ml with Apache License 2.0 | 5 votes |
package org.opennetworkinsight import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SQLContext import org.slf4j.LoggerFactory import org.opennetworkinsight.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig import org.opennetworkinsight.dns.DNSSuspiciousConnects import org.opennetworkinsight.netflow.FlowSuspiciousConnects import org.opennetworkinsight.proxy.ProxySuspiciousConnectsAnalysis def main(args: Array[String]) { val parser = SuspiciousConnectsArgumentParser.parser parser.parse(args, SuspiciousConnectsConfig()) match { case Some(config) => val logger = LoggerFactory.getLogger(this.getClass) Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) val analysis = config.analysis val sparkConfig = new SparkConf().setAppName("ONI ML: " + analysis + " lda") val sparkContext = new SparkContext(sparkConfig) val sqlContext = new SQLContext(sparkContext) implicit val outputDelimiter = OutputDelimiter analysis match { case "flow" => FlowSuspiciousConnects.run(config, sparkContext, sqlContext, logger) case "dns" => DNSSuspiciousConnects.run(config, sparkContext, sqlContext, logger) case "proxy" => ProxySuspiciousConnectsAnalysis.run(config, sparkContext, sqlContext, logger) case _ => println("ERROR: unsupported (or misspelled) analysis: " + analysis) } sparkContext.stop() case None => println("Error parsing arguments") } System.exit(0) } }
Example 107
Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object DecisionTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val dt = new DecisionTreeClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += dt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 108
Source File: SVMPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint object SVMPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def svmPipeline(sc: SparkContext) = { val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t")) val data = records.map { r => val trimmed = r.map(_.replaceAll("\"", "")) val label = trimmed(r.size - 1).toInt val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble) LabeledPoint(label, Vectors.dense(features)) } // params for SVM val numIterations = 10 // Run training algorithm to build the model val svmModel = SVMWithSGD.train(data, numIterations) // Clear the default threshold. svmModel.clearThreshold() val svmTotalCorrect = data.map { point => if(svmModel.predict(point.features) == point.label) 1 else 0 }.sum() // calculate accuracy val svmAccuracy = svmTotalCorrect / data.count() println(svmAccuracy) } }
Example 109
Source File: NaiveBayesPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object NaiveBayesPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val nb = new NaiveBayes() stages += vectorAssembler stages += nb val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 110
Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.DataFrame object LogisticRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val lr = new LogisticRegression() val paramGrid = new ParamGridBuilder() .addGrid(lr.regParam, Array(0.1, 0.01)) .addGrid(lr.fitIntercept) .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0)) .build() val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr)) val trainValidationSplit = new TrainValidationSplit() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) .setEstimatorParamMaps(paramGrid) // 80% of the data will be used for training and the remaining 20% for validation. .setTrainRatio(0.8) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) //val model = trainValidationSplit.fit(training) val model = trainValidationSplit.fit(dataFrame) //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val totalPoints = dataFrame.count() val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum() val accuracy = lrTotalCorrect/totalPoints println("Accuracy of LogisticRegression is: ", accuracy) holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/LR.xls") holdout.rdd.map(x => x(1).asInstanceOf[Double]).repartition(1).saveAsTextFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/Actual.xls") savePredictions(holdout, dataFrame, rm, "/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/LogisticRegression.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { println("Mean Squared Error:", regressionMetrics.meanSquaredError) println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError) predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 111
Source File: RandomForestPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object RandomForestPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val rf = new RandomForestClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setNumTrees(20) .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += rf val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/RF.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/RandomForest.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 112
Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object DecisionTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val dt = new DecisionTreeClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += dt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/DT.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/DecisionTree.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 113
Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object GradientBoostedTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val gbt = new GBTClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxIter(10) stages += vectorAssembler stages += gbt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/GBT.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/GBT.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 114
Source File: SVMPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint object SVMPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def svmPipeline(sc: SparkContext) = { val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t")) val data = records.map { r => val trimmed = r.map(_.replaceAll("\"", "")) val label = trimmed(r.size - 1).toInt val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble) LabeledPoint(label, Vectors.dense(features)) } // params for SVM val numIterations = 10 // Run training algorithm to build the model val svmModel = SVMWithSGD.train(data, numIterations) // Clear the default threshold. svmModel.clearThreshold() val svmTotalCorrect = data.map { point => if(svmModel.predict(point.features) == point.label) 1 else 0 }.sum() // calculate accuracy val svmAccuracy = svmTotalCorrect / data.count() println(svmAccuracy) } }
Example 115
Source File: NaiveBayesPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object NaiveBayesPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val nb = new NaiveBayes() stages += vectorAssembler stages += nb val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/NB.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/NaiveBayes.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 116
Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.DataFrame object LogisticRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val lr = new LogisticRegression() val paramGrid = new ParamGridBuilder() .addGrid(lr.regParam, Array(0.1, 0.01)) .addGrid(lr.fitIntercept) .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0)) .build() val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr)) val trainValidationSplit = new TrainValidationSplit() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) .setEstimatorParamMaps(paramGrid) // 80% of the data will be used for training and the remaining 20% for validation. .setTrainRatio(0.8) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) //val model = trainValidationSplit.fit(training) val model = trainValidationSplit.fit(dataFrame) //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val totalPoints = dataFrame.count() val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum() val accuracy = lrTotalCorrect/totalPoints println("Accuracy of LogisticRegression is: ", accuracy) holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/LR.xls") holdout.rdd.map(x => x(1).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/Actual.xls") savePredictions(holdout, dataFrame, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/LogisticRegression.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { println("Mean Squared Error:", regressionMetrics.meanSquaredError) println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError) predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 117
Source File: RandomForestPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object RandomForestPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val rf = new RandomForestClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setNumTrees(20) .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += rf val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 118
Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object DecisionTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val dt = new DecisionTreeClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += dt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 119
Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object GradientBoostedTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val gbt = new GBTClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxIter(10) stages += vectorAssembler stages += gbt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 120
Source File: SVMPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint object SVMPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def svmPipeline(sc: SparkContext) = { val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t")) val data = records.map { r => val trimmed = r.map(_.replaceAll("\"", "")) val label = trimmed(r.size - 1).toInt val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble) LabeledPoint(label, Vectors.dense(features)) } // params for SVM val numIterations = 10 // Run training algorithm to build the model val svmModel = SVMWithSGD.train(data, numIterations) // Clear the default threshold. svmModel.clearThreshold() val svmTotalCorrect = data.map { point => if(svmModel.predict(point.features) == point.label) 1 else 0 }.sum() // calculate accuracy val svmAccuracy = svmTotalCorrect / data.count() println(svmAccuracy) } }
Example 121
Source File: NaiveBayesPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object NaiveBayesPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val nb = new NaiveBayes() stages += vectorAssembler stages += nb val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 122
Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.DataFrame object LogisticRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val lr = new LogisticRegression() val paramGrid = new ParamGridBuilder() .addGrid(lr.regParam, Array(0.1, 0.01)) .addGrid(lr.fitIntercept) .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0)) .build() val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr)) val trainValidationSplit = new TrainValidationSplit() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) .setEstimatorParamMaps(paramGrid) // 80% of the data will be used for training and the remaining 20% for validation. .setTrainRatio(0.8) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) //val model = trainValidationSplit.fit(training) val model = trainValidationSplit.fit(dataFrame) //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val totalPoints = dataFrame.count() val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum() val accuracy = lrTotalCorrect/totalPoints println("Accuracy of LogisticRegression is: ", accuracy) } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { println("Mean Squared Error:", regressionMetrics.meanSquaredError) println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError) predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 123
Source File: GeneralizedLinearRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.regression.bikesharing import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer} import org.apache.spark.ml.regression.GeneralizedLinearRegression import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{SparkSession, _} object GeneralizedLinearRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def genLinearRegressionWithVectorFormat(vectorAssembler: VectorAssembler, vectorIndexer: VectorIndexer, dataFrame: DataFrame) = { val lr = new GeneralizedLinearRegression() .setFeaturesCol("features") .setLabelCol("label") .setFamily("gaussian") .setLink("identity") .setMaxIter(10) .setRegParam(0.3) val pipeline = new Pipeline().setStages(Array(vectorAssembler, vectorIndexer, lr)) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) val model = pipeline.fit(training) val fullPredictions = model.transform(test).cache() val predictions = fullPredictions.select("prediction").rdd.map(_.getDouble(0)) val labels = fullPredictions.select("label").rdd.map(_.getDouble(0)) val RMSE = new RegressionMetrics(predictions.zip(labels)).rootMeanSquaredError println(s" Root mean squared error (RMSE): $RMSE") } def genLinearRegressionWithSVMFormat(spark: SparkSession) = { // Load training data val training = spark.read.format("libsvm") .load("./src/main/scala/org/sparksamples/regression/dataset/BikeSharing/lsvmHours.txt") val lr = new GeneralizedLinearRegression() .setFamily("gaussian") .setLink("identity") .setMaxIter(10) .setRegParam(0.3) // Fit the model val model = lr.fit(training) // Print the coefficients and intercept for generalized linear regression model println(s"Coefficients: ${model.coefficients}") println(s"Intercept: ${model.intercept}") // Summarize the model over the training set and print out some metrics val summary = model.summary println(s"Coefficient Standard Errors: ${summary.coefficientStandardErrors.mkString(",")}") println(s"T Values: ${summary.tValues.mkString(",")}") println(s"P Values: ${summary.pValues.mkString(",")}") println(s"Dispersion: ${summary.dispersion}") println(s"Null Deviance: ${summary.nullDeviance}") println(s"Residual Degree Of Freedom Null: ${summary.residualDegreeOfFreedomNull}") println(s"Deviance: ${summary.deviance}") println(s"Residual Degree Of Freedom: ${summary.residualDegreeOfFreedom}") println(s"AIC: ${summary.aic}") println("Deviance Residuals: ") summary.residuals().show() } }
Example 124
Source File: LinearRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.regression.bikesharing import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer} import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{DataFrame, SparkSession} object LinearRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def linearRegressionWithVectorFormat(vectorAssembler: VectorAssembler, vectorIndexer: VectorIndexer, dataFrame: DataFrame) = { val lr = new LinearRegression() .setFeaturesCol("features") .setLabelCol("label") .setRegParam(0.1) .setElasticNetParam(1.0) .setMaxIter(10) val pipeline = new Pipeline().setStages(Array(vectorAssembler, vectorIndexer, lr)) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) val model = pipeline.fit(training) val fullPredictions = model.transform(test).cache() val predictions = fullPredictions.select("prediction").rdd.map(_.getDouble(0)) val labels = fullPredictions.select("label").rdd.map(_.getDouble(0)) val RMSE = new RegressionMetrics(predictions.zip(labels)).rootMeanSquaredError println(s" Root mean squared error (RMSE): $RMSE") } def linearRegressionWithSVMFormat(spark: SparkSession) = { // Load training data val training = spark.read.format("libsvm") .load("./src/main/scala/org/sparksamples/regression/dataset/BikeSharing/lsvmHours.txt") val lr = new LinearRegression() .setMaxIter(10) .setRegParam(0.3) .setElasticNetParam(0.8) // Fit the model val lrModel = lr.fit(training) // Print the coefficients and intercept for linear regression println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}") // Summarize the model over the training set and print out some metrics val trainingSummary = lrModel.summary println(s"numIterations: ${trainingSummary.totalIterations}") println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}") trainingSummary.residuals.show() println(s"RMSE: ${trainingSummary.rootMeanSquaredError}") println(s"r2: ${trainingSummary.r2}") } }
Example 125
Source File: SparseNaiveBayes.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println
Example 126
Source File: DenseKMeans.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 127
Source File: StreamingExamples.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.apache.log4j.{Level, Logger} import org.apache.spark.internal.Logging def setStreamingLogLevels() { val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements if (!log4jInitialized) { // We first log something to initialize Spark's default logging, then we override the // logging level. logInfo("Setting log level to [WARN] for streaming example." + " To override add a custom log4j.properties to the classpath.") Logger.getRootLogger.setLevel(Level.WARN) } } }
Example 128
Source File: YarnScheduler.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.yarn.util.RackResolver import org.apache.log4j.{Level, Logger} import org.apache.spark._ import org.apache.spark.scheduler.TaskSchedulerImpl import org.apache.spark.util.Utils private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) { // RackResolver logs an INFO message whenever it resolves a rack, which is way too often. if (Logger.getLogger(classOf[RackResolver]).getLevel == null) { Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN) } // By default, rack is unknown override def getRackForHost(hostPort: String): Option[String] = { val host = Utils.parseHostPort(hostPort)._1 Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation) } }
Example 129
Source File: BeforeAndAfterWithContext.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.rpc.netty import eleflow.uberdata.core.IUberdataContext import eleflow.uberdata.core.util.ClusterSettings import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkEnv} import org.scalatest.{BeforeAndAfterEach, Suite} object TestSparkConf { def conf = { val sconf = new SparkConf() sconf.set("spark.app.name", "teste") sconf } val separator = "," } trait BeforeAndAfterWithContext extends BeforeAndAfterEach { this: Suite => val defaultFilePath = "src/test/resources/" import TestSparkConf._ ClusterSettings.master = Some("local[*]") conf.set("spark.driver.allowMultipleContexts", "true") @transient val context = IUberdataContext.getUC(conf) override def beforeEach() = { setLogLevels(Level.INFO, Seq("spark", "org.eclipse.jetty", "akka")) } def setLogLevels(level: org.apache.log4j.Level, loggers: TraversableOnce[String]) = { loggers.map { loggerName => val logger = Logger.getLogger(loggerName) val prevLevel = logger.getLevel logger.setLevel(level) loggerName -> prevLevel }.toMap } override def afterEach() = { val get = SparkEnv.get val rpcEnv = if (get != null) { Some(get.rpcEnv) } else None context.clearContext() //rpcEnv.foreach( // _.fileServer.asInstanceOf[org.apache.spark.rpc.netty.HttpBasedFileServer].shutdown()) System.clearProperty("spark.master.port") } }
Example 130
Source File: MLLibSuite.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib import org.scalatest.{BeforeAndAfterAll, FunSuite} import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.{Row, SparkSession} class MLLibSuite extends FunSuite with BeforeAndAfterAll { private var sparkSession: SparkSession = _ var savedLevels: Map[String, Level] = _ override def beforeAll(): Unit = { super.beforeAll() sparkSession = SparkSession.builder.master("local[2]").appName("MLlib QA").getOrCreate() // Travis limits the size of the log file produced by a build. Because we do run a small // version of all the ML benchmarks in this suite, we produce a ton of logs. Here we set the // log level to ERROR, just for this suite, to avoid displeasing travis. savedLevels = Seq("akka", "org", "com.databricks").map { name => val logger = Logger.getLogger(name) val curLevel = logger.getLevel logger.setLevel(Level.ERROR) name -> curLevel }.toMap } override def afterAll(): Unit = { savedLevels.foreach { case (name, level) => Logger.getLogger(name).setLevel(level) } try { if (sparkSession != null) { sparkSession.stop() } // To avoid RPC rebinding to the same port, since it doesn't unbind immediately on shutdown System.clearProperty("spark.driver.port") sparkSession = null } finally { super.afterAll() } } test("test MlLib benchmarks with mllib-small.yaml.") { val results = MLLib.run(yamlConfig = MLLib.smallConfig) val failures = results.na.drop(Seq("failure")) if (failures.count() > 0) { failures.select("name", "failure.*").collect().foreach { case Row(name: String, error: String, message: String) => println( s"""There as a failure in the benchmark for $name: | $error ${message.replace("\n", "\n ")} """.stripMargin) } fail("Unable to run all benchmarks successfully, see console output for more info.") } } test("test before benchmark methods for pipeline benchmarks.") { val benchmarks = MLLib.getBenchmarks(MLLib.getConf(yamlConfig = MLLib.smallConfig)) benchmarks.foreach { b => b.beforeBenchmark() } } }
Example 131
Source File: DeleteFromTableEventListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.secondaryindex.events import scala.collection.JavaConverters._ import org.apache.log4j.Logger import org.apache.spark.internal.Logging import org.apache.spark.sql.CarbonEnv import org.apache.spark.sql.hive.CarbonRelation import org.apache.spark.sql.index.CarbonIndexUtil import org.apache.spark.sql.secondaryindex.hive.CarbonInternalMetastore import org.apache.spark.sql.secondaryindex.util.SecondaryIndexUtil import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.events.{DeleteFromTablePostEvent, DeleteFromTablePreEvent, Event, OperationContext, OperationEventListener} override def onEvent(event: Event, operationContext: OperationContext): Unit = { event match { case deleteFromTablePreEvent: DeleteFromTablePreEvent => LOGGER.info("Delete from table pre event listener called") val carbonTable = deleteFromTablePreEvent.carbonTable // Should not allow delete on index table if (carbonTable.isIndexTable) { sys .error(s"Delete is not permitted on Index Table [${ carbonTable .getDatabaseName }.${ carbonTable.getTableName }]") } case deleteFromTablePostEvent: DeleteFromTablePostEvent => LOGGER.info("Delete from table post event listener called") val parentCarbonTable = deleteFromTablePostEvent.carbonTable val sparkSession = deleteFromTablePostEvent.sparkSession CarbonInternalMetastore .refreshIndexInfo(parentCarbonTable.getDatabaseName, parentCarbonTable.getTableName, parentCarbonTable)( sparkSession) val indexTableList = CarbonIndexUtil.getSecondaryIndexes(parentCarbonTable) if (!indexTableList.isEmpty) { val indexCarbonTableList = indexTableList.asScala.map { indexTableName => CarbonEnv.getInstance(sparkSession).carbonMetaStore .lookupRelation(Option(parentCarbonTable.getDatabaseName), indexTableName)( sparkSession) .asInstanceOf[CarbonRelation].carbonTable }.toList SecondaryIndexUtil .updateTableStatusForIndexTables(parentCarbonTable, indexCarbonTableList.asJava) } } } }
Example 132
Source File: AlterTableDropColumnEventListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.secondaryindex.events import scala.collection.JavaConverters._ import org.apache.log4j.Logger import org.apache.spark.sql.{CarbonEnv, SparkSession} import org.apache.spark.sql.execution.command.AlterTableDropColumnModel import org.apache.spark.sql.execution.command.index.DropIndexCommand import org.apache.spark.sql.hive.CarbonRelation import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.metadata.index.IndexType import org.apache.carbondata.events.{AlterTableDropColumnPreEvent, Event, OperationContext, OperationEventListener} class AlterTableDropColumnEventListener extends OperationEventListener { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) override def onEvent(event: Event, operationContext: OperationContext): Unit = { event match { case alterTableDropColumnPreEvent: AlterTableDropColumnPreEvent => LOGGER.info("alter table drop column event listener called") val carbonTable = alterTableDropColumnPreEvent.carbonTable val dbName = carbonTable.getDatabaseName val tableName = carbonTable.getTableName val tablePath = carbonTable.getTablePath val sparkSession = alterTableDropColumnPreEvent.sparkSession val alterTableDropColumnModel = alterTableDropColumnPreEvent.alterTableDropColumnModel dropApplicableSITables(dbName, tableName, tablePath, alterTableDropColumnModel)(sparkSession) } } private def dropApplicableSITables(dbName: String, tableName: String, tablePath: String, alterTableDropColumnModel: AlterTableDropColumnModel) (sparkSession: SparkSession) { var indexTableToDrop: Seq[String] = Seq.empty val catalog = CarbonEnv.getInstance(sparkSession).carbonMetaStore val parentCarbonTable = catalog.lookupRelation(Some(dbName), tableName)(sparkSession) .asInstanceOf[CarbonRelation].carbonTable val secondaryIndexMap = parentCarbonTable.getIndexesMap.get(IndexType.SI.getIndexProviderName) if (null == secondaryIndexMap) { // if secondary index map is empty, return return } secondaryIndexMap.asScala.foreach(indexTable => { val indexColumns = indexTable._2.asScala(CarbonCommonConstants.INDEX_COLUMNS).split(",") val colSize = alterTableDropColumnModel.columns.intersect(indexColumns).size if (colSize > 0) { if (colSize == indexColumns.size) { indexTableToDrop ++= Seq(indexTable._1) } else { sys .error(s"Index Table [${ indexTable._1 }] exists with combination of provided drop column(s) and other columns, drop " + s"index table & retry") } } }) indexTableToDrop.foreach { indexTable => DropIndexCommand(ifExistsSet = true, Some(dbName), tableName, indexTable.toLowerCase, needLock = false).run(sparkSession) } } }
Example 133
Source File: DeleteSegmentByDateListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.secondaryindex.events import scala.collection.JavaConverters._ import org.apache.log4j.Logger import org.apache.spark.internal.Logging import org.apache.spark.sql.CarbonEnv import org.apache.spark.sql.hive.CarbonRelation import org.apache.spark.sql.index.CarbonIndexUtil import org.apache.carbondata.api.CarbonStore import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.events.{DeleteSegmentByDatePostEvent, Event, OperationContext, OperationEventListener} class DeleteSegmentByDateListener extends OperationEventListener with Logging { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) override def onEvent(event: Event, operationContext: OperationContext): Unit = { event match { case deleteSegmentPostEvent: DeleteSegmentByDatePostEvent => LOGGER.info("Delete segment By date post event listener called") val carbonTable = deleteSegmentPostEvent.carbonTable val loadDates = deleteSegmentPostEvent.loadDates val sparkSession = deleteSegmentPostEvent.sparkSession CarbonIndexUtil.getSecondaryIndexes(carbonTable).asScala.foreach { tableName => val metastore = CarbonEnv.getInstance(sparkSession).carbonMetaStore val table = metastore .lookupRelation(Some(carbonTable.getDatabaseName), tableName)(sparkSession) .asInstanceOf[CarbonRelation].carbonTable CarbonStore .deleteLoadByDate(loadDates, carbonTable.getDatabaseName, table.getTableName, table) } } } }
Example 134
Source File: DropCacheSIEventListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.secondaryindex.events import scala.collection.JavaConverters._ import org.apache.log4j.Logger import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.execution.command.cache.CarbonDropCacheCommand import org.apache.spark.sql.index.CarbonIndexUtil import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.metadata.index.IndexType import org.apache.carbondata.events.{DropTableCacheEvent, Event, OperationContext, OperationEventListener} object DropCacheSIEventListener extends OperationEventListener { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) override protected def onEvent(event: Event, operationContext: OperationContext): Unit = { event match { case dropCacheEvent: DropTableCacheEvent => val carbonTable = dropCacheEvent.carbonTable val sparkSession = dropCacheEvent.sparkSession val internalCall = dropCacheEvent.internalCall if (carbonTable.isIndexTable && !internalCall) { throw new UnsupportedOperationException("Operation not allowed on child table.") } val allIndexTables = carbonTable.getIndexTableNames( IndexType.SI.getIndexProviderName) val dbName = carbonTable.getDatabaseName for (indexTableName <- allIndexTables.asScala) { try { val dropCacheCommandForChildTable = CarbonDropCacheCommand( TableIdentifier(indexTableName, Some(dbName)), internalCall = true) dropCacheCommandForChildTable.processMetadata(sparkSession) } catch { case e: Exception => LOGGER.error(s"Clean cache for SI table $indexTableName failed. ", e) } } } } }
Example 135
Source File: ShowCacheSIEventListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.secondaryindex.events import scala.collection.JavaConverters._ import org.apache.log4j.Logger import org.apache.spark.sql.CarbonEnv import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.metadata.index.IndexType import org.apache.carbondata.core.metadata.schema.indextable.IndexMetadata import org.apache.carbondata.events.{Event, OperationContext, OperationEventListener, ShowTableCacheEvent} object ShowCacheSIEventListener extends OperationEventListener { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) override def onEvent(event: Event, operationContext: OperationContext): Unit = { event match { case showTableCacheEvent: ShowTableCacheEvent => val carbonTable = showTableCacheEvent.carbonTable val sparkSession = showTableCacheEvent.sparkSession val internalCall = showTableCacheEvent.internalCall if (carbonTable.isIndexTable && !internalCall) { throw new UnsupportedOperationException("Operation not allowed on index table.") } val childTables = operationContext.getProperty(carbonTable.getTableUniqueName) .asInstanceOf[List[(String, String)]] val indexTables = carbonTable.getIndexTableNames( IndexType.SI.getIndexProviderName).asScala // if there are no index tables for a given fact table do not perform any action operationContext.setProperty(carbonTable.getTableUniqueName, indexTables.map { indexTable => val indexCarbonTable = CarbonEnv.getCarbonTable(Some(carbonTable.getDatabaseName), indexTable)(sparkSession) (carbonTable.getDatabaseName + "-" + indexTable, "Secondary Index", indexCarbonTable.getTableId) }.toList ++ childTables) } } }
Example 136
Source File: UpdateTablePreEventListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.secondaryindex.events import org.apache.log4j.Logger import org.apache.spark.internal.Logging import org.apache.spark.sql.index.CarbonIndexUtil import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.events.{Event, OperationContext, OperationEventListener, UpdateTablePreEvent} class UpdateTablePreEventListener extends OperationEventListener with Logging { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) override def onEvent(event: Event, operationContext: OperationContext): Unit = { event match { case updateTablePreEvent: UpdateTablePreEvent => LOGGER.info("Update table pre event listener called") val carbonTable = updateTablePreEvent.carbonTable // Should not allow update on index table if (carbonTable.isIndexTable) { sys .error(s"Update is not permitted on Index Table [${ carbonTable .getDatabaseName }.${ carbonTable.getTableName }]") } else if (!carbonTable.getIndexesMap.isEmpty) { sys .error(s"Update is not permitted on table that contains secondary index [${ carbonTable .getDatabaseName }.${ carbonTable.getTableName }]. Drop all indexes and retry") } } } }
Example 137
Source File: AlterTableRenameEventListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.secondaryindex.events import scala.collection.JavaConverters._ import org.apache.log4j.Logger import org.apache.spark.internal.Logging import org.apache.spark.sql.CarbonEnv import org.apache.spark.sql.hive._ import org.apache.spark.sql.index.CarbonIndexUtil import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.metadata.index.IndexType import org.apache.carbondata.core.metadata.schema.table.CarbonTable import org.apache.carbondata.events.{AlterTableRenamePostEvent, Event, OperationContext, OperationEventListener} class AlterTableRenameEventListener extends OperationEventListener with Logging { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) override def onEvent(event: Event, operationContext: OperationContext): Unit = { event match { case alterTableRenamePreEvent: AlterTableRenamePostEvent => LOGGER.info("alter table rename Pre event listener called") val alterTableRenameModel = alterTableRenamePreEvent.alterTableRenameModel val carbonTable = alterTableRenamePreEvent.carbonTable val sparkSession = alterTableRenamePreEvent.sparkSession val oldDatabaseName = carbonTable.getDatabaseName val newTableName = alterTableRenameModel.newTableIdentifier.table val metastore = CarbonEnv.getInstance(sparkSession).carbonMetaStore val table: CarbonTable = metastore .lookupRelation(Some(oldDatabaseName), newTableName)(sparkSession) .asInstanceOf[CarbonRelation].carbonTable table.getIndexTableNames(IndexType.SI.getIndexProviderName) .asScala.map { entry => CarbonSessionCatalogUtil.getClient(sparkSession).runSqlHive( s"ALTER TABLE $oldDatabaseName.${ entry } " + s"SET SERDEPROPERTIES ('parentTableName'='$newTableName')") } } } }
Example 138
Source File: CleanFilesPostEventListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.secondaryindex.events import scala.collection.JavaConverters._ import org.apache.log4j.Logger import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.index.CarbonIndexUtil import org.apache.spark.sql.optimizer.CarbonFilters import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.indexstore.PartitionSpec import org.apache.carbondata.core.mutate.CarbonUpdateUtil import org.apache.carbondata.core.statusmanager.SegmentStatusManager import org.apache.carbondata.events.{CleanFilesPostEvent, Event, OperationContext, OperationEventListener} class CleanFilesPostEventListener extends OperationEventListener with Logging { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) override def onEvent(event: Event, operationContext: OperationContext): Unit = { event match { case cleanFilesPostEvent: CleanFilesPostEvent => LOGGER.info("Clean files post event listener called") val carbonTable = cleanFilesPostEvent.carbonTable val indexTables = CarbonIndexUtil .getIndexCarbonTables(carbonTable, cleanFilesPostEvent.sparkSession) indexTables.foreach { indexTable => val partitions: Option[Seq[PartitionSpec]] = CarbonFilters.getPartitions( Seq.empty[Expression], cleanFilesPostEvent.sparkSession, indexTable) SegmentStatusManager.deleteLoadsAndUpdateMetadata( indexTable, true, partitions.map(_.asJava).orNull) CarbonUpdateUtil.cleanUpDeltaFiles(indexTable, true) } } } }
Example 139
Source File: DeleteSegmentByIdListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.secondaryindex.events import scala.collection.JavaConverters._ import org.apache.log4j.Logger import org.apache.spark.internal.Logging import org.apache.spark.sql.CarbonEnv import org.apache.spark.sql.hive.CarbonRelation import org.apache.spark.sql.index.CarbonIndexUtil import org.apache.carbondata.api.CarbonStore import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.datastore.impl.FileFactory import org.apache.carbondata.core.metadata.index.IndexType import org.apache.carbondata.core.util.path.CarbonTablePath import org.apache.carbondata.events.{DeleteSegmentByIdPostEvent, Event, OperationContext, OperationEventListener} class DeleteSegmentByIdListener extends OperationEventListener with Logging { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) override def onEvent(event: Event, operationContext: OperationContext): Unit = { event match { case deleteSegmentPostEvent: DeleteSegmentByIdPostEvent => LOGGER.info("Delete segment By id post event listener called") val carbonTable = deleteSegmentPostEvent.carbonTable val loadIds = deleteSegmentPostEvent.loadIds val sparkSession = deleteSegmentPostEvent.sparkSession val siIndexesMap = carbonTable.getIndexesMap .get(IndexType.SI.getIndexProviderName) if (null != siIndexesMap) { siIndexesMap.keySet().asScala.foreach { tableName => val metastore = CarbonEnv.getInstance(sparkSession).carbonMetaStore val table = metastore .lookupRelation(Some(carbonTable.getDatabaseName), tableName)(sparkSession) .asInstanceOf[CarbonRelation].carbonTable val tableStatusFilePath = CarbonTablePath.getTableStatusFilePath(table.getTablePath) // this check is added to verify if the table status file for the index table exists // or not. Delete on index tables is only to be called if the table status file exists. if (FileFactory.isFileExist(tableStatusFilePath)) { CarbonStore .deleteLoadById(loadIds, carbonTable.getDatabaseName, table.getTableName, table) } } } } } }
Example 140
Source File: CreateCarbonRelationEventListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.secondaryindex.events import org.apache.log4j.Logger import org.apache.spark.internal.Logging import org.apache.spark.sql.secondaryindex.hive.CarbonInternalMetastore import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.events.{CreateCarbonRelationPostEvent, Event, OperationContext, OperationEventListener} class CreateCarbonRelationEventListener extends OperationEventListener with Logging { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) override def onEvent(event: Event, operationContext: OperationContext): Unit = { event match { case createCarbonRelationPostEvent: CreateCarbonRelationPostEvent => LOGGER.debug("Create carbon relation post event listener called") val carbonTable = createCarbonRelationPostEvent.carbonTable val databaseName = createCarbonRelationPostEvent.carbonTable.getDatabaseName val tableName = createCarbonRelationPostEvent.carbonTable.getTableName val sparkSession = createCarbonRelationPostEvent.sparkSession CarbonInternalMetastore .refreshIndexInfo(databaseName, tableName, carbonTable, createCarbonRelationPostEvent.needLock)(sparkSession) } } }
Example 141
Source File: SILoadEventListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.secondaryindex.events import scala.collection.JavaConverters._ import org.apache.log4j.Logger import org.apache.spark.internal.Logging import org.apache.spark.sql.{CarbonEnv, SparkSession} import org.apache.spark.sql.hive.CarbonRelation import org.apache.spark.sql.index.CarbonIndexUtil import org.apache.spark.sql.secondaryindex.command.IndexModel import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.metadata.index.IndexType import org.apache.carbondata.core.metadata.schema.indextable.IndexMetadata import org.apache.carbondata.events._ import org.apache.carbondata.processing.loading.events.LoadEvents.LoadTablePreStatusUpdateEvent class SILoadEventListener extends OperationEventListener with Logging { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) override def onEvent(event: Event, operationContext: OperationContext): Unit = { event match { case _: LoadTablePreStatusUpdateEvent => if (operationContext.getProperty("isAddLoad") != null && operationContext.getProperty("isAddLoad").toString.toBoolean) { return } LOGGER.info("Load pre status update event-listener called") val loadTablePreStatusUpdateEvent = event.asInstanceOf[LoadTablePreStatusUpdateEvent] val carbonLoadModel = loadTablePreStatusUpdateEvent.getCarbonLoadModel val sparkSession = SparkSession.getActiveSession.get // when Si creation and load to main table are parallel, get the carbonTable from the // metastore which will have the latest index Info val metaStore = CarbonEnv.getInstance(sparkSession).carbonMetaStore val carbonTable = metaStore .lookupRelation(Some(carbonLoadModel.getDatabaseName), carbonLoadModel.getTableName)(sparkSession).asInstanceOf[CarbonRelation].carbonTable val indexMetadata = carbonTable.getIndexMetadata val secondaryIndexProvider = IndexType.SI.getIndexProviderName if (null != indexMetadata && null != indexMetadata.getIndexesMap && null != indexMetadata.getIndexesMap.get(secondaryIndexProvider)) { val indexTables = indexMetadata.getIndexesMap .get(secondaryIndexProvider).keySet().asScala // if there are no index tables for a given fact table do not perform any action if (indexTables.nonEmpty) { indexTables.foreach { indexTableName => val secondaryIndex = IndexModel(Some(carbonTable.getDatabaseName), indexMetadata.getParentTableName, indexMetadata .getIndexColumns(secondaryIndexProvider, indexTableName).split(",").toList, indexTableName) val metaStore = CarbonEnv.getInstance(sparkSession).carbonMetaStore val indexTable = metaStore .lookupRelation(Some(carbonLoadModel.getDatabaseName), indexTableName)(sparkSession).asInstanceOf[CarbonRelation].carbonTable CarbonIndexUtil .LoadToSITable(sparkSession, carbonLoadModel, indexTableName, isLoadToFailedSISegments = false, secondaryIndex, carbonTable, indexTable) } } else { logInfo(s"No index tables found for table: ${carbonTable.getTableName}") } } else { logInfo(s"Index information is null for table: ${carbonTable.getTableName}") } } } }
Example 142
Source File: SIRefreshEventListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.secondaryindex.events import org.apache.log4j.Logger import org.apache.spark.internal.Logging import org.apache.spark.sql.secondaryindex.hive.CarbonInternalMetastore import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.events.{Event, LookupRelationPostEvent, OperationContext, OperationEventListener} class SIRefreshEventListener extends OperationEventListener with Logging { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) override def onEvent(event: Event, operationContext: OperationContext): Unit = { event match { case lookupRelationPostEvent: LookupRelationPostEvent => LOGGER.debug("SI Refresh post event listener called") val carbonTable = lookupRelationPostEvent.carbonTable val databaseName = lookupRelationPostEvent.carbonTable.getDatabaseName val tableName = lookupRelationPostEvent.carbonTable.getTableName val sparkSession = lookupRelationPostEvent.sparkSession CarbonInternalMetastore.refreshIndexInfo(databaseName, tableName, carbonTable)(sparkSession) } } }
Example 143
Source File: CarbonDropMVCommand.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command.view import org.apache.log4j.Logger import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.execution.command.AtomicRunnableCommand import org.apache.spark.sql.execution.command.table.CarbonDropTableCommand import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.datastore.impl.FileFactory import org.apache.carbondata.core.util.CarbonProperties import org.apache.carbondata.events.{OperationContext, OperationListenerBus} import org.apache.carbondata.view.{MVCatalogInSpark, MVManagerInSpark, UpdateMVPostExecutionEvent, UpdateMVPreExecutionEvent} case class CarbonDropMVCommand( databaseNameOption: Option[String], name: String, ifExistsSet: Boolean, forceDrop: Boolean = false) extends AtomicRunnableCommand { private val logger = CarbonDropMVCommand.LOGGER private var dropTableCommand: CarbonDropTableCommand = _ override def processMetadata(session: SparkSession): Seq[Row] = { setAuditInfo(Map("mvName" -> name)) val viewManager = MVManagerInSpark.get(session) try { logger.info("Trying to drop materialized view schema") val databaseName = databaseNameOption.getOrElse(session.sessionState.catalog.getCurrentDatabase) val schema = viewManager.getSchema(databaseName, name) if (schema != null) { // Drop mv status. val databaseLocation = viewManager.getDatabaseLocation(databaseName) val systemDirectoryPath = CarbonProperties.getInstance() .getSystemFolderLocationPerDatabase(FileFactory .getCarbonFile(databaseLocation) .getCanonicalPath) val identifier = TableIdentifier(name, Option(databaseName)) val operationContext = new OperationContext() OperationListenerBus.getInstance().fireEvent( UpdateMVPreExecutionEvent(session, systemDirectoryPath, identifier), operationContext) viewManager.onDrop(databaseName, name) OperationListenerBus.getInstance().fireEvent( UpdateMVPostExecutionEvent(session, systemDirectoryPath, identifier), operationContext) // Drop mv table. val dropTableCommand = CarbonDropTableCommand( ifExistsSet = true, Option(databaseName), name, dropChildTable = true, isInternalCall = true) dropTableCommand.processMetadata(session) // Drop mv schema. try { viewManager.deleteSchema(databaseName, name) } finally { val viewCatalog = viewManager.getCatalog() .asInstanceOf[MVCatalogInSpark] if (viewCatalog != null) { viewCatalog.deregisterSchema(schema.getIdentifier) } } this.dropTableCommand = dropTableCommand } } catch { case exception: Exception => if (!ifExistsSet) { throw exception } } Seq.empty } override def processData(sparkSession: SparkSession): Seq[Row] = { // delete the table folder if (this.dropTableCommand != null) { this.dropTableCommand.processData(sparkSession) } Seq.empty } override protected def opName: String = "DROP MATERIALIZED VIEW" } object CarbonDropMVCommand { private val LOGGER: Logger = LogServiceFactory.getLogService( classOf[CarbonDropMVCommand].getCanonicalName) }
Example 144
Source File: CacheUtil.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command.cache import scala.collection.JavaConverters._ import org.apache.log4j.Logger import org.apache.spark.sql.SparkSession import org.apache.spark.sql.util.SparkSQLUtil import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.datastore.impl.FileFactory import org.apache.carbondata.core.index.Segment import org.apache.carbondata.core.metadata.schema.table.{CarbonTable, IndexSchema} import org.apache.carbondata.core.readcommitter.LatestFilesReadCommittedScope import org.apache.carbondata.core.statusmanager.SegmentStatusManager import org.apache.carbondata.core.util.CarbonProperties import org.apache.carbondata.index.bloom.{BloomCacheKeyValue, BloomCoarseGrainIndexFactory} import org.apache.carbondata.indexserver.IndexServer import org.apache.carbondata.processing.merger.CarbonDataMergerUtil object CacheUtil { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) def getAllIndexFiles(carbonTable: CarbonTable)(sparkSession: SparkSession): List[String] = { if (carbonTable.isTransactionalTable) { val absoluteTableIdentifier = carbonTable.getAbsoluteTableIdentifier val validAndInvalidSegmentsInfo = new SegmentStatusManager(absoluteTableIdentifier) .getValidAndInvalidSegments(carbonTable.isMV) // Fire a job to clear the invalid segments cached in the executors. if (CarbonProperties.getInstance().isDistributedPruningEnabled(carbonTable.getDatabaseName, carbonTable.getTableName)) { val invalidSegmentIds = validAndInvalidSegmentsInfo.getInvalidSegments.asScala .map(_.getSegmentNo).toArray try { IndexServer.getClient .invalidateSegmentCache(carbonTable, invalidSegmentIds, SparkSQLUtil.getTaskGroupId(sparkSession)) } catch { case e: Exception => LOGGER.warn("Failed to clear cache from executors. ", e) } } validAndInvalidSegmentsInfo.getValidSegments.asScala.flatMap { segment => segment.getCommittedIndexFile.keySet().asScala }.map { indexFile => indexFile.replace(CarbonCommonConstants.WINDOWS_FILE_SEPARATOR, CarbonCommonConstants.FILE_SEPARATOR) }.toList } else { val tablePath = carbonTable.getTablePath val readCommittedScope = new LatestFilesReadCommittedScope(tablePath, FileFactory.getConfiguration) readCommittedScope.getSegmentList.flatMap { load => val seg = new Segment(load.getLoadName, null, readCommittedScope) seg.getCommittedIndexFile.keySet().asScala }.map { indexFile => indexFile.replace(CarbonCommonConstants.WINDOWS_FILE_SEPARATOR, CarbonCommonConstants.FILE_SEPARATOR) }.toList } } def getBloomCacheKeys(carbonTable: CarbonTable, indexSchema: IndexSchema): List[String] = { val segments = CarbonDataMergerUtil.getValidSegmentList(carbonTable).asScala // Generate shard Path for the indexSchema val shardPaths = segments.flatMap { segment => BloomCoarseGrainIndexFactory.getAllShardPaths(carbonTable.getTablePath, segment.getSegmentNo, indexSchema.getIndexName).asScala } // get index columns val indexColumns = carbonTable.getIndexedColumns(indexSchema.getIndexColumns).asScala.map { entry => entry.getColName } // generate cache key using shard path and index columns on which bloom was created. val indexKeys = shardPaths.flatMap { shardPath => indexColumns.map { indexCol => new BloomCacheKeyValue.CacheKey(shardPath, indexCol).toString } } indexKeys.toList } }
Example 145
Source File: DropCacheEventListeners.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.listeners import scala.collection.JavaConverters._ import scala.collection.mutable import org.apache.log4j.Logger import org.apache.spark.sql.{CarbonEnv, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.execution.command.cache.{CacheUtil, CarbonDropCacheCommand} import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.cache.CacheProvider import org.apache.carbondata.core.metadata.index.IndexType import org.apache.carbondata.core.metadata.schema.table.IndexSchema import org.apache.carbondata.core.view.MVSchema import org.apache.carbondata.events.{DropTableCacheEvent, Event, OperationContext, OperationEventListener} import org.apache.carbondata.view.MVManagerInSpark object DropCacheMVEventListener extends OperationEventListener { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) override protected def onEvent(event: Event, operationContext: OperationContext): Unit = { event match { case dropCacheEvent: DropTableCacheEvent => val carbonTable = dropCacheEvent.carbonTable val cache = CacheProvider.getInstance().getCarbonCache val indexProviderMap = carbonTable.getIndexesMap val bloomIndexProvider = IndexType.BLOOMFILTER.getIndexProviderName if (!indexProviderMap.isEmpty && null != indexProviderMap.get(bloomIndexProvider)) { val bloomIndexes = indexProviderMap.get(bloomIndexProvider) val bloomIndexIterator = bloomIndexes.entrySet().iterator() while (bloomIndexIterator.hasNext) { val bloomIndexEntry = bloomIndexIterator.next() val index = new IndexSchema(bloomIndexEntry.getKey, bloomIndexProvider) index.setProperties(bloomIndexEntry.getValue) try { // Get index keys val indexKeys = CacheUtil.getBloomCacheKeys(carbonTable, index) // remove index keys from cache cache.removeAll(indexKeys.asJava) } catch { case e: Exception => LOGGER.warn( s"Clean cache for Bloom index ${ index.getIndexName } failed.", e) } } } } } }
Example 146
Source File: CompactionTaskCompletionListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.rdd import java.util import org.apache.log4j.Logger import org.apache.spark.TaskContext import org.apache.spark.sql.carbondata.execution.datasources.tasklisteners.CarbonCompactionTaskCompletionListener import org.apache.spark.sql.execution.command.management.CommonLoadUtils import org.apache.spark.util.CollectionAccumulator import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.scan.result.iterator.RawResultIterator import org.apache.carbondata.core.segmentmeta.SegmentMetaDataInfo import org.apache.carbondata.processing.loading.TableProcessingOperations import org.apache.carbondata.processing.loading.model.CarbonLoadModel import org.apache.carbondata.processing.merger.{AbstractResultProcessor, CarbonCompactionExecutor, CarbonCompactionUtil} class CompactionTaskCompletionListener( carbonLoadModel: CarbonLoadModel, exec: CarbonCompactionExecutor, processor: AbstractResultProcessor, rawResultIteratorMap: util.Map[String, util.List[RawResultIterator]], segmentMetaDataAccumulator: CollectionAccumulator[Map[String, SegmentMetaDataInfo]], queryStartTime: Long) extends CarbonCompactionTaskCompletionListener { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getName) override def onTaskCompletion(context: TaskContext): Unit = { deleteLocalDataFolders() // close all the query executor service and clean up memory acquired during query processing if (null != exec) { LOGGER.info("Cleaning up query resources acquired during compaction") exec.close(rawResultIteratorMap.get(CarbonCompactionUtil.UNSORTED_IDX), queryStartTime) exec.close(rawResultIteratorMap.get(CarbonCompactionUtil.SORTED_IDX), queryStartTime) } // clean up the resources for processor if (null != processor) { LOGGER.info("Closing compaction processor instance to clean up loading resources") processor.close() } // fill segment metadata to accumulator CommonLoadUtils.fillSegmentMetaDataInfoToAccumulator(carbonLoadModel.getTableName, carbonLoadModel.getSegmentId, segmentMetaDataAccumulator) } private def deleteLocalDataFolders(): Unit = { try { LOGGER.info("Deleting local folder store location") val isCompactionFlow = true TableProcessingOperations .deleteLocalDataLoadFolderLocation(carbonLoadModel, isCompactionFlow, false) } catch { case e: Exception => LOGGER.error(e) } } }
Example 147
Source File: QueryExecutorWithLogging.scala From variantsdwh with Apache License 2.0 | 5 votes |
package pl.edu.pw.ii.zsibio.dwh.benchmark.utils import java.io.{File, FileOutputStream, PrintWriter} import java.util.Calendar import pl.edu.pw.ii.zsibio.dwh.benchmark.dao.{ConnectDriver, EngineConnection, QueryResult} import net.jcazevedo.moultingyaml._ import net.jcazevedo.moultingyaml.DefaultYamlProtocol import net.jcazevedo.moultingyaml.DefaultYamlProtocol._ import org.apache.log4j.Logger import pl.edu.pw.ii.zsibio.dwh.benchmark.dao.ConnectDriver.Value import pl.edu.pw.ii.zsibio.dwh.benchmark.utils.QueryType.QueryType case class Query(queryId:String, queryType:String, queryEngine:String, storageFormat:String,queryDesc:String, statement:String) object QueryType extends Enumeration { type QueryType = Value val SELECT, CREATE, UPDATE = Value } object QueryExecutorWithLogging { val log = Logger.getLogger("pl.edu.pw.ii.zsibio.dwh.benchmark.utils.QueryExecutorWithLogging") object QueryYamlProtocol extends DefaultYamlProtocol { implicit val queryFormat = yamlFormat6(Query) } def runStatement(query: Query, conn:EngineConnection, logFile:String, dryRun: Boolean) = { log.info(s"Running ${query.queryId} ... using ${query.queryEngine} engine") log.debug(s"Executing query: ${query.statement}") query.queryType.toLowerCase() match { case "select" => logQuery(conn, query, logFile, dryRun) case _ => conn.executeUpdate(query.statement.toLowerCase) } } def parseQueryYAML(file:String,storageType:String,connString:String, kuduMaster:String, dbName:String, ifExplain:Boolean = false) : Query ={ log.info(s"Parsing ${file}") val lines = scala.io.Source.fromFile(file).mkString val yml = lines.stripMargin.parseYaml import QueryYamlProtocol._ queryPreprocess(yml.convertTo[Query], storageType, connString, kuduMaster, dbName, ifExplain) } private def logQuery(conn:EngineConnection, query: Query, logFile:String, dryRun:Boolean) ={ val rs = conn.executeQuery(query.statement.toLowerCase,true) //rs.rs.next() val result = s"${Calendar.getInstance().getTime().toString},${query.queryId}," + s"${query.queryEngine},${query.storageFormat},${rs.timing.get.getTiming()},${dryRun.toString}\n" log.info(s"Result: ${result}") val writer = new PrintWriter(new FileOutputStream(new File(logFile),true)) writer.write(result) writer.flush() writer.close() } private def queryPreprocess(query: Query, storageType: String, connString: String, kuduMaster: String, dbName: String, ifExplain: Boolean) = { def replaceVars(property:String) ={ property .replaceAll("\\{\\{DATA_FORMAT\\}\\}",storageType.toLowerCase) .replaceAll("\\{\\{DB_NAME\\}\\}",dbName.toLowerCase) .replaceAll("\\{\\{KUDU_MASTER\\}\\}",kuduMaster ) .replaceAll("\\{\\{IF_EXPLAIN\\}\\}", if(ifExplain) "EXPLAIN " else "") .replaceAll("\\{\\{PERCENTILE_APPROX\\}\\}", if(query.queryEngine.toLowerCase=="presto") "approx_percentile" else "percentile_approx") } query.copy( queryId = replaceVars(query.queryId), queryDesc = replaceVars(query.queryDesc), storageFormat = replaceVars(query.storageFormat), statement = replaceVars(query.statement.replaceAll(",",",\n").replaceAll("\\(","\\( ")) ) } }
Example 148
Source File: SomeSQLOnTitanic.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.machinelearning.titanic import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession object SomeSQLOnTitanic { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main (args:Array[String]): Unit = { val testFile = args(0) val trainFile = args(1) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } import sparkSession.implicits._ //Load Data val trainDs = sparkSession.read.option("header", "true") .option("charset", "UTF8") .option("delimiter",",") .csv(trainFile) trainDs.createOrReplaceTempView("train") println("Sex -> Servived") sparkSession.sql("select Sex, sum(Survived), count(*), (sum(Survived)/count(*)) from train group by Sex").collect().foreach(println) println("Cabin -> Servived") sparkSession.sql("select substring(Cabin,1,1), sum(Survived), count(*), (sum(Survived)/count(*)) from train group by 1 order by 1").collect().foreach(println) println("Age -> Servived") sparkSession.sql("select round(cast(Age as Int) / 10) as age_block, sum(Survived), count(*), (sum(Survived)/count(*)) from train group by 1 order by 1").collect().foreach(println) println("PClass -> Servived") sparkSession.sql("select pclass, sum(Survived), count(*), (sum(Survived)/count(*)) from train group by pclass order by 1").collect().foreach(println) println("Embarked -> Servived") sparkSession.sql("select Embarked, sum(Survived), count(*), (sum(Survived)/count(*)) from train group by Embarked order by 1").collect().foreach(println) println("Fare -> Servived") sparkSession.sql("select round((Fare / 10)), sum(Survived), count(*), (sum(Survived)/count(*)) from train group by 1 order by 1").collect().foreach(println) println("Servived -> Servived") sparkSession.sql("select sum(Survived), count(*) from train order by 1").collect().foreach(println) sparkSession.stop() } }
Example 149
Source File: ManyToManyNormalJoin.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.manytomany import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.{Row, SparkSession} import scala.collection.mutable object ManyToManyNormalJoin { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val jsonPath = args(0) val sparkSession = SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .getOrCreate() val jsonDf = sparkSession.read.json(jsonPath) val nGramWordCount = jsonDf.rdd.flatMap(r => { val actions = r.getAs[mutable.WrappedArray[Row]]("actions") val resultList = new mutable.MutableList[((Long, Long), Int)] actions.foreach(a => { val aValue = a.getAs[Long]("action") actions.foreach(b => { val bValue = b.getAs[Long]("action") if (aValue < bValue) { resultList.+=(((aValue, bValue), 1)) } }) }) resultList.toSeq }).reduceByKey(_ + _) nGramWordCount.collect().foreach(r => { println(" - " + r) }) } }
Example 150
Source File: ManyToManyNestedJoin.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.manytomany import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.{Row, SparkSession} import scala.collection.mutable object ManyToManyNestedJoin { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val jsonPath = args(0) val sparkSession = SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .getOrCreate() val jsonDf = sparkSession.read.json(jsonPath) val nGramWordCount = jsonDf.rdd.flatMap(r => { val actions = r.getAs[mutable.WrappedArray[Row]]("actions") val resultList = new mutable.MutableList[(Long, NestedCount)] actions.foreach(a => { val aValue = a.getAs[Long]("action") val aNestedCount = new NestedCount actions.foreach(b => { val bValue = b.getAs[Long]("action") if (aValue < bValue) { aNestedCount.+=(bValue, 1) } }) resultList.+=((aValue, aNestedCount)) }) resultList.toSeq }).reduceByKey((a, b) => a + b) //.reduceByKey(_ + _) nGramWordCount.collect().foreach(r => { println(" - " + r) }) } } //1,2 //1,3 //1,4 //1 (2, 3, 4) class NestedCount() extends Serializable{ val map = new mutable.HashMap[Long, Long]() def += (key:Long, count:Long): Unit = { val currentValue = map.getOrElse(key, 0l) map.put(key, currentValue + count) } def + (other:NestedCount): NestedCount = { val result = new NestedCount other.map.foreach(r => { result.+=(r._1, r._2) }) this.map.foreach(r => { result.+=(r._1, r._2) }) result } override def toString(): String = { val stringBuilder = new StringBuilder map.foreach(r => { stringBuilder.append("(" + r._1 + "," + r._2 + ")") }) stringBuilder.toString() } }
Example 151
Source File: SaltedExample.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.salted import java.util.Random import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession object SaltedExample { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val jsonPath = args(0) val sparkSession = SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .getOrCreate() val jsonDfLeft = sparkSession.read.json(jsonPath) val saltedLeft = jsonDfLeft.rdd.flatMap(r => { val group = r.getAs[String]("group") val value = r.getAs[Long]("value") Seq((group + "_" + 0, value),(group + "_" + 1, value)) }) val jsonDfRight = sparkSession.read.json(jsonPath) val saltedRight = jsonDfRight.rdd.mapPartitions(it => { val random = new Random() it.map(r => { val group = r.getAs[String]("group") val value = r.getAs[Long]("value") (group + "_" + random.nextInt(2), value) }) }) jsonDfLeft.join(jsonDfRight).collect().foreach(r => { println("Normal.result:" + r) }) println("----") saltedLeft.join(saltedRight).collect().foreach(r => { println("Salted.result:" + r) }) } }
Example 152
Source File: SmallWindowing.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.windowing.small import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession object SmallWindowing { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val jsonPath = args(0) val sparkSession = SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .getOrCreate() val jsonDf = sparkSession.read.json(jsonPath) val timeDifRdd = jsonDf.rdd.map(row => { val group = row.getAs[String]("group") val time = row.getAs[Long]("time") val value = row.getAs[Long]("value") //(key , value) (group, (time, value)) }).groupByKey().flatMap{case (group, records) => var lastValue = 0l val localList = records.toSeq println("localList.size:" + localList.size) localList.sortBy(_._1).map{case (time, value) => val dif = value - lastValue lastValue = value (group, time, value, dif) } } timeDifRdd.take(10).foreach(r => { println(r) }) sparkSession.stop() } }
Example 153
Source File: SuperBigWindowing.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.windowing.superbig import org.apache.log4j.{Level, Logger} import org.apache.spark.Partitioner import org.apache.spark.sql.SparkSession object SuperBigWindowing { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args: Array[String]): Unit = { val jsonPath = args(0) val pageSize = args(1).toInt val spark = SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .getOrCreate() val jsonDf = spark.read.json(jsonPath) import spark.implicits._ val diffDs = jsonDf.flatMap(row => { val group = row.getAs[String]("group") val time = row.getAs[Long]("time") val value = row.getAs[Long]("value") val timePage = time / pageSize if (time % pageSize == 0) { //Am I on the edge of the page Seq((timePage, (time, value)), (timePage + 1, (time, value))) } else { Seq((timePage, (time, value))) } }).groupByKey(r => r._1).flatMapGroups((k, it) => { var lastValue = 0l it.toSeq. sortBy{case (page, (time, value)) => time}. map{case (page, (time, value)) => val dif = value - lastValue lastValue = value (time, value, dif) } }) diffDs.collect().foreach(r => println(" - " + r)) spark.stop() } }
Example 154
Source File: SessionWindowing.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.timeseries import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import scala.collection.mutable object SessionWindowing { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val sessionJson = args(0) val timeGap = args(1).toInt val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host", "127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } println("---") import sparkSession.implicits._ val sessionDs = sparkSession.read.json(sessionJson).as[JsonLeadLag] sessionDs.createOrReplaceTempView("session_table") sparkSession.sql("select * from session_table").collect().foreach(println) val sessionDefinitinonDf = sessionDs.rdd.map(r => { (r.group, r) }).groupByKey().flatMap{ case (group, jsonObjIt) => var lastStart:Long = -1 var lastEnd:Long = -1 var sessionCount = 1 var eventsInASession = 0 val sessionList = new mutable.MutableList[SessionDefinition] jsonObjIt.toSeq.sortBy(r => r.ts).foreach(record => { val ts = record.ts eventsInASession += 1 if (lastStart == -1) { lastStart = ts } else if (ts > lastEnd + timeGap) { sessionList += SessionDefinition(group, lastStart, lastEnd, lastEnd - lastStart, eventsInASession) lastStart = ts eventsInASession = 0 } lastEnd = ts }) sessionList } sessionDefinitinonDf.collect().foreach(println) } } case class SessionDefinition(group:String, sessionStart:Long, sessionEnd:Long, sessionLength:Long, sessionEvents:Int)
Example 155
Source File: LeadLagExample.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.timeseries import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession object LeadLagExample { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val leadLagJson = args(0) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host", "127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } println("---") import sparkSession.implicits._ val leadLag = sparkSession.read.json(leadLagJson).as[JsonLeadLag] leadLag.createOrReplaceTempView("leadlag") sparkSession.sql("select * from leadlag").collect().foreach(println) val leadLagDf = sparkSession.sql("SELECT " + "group, ts, " + "value as v_now, " + "LEAD(value) OVER (PARTITION BY group ORDER BY ts) as v_after, " + "LAG(value) OVER (PARTITION BY group ORDER BY ts) as v_before " + "FROM leadlag") leadLagDf.collect().foreach(println) leadLagDf.createOrReplaceTempView("leadlag_stage2") leadLagDf.printSchema() sparkSession.sql("select " + "group, ts, v_now, v_after, v_before, " + "case " + " when v_now < v_after and v_now < v_before then 'valley'" + " when v_now > v_after and v_now > v_before then 'peak'" + " else 'n/a' " + "end " + "from leadlag_stage2").collect().foreach(println) } } case class JsonLeadLag(group:String, ts:Long, value:Long)
Example 156
Source File: TumblingWindows.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.timeseries import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession object TumblingWindows { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val leadLagJson = args(0) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host", "127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } println("---") import sparkSession.implicits._ val leadLag = sparkSession.read.json(leadLagJson).as[JsonLeadLag] leadLag.createOrReplaceTempView("leadlag") sparkSession.sql("select * from leadlag").collect().foreach(println) val leadLagDf = sparkSession.sql("SELECT " + "group, " + "round(ts / 3), " + "avg(value), " + "max(value), " + "min(value) " + "FROM leadlag " + "group by 1,2") leadLagDf.collect().foreach(println) } }
Example 157
Source File: InfectionPointWindow.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.timeseries import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession object InfectionPointWindow { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val inflectionPointJson = args(0) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host", "127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } println("---") import sparkSession.implicits._ val inflectionPointDs = sparkSession.read.json(inflectionPointJson).as[JsonInfectionPoint] inflectionPointDs.createOrReplaceTempView("inflection_point") sparkSession.sql("select * from inflection_point").collect().foreach(println) val leadLagDf = sparkSession.sql("SELECT " + "group, ts, " + "value as v_now, " + "AVG(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg, " + "Min(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg, " + "Max(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg " + "FROM inflection_point " + "where event_type = 'inflection'") leadLagDf.collect().foreach(println) } } case class JsonInfectionPoint(group:String, ts:Long, value:Long, event_type:String)
Example 158
Source File: SplidingWindows.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.timeseries import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession object SplidingWindows { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val leadLagJson = args(0) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host", "127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } println("---") import sparkSession.implicits._ val leadLag = sparkSession.read.json(leadLagJson).as[JsonLeadLag] leadLag.createOrReplaceTempView("leadlag") sparkSession.sql("select * from leadlag").collect().foreach(println) val leadLagDf = sparkSession.sql("SELECT " + "group, ts, " + "value as v_now, " + "AVG(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg, " + "Min(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg, " + "Max(value) OVER (ORDER BY ts rows between 3 preceding and current row) as v_moving_avg " + "FROM leadlag") leadLagDf.collect().foreach(println) } }
Example 159
Source File: JsonNestedExample.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.nested import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.types.{ArrayType, DataType, StructField, StructType} import scala.collection.mutable object JsonNestedExample { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args: Array[String]): Unit = { val isLocal = args(0).equalsIgnoreCase("l") val jsonPath = args(1) val outputTableName = args(2) val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } println("---") val jsonDf = sparkSession.read.json(jsonPath) val localJsonDf = jsonDf.collect() println("--Df") jsonDf.foreach(row => { println("row:" + row) }) println("--local") localJsonDf.foreach(row => { println("row:" + row) }) jsonDf.createOrReplaceTempView("json_table") println("--Tree Schema") jsonDf.schema.printTreeString() println("--") jsonDf.write.saveAsTable(outputTableName) sparkSession.sqlContext.sql("select * from " + outputTableName).take(10).foreach(println) println("--") sparkSession.stop() } def populatedFlattedHashMap(row:Row, schema:StructType, fields:Array[StructField], flattedMap:mutable.HashMap[(String, DataType), mutable.MutableList[Any]], parentFieldName:String): Unit = { fields.foreach(field => { println("field:" + field.dataType) if (field.dataType.isInstanceOf[ArrayType]) { val elementType = field.dataType.asInstanceOf[ArrayType].elementType if (elementType.isInstanceOf[StructType]) { val childSchema = elementType.asInstanceOf[StructType] val childRow = Row.fromSeq(row.getAs[mutable.WrappedArray[Any]](field.name).toSeq) populatedFlattedHashMap(childRow, childSchema, childSchema.fields, flattedMap, parentFieldName + field.name + ".") } } else { val fieldList = flattedMap.getOrElseUpdate((parentFieldName + field.name, field.dataType), new mutable.MutableList[Any]) fieldList.+=:(row.getAs[Any](schema.fieldIndex(field.name))) } }) } }
Example 160
Source File: NestedTableExample.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.nested import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType} import org.apache.spark.sql.{Row, SparkSession} object NestedTableExample { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args: Array[String]): Unit = { val spark = SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .enableHiveSupport() .getOrCreate() spark.sql("create table IF NOT EXISTS nested_empty " + "( A int, " + " B string, " + " nested ARRAY<STRUCT< " + " nested_C: int," + " nested_D: string" + " >>" + ") ") val rowRDD = spark.sparkContext. parallelize(Array( Row(1, "foo", Seq(Row(1, "barA"),Row(2, "bar"))), Row(2, "foo", Seq(Row(1, "barB"),Row(2, "bar"))), Row(3, "foo", Seq(Row(1, "barC"),Row(2, "bar"))))) val emptyDf = spark.sql("select * from nested_empty limit 0") val tableSchema = emptyDf.schema val populated1Df = spark.sqlContext.createDataFrame(rowRDD, tableSchema) println("----") populated1Df.collect().foreach(r => println(" emptySchemaExample:" + r)) val nestedSchema = new StructType() .add("nested_C", IntegerType) .add("nested_D", StringType) val definedSchema = new StructType() .add("A", IntegerType) .add("B", StringType) .add("nested", ArrayType(nestedSchema)) val populated2Df = spark.sqlContext.createDataFrame(rowRDD, definedSchema) println("----") populated1Df.collect().foreach(r => println(" BuiltExample:" + r)) spark.stop() } }
Example 161
Source File: PopulateHiveTable.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.nested import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType} object PopulateHiveTable { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args: Array[String]): Unit = { val spark = SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() spark.sql("create table IF NOT EXISTS nested_empty " + "( A int, " + " B string, " + " nested ARRAY<STRUCT< " + " nested_C: int," + " nested_D: string" + " >>" + ") ") val rowRDD = spark.sparkContext. parallelize(Array( Row(1, "foo", Seq(Row(1, "barA"),Row(2, "bar"))), Row(2, "foo", Seq(Row(1, "barB"),Row(2, "bar"))), Row(3, "foo", Seq(Row(1, "barC"),Row(2, "bar"))))) val emptyDf = spark.sql("select * from nested_empty limit 0") val tableSchema = emptyDf.schema val populated1Df = spark.sqlContext.createDataFrame(rowRDD, tableSchema) populated1Df.repartition(2).write.saveAsTable("nested_populated") println("----") populated1Df.collect().foreach(r => println(" emptySchemaExample:" + r)) val nestedSchema = new StructType() .add("nested_C", IntegerType) .add("nested_D", StringType) val definedSchema = new StructType() .add("A", IntegerType) .add("B", StringType) .add("nested", ArrayType(nestedSchema)) val populated2Df = spark.sqlContext.createDataFrame(rowRDD, definedSchema) println("----") populated1Df.collect().foreach(r => println(" BuiltExample:" + r)) spark.stop() } }
Example 162
Source File: CountingInAStreamExpUpdateStateByKey.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.dstream import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{Seconds, StreamingContext} object CountingInAStreamExpUpdateStateByKey { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val checkpointFolder = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .master("local[3]") .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } val ssc = new StreamingContext(sparkSession.sparkContext, Seconds(1)) ssc.checkpoint(checkpointFolder) val lines = ssc.socketTextStream(host, port.toInt) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(word => (word, 1)) .updateStateByKey((values: Seq[(Int)], state: Option[(Int)]) => { var value = state.getOrElse(0) values.foreach(i => { value += i }) Some(value) }) wordCounts.foreachRDD(rdd => { println("{") val localCollection = rdd.collect() println(" size:" + localCollection.length) localCollection.foreach(r => println(" " + r)) println("}") }) ssc.start() ssc.awaitTermination() } }
Example 163
Source File: CountingInAStreamExpBatchCounting.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.dstream import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{Seconds, StreamingContext} object CountingInAStreamExpBatchCounting { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val checkpointFolder = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .master("local[3]") .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } val ssc = new StreamingContext(sparkSession.sparkContext, Seconds(2)) ssc.checkpoint(checkpointFolder) val lines = ssc.socketTextStream(host, port.toInt) val words = lines.flatMap(line => line.toLowerCase.split(" ")) val wordCounts = words.map(word => (word, 1)) .reduceByKey((a,b) => a + b) wordCounts.foreachRDD(rdd => { println("{") val localCollection = rdd.collect() println(" size:" + localCollection.length) localCollection.foreach(r => println(" " + r)) println("}") }) ssc.start() ssc.awaitTermination() } }
Example 164
Source File: CountingInAStreamMapWithState.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.structured import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout} object CountingInAStreamMapWithState { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val checkpointFolder = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .master("local[3]") .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .master("local[3]") .getOrCreate() } import sparkSession.implicits._ val socketLines = sparkSession.readStream .format("socket") .option("host", host) .option("port", port) .load() val messageDs = socketLines.as[String]. flatMap(line => line.toLowerCase().split(" ")). map(word => WordCountEvent(word, 1)) // Generate running word count val wordCounts = messageDs.groupByKey(tuple => tuple.word). mapGroupsWithState[WordCountInMemory, WordCountReturn](GroupStateTimeout.ProcessingTimeTimeout) { case (word: String, events: Iterator[WordCountEvent], state: GroupState[WordCountInMemory]) => var newCount = if (state.exists) state.get.countOfWord else 0 events.foreach(tuple => { newCount += tuple.countOfWord }) state.update(WordCountInMemory(newCount)) WordCountReturn(word, newCount) } // Start running the query that prints the running counts to the console val query = wordCounts.writeStream .outputMode("update") .format("console") .start() query.awaitTermination() } } case class WordCountEvent(word:String, countOfWord:Int) extends Serializable { } case class WordCountInMemory(countOfWord: Int) extends Serializable { } case class WordCountReturn(word:String, countOfWord:Int) extends Serializable { }
Example 165
Source File: CountingInAStreamExpGroupBy.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.structured import com.malaska.spark.training.streaming.{Message, MessageBuilder} import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.{OutputMode, Trigger} import org.apache.spark.sql.functions._ object CountingInAStreamExpGroupBy { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val checkpointFolder = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .master("local[3]") .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .master("local[3]") .getOrCreate() } import sparkSession.implicits._ val socketLines = sparkSession.readStream .format("socket") .option("host", host) .option("port", port) .load() val messageDs = socketLines.as[String]. flatMap(line => line.toLowerCase().split(" ")) // Generate running word count val wordCounts = messageDs.groupBy("value").count() // Start running the query that prints the running counts to the console val query = wordCounts.writeStream .outputMode("complete") .format("console") .start() query.awaitTermination() } }
Example 166
Source File: CountingInAStreamDatasetExpGroupBy.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.structured import com.malaska.spark.training.streaming.{Message, MessageBuilder} import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.Trigger import org.apache.spark.sql.functions._ object CountingInAStreamDatasetExpGroupBy { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val checkpointFolder = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .master("local[3]") .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .master("local[3]") .getOrCreate() } import sparkSession.implicits._ val socketLines = sparkSession.readStream .format("socket") .option("host", host) .option("port", port) .load() val messageDs = socketLines.as[String].map(line => { MessageBuilder.build(line) }).as[Message] val tickerCount = messageDs.groupBy("ticker", "destUser").agg(sum($"price"), avg($"price")) val ticketOutput = tickerCount.writeStream .format("Console") .trigger(Trigger.ProcessingTime("5 seconds")) .option("checkpointLocation", checkpointFolder) .outputMode("complete") .format("console") .start() ticketOutput.awaitTermination() } }
Example 167
Source File: CountingInAStreamExpWindowing.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.structured import com.malaska.spark.training.streaming.{Message, MessageBuilder} import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.streaming.FileStreamSource.Timestamp import org.apache.spark.sql.functions._ import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.streaming.{OutputMode, Trigger} object CountingInAStreamExpWindowing { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val checkpointFolder = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .master("local[5]") .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .master("local[5]") .getOrCreate() } import sparkSession.implicits._ val socketLines = sparkSession.readStream .format("socket") .option("host", host) .option("port", port) .option("includeTimestamp", true) .load() val messageDsDStream = socketLines.as[(String, Timestamp)].map(line => { MessageBuilder.build(line._1, line._2) }).filter(r => r != null).as[Message] val tickerCount = messageDsDStream.withColumn("eventTime", $"tradeTs".cast("timestamp")) .withWatermark("eventTime", "30 seconds") .groupBy(window($"eventTime", "30 seconds", "5 seconds"), $"ticker") .agg(max($"tradeTs") as "max_time", sum($"price") as "total_price", avg($"price") as "avg_price", count($"price") as "number_of_trades")//.orderBy("window") val ticketOutput = tickerCount.writeStream .format("Console") .option("checkpointLocation", checkpointFolder) .outputMode("update") //.outputMode("complete") .format("console") .option("truncate", false) .option("numRows", 40) .start() ticketOutput.awaitTermination() } }
Example 168
Source File: ZombieExample.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.graph import org.apache.log4j.{Level, Logger} import org.apache.spark.graphx.{Edge, EdgeDirection, Graph, _} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession object ZombieExample { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val vertexJsonFile = args(0) val edgeJsonFile = args(1) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } println("---") import sparkSession.implicits._ val vectorDs = sparkSession.read.json(vertexJsonFile).as[JsonVertex] val edgeDs = sparkSession.read.json(edgeJsonFile).as[JsonEdge] val vectorRdd:RDD[(VertexId, ZombieStats)] = vectorDs.rdd.map(r => { (r.vertex_id.toLong, new ZombieStats(r.is_zombie.equals("yes"), r.time_alive)) }) val edgeRdd = edgeDs.rdd.map(r => { new Edge[String](r.src, r.dst, r.edge_type) }) val defaultUser = new ZombieStats(false, 0) val graph = Graph(vectorRdd, edgeRdd, defaultUser) val zombieResults = graph.pregel[Long](0, 30, EdgeDirection.Either)( (vertexId, zombieState, message) => { if (message > 0 && !zombieState.isZombie) { new ZombieStats(true, message) } else { zombieState } }, triplet => { if (triplet.srcAttr.isZombie && !triplet.dstAttr.isZombie) { Iterator((triplet.dstId, triplet.srcAttr.lengthOfLife + 1l)) } else if (triplet.dstAttr.isZombie && !triplet.srcAttr.isZombie) { Iterator((triplet.srcId, triplet.dstAttr.lengthOfLife + 1l)) } else { Iterator.empty } }, (a, b) => Math.min(a, b)) println("ZombieBite") zombieResults.vertices.collect().sortBy(r => r._1).foreach(r => { println("vertexId:" + r._1 + ",ZobmieStat:" + r._2) }) sparkSession.stop() } } case class ZombieStats (isZombie:Boolean, lengthOfLife:Long)
Example 169
Source File: TrianglesExample.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.graph import org.apache.log4j.{Level, Logger} import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession object TrianglesExample { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val vertexJsonFile = args(0) val edgeJsonFile = args(1) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } println("---") import sparkSession.implicits._ val vectorDs = sparkSession.read.json(vertexJsonFile).as[JsonVertex] val edgeDs = sparkSession.read.json(edgeJsonFile).as[JsonEdge] val vectorRdd:RDD[(VertexId, ZombieStats)] = vectorDs.rdd.map(r => { (r.vertex_id.toLong, new ZombieStats(r.is_zombie.equals("yes"), r.time_alive)) }) val edgeRdd = edgeDs.rdd.map(r => { new Edge[String](r.src, r.dst, r.edge_type) }) val defaultUser = new ZombieStats(false, 0) val graph = Graph(vectorRdd, edgeRdd, defaultUser) println("TriangleCount") graph.triangleCount().vertices.collect().sortBy(r => r._1).foreach(r => { println("vertexId:" + r._1 + ",triangleCount:" + r._2) }) graph.pageRank(1.1, 1.1) sparkSession.stop() } }
Example 170
Source File: RegressionMetricsSpark.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.evaluation import breeze.linalg.DenseVector import io.github.mandar2812.dynaml.graphics.charts.Highcharts._ import org.apache.log4j.{Priority, Logger} import org.apache.spark.Accumulator import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import scalax.chart.module.ChartFactories.{XYBarChart, XYLineChart, XYAreaChart} histogram(residuals, numBins = 20) title("Histogram of Regression Residuals") } } object RegressionMetricsSpark { def computeKPIs(scoresAndLabels: RDD[(Double, Double)], size: Long) : (Double, Double, Double, Double) = { val mean: Accumulator[Double] = scoresAndLabels.context.accumulator(0.0, "mean") val err:DenseVector[Double] = scoresAndLabels.map((sc) => { val diff = sc._1 - sc._2 mean += sc._2 val difflog = math.pow(math.log(1 + math.abs(sc._1)) - math.log(math.abs(sc._2) + 1), 2) DenseVector(math.abs(diff), math.pow(diff, 2.0), difflog) }).reduce((a,b) => a+b) val SS_res = err(1) val mu: Broadcast[Double] = scoresAndLabels.context.broadcast(mean.value/size.toDouble) val SS_tot = scoresAndLabels.map((sc) => math.pow(sc._2 - mu.value, 2.0)).sum() val rmse = math.sqrt(SS_res/size.toDouble) val mae = err(0)/size.toDouble val rsq = if(1/SS_tot != Double.NaN) 1 - (SS_res/SS_tot) else 0.0 val rmsle = err(2)/size.toDouble (mae, rmse, rsq, rmsle) } }
Example 171
Source File: RejectionSamplingScheme.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.probability import breeze.stats.distributions.{Density, Rand} import io.github.mandar2812.dynaml.pipes._ import io.github.mandar2812.dynaml.probability.distributions.GenericDistribution import scala.util.Random import org.apache.log4j.Logger abstract class RejectionSamplingScheme[ ConditioningSet, Domain, Dist <: Density[ConditioningSet] with Rand[ConditioningSet], DistL <: Density[Domain] with Rand[Domain], JointDist <: Density[(ConditioningSet, Domain)] with Rand[(ConditioningSet, Domain)], Likelihood <: RandomVarWithDistr[Domain, DistL]]( p: RandomVarWithDistr[ConditioningSet, Dist], c: DataPipe[ConditioningSet, Likelihood]) extends RandomVarWithDistr[(ConditioningSet, Domain), JointDist] with BayesJointProbabilityScheme[ ConditioningSet, Domain, RandomVarWithDistr[ConditioningSet, Dist], Likelihood] { self => override val prior: RandomVarWithDistr[ConditioningSet, Dist] = p override val likelihood: DataPipe[ConditioningSet, Likelihood] = c var Max_Candidates: Int = 1000 var Max_Estimations: Int = 10000 override val sample = prior.sample > BifurcationPipe[ConditioningSet, ConditioningSet, Domain]( (c: ConditioningSet) => (c, likelihood(c).draw) ) override val posterior: DataPipe[Domain, RandomVariable[ConditioningSet]] = DataPipe((data: Domain) => { val sampl = this.prior.sample val q = this.prior.underlyingDist val M = (1 to Max_Estimations).map(i => { likelihood(sampl()).underlyingDist(data) }).sum/Max_Estimations.toDouble new RandomVarWithDistr[ConditioningSet, GenericDistribution[ConditioningSet]] { innerself => val logger = Logger.getLogger(this.getClass) override val sample: DataPipe[Unit, ConditioningSet] = DataPipe(() => { val iterations = 0 var accepted = false var accepted_sample: ConditioningSet = sampl() while(!accepted && iterations < Max_Candidates) { // generate a candidate val candidate = sampl() val a = underlyingDist(candidate)/(M*q(candidate)) if(Random.nextDouble() <= a) { logger.info("... Sample Accepted ...") accepted = true accepted_sample = candidate } } accepted_sample }) override val underlyingDist: GenericDistribution[ConditioningSet] = new GenericDistribution[ConditioningSet] { override def apply(x: ConditioningSet): Double = prior.underlyingDist(x)*likelihood(x).underlyingDist(data) override def draw() = innerself.sample() } } }) }
Example 172
Source File: MOGPRegressionModel.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.models.gp import breeze.linalg.{DenseMatrix, DenseVector} import io.github.mandar2812.dynaml.kernels.LocalScalarKernel import io.github.mandar2812.dynaml.pipes.{DataPipe, DataPipe2} import io.github.mandar2812.dynaml.probability.distributions.MatrixNormal import org.apache.log4j.Logger override def dataAsSeq(data: Stream[(I, DenseVector[Double])]): Seq[((I, Int), Double)] = data.map((patternAndLabel) => patternAndLabel._2.mapPairs((i, label) => ((patternAndLabel._1, i), label) ).toArray.toSeq).reduceLeft((s1, s2) => s1 ++ s2) } class KroneckerMOGPModel[I]( covFunc: LocalScalarKernel[I], noiseCovFunc: LocalScalarKernel[I], coRegCov: LocalScalarKernel[Int], data: Stream[(I, DenseVector[Double])], num: Int, numOutputs: Int, meanFunc: DataPipe[(I, Int), Double] = DataPipe((_: (I, Int)) => 0.0)) extends MOGPRegressionModel[I](covFunc:*coRegCov, noiseCovFunc:* coRegCov, data, num, numOutputs, meanFunc) { val (covFPipe, noiseCovPipe, coRegCovPipe) = (covFunc.asPipe, noiseCovFunc.asPipe, coRegCov.asPipe) override def energy(h: Map[String, Double], options: Map[String, String]): Double = { setState(h) val (features, targets) = data.unzip val covMatrix: DenseMatrix[Double] = covFunc .buildKernelMatrix(features, features.length) .getKernelMatrix() val noiseMatrix: DenseMatrix[Double] = noiseCovFunc .buildKernelMatrix(features, features.length) .getKernelMatrix() val colCovMatrix = coRegCov .buildKernelMatrix(0 until noutputs, noutputs) .getKernelMatrix() val meanMat: DenseMatrix[Double] = DenseMatrix.vertcat( features.map(instance => DenseVector.tabulate[Double](noutputs)(o => mean((instance, o))).asDenseMatrix):_* ) val mvn = MatrixNormal(meanMat, covMatrix+noiseMatrix, colCovMatrix) -mvn.logPdf(DenseMatrix.vertcat(targets.map(_.asDenseMatrix):_*)) } }
Example 173
Source File: GPBasisFuncRegressionModel.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.models.gp import breeze.linalg.{DenseMatrix, DenseVector, cholesky, trace, inv} import breeze.numerics.{log, sqrt} import io.github.mandar2812.dynaml.algebra._ import io.github.mandar2812.dynaml.analysis._ import io.github.mandar2812.dynaml.algebra.PartitionedMatrixOps._ import io.github.mandar2812.dynaml.algebra.PartitionedMatrixSolvers._ import io.github.mandar2812.dynaml.kernels._ import io.github.mandar2812.dynaml.models.{ContinuousProcessModel, SecondOrderProcessModel} import io.github.mandar2812.dynaml.optimization.GloballyOptWithGrad import io.github.mandar2812.dynaml.pipes.{DataPipe, DataPipe2} import io.github.mandar2812.dynaml.probability.{MultGaussianPRV, MultGaussianRV} import org.apache.log4j.Logger import scala.reflect.ClassTag abstract class GPBasisFuncRegressionModel[T, I: ClassTag]( cov: LocalScalarKernel[I], n: LocalScalarKernel[I], data: T, num: Int, basisFunc: DataPipe[I, DenseVector[Double]], basis_param_prior: MultGaussianRV) extends AbstractGPRegressionModel[T, I]( cov, n, data, num) { val MultGaussianRV(b, covB) = basis_param_prior implicit val vf = VectorField(b.length) private lazy val lowB = cholesky(covB) private lazy val covBsolveb = lowB.t \ (lowB \ b) private lazy val h: PartitionedMatrix = PartitionedMatrix.horzcat(_blockSize)(trainingData.map(basisFunc(_)):_*) override val mean: DataPipe[I, Double] = basisFunc > DataPipe((h: DenseVector[Double]) => h.t * b) private val basisFeatureMap: DataPipe[I, DenseVector[Double]] = basisFunc > DataPipe((x: DenseVector[Double]) => lowB*x) val feature_map_cov = CovarianceFunction(basisFunc > DataPipe((x: DenseVector[Double]) => lowB*x)) override protected def getTrainKernelMatrix[U <: Seq[I]] = { SVMKernel.buildPartitionedKernelMatrix(trainingData, trainingData.length, _blockSize, _blockSize, (x: I, y: I) => {covariance.evaluate(x, y) + feature_map_cov.evaluate(x, y) + noiseModel.evaluate(x, y)} ) } override protected def getCrossKernelMatrix[U <: Seq[I]](test: U) = SVMKernel.crossPartitonedKernelMatrix( trainingData, test, _blockSize, _blockSize, (x: I, y: I) => {covariance.evaluate(x, y) + feature_map_cov.evaluate(x, y)} ) override protected def getTestKernelMatrix[U <: Seq[I]](test: U) = SVMKernel.buildPartitionedKernelMatrix( test, test.length.toLong, _blockSize, _blockSize, (x: I, y: I) => {covariance.evaluate(x, y) + feature_map_cov.evaluate(x, y)} ) }
Example 174
Source File: QuasiNewtonOptimizer.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.optimization import breeze.linalg.{DenseMatrix, DenseVector, inv} import io.github.mandar2812.dynaml.pipes.DataPipe import org.apache.log4j.Logger import spire.implicits._ override def optimize( nPoints: Long, ParamOutEdges: Stream[(DenseVector[Double], Double)], initialP: DenseVector[Double]): DenseVector[Double] = QuasiNewtonOptimizer.run( nPoints, this.regParam, this.numIterations, updater, gradient, this.stepSize, initialP, ParamOutEdges, DataPipe(identity[Stream[(DenseVector[Double], Double)]] _) ) } object QuasiNewtonOptimizer { private val logger = Logger.getLogger(this.getClass) def run[T]( nPoints: Long, regParam: Double, numIterations: Int, updater: HessianUpdater, gradient: Gradient, stepSize: Double, initial: DenseVector[Double], POutEdges: T, transform: DataPipe[T, Stream[(DenseVector[Double], Double)]], logging: Boolean = true, logging_rate: Int = 100): DenseVector[Double] = { var oldW: DenseVector[Double] = initial var newW = oldW val hessian = transform(POutEdges) .map(_._1) .map(x => DenseVector(x.toArray ++ Array(1.0))) .map(x => x*x.t) .reduce((x: DenseMatrix[Double], y: DenseMatrix[Double]) => x + y) var regInvHessian = inv(hessian + DenseMatrix.eye[Double](initial.length)*regParam) var oldCumGradient = DenseVector.zeros[Double](initial.length) println("Performing Quasi-Newton Optimization") cfor(1)(iter => iter < numIterations, iter => iter + 1)( iter => { val cumGradient: DenseVector[Double] = DenseVector.zeros(initial.length) var cumLoss: Double = 0.0 transform(POutEdges).foreach(ed => { val x = DenseVector(ed._1.toArray ++ Array(1.0)) val y = ed._2 cumLoss += gradient.compute(x, y, oldW, cumGradient) }) if(logging && iter % logging_rate == 0) RegularizedOptimizer.prettyPrint(iter, cumLoss/nPoints.toDouble) //Find the search direction p = inv(H)*grad(J) //perform update x_new = x + step*p val searchDirection = regInvHessian*cumGradient*(-1.0) newW = updater.compute(oldW, searchDirection, stepSize, iter, regParam)._1 regInvHessian = updater.hessianUpdate(regInvHessian, newW-oldW, cumGradient-oldCumGradient) oldW = newW oldCumGradient = cumGradient }) newW } }
Example 175
Source File: GradientDescentSpark.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.optimization import breeze.linalg.DenseVector import org.apache.log4j.{Logger, Priority} import org.apache.spark.AccumulatorParam import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD override def optimize(nPoints: Long, ParamOutEdges: RDD[LabeledPoint], initialP: DenseVector[Double]) : DenseVector[Double] = GradientDescentSpark.runBatchSGD( nPoints, this.regParam, this.numIterations, this.updater, this.gradient, this.stepSize, initialP, ParamOutEdges, this.miniBatchFraction ) } object GradientDescentSpark { private val logger = Logger.getLogger(this.getClass) def runBatchSGD( nPoints: Long, regParam: Double, numIterations: Int, updater: Updater, gradient: Gradient, stepSize: Double, initial: DenseVector[Double], POutEdges: RDD[LabeledPoint], miniBatchFraction: Double): DenseVector[Double] = { var count = 1 var oldW: DenseVector[Double] = initial var newW = oldW val sc = POutEdges.context val gradb = sc.broadcast(gradient) logger.log(Priority.INFO, "Training model using SGD") while(count <= numIterations) { val cumGradient = sc.accumulator(DenseVector.zeros[Double](initial.length))(new VectorAccumulator()) val wb = sc.broadcast(oldW) POutEdges sample(withReplacement = false, fraction = miniBatchFraction) foreach ((ed) => { val features = DenseVector(ed.features.toArray) val label = ed.label val (g, _) = gradb.value.compute(features, label, wb.value) cumGradient += g }) newW = updater.compute(oldW, cumGradient.value / nPoints.toDouble, stepSize, count, regParam)._1 oldW = newW count += 1 } newW } } class VectorAccumulator extends AccumulatorParam[DenseVector[Double]] { override def addInPlace(r1: DenseVector[Double], r2: DenseVector[Double]): DenseVector[Double] = r1 + r2 override def zero(initialValue: DenseVector[Double]): DenseVector[Double] = DenseVector.zeros(initialValue.length) }
Example 176
Source File: GradBasedGlobalOptimizer.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.optimization import breeze.linalg.{DenseVector, norm} import org.apache.log4j.Logger class GradBasedGlobalOptimizer[M <: GloballyOptWithGrad](model: M) extends GlobalOptimizer[M] { override val system: M = model override protected val logger: Logger = Logger.getLogger(this.getClass) override def optimize(initialConfig: Map[String, Double], options: Map[String, String] = Map("tolerance" -> "0.0001", "step" -> "0.005", "maxIterations" -> "50")) : (M, Map[String, Double]) = { logger.info("Starting Maximum Likelihood based optimization: ML-II") println( "-----------------------------------------------------"+ "-----------------------------------------------------") //Carry out gradient descent with step size alpha and //for a specified number of maximum iterations val tolerance = options("tolerance").toDouble val alpha = options("step").toDouble val maxit = options("maxIterations").toInt var count = 1 var gradNorm = 1.0 var working_solution = initialConfig logger.info("Starting state: \n"+GlobalOptimizer.prettyPrint(working_solution)) do { val gradient = system.gradEnergy(working_solution) println( "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"+ "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") print("\n") logger.info("Gradient at "+count+" iterations is: \n"+GlobalOptimizer.prettyPrint(gradient)) print("\n") gradNorm = norm(DenseVector(gradient.values.toArray), 2) working_solution = working_solution.zip(gradient).map((confAndGrad) => { val hyp = confAndGrad._1._1 val gr:Double = if(confAndGrad._2._2 == Double.PositiveInfinity){ 1.0 } else if(confAndGrad._2._2 == Double.NegativeInfinity){ -1.0 } else if(confAndGrad._2._2 == Double.NaN){ 1.0 } else { confAndGrad._2._2 } val newValue = math.abs(confAndGrad._1._2 - alpha*gr) (hyp, newValue) }) logger.info("Updated state : \n"+GlobalOptimizer.prettyPrint(working_solution)) print("\n") println( "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"+ "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") count += 1 } while (count < maxit && gradNorm >= tolerance) logger.info("Stopped ML-II at "+count+" iterations") logger.info("Final state : \n"+GlobalOptimizer.prettyPrint(working_solution)) //Persist the current configuration to the model memory if(options.contains("persist") && (options("persist") == "true" || options("persist") == "1")) system.persist(working_solution) (system, working_solution) } }
Example 177
Source File: SVMKernelMatrix.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.kernels import breeze.linalg.{DenseMatrix, DenseVector, eig, max, min} import org.apache.log4j.{Logger, Priority} override def eigenDecomposition(dimensions: Int = this.dimension.toInt): (DenseVector[Double], DenseMatrix[Double]) = { logger.log(Priority.INFO, "Eigenvalue decomposition of the kernel matrix using JBlas.") val decomp = eig(this.kernel) logger.log(Priority.INFO, "Eigenvalue stats: " +min(decomp.eigenvalues) +" =< lambda =< " +max(decomp.eigenvalues) ) (decomp.eigenvalues, decomp.eigenvectors) } }
Example 178
Source File: Test_example_CNN.scala From SparkMLlibDeepLearn with Apache License 2.0 | 5 votes |
package tests import org.apache.log4j.{ Level, Logger } import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.storage.StorageLevel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.linalg.{ Vector, Vectors } import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.regression.LabeledPoint import breeze.linalg.{ Matrix => BM, CSCMatrix => BSM, DenseMatrix => BDM, Vector => BV, DenseVector => BDV, SparseVector => BSV, axpy => brzAxpy, svd => brzSvd, max => Bmax, min => Bmin, sum => Bsum } import scala.collection.mutable.ArrayBuffer import CNN.CNN object Test_example_CNN { def main(args: Array[String]) { //1 ����Spark���� val conf = new SparkConf().setAppName("CNNtest") val sc = new SparkContext(conf) //2 �������� Logger.getRootLogger.setLevel(Level.WARN) val data_path = "/deeplearn/train_d3.txt" val examples = sc.textFile(data_path).cache() val train_d1 = examples.map { line => val f1 = line.split("\t") val f = f1.map(f => f.toDouble) val y = f.slice(0, 10) val x = f.slice(10, f.length) (new BDM(1, y.length, y), (new BDM(1, x.length, x)).reshape(28, 28) / 255.0) } val train_d = train_d1.map(f => (f._1, f._2)) //3 ����ѵ������������ģ�� // opts:��������������������������֤���� val opts = Array(50.0, 1.0, 0.0) train_d.cache val numExamples = train_d.count() println(s"numExamples = $numExamples.") val CNNmodel = new CNN(). setMapsize(new BDM(1, 2, Array(28.0, 28.0))). setTypes(Array("i", "c", "s", "c", "s")). setLayer(5). setOnum(10). setOutputmaps(Array(0.0, 6.0, 0.0, 12.0, 0.0)). setKernelsize(Array(0.0, 5.0, 0.0, 5.0, 0.0)). setScale(Array(0.0, 0.0, 2.0, 0.0, 2.0)). setAlpha(1.0). CNNtrain(train_d, opts) //4 ģ�Ͳ��� val CNNforecast = CNNmodel.predict(train_d) val CNNerror = CNNmodel.Loss(CNNforecast) println(s"NNerror = $CNNerror.") val printf1 = CNNforecast.map(f => (f.label.data, f.predict_label.data)).take(200) println("Ԥ��ֵ") for (i <- 0 until printf1.length) { val outi = printf1(i)._2.mkString("\t") println(outi) } } }
Example 179
Source File: LRAccuracyTest.scala From SparseML with Apache License 2.0 | 5 votes |
package MLlib import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, LogisticRegressionModel, SparseLogisticRegressionWithLBFGS} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkContext, SparkConf} object LRAccuracyTest { def main(args: Array[String]) { val conf = new SparkConf().setAppName(s"LogisticRegressionTest with $args").setMaster("local") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").map( l => LabeledPoint(l.label, l.features.toSparse)) // Split data into training (60%) and test (40%). val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) // Run training algorithm to build the model val model = new SparseLogisticRegressionWithLBFGS() .setNumClasses(5) .run(training) // Compute raw scores on the test set. val predictionAndLabels = test.map { case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } // Get evaluation metrics. val metrics = new MulticlassMetrics(predictionAndLabels) val precision = metrics.precision println("Precision = " + precision) } }
Example 180
Source File: MnistExample.scala From SparseML with Apache License 2.0 | 5 votes |
import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.{KMeans, ScalableKMeans, SparseKMeans} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql.SparkSession object MnistExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val spark = SparkSession.builder.appName("svm").master("local[8]").getOrCreate() val trainRDD = spark.sparkContext.textFile("data/mnist/mnist_train.csv", 8) .map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => Vectors.dense(arr.slice(1, 785))) val model = new KMeans() .setK(10) .setInitializationMode("random") .setMaxIterations(10) .run(trainRDD) println("final clusters:") println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n")) } }
Example 181
Source File: KMeanTest.scala From SparseML with Apache License 2.0 | 5 votes |
import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.{ScalableKMeans, KMeans} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.{SparseVector, Vectors, Vector} import scala.util.Random //spark/bin/spark-submit --master spark://10.100.34.48:7077 --class ScalableKMeanTest --executor-memory 20g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 1000000 100 0.1 1 my 9 //guale spark/bin/spark-submit --master spark://10.100.34.48:7077 --class ScalableKMeanTest --executor-memory 5g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 5000000 100 0.1 1 my 15 object ScalableKMeanTest { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setAppName(s"kmeans: ${args.mkString(",")}") val sc = new SparkContext(conf) val k = args(0).toInt val dimension = args(1).toInt val recordNum = args(2).toInt val sparsity = args(3).toDouble val iterations = args(4).toInt val means = args(5) val parNumber = args(6).toInt val data: RDD[Vector] = sc.parallelize(1 to recordNum, parNumber).map(i => { val ran = new Random() val indexArr = ran.shuffle((0 until dimension).toList).take((dimension * sparsity).toInt).sorted.toArray val valueArr = (1 to (dimension * sparsity).toInt).map(in => ran.nextDouble()).sorted.toArray val vec: Vector = new SparseVector(dimension, indexArr, valueArr) vec }).cache() println(args.mkString(", ")) println(data.count() + " records generated") val st = System.nanoTime() val model = if(means == "my") { println("running scalable kmeans") val model = new ScalableKMeans() .setK(k) .setInitializationMode("random") .setMaxIterations(iterations) .run(data) model } else { println("running mllib kmeans") val model = new KMeans() .setK(k) .setInitializationMode("random") .setMaxIterations(iterations) .run(data) model } println((System.nanoTime() - st) / 1e9 + " seconds cost") println("final clusters: " + model.clusterCenters.length) println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n")) sc.stop() } }
Example 182
Source File: MannWhitneyUTestSuite.scala From StatisticsOnSpark with Apache License 2.0 | 5 votes |
package test import org.apache.commons.math3.stat.inference.MannWhitneyUTest import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkContext, SparkConf} object MannWhitneyUTestSuite { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setAppName("TallSkinnySVD").setMaster("local") val sc = new SparkContext(conf) def main(args: Array[String]) { testMannWhitneyU testMannWhitneyUTest } private def testMannWhitneyU(): Unit ={ val sample1 = Array(1d, 3d, 5, 7) val sample2 = Array(2, 4, 6, 8d) val rdd1 = sc.parallelize(sample1) val rdd2 = sc.parallelize(sample2) val result = new MannWhitneyUTest() .mannWhitneyU(sample1, sample2) val result2 = org.apache.spark.mllib.stat.test.MannWhitneyUTest.mannWhitneyU(rdd1, rdd2) assert(result == result2) } private def testMannWhitneyUTest(): Unit ={ val sample1 = Array(1d, 3d, 5, 7) val sample2 = Array(2, 4, 6, 8d) val rdd1 = sc.parallelize(sample1) val rdd2 = sc.parallelize(sample2) val result = new MannWhitneyUTest() .mannWhitneyUTest(sample1, sample2) val result2 = org.apache.spark.mllib.stat.test.MannWhitneyUTest.mannWhitneyUTest(rdd1, rdd2) println(result) println(result2) assert(result == result2) } }
Example 183
Source File: TTestSuite.scala From StatisticsOnSpark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import org.apache.commons.math3.stat.inference.TestUtils import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} object TTestSuite { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setAppName("TallSkinnySVD").setMaster("local") val sc = new SparkContext(conf) def main(args: Array[String]) { OneSampleTTest twoIndependentSampleTTest pairedTwoSampleTTest } def OneSampleTTest(): Unit ={ val observed = Array(100d, 200d, 300d, 400d) val mu = 2.5d assert(TestUtils.tTest(mu, observed, 0.05) == new OneSampleTTest().tTest(mu, sc.parallelize(observed), 0.05)) assert(TestUtils.tTest(mu, observed) == new OneSampleTTest().tTest(mu, sc.parallelize(observed))) } def twoIndependentSampleTTest(): Unit ={ val sample1 = Array(100d, 200d, 300d, 400d) val sample2 = Array(101d, 205d, 300d, 400d) val rdd1 = sc.parallelize(sample1) val rdd2 = sc.parallelize(sample2) assert(TestUtils.tTest(sample1, sample2, 0.05) == new TwoSampleIndependentTTest().tTest(rdd1, rdd2, 0.05)) assert(TestUtils.tTest(sample1, sample2) == new TwoSampleIndependentTTest().tTest(rdd1, rdd2)) } def pairedTwoSampleTTest(): Unit ={ val sample1 = Array(100d, 200d, 300d, 400d) val sample2 = Array(101d, 202d, 300d, 400d) val rdd1 = sc.parallelize(sample1) val rdd2 = sc.parallelize(sample2) assert(TestUtils.pairedTTest(sample1, sample2, 0.05) == new PairTwoSampleTTest().tTest(rdd1, rdd2, 0.05)) assert(TestUtils.pairedTTest(sample1, sample2) == new PairTwoSampleTTest().tTest(rdd1, rdd2)) } }
Example 184
Source File: ANOVASuite.scala From StatisticsOnSpark with Apache License 2.0 | 5 votes |
package test import java.util import main.ANOVA.OneWayANOVA import org.apache.commons.math3.stat.inference.TestUtils import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.stat.OneSampleTTest import org.apache.spark.{SparkContext, SparkConf} object ANOVASuite { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setAppName("TallSkinnySVD").setMaster("local") val sc = new SparkContext(conf) def main(args: Array[String]) { OneWayANOVA } def OneWayANOVA(): Unit ={ val sample1 = Array(100d, 200d, 300d, 400d) val sample2 = Array(101d, 200d, 300d, 400d) val sample3 = Array(102d, 200d, 300d, 400d) val data = new util.ArrayList[Array[Double]]() data.add(sample1) data.add(sample2) data.add(sample3) val rdd1 = sc.parallelize(sample1) val rdd2 = sc.parallelize(sample2) val rdd3 = sc.parallelize(sample3) val rddData = Seq(rdd1, rdd2, rdd3) assert(TestUtils.oneWayAnovaFValue(data) == new OneWayANOVA().anovaFValue(rddData)) assert(TestUtils.oneWayAnovaPValue(data) == new OneWayANOVA().anovaPValue(rddData)) } }
Example 185
Source File: ProxyPlugin.scala From AppCrawler with Apache License 2.0 | 5 votes |
package com.testerhome.appcrawler.plugin import java.io.File import com.brsanthu.googleanalytics.GoogleAnalytics import com.testerhome.appcrawler.URIElement import com.testerhome.appcrawler.Plugin import net.lightbody.bmp.BrowserMobProxyServer import net.lightbody.bmp.proxy.CaptureType import org.apache.log4j.{BasicConfigurator, Level, Logger} import scala.util.Try class ProxyPlugin extends Plugin { private var proxy: BrowserMobProxyServer = _ val port = 7777 //todo: 支持代理 override def start(): Unit = { BasicConfigurator.configure() Logger.getRootLogger.setLevel(Level.INFO) Logger.getLogger("ProxyServer").setLevel(Level.WARN) proxy = new BrowserMobProxyServer() proxy.setHarCaptureTypes(CaptureType.getNonBinaryContentCaptureTypes) proxy.setTrustAllServers(true) proxy.start(port) //proxy.setHarCaptureTypes(CaptureType.getAllContentCaptureTypes) //proxy.setHarCaptureTypes(CaptureType.getHeaderCaptureTypes) log.info(s"proxy server listen on ${port}") proxy.newHar("start") } override def beforeElementAction(element: URIElement): Unit = { log.info("clear har") proxy.endHar() //创建新的har val harFileName = getCrawler().getBasePathName() + ".har" proxy.newHar(harFileName) } override def afterElementAction(element: URIElement): Unit = { log.info("save har") val harFileName = getCrawler().getBasePathName() + ".har" val file = new File(harFileName) try { log.info(proxy.getHar) log.info(proxy.getHar.getLog) log.info(proxy.getHar.getLog.getEntries.size()) log.info(s"har entry size = ${proxy.getHar.getLog.getEntries.size()}") if (proxy.getHar.getLog.getEntries.size() > 0) { proxy.getHar.writeTo(file) } } catch { case e: Exception =>{ log.error("read har error") log.error(e.getCause) log.error(e.getMessage) e.getStackTrace.foreach(log.error) } } } override def stop(): Unit ={ log.info("prpxy stop") proxy.stop() } }
Example 186
Source File: TestGA.scala From AppCrawler with Apache License 2.0 | 5 votes |
package com.testerhome.appcrawler.ut import com.brsanthu.googleanalytics.{GoogleAnalytics, PageViewHit} import org.apache.log4j.{BasicConfigurator, Level, Logger} import org.scalatest.FunSuite class TestGA extends FunSuite{ test("google analyse"){ println("ga start") BasicConfigurator.configure() Logger.getRootLogger().setLevel(Level.WARN) val ga = new GoogleAnalytics("UA-74406102-1") 1 to 10 foreach(x=>{ ga.postAsync(new PageViewHit(s"http://appcrawler.io/demo${x}", "test")) }) Thread.sleep(10000) 1 to 10 foreach(x=>{ ga.postAsync(new PageViewHit(s"http://appcrawler.io/dem1${x}", "test")) }) Thread.sleep(10000) 1 to 10 foreach(x=>{ ga.postAsync(new PageViewHit(s"http://appcrawler.io/dem2${x}", "test")) }) //ga.post(new PageViewHit("http://appcrawler.io/test2", "test")) println("ga end") } }
Example 187
Source File: TestMacaca.scala From AppCrawler with Apache License 2.0 | 5 votes |
package com.testerhome.appcrawler.it import org.scalatest.{BeforeAndAfterAll, FunSuite} import org.apache.log4j.Logger import com.alibaba.fastjson.JSONObject import macaca.client.MacacaClient class TestMacaca extends FunSuite with BeforeAndAfterAll{ val driver=new MacacaClient() override def beforeAll(): Unit = { val porps = new JSONObject() porps.put("autoAcceptAlerts", true) porps.put("browserName", "") porps.put("platformName", "android") porps.put("package", "com.gotokeep.keep") porps.put("activity", ".activity.SplashActivity") porps.put("reuse", 3) val desiredCapabilities = new JSONObject() desiredCapabilities.put("desiredCapabilities", porps) driver.initDriver(desiredCapabilities) } test("macaca android"){ println(driver.source()) } test("macaca chrome"){ val porps = new JSONObject() porps.put("autoAcceptAlerts", true) porps.put("browserName", "Chrome") porps.put("platformName", "desktop") // android or ios porps.put("javascriptEnabled", true) porps.put("platform", "ANY") val desiredCapabilities = new JSONObject() desiredCapabilities.put("desiredCapabilities", porps) driver.initDriver(desiredCapabilities) driver.get("http://www.baidu.com/") } }
Example 188
Source File: StreamHQL.scala From spark-cep with Apache License 2.0 | 5 votes |
import java.util.Properties import kafka.consumer.ConsumerConfig import org.I0Itec.zkclient.ZkClient import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.streaming.StreamSQLContext import org.apache.spark.sql.streaming.sources.MessageDelimiter import org.apache.spark.streaming.dstream.ConstantInputDStream import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} import redis.RedisManager import scala.util.parsing.json.JSON class TabDelimiter extends MessageDelimiter { override val delimiter = "\t" } object StreamDDL { def main(args: Array[String]): Unit = { Logger.getRootLogger.setLevel(Level.WARN) val query = args(0) val sc = new SparkContext(new SparkConf()) val ssc = new StreamingContext(sc, Seconds(1)) val streamSqlContext = new StreamSQLContext(ssc, new HiveContext(sc)) streamSqlContext.command(query) new ConstantInputDStream[Int](ssc, sc.parallelize(Seq(1))).print ssc.start() ssc.awaitTerminationOrTimeout(100) ssc.stop() } } object StreamHQL { object Redis { var initialized = false var manager: RedisManager = _ def init(confMap: Map[String, String]) { if (initialized == false) { manager = new RedisManager( confMap("redis.shards"), confMap("redis.sentinels"), confMap("redis.database").toInt) manager.init initialized = true } } } def removeConsumerGroup(zkQuorum: String, groupId: String) { val properties = new Properties() properties.put("zookeeper.connect", zkQuorum) properties.put("group.id", groupId) val conf = new ConsumerConfig(properties) val zkClient = new ZkClient(conf.zkConnect) zkClient.deleteRecursive(s"/consumers/${conf.groupId}") zkClient.close() } def main(args: Array[String]): Unit = { Logger.getRootLogger.setLevel(Level.WARN) val confMap = JSON.parseFull(args(0)).get.asInstanceOf[Map[String, String]] val qid = args(1) val query = args(2) val sc = new SparkContext(new SparkConf()) val ssc = new StreamingContext(sc, Seconds(1)) val hc = new HiveContext(sc) val streamSqlContext = new StreamSQLContext(ssc, hc) val redisExpireSec = confMap("redis.expire.sec").toInt ssc.checkpoint(s"checkpoint/$qid") hc.setConf("spark.streaming.query.id", qid) hc.setConf("spark.sql.shuffle.partitions", confMap("spark.sql.shuffle.partitions")) removeConsumerGroup(confMap("kafka.zookeeper.quorum"), qid) val result = streamSqlContext.sql(query) val schema = result.schema result.foreachRDD((rdd, time) => { rdd.foreachPartition(partition => { Redis.init(confMap) val jedis = Redis.manager.getResource val pipe = jedis.pipelined partition.foreach(record => { val seq = record.toSeq(schema) val ts = time.milliseconds / 1000 val hkey = seq.take(seq.size - 1).mkString(".") pipe.hset(qid + "." + ts, hkey, seq(seq.size - 1).toString) pipe.expire(qid + "." + ts, redisExpireSec) }) pipe.sync Redis.manager.returnResource(jedis) }) }) ssc.start() ssc.awaitTermination() ssc.stop() } }
Example 189
Source File: ZeroMQWordCount.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming.zeromq import scala.language.implicitConversions import scala.util.Random import org.apache.log4j.{Level, Logger} import org.zeromq.ZContext import org.zeromq.ZMQ import org.zeromq.ZMQException import org.zeromq.ZMsg import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.zeromq.ZeroMQUtils object ZeroMQWordCount { def main(args: Array[String]) { if (args.length < 2) { // scalastyle:off println System.err.println("Usage: ZeroMQWordCount <zeroMqUrl> <topic>") // scalastyle:on println System.exit(1) } // Set logging level if log4j not configured (override by adding log4j.properties to classpath). Logger.getRootLogger.setLevel(Level.WARN) val Seq(url, topic) = args.toSeq val sparkConf = new SparkConf().setAppName("ZeroMQWordCount") // Check Spark configuration for master URL, set it to local if not present. if (!sparkConf.contains("spark.master")) { sparkConf.setMaster("local[2]") } // Create the context and set the batch size. val ssc = new StreamingContext(sparkConf, Seconds(2)) val lines = ZeroMQUtils.createTextStream( ssc, url, true, Seq(topic.getBytes) ) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } }
Example 190
Source File: TwitterLocations.scala From bahir with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming.twitter import org.apache.log4j.{Level, Logger} import twitter4j.FilterQuery import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.twitter._ object TwitterLocations { def main(args: Array[String]) { if (args.length < 4 || args.length % 4 != 0) { System.err.println("Usage: TwitterLocations <consumer key> <consumer secret> " + "<access token> <access token secret> " + "[<latitude-south-west> <longitude-south-west>" + " <latitude-north-east> <longitude-north-east> ...]") System.exit(1) } // Set logging level if log4j not configured (override by adding log4j.properties to classpath) if (!Logger.getRootLogger.getAllAppenders.hasMoreElements) { Logger.getRootLogger.setLevel(Level.WARN) } // Set the system properties so that Twitter4j library used by twitter stream // can use them to generate OAuth credentials val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4) System.setProperty("twitter4j.oauth.consumerKey", consumerKey) System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret) System.setProperty("twitter4j.oauth.accessToken", accessToken) System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret) // Get bounding boxes of locations for which to retrieve Tweets from command line val locationArgs = args.takeRight(args.length - 4) val boundingBoxes = if (locationArgs.length == 0) { System.out.println("No location bounding boxes specified, using defaults for New York City") val nycSouthWest = Array(-74.0, 40.0) val nycNorthEast = Array(-73.0, 41.0) Array(nycSouthWest, nycNorthEast) } else { locationArgs.map(_.toDouble).sliding(2, 2).toArray } val sparkConf = new SparkConf().setAppName("TwitterLocations") // check Spark configuration for master URL, set it to local if not configured if (!sparkConf.contains("spark.master")) { sparkConf.setMaster("local[2]") } val ssc = new StreamingContext(sparkConf, Seconds(2)) val locationsQuery = new FilterQuery().locations(boundingBoxes : _*) // Print Tweets from the specified coordinates // This includes Tweets geo-tagged in the bounding box defined by the coordinates // As well as Tweets tagged in places inside of the bounding box TwitterUtils.createFilteredStream(ssc, None, Some(locationsQuery)) .map(tweet => { val latitude = Option(tweet.getGeoLocation).map(l => s"${l.getLatitude},${l.getLongitude}") val place = Option(tweet.getPlace).map(_.getName) val location = latitude.getOrElse(place.getOrElse("(no location)")) val text = tweet.getText.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') s"$location\t$text" }) .print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 191
Source File: TwitterAlgebirdHLL.scala From bahir with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming.twitter import com.twitter.algebird.HyperLogLog._ import com.twitter.algebird.HyperLogLogMonoid import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.twitter._ // scalastyle:off val BIT_SIZE = 12 val filters = args val sparkConf = new SparkConf().setAppName("TwitterAlgebirdHLL") // check Spark configuration for master URL, set it to local if not configured if (!sparkConf.contains("spark.master")) { sparkConf.setMaster("local[2]") } val ssc = new StreamingContext(sparkConf, Seconds(5)) val stream = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_ONLY_SER) val users = stream.map(status => status.getUser.getId) val hll = new HyperLogLogMonoid(BIT_SIZE) var globalHll = hll.zero var userSet: Set[Long] = Set() val approxUsers = users.mapPartitions(ids => { ids.map(id => hll.create(id)) }).reduce(_ + _) val exactUsers = users.map(id => Set(id)).reduce(_ ++ _) approxUsers.foreachRDD(rdd => { if (rdd.count() != 0) { val partial = rdd.first() globalHll += partial println("Approx distinct users this batch: %d".format(partial.estimatedSize.toInt)) println("Approx distinct users overall: %d".format(globalHll.estimatedSize.toInt)) } }) exactUsers.foreachRDD(rdd => { if (rdd.count() != 0) { val partial = rdd.first() userSet ++= partial println("Exact distinct users this batch: %d".format(partial.size)) println("Exact distinct users overall: %d".format(userSet.size)) println("Error rate: %2.5f%%".format(((globalHll.estimatedSize / userSet.size.toDouble) - 1 ) * 100)) } }) ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 192
Source File: TwitterPopularTags.scala From bahir with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming.twitter import org.apache.log4j.{Level, Logger} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.twitter._ import org.apache.spark.SparkConf object TwitterPopularTags { def main(args: Array[String]) { if (args.length < 4) { System.err.println("Usage: TwitterPopularTags <consumer key> <consumer secret> " + "<access token> <access token secret> [<filters>]") System.exit(1) } // Set logging level if log4j not configured (override by adding log4j.properties to classpath) if (!Logger.getRootLogger.getAllAppenders.hasMoreElements) { Logger.getRootLogger.setLevel(Level.WARN) } val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4) val filters = args.takeRight(args.length - 4) // Set the system properties so that Twitter4j library used by twitter stream // can use them to generate OAuth credentials System.setProperty("twitter4j.oauth.consumerKey", consumerKey) System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret) System.setProperty("twitter4j.oauth.accessToken", accessToken) System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret) val sparkConf = new SparkConf().setAppName("TwitterPopularTags") // check Spark configuration for master URL, set it to local if not configured if (!sparkConf.contains("spark.master")) { sparkConf.setMaster("local[2]") } val ssc = new StreamingContext(sparkConf, Seconds(2)) val stream = TwitterUtils.createStream(ssc, None, filters) val hashTags = stream.flatMap(status => status.getText.split(" ").filter(_.startsWith("#"))) val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60)) .map{case (topic, count) => (count, topic)} .transform(_.sortByKey(false)) val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10)) .map{case (topic, count) => (count, topic)} .transform(_.sortByKey(false)) // Print popular hashtags topCounts60.foreachRDD(rdd => { val topList = rdd.take(10) println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count())) topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))} }) topCounts10.foreachRDD(rdd => { val topList = rdd.take(10) println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count())) topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))} }) ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 193
Source File: TrainNewsClassWithDTDemo.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package applications.mining import config.paramconf.ClassParams import functions.Preprocessor import org.apache.log4j.{Level, Logger} import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.feature._ import org.apache.spark.sql.SparkSession object TrainNewsClassWithDTDemo { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.WARN) val spark = SparkSession .builder .master("local[2]") .appName("train news with DT Demo") .getOrCreate() val args = Array("ckooc-ml/data/classnews/train") val filePath = args(0) import spark.implicits._ val data = spark.sparkContext.textFile(filePath).flatMap { line => val tokens: Array[String] = line.split("\u00ef") if (tokens.length > 3) Some((tokens(0), tokens(1), tokens(2), tokens(3))) else None }.toDF("label", "title", "time", "content") data.persist() val preprocessor = new Preprocessor val pipeline = preprocessor.preprocess(data) // DT模型训练 val params = new ClassParams val dtClassifier = new DecisionTreeClassifier() .setMinInfoGain(params.minInfoGain) .setMaxDepth(params.maxDepth) //目前Spark只支持最大30层深度 .setLabelCol("indexedLabel") .setFeaturesCol("features") val indexModel = pipeline.getStages(1).asInstanceOf[StringIndexerModel] //索引标签化 val labelConverter = new IndexToString() .setLabels(indexModel.labels) .setInputCol(dtClassifier.getPredictionCol) .setOutputCol("predictedLabel") val stages = pipeline.getStages ++ Array(dtClassifier, labelConverter) pipeline.setStages(stages) val model = pipeline.fit(data) model.write.overwrite().save(params.DTModelPath) data.unpersist() spark.stop() } }
Example 194
Source File: PredictNewsClassDemo.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package applications.mining import algorithms.evaluation.MultiClassEvaluation import config.paramconf.ClassParams import org.apache.log4j.{Level, Logger} import org.apache.spark.ml.PipelineModel import org.apache.spark.sql.{Row, SparkSession} object PredictNewsClassDemo extends Serializable { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.WARN) val spark = SparkSession .builder .master("local[2]") .appName("predict news multi class demo") .getOrCreate() val args = Array("ckooc-ml/data/classnews/predict", "lr") val filePath = args(0) val modelType = args(1) var modelPath = "" val params = new ClassParams modelType match { case "lr" => modelPath = params.LRModelPath case "dt" => modelPath = params.DTModelPath case _ => println("模型类型错误!") System.exit(1) } import spark.implicits._ val data = spark.sparkContext.textFile(filePath).flatMap { line => val tokens: Array[String] = line.split("\u00ef") if (tokens.length > 3) Some((tokens(0), tokens(1), tokens(2), tokens(3))) else None }.toDF("label", "title", "time", "content") data.persist() //加载模型,进行数据转换 val model = PipelineModel.load(modelPath) val predictions = model.transform(data) //=== 模型评估 val resultRDD = predictions.select("prediction", "indexedLabel").rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val (precision, recall, f1) = MultiClassEvaluation.multiClassEvaluate(resultRDD) println("\n\n========= 评估结果 ==========") println(s"\n加权准确率:$precision") println(s"加权召回率:$recall") println(s"F1值:$f1") // predictions.select("label", "predictedLabel", "content").show(100, truncate = false) data.unpersist() spark.stop() } }
Example 195
Source File: TrainNewsClassWithLRDemo.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package applications.mining import config.paramconf.ClassParams import functions.Preprocessor import org.apache.log4j.{Level, Logger} import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature._ import org.apache.spark.sql.SparkSession object TrainNewsClassWithLRDemo extends Serializable { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.WARN) val spark = SparkSession .builder .master("local[2]") .appName("train news with LR Demo") .getOrCreate() val args = Array("ckooc-ml/data/classnews/train") val filePath = args(0) import spark.implicits._ val data = spark.sparkContext.textFile(filePath).flatMap { line => val tokens: Array[String] = line.split("\u00ef") if (tokens.length > 3) Some((tokens(0), tokens(1), tokens(2), tokens(3))) else None }.toDF("label", "title", "time", "content") data.persist() val preprocessor = new Preprocessor val pipeline = preprocessor.preprocess(data) //LR模型训练 val params = new ClassParams val logisticRegression = new LogisticRegression() .setTol(params.converTol) .setMaxIter(params.maxIteration) .setRegParam(params.regParam) .setElasticNetParam(params.elasticNetParam) .setLabelCol("indexedLabel") .setFeaturesCol("features") val indexModel = pipeline.getStages(1).asInstanceOf[StringIndexerModel] //索引标签化 val labelConverter = new IndexToString() .setLabels(indexModel.labels) .setInputCol(logisticRegression.getPredictionCol) .setOutputCol("predictedLabel") val stages = pipeline.getStages ++ Array(logisticRegression, labelConverter) pipeline.setStages(stages) val model = pipeline.fit(data) model.write.overwrite().save(params.LRModelPath) data.unpersist() spark.stop() } }
Example 196
Source File: StarsAnalysisDemo.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package applications.analysis import java.io.{BufferedWriter, FileOutputStream, OutputStreamWriter} import functions.segment.Segmenter import org.apache.log4j.{Level, Logger} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row, SparkSession} object StarsAnalysisDemo { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val spark = SparkSession .builder .master("local[2]") .appName("Stars Analysis Demo") .getOrCreate() val filePath = "E:/data/chinaNews/entertainment.txt" // 加载数据,并保留年份和内容字段,并对内容字段进行过滤 import spark.implicits._ val data = spark.sparkContext.textFile(filePath).flatMap { line => val tokens: Array[String] = line.split("\u00ef") if (tokens.length > 3) { var year: String = tokens(2).split("-")(0) if (tokens(2).contains("年")) year = tokens(2).split("年")(0) var content = tokens(3) if (content.length > 22 && content.substring(0, 20).contains("日电")) { content = content.substring(content.indexOf("日电") + 2, content.length).trim } if (content.startsWith("(")) content = content.substring(content.indexOf(")") + 1, content.length) if (content.length > 20 && content.substring(content.length - 20, content.length).contains("记者")) { content = content.substring(0, content.lastIndexOf("记者")).trim } Some(year, content) } else None }.toDF("year", "content") // 分词,去除长度为1的词,每个词保留词性 val segmenter = new Segmenter() .isAddNature(true) .isDelEn(true) .isDelNum(true) .setMinTermLen(2) .setMinTermNum(5) .setSegType("StandardSegment") .setInputCol("content") .setOutputCol("segmented") val segDF: DataFrame = segmenter.transform(data) segDF.cache() val segRDD: RDD[(Int, Seq[String])] = segDF.select("year", "segmented").rdd.map { case Row(year: String, terms: Seq[String]) => (Integer.parseInt(year), terms) } val result: Array[String] = segRDD.map(line => line._1.toString + "\u00ef" + line._2.mkString(",")).collect() val writer: BufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("E:/entertainment_seg.txt"))) result.foreach(line => writer.write(line + "\n")) writer.close() // 统计2016出现在新闻中最多的明星 val stars2016 = segRDD.filter(_._1 == 2016) .flatMap { case (year: Int, termStr: Seq[String]) => val person = termStr .map(term => (term.split("/")(0), term.split("/")(1))) .filter(_._2.equalsIgnoreCase("nr")) .map(term => (term._1, 1L)) person } .reduceByKey(_ + _) .sortBy(_._2, ascending = false) segDF.unpersist() stars2016.take(100).foreach(println) spark.stop() } }
Example 197
Source File: NLPPreprocessTest.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package nlp import com.hankcs.hanlp.utility.Predefine import functions.clean.Cleaner import functions.segment.Segmenter import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.junit.Test import scala.reflect.io.File @Test def testSegmenter(): Unit = { val spark = SparkSession .builder .master("local[2]") .appName("Segment Demo") .getOrCreate() val text = Seq( (0, "这段文本是用来做分词测试的!This text is for test!"), (1, "江州市长江大桥参加长江大桥通车仪式"), (2, "他邀请了不少于10个明星,有:范冰冰、赵薇、周杰伦等,还有20几位商业大佬") ) val sentenceData = spark.createDataFrame(text).toDF("id", "sentence") // 设置HanLP配置文件路径, 默认位于classpath路径中 val path = this.getClass.getClassLoader.getResource("").getPath Predefine.HANLP_PROPERTIES_PATH = path + File.separator + "hanlp.properties" val segmenter = new Segmenter() .isDelEn(true) .isDelNum(true) .isAddNature(true) .setSegType("StandardSegment") .setMinTermLen(2) .setMinTermNum(3) .setInputCol("sentence") .setOutputCol("segmented") segmenter.transform(sentenceData).show(false) spark.stop() } }
Example 198
Source File: Schema.scala From osmesa with Apache License 2.0 | 5 votes |
package osmesa.analytics.updater import java.sql.Timestamp import java.time.Instant import geotrellis.vectortile.Layer import org.apache.log4j.Logger import osmesa.analytics.updater.Implicits._ trait Schema { val layer: Layer val features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)] val newFeatures: Seq[VTFeature] lazy val replacementFeatures: Seq[VTFeature] = Seq.empty[VTFeature] lazy val retainedFeatures: Seq[VTFeature] = Seq.empty[VTFeature] protected lazy val logger: Logger = Logger.getLogger(getClass) protected lazy val touchedFeatures: Map[String, Seq[VTFeature]] = Map.empty[String, Seq[VTFeature]] protected lazy val versionInfo: Map[String, (Int, Int, Timestamp)] = touchedFeatures .mapValues(_.last) .mapValues( f => ( f.data("__version").toInt, f.data("__minorVersion").toInt, Timestamp.from(Instant.ofEpochMilli(f.data("__updated"))) )) protected lazy val minorVersions: Map[String, Int] = features .mapValues { case (_, curr) => curr.data } .map { case (id, f) => versionInfo.get(id) match { case Some((prevVersion, _, _)) if prevVersion < f.version => (id, 0) case Some((prevVersion, prevMinorVersion, _)) if prevVersion == f.version => (id, prevMinorVersion + 1) case _ => (id, 0) } } } trait SchemaBuilder { val layerName: String def apply(layer: Layer, features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)]): Schema }
Example 199
Source File: printMatrix.scala From mCNN with Apache License 2.0 | 5 votes |
package hhbyyh.mCNN import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.{SparkContext, SparkConf} import breeze.linalg.{DenseMatrix => BDM, kron} object printMatrix { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setMaster("local[8]").setAppName("ttt") val sc = new SparkContext(conf) val lines = sc.textFile("dataset/mnist/mnist_train.csv", 8) val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => (arr(0), Example.Vector2Tensor(Vectors.dense(arr.slice(1, 785).map(v => if(v > 200) 1.0 else 0)))(0))) val lines2 = sc.textFile("dataset/train.format", 8) val data2 = lines2.map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => (arr(784), Example.Vector2Tensor(Vectors.dense(arr.slice(0, 784)))(0))) data2.take(10).foreach(record =>{ println("label: " + record._1) val intm = new BDM[Int](28, 28, record._2.toArray.map(d => d.toInt)) val str = intm.toString(1000, 1000).replace('0', '.').replace('0', '*') println(str) }) } }
Example 200
Source File: Example.scala From mCNN with Apache License 2.0 | 5 votes |
package hhbyyh.mCNN import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.{SparkConf, SparkContext} import breeze.linalg.{DenseMatrix => BDM, _} object Example { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setMaster("local[8]").setAppName("ttt") val sc = new SparkContext(conf) val lines = sc.textFile("dataset/train.format", 8) val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => (arr(784), Vector2Tensor(Vectors.dense(arr.slice(0, 784))))) val topology = new CNNTopology topology.addLayer(CNNLayer.buildConvolutionLayer(1, 6, new Scale(5, 5))) topology.addLayer(CNNLayer.buildMeanPoolingLayer(new Scale(2, 2))) topology.addLayer(CNNLayer.buildConvolutionLayer(6, 12, new Scale(5, 5))) topology.addLayer(CNNLayer.buildMeanPoolingLayer(new Scale(2, 2))) topology.addLayer(CNNLayer.buildConvolutionLayer(12, 12, new Scale(4, 4))) val cnn: CNN = new CNN(topology).setMaxIterations(5).setMiniBatchSize(16) val start = System.nanoTime() cnn.trainOneByOne(data) println("Training time: " + (System.nanoTime() - start) / 1e9) val right = data.map(record =>{ val result = cnn.predict(record._2) if(result == record._1) 1 else 0 }).sum() println(s"Predicting precision: $right " + right.toDouble/(data.count())) // val testData = sc.textFile("dataset/mnist/mnist_test.csv", 8) // .map(line => line.split(",")).map(arr => arr.map(_.toDouble)) // .map(arr => (arr(0), Example.Vector2Tensor(Vectors.dense(arr.slice(1, 785).map(v => if(v > 200) 1.0 else 0))))) val rightM = data.map(record =>{ val result = cnn.predict(record._2) if(result == record._1) 1 else 0 }).sum() println(s"Mnist Full Predicting precision: $rightM " + rightM.toDouble/(data.count())) } def Vector2Tensor(record: Vector): Array[BDM[Double]] = { val mapSize = new Scale(28, 28) val m = new BDM[Double](mapSize.x, mapSize.y) var i: Int = 0 while (i < mapSize.x) { var j: Int = 0 while (j < mapSize.y) { m(i, j) = record(mapSize.x * i + j) j += 1 } i += 1 } Array(m) } }