scala.reflect.runtime.universe Scala Examples
The following examples show how to use scala.reflect.runtime.universe.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: LoginControllerFactory.scala From scuruto with MIT License | 5 votes |
package controller import skinny.SkinnyConfig object LoginControllerFactory { private val DEFAULT_PROVIDOR = "App" val create: LoginController = { val providor = SkinnyConfig.stringConfigValue("login.providor").map { configValue => configValue.capitalize } getOrElse DEFAULT_PROVIDOR import scala.reflect.runtime.universe val runtimeMirror = universe.runtimeMirror(getClass.getClassLoader) val module = runtimeMirror.staticModule(s"controller.login.${providor}LoginController") val obj = runtimeMirror.reflectModule(module) val controller = obj.instance controller.asInstanceOf[LoginController] } }
Example 2
Source File: L8-35DataFrameExamplesRDD.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.types.DataType import org.apache.spark.sql.types.StructType import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats object CdrDataframeExamplesRDDApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: CdrDataframeExamplesRDDApp <appname> <batchInterval> <hostname> <schemaPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, schemaFile) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ implicit val formats = DefaultFormats val schemaJson = scala.io.Source.fromFile(schemaFile).mkString val schema = DataType.fromJson(schemaJson).asInstanceOf[StructType] val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() val highInternet = sqlC.createDataFrame(cdrs.rdd.filter(r => r.getFloat(3) + r.getFloat(4) >= r.getFloat(5) + r.getFloat(6)), schema) val highOther = cdrs.except(highInternet) val highInternetGrid = highInternet.select("squareId", "countryCode").dropDuplicates() val highOtherGrid = highOther.select("squareId", "countryCode").dropDuplicates() highOtherGrid.except(highInternetGrid).show() highInternetGrid.except(highOtherGrid).show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 3
Source File: L8-13HiveQL.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrHiveqlApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrHiveqlApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val cl = Thread.currentThread().getContextClassLoader() val hiveC = new HiveContext(ssc.sparkContext) Thread.currentThread().setContextClassLoader(cl) import hiveC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { seqToCdr(rdd).toDF().registerTempTable("cdrs") hiveC.sql("SET DATE_FMT='yy-MM-dd|HH'") hiveC.sql("SELECT from_unixtime(timeInterval, ${hiveconf:DATE_FMT}) AS TS, SUM(smsInActivity + smsOutActivity + callInActivity + callOutActivity + internetTrafficActivity) AS Activity FROM cdrs GROUP BY from_unixtime(timeInterval, ${hiveconf:DATE_FMT}) ORDER BY Activity DESC").show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 4
Source File: L6-20CassandraConnector.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput import com.datastax.spark.connector.SomeColumns import com.datastax.spark.connector.cql.CassandraConnector import com.datastax.spark.connector.streaming.toDStreamFunctions import com.datastax.spark.connector.toNamedColumnRef object CassandraConnectorSinkApp { def main(args: Array[String]) { if (args.length != 6) { System.err.println( "Usage: CassandraConnectorSinkApp <appname> <cassandraHost> <cassandraPort> <keyspace> <tableName> <columnName>") System.exit(1) } val Seq(appName, cassandraHost, cassandraPort, keyspace, tableName, columnName) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.cassandra.connection.host", cassandraHost) .set("spark.cassandra.connection.port", cassandraPort) val batchInterval = 10 val windowSize = 20 val slideInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) CassandraConnector(conf).withSessionDo { session => session.execute(s"CREATE KEYSPACE IF NOT EXISTS %s WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }".format(keyspace)) session.execute(s"CREATE TABLE IF NOT EXISTS %s.%s (key TEXT PRIMARY KEY, %s FLOAT)".format(keyspace, tableName, columnName)) } HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) }) .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) .map(stock => (stock._1, stock._2 / (windowSize / batchInterval))) .saveToCassandra(keyspace, tableName) ssc.start() ssc.awaitTermination() } }
Example 5
Source File: L9-17MLCrossValidation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.Normalizer import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.ml.tuning.CrossValidator import org.apache.spark.ml.tuning.ParamGridBuilder import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object MLCrossValidationApp { case class Activity(label: Double, accelXHand: Double, accelYHand: Double, accelZHand: Double, accelXChest: Double, accelYChest: Double, accelZChest: Double, accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: MLCrossValidationApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) == "4" || f(1) == "5") .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) .map(f => f.map(v => v.toDouble)) .foreachRDD(rdd => { if (!rdd.isEmpty) { val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF() val split = accelerometer.randomSplit(Array(0.3, 0.7)) val test = split(0) val train = split(1) val assembler = new VectorAssembler() .setInputCols(Array( "accelXHand", "accelYHand", "accelZHand", "accelXChest", "accelYChest", "accelZChest", "accelXAnkle", "accelYAnkle", "accelZAnkle")) .setOutputCol("vectors") val normalizer = new Normalizer() .setInputCol(assembler.getOutputCol) .setOutputCol("features") val regressor = new RandomForestRegressor() val pipeline = new Pipeline() .setStages(Array(assembler, normalizer, regressor)) val validator = new CrossValidator() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) val pGrid = new ParamGridBuilder() .addGrid(normalizer.p, Array(1.0, 5.0, 10.0)) .addGrid(regressor.numTrees, Array(10, 50, 100)) .build() validator.setEstimatorParamMaps(pGrid) validator.setNumFolds(5) val bestModel = validator.fit(train) val prediction = bestModel.transform(test) prediction.show() } }) ssc.start() ssc.awaitTermination() } }
Example 6
Source File: L9-15MLPipeline.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.Normalizer import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.ml.param.ParamMap object MLPipelineApp { case class Activity(label: Double, accelXHand: Double, accelYHand: Double, accelZHand: Double, accelXChest: Double, accelYChest: Double, accelZChest: Double, accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: MLPipelineApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) == "4" || f(1) == "5") .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) .map(f => f.map(v => v.toDouble)) .foreachRDD(rdd => { if (!rdd.isEmpty) { val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF() val split = accelerometer.randomSplit(Array(0.3, 0.7)) val test = split(0) val train = split(1) val assembler = new VectorAssembler() .setInputCols(Array( "accelXHand", "accelYHand", "accelZHand", "accelXChest", "accelYChest", "accelZChest", "accelXAnkle", "accelYAnkle", "accelZAnkle")) .setOutputCol("vectors") val normalizer = new Normalizer() .setInputCol(assembler.getOutputCol) .setOutputCol("features") val regressor = new RandomForestRegressor() val pipeline = new Pipeline() .setStages(Array(assembler, normalizer, regressor)) val pMap = ParamMap(normalizer.p -> 1.0) val model = pipeline.fit(train, pMap) val prediction = model.transform(test) prediction.show() } }) ssc.start() ssc.awaitTermination() } }
Example 7
Source File: TestFlinkGenerator.scala From milan with Apache License 2.0 | 5 votes |
package com.amazon.milan.compiler.flink.generator import com.amazon.milan.application.ApplicationConfiguration import com.amazon.milan.application.sources.S3DataSource import com.amazon.milan.dataformats.JsonDataInputFormat import com.amazon.milan.compiler.flink.testing.{IntRecord, TestApplicationExecutor} import com.amazon.milan.lang._ import com.amazon.milan.testing.applications._ import org.junit.Assert._ import org.junit.Test import scala.reflect.runtime.universe import scala.tools.reflect.ToolBox @Test class TestFlinkGenerator { private val generator = new FlinkGenerator(GeneratorConfig()) @Test def test_FlinkGenerator_GenerateScala_WithListSourceAndMapOfOneRecord_GeneratesCodeThatCompilesAndOutputsMappedRecord(): Unit = { val input = Stream.of[IntRecord].withName("input") val output = input.map(r => IntRecord(r.i + 1)).withName("output") val graph = new StreamGraph(output) val config = new ApplicationConfiguration config.setListSource(input, IntRecord(1)) val result = TestApplicationExecutor.executeApplication(graph, config, 10, output) val outputRecords = result.getRecords(output) assertEquals(List(IntRecord(2)), outputRecords) } @Test def test_FlinkGenerator_GenerateScala_WithS3DataSource_GeneratesCodeThatCompiles(): Unit = { val input = Stream.of[IntRecord].withName("input") val output = input.map(r => IntRecord(r.i + 1)).withName("output") val graph = new StreamGraph(output) val config = new ApplicationConfiguration config.setSource(input, new S3DataSource[IntRecord]("bucket", "prefix", new JsonDataInputFormat[IntRecord]())) val generatedCode = this.generator.generateScala(graph, config, "", "TestApp") this.eval(generatedCode) } private def eval(code: String): Any = { try { val tb = ToolBox(universe.runtimeMirror(this.getClass.getClassLoader)).mkToolBox() val tree = tb.parse(code) tb.eval(tree) } catch { case ex: Throwable => Console.println(code) throw ex } } }
Example 8
Source File: ApplicationFeature.scala From CMAK with Apache License 2.0 | 5 votes |
package features import com.typesafe.config.Config import grizzled.slf4j.Logging import kafka.manager.features.KMFeature import scala.util.{Success, Failure, Try} sealed trait ApplicationFeature extends KMFeature case object KMClusterManagerFeature extends ApplicationFeature case object KMTopicManagerFeature extends ApplicationFeature case object KMPreferredReplicaElectionFeature extends ApplicationFeature case object KMScheduleLeaderElectionFeature extends ApplicationFeature case object KMReassignPartitionsFeature extends ApplicationFeature case object KMBootstrapClusterConfigFeature extends ApplicationFeature object ApplicationFeature extends Logging { import scala.reflect.runtime.universe val runtimeMirror = universe.runtimeMirror(getClass.getClassLoader) def from(s: String) : Option[ApplicationFeature] = { Try { val clazz = s"features.$s" val module = runtimeMirror.staticModule(clazz) val obj = runtimeMirror.reflectModule(module) obj.instance match { case f: ApplicationFeature => f case _ => throw new IllegalArgumentException(s"Unknown application feature $s") } } match { case Failure(t) => error(s"Unknown application feature $s") None case Success(f) => Option(f) } } } case class ApplicationFeatures(features: Set[ApplicationFeature]) object ApplicationFeatures extends Logging { lazy val default : List[String] = List( KMClusterManagerFeature, KMTopicManagerFeature, KMPreferredReplicaElectionFeature, KMReassignPartitionsFeature).map(_.getClass.getSimpleName) def getApplicationFeatures(config: Config) : ApplicationFeatures = { import scala.collection.JavaConverters._ val configFeatures: Option[List[String]] = Try(config.getStringList("application.features").asScala.toList).toOption if(configFeatures.isEmpty) { warn(s"application.features not found in conf file, using default values $default") } val f = configFeatures.getOrElse(default).map(ApplicationFeature.from).flatten ApplicationFeatures(f.toSet) } }
Example 9
Source File: KMFeature.scala From CMAK with Apache License 2.0 | 5 votes |
package kafka.manager.features import grizzled.slf4j.Logging import kafka.manager.model.{Kafka_0_8_1_1, ClusterConfig} import scala.collection.mutable.ListBuffer import scala.util.{Success, Failure, Try} trait KMFeature sealed trait ClusterFeature extends KMFeature case object KMLogKafkaFeature extends ClusterFeature case object KMDeleteTopicFeature extends ClusterFeature case object KMJMXMetricsFeature extends ClusterFeature case object KMDisplaySizeFeature extends ClusterFeature case object KMPollConsumersFeature extends ClusterFeature object ClusterFeature extends Logging { import scala.reflect.runtime.universe val runtimeMirror = universe.runtimeMirror(getClass.getClassLoader) def from(s: String) : Option[ClusterFeature] = { Try { val clazz = s"features.$s" val module = runtimeMirror.staticModule(clazz) val obj = runtimeMirror.reflectModule(module) obj.instance match { case f: ClusterFeature => f case _ => throw new IllegalArgumentException(s"Unknown application feature $s") } } match { case Failure(t) => error(s"Unknown application feature $s") None case Success(f) => Option(f) } } } case class ClusterFeatures(features: Set[ClusterFeature]) object ClusterFeatures { val default = ClusterFeatures(Set()) def from(clusterConfig: ClusterConfig) : ClusterFeatures = { val buffer = new ListBuffer[ClusterFeature] if(clusterConfig.logkafkaEnabled) buffer+=KMLogKafkaFeature if(clusterConfig.jmxEnabled) buffer+=KMJMXMetricsFeature if(clusterConfig.displaySizeEnabled) buffer+=KMDisplaySizeFeature if(clusterConfig.version != Kafka_0_8_1_1) buffer+=KMDeleteTopicFeature if(clusterConfig.pollConsumers) buffer+=KMPollConsumersFeature ClusterFeatures(buffer.toSet) } }
Example 10
Source File: HBaseCredentialProvider.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn.security import scala.reflect.runtime.universe import scala.util.control.NonFatal import org.apache.hadoop.conf.Configuration import org.apache.hadoop.security.Credentials import org.apache.hadoop.security.token.{Token, TokenIdentifier} import org.apache.spark.SparkConf import org.apache.spark.internal.Logging private[security] class HBaseCredentialProvider extends ServiceCredentialProvider with Logging { override def serviceName: String = "hbase" override def obtainCredentials( hadoopConf: Configuration, sparkConf: SparkConf, creds: Credentials): Option[Long] = { try { val mirror = universe.runtimeMirror(getClass.getClassLoader) val obtainToken = mirror.classLoader. loadClass("org.apache.hadoop.hbase.security.token.TokenUtil"). getMethod("obtainToken", classOf[Configuration]) logDebug("Attempting to fetch HBase security token.") val token = obtainToken.invoke(null, hbaseConf(hadoopConf)) .asInstanceOf[Token[_ <: TokenIdentifier]] logInfo(s"Get token from HBase: ${token.toString}") creds.addToken(token.getService, token) } catch { case NonFatal(e) => logDebug(s"Failed to get token from service $serviceName", e) } None } override def credentialsRequired(hadoopConf: Configuration): Boolean = { hbaseConf(hadoopConf).get("hbase.security.authentication") == "kerberos" } private def hbaseConf(conf: Configuration): Configuration = { try { val mirror = universe.runtimeMirror(getClass.getClassLoader) val confCreate = mirror.classLoader. loadClass("org.apache.hadoop.hbase.HBaseConfiguration"). getMethod("create", classOf[Configuration]) confCreate.invoke(null, conf).asInstanceOf[Configuration] } catch { case NonFatal(e) => logDebug("Fail to invoke HBaseConfiguration", e) conf } } }
Example 11
Source File: IntegrationsModule.scala From scuruto with MIT License | 5 votes |
package module import integration._ import skinny.SkinnyConfig class IntegrationsModule extends scaldi.Module { private val DEFAULT_SERVICE = "Null" val service = SkinnyConfig.stringConfigValue("externalIntegration.service").map { configValue => configValue.capitalize } getOrElse DEFAULT_SERVICE import scala.reflect.runtime.universe val runtimeMirror = universe.runtimeMirror(getClass.getClassLoader) val module = runtimeMirror.staticModule(s"integration.${service}Integration") val obj = runtimeMirror.reflectModule(module) val integration = obj.instance bind[ExternalServiceIntegration] to integration.asInstanceOf[ExternalServiceIntegration] }
Example 12
Source File: UploadControllerFactory.scala From scuruto with MIT License | 5 votes |
package controller import skinny.SkinnyConfig object UploadControllerFactory { private val DEFAULT_DESTINATION = "Local" val create: UploadController = { val destination = SkinnyConfig.stringConfigValue("upload.destination").map { configValue => configValue.capitalize } getOrElse DEFAULT_DESTINATION import scala.reflect.runtime.universe val runtimeMirror = universe.runtimeMirror(getClass.getClassLoader) val module = runtimeMirror.staticModule(s"controller.upload.${destination}UploadController") val obj = runtimeMirror.reflectModule(module) val controller = obj.instance controller.asInstanceOf[UploadController] } }
Example 13
Source File: L8-8Sql.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrSqlApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrSqlApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.registerTempTable("cdrs") sqlC.sql("SELECT countryCode, COUNT(countryCode) AS cCount FROM cdrs GROUP BY countryCode ORDER BY cCount DESC LIMIT 5").show() sqlC.dropTempTable("cdrs") }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 14
Source File: ContractProxyModule.scala From fintrospect with Apache License 2.0 | 5 votes |
package io.fintrospect import com.twitter.finagle.Service import com.twitter.finagle.http.path.{Path, Root} import com.twitter.finagle.http.{Request, Response} import io.fintrospect.renderers.swagger2dot0.{ApiInfo, Swagger2dot0Json} import scala.reflect.runtime.universe.TypeTag import scala.reflect.runtime.{currentMirror, universe} object ContractProxyModule { def apply[T <: Contract](name: String, service: Service[Request, Response], contract: T, rootPath: Path = Root, description: String = null)(implicit tag: TypeTag[T]): RouteModule[Request, Response] = { val descriptionOption = Option(description).getOrElse(s"Proxy services for $name API") val routes = universe.typeOf[T].members .filter(_.isModule) .map(_.asModule) .map(currentMirror.reflectModule(_).instance) .filter(_.isInstanceOf[ContractEndpoint]) .map(_.asInstanceOf[ContractEndpoint].route) routes.foldLeft(RouteModule(rootPath, Swagger2dot0Json(ApiInfo(name, name, descriptionOption)))) { (spec, route) => spec.withRoute(route.bindToProxy(service)) } } }
Example 15
Source File: ScalaCleanCompilerPlugin.scala From ScalaClean with Apache License 2.0 | 5 votes |
package org.scalaclean.analysis import org.scalaclean.analysis.plugin.{ExtensionPlugin, ExtensionPluginFactory, JunitPlugin, ModsPlugin} import scala.tools.nsc.Global import scala.tools.nsc.plugins.{Plugin, PluginComponent} class ScalaCleanCompilerPlugin(override val global: Global) extends Plugin { override val name: String = "scalaclean-analysis-plugin" override val description: String = "ScalaClean analysis plugin" val component = new ScalaCompilerPluginComponent(global) //hardcoded for the moment component.extensions += ModsPlugin.create(component, "") component.extensions += JunitPlugin.create(component, "") override def processOptions( options: List[String], error: String => Unit): Unit = { import scala.reflect.runtime.universe val runtimeMirror = universe.runtimeMirror(getClass.getClassLoader) val realOptions = options.distinct component.options = realOptions for (option <- realOptions) { if (option == "debug:true") { component.debug = true } else if (option.startsWith("extension:")) { val end = { val end = option.indexOf(':', 10) if (end == -1) option.length else end } val fqn = option.substring(10, end) val module = runtimeMirror.staticModule(fqn) runtimeMirror.reflectModule(module).instance match { case valid: ExtensionPluginFactory => component.extensions += valid.create(component, option.substring(end)) case null => throw new IllegalArgumentException("not a valid Extension FQN - expected the name of an object") case invalid => throw new IllegalArgumentException(s"not a valid Extension FQN - ${invalid.getClass.getName} is not a ${classOf[ExtensionDescriptor[_]].getName}") } } else if (option.startsWith("srcdirs:")) { component.sourceDirs = option.substring(8).split(java.io.File.pathSeparatorChar).toList } else error(s"Option not recognised: $option") } } override val optionsHelp: Option[String] = Some( // s"""-P:$name:debug:true Set debugging on the ScalaClean analysis plugin |-P:$name:srcdirs The path of sources, separated by ${java.io.File.pathSeparatorChar} |-P:$name:extension:<fqn> Add an extension dataset. FQN is the fully qualified name of the appropriate ExtensionDescriptor object |""".stripMargin) override val components: List[PluginComponent] = List(component) }
Example 16
Source File: ChewerJob.scala From comet-data-pipeline with Apache License 2.0 | 5 votes |
package com.ebiznext.comet.job.ingest import com.ebiznext.comet.schema.handlers.StorageHandler import com.ebiznext.comet.schema.model.{Domain, Schema, Type} import com.ebiznext.comet.utils.{SparkJob, SparkJobResult} import org.apache.hadoop.fs.Path import scala.reflect.runtime.universe import scala.util.Try trait ChewerJob extends SparkJob { var domain: Domain var schema: Schema var types: List[Type] var path: List[Path] var storageHandler: StorageHandler def run( domain: Domain, schema: Schema, types: List[Type], path: List[Path], storageHandler: StorageHandler ): Try[SparkJobResult] = { this.domain = domain this.schema = schema this.types = types this.path = path this.storageHandler = storageHandler run() } } object ChewerJob { def run( objName: String, domain: Domain, schema: Schema, types: List[Type], path: List[Path], storageHandler: StorageHandler ): Try[SparkJobResult] = { val runtimeMirror = universe.runtimeMirror(getClass.getClassLoader) val module = runtimeMirror.staticModule(objName) val obj: universe.ModuleMirror = runtimeMirror.reflectModule(module) val chewer = obj.instance.asInstanceOf[ChewerJob] chewer.run(domain, schema, types, path, storageHandler) } }
Example 17
Source File: MorpheusGraphTest.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.impl import org.apache.spark.sql.Row import org.opencypher.morpheus.api.io.MorpheusElementTable import org.opencypher.morpheus.api.value.MorpheusElement._ import org.opencypher.morpheus.impl.table.SparkTable.DataFrameTable import org.opencypher.morpheus.testing.MorpheusTestSuite import org.opencypher.morpheus.testing.fixture.{GraphConstructionFixture, RecordsVerificationFixture, TeamDataFixture} import org.opencypher.okapi.api.types._ import org.opencypher.okapi.relational.api.planning.RelationalRuntimeContext import org.opencypher.okapi.relational.api.table.RelationalCypherRecords import org.opencypher.okapi.relational.impl.operators.Start import org.opencypher.okapi.testing.Bag import scala.reflect.runtime.universe abstract class MorpheusGraphTest extends MorpheusTestSuite with GraphConstructionFixture with RecordsVerificationFixture with TeamDataFixture { object MorpheusGraphTest { implicit class RecordOps(records: RelationalCypherRecords[DataFrameTable]) { def planStart: Start[DataFrameTable] = { implicit val tableTypeTag: universe.TypeTag[DataFrameTable] = morpheus.tableTypeTag implicit val context: RelationalRuntimeContext[DataFrameTable] = morpheus.basicRuntimeContext() Start.fromEmptyGraph(records) } } } it("should return only nodes with that exact label (single label)") { val graph = initGraph(dataFixtureWithoutArrays) val nodes = graph.nodes("n", CTNode("Person"), exactLabelMatch = true) val cols = Seq( n, nHasLabelPerson, nHasPropertyLuckyNumber, nHasPropertyName ) verify(nodes, cols, Bag(Row(4L.encodeAsMorpheusId.toList, true, 8L, "Donald"))) } it("should return only nodes with that exact label (multiple labels)") { val graph = initGraph(dataFixtureWithoutArrays) val nodes = graph.nodes("n", CTNode("Person", "German"), exactLabelMatch = true) val cols = Seq( n, nHasLabelGerman, nHasLabelPerson, nHasPropertyLuckyNumber, nHasPropertyName ) val data = Bag( Row(2L.encodeAsMorpheusId.toList, true, true, 1337L, "Martin"), Row(3L.encodeAsMorpheusId.toList, true, true, 8L, "Max"), Row(0L.encodeAsMorpheusId.toList, true, true, 42L, "Stefan") ) verify(nodes, cols, data) } it("should support the same node label from multiple node tables") { // this creates additional :Person nodes val personsPart2 = morpheus.sparkSession.createDataFrame( Seq( (5L, false, "Soeren", 23L), (6L, false, "Hannes", 42L)) ).toDF("ID", "IS_SWEDE", "NAME", "NUM") val personTable2 = MorpheusElementTable.create(personTable.mapping, personsPart2) val graph = morpheus.graphs.create(personTable, personTable2) graph.nodes("n").size shouldBe 6 } it("should support the same relationship type from multiple relationship tables") { // this creates additional :KNOWS relationships val knowsParts2 = morpheus.sparkSession.createDataFrame( Seq( (1L, 7L, 2L, 2017L), (1L, 8L, 3L, 2016L)) ).toDF("SRC", "ID", "DST", "SINCE") val knowsTable2 = MorpheusElementTable.create(knowsTable.mapping, knowsParts2) val graph = morpheus.graphs.create(personTable, knowsTable, knowsTable2) graph.relationships("r").size shouldBe 8 } it("should return an empty result for non-present types") { val graph = morpheus.graphs.create(personTable, knowsTable) graph.nodes("n", CTNode("BAR")).size shouldBe 0 graph.relationships("r", CTRelationship("FOO")).size shouldBe 0 } }
Example 18
Source File: HBaseCredentialProvider.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn.security import scala.reflect.runtime.universe import scala.util.control.NonFatal import org.apache.hadoop.conf.Configuration import org.apache.hadoop.security.Credentials import org.apache.hadoop.security.token.{Token, TokenIdentifier} import org.apache.spark.SparkConf import org.apache.spark.internal.Logging private[security] class HBaseCredentialProvider extends ServiceCredentialProvider with Logging { override def serviceName: String = "hbase" override def obtainCredentials( hadoopConf: Configuration, sparkConf: SparkConf, creds: Credentials): Option[Long] = { try { val mirror = universe.runtimeMirror(getClass.getClassLoader) val obtainToken = mirror.classLoader. loadClass("org.apache.hadoop.hbase.security.token.TokenUtil"). getMethod("obtainToken", classOf[Configuration]) logDebug("Attempting to fetch HBase security token.") val token = obtainToken.invoke(null, hbaseConf(hadoopConf)) .asInstanceOf[Token[_ <: TokenIdentifier]] logInfo(s"Get token from HBase: ${token.toString}") creds.addToken(token.getService, token) } catch { case NonFatal(e) => logDebug(s"Failed to get token from service $serviceName", e) } None } override def credentialsRequired(hadoopConf: Configuration): Boolean = { hbaseConf(hadoopConf).get("hbase.security.authentication") == "kerberos" } private def hbaseConf(conf: Configuration): Configuration = { try { val mirror = universe.runtimeMirror(getClass.getClassLoader) val confCreate = mirror.classLoader. loadClass("org.apache.hadoop.hbase.HBaseConfiguration"). getMethod("create", classOf[Configuration]) confCreate.invoke(null, conf).asInstanceOf[Configuration] } catch { case NonFatal(e) => logDebug("Fail to invoke HBaseConfiguration", e) conf } } }
Example 19
Source File: TypeInfo.scala From avro4s with Apache License 2.0 | 5 votes |
package com.sksamuel.avro4s import magnolia.TypeName import scala.util.Try case class TypeInfo(owner: String, short: String, typeArguments: Seq[TypeInfo], nameAnnotation: Option[String], namespaceAnnotation: Option[String], erased: Boolean) { val full: String = s"$owner.$short" } object TypeInfo { import scala.reflect.runtime.universe def apply(typeName: TypeName, annos: Seq[Any]): TypeInfo = { val annotationExtractors = new AnnotationExtractors(annos) TypeInfo( typeName.owner, typeName.short, typeName.typeArguments.map(TypeInfo.fromTypeName), annotationExtractors.name, annotationExtractors.namespace, annotationExtractors.erased ) } def fromTypeName(typeName: TypeName): TypeInfo = { // try to populate from the class name, but this may fail if the class is not top level // if it does fail then we default back to using what magnolia provides val maybeType: Option[universe.Type] = Try { val mirror = universe.runtimeMirror(Thread.currentThread().getContextClassLoader) val classsym = mirror.staticClass(typeName.full) classsym.toType }.toOption TypeInfo( owner = typeName.owner, short = typeName.short, typeArguments = typeName.typeArguments.map(fromTypeName), nameAnnotation = maybeType.flatMap(nameAnnotation), namespaceAnnotation = maybeType.flatMap(namespaceAnnotation), erased = maybeType.exists(erased) ) } def fromClass[A](klass: Class[A]): TypeInfo = { import scala.reflect.runtime.universe val mirror = universe.runtimeMirror(Thread.currentThread().getContextClassLoader) val sym = mirror.classSymbol(klass) val tpe = sym.toType TypeInfo.fromType(tpe) } private def nameAnnotation(tpe: universe.Type): Option[String] = { import scala.reflect.runtime.universe._ tpe.typeSymbol.typeSignature.typeSymbol.annotations.collectFirst { case a if a.tree.tpe =:= typeOf[AvroName] => val annoValue = a.tree.children.tail.head.asInstanceOf[Literal].value.value annoValue.toString } } private def namespaceAnnotation(tpe: universe.Type): Option[String] = { import scala.reflect.runtime.universe._ tpe.typeSymbol.typeSignature.typeSymbol.annotations.collectFirst { case a if a.tree.tpe =:= typeOf[AvroNamespace] => val annoValue = a.tree.children.tail.head.asInstanceOf[Literal].value.value annoValue.toString } } private def erased(tpe: universe.Type): Boolean = { import scala.reflect.runtime.universe._ tpe.typeSymbol.typeSignature.typeSymbol.annotations.exists { case a if a.tree.tpe =:= typeOf[AvroErasedName] => true case _ => false } } def fromType(tpe: universe.Type): TypeInfo = { TypeInfo( tpe.typeSymbol.owner.fullName, tpe.typeSymbol.name.decodedName.toString, tpe.typeArgs.map(fromType), nameAnnotation(tpe), namespaceAnnotation(tpe), erased(tpe) ) } }
Example 20
Source File: HBaseDelegationTokenProvider.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.security import scala.reflect.runtime.universe import scala.util.control.NonFatal import org.apache.hadoop.conf.Configuration import org.apache.hadoop.security.Credentials import org.apache.hadoop.security.token.{Token, TokenIdentifier} import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.util.Utils private[security] class HBaseDelegationTokenProvider extends HadoopDelegationTokenProvider with Logging { override def serviceName: String = "hbase" override def obtainDelegationTokens( hadoopConf: Configuration, sparkConf: SparkConf, creds: Credentials): Option[Long] = { try { val mirror = universe.runtimeMirror(Utils.getContextOrSparkClassLoader) val obtainToken = mirror.classLoader. loadClass("org.apache.hadoop.hbase.security.token.TokenUtil"). getMethod("obtainToken", classOf[Configuration]) logDebug("Attempting to fetch HBase security token.") val token = obtainToken.invoke(null, hbaseConf(hadoopConf)) .asInstanceOf[Token[_ <: TokenIdentifier]] logInfo(s"Get token from HBase: ${token.toString}") creds.addToken(token.getService, token) } catch { case NonFatal(e) => logDebug(s"Failed to get token from service $serviceName", e) } None } override def delegationTokensRequired( sparkConf: SparkConf, hadoopConf: Configuration): Boolean = { hbaseConf(hadoopConf).get("hbase.security.authentication") == "kerberos" } private def hbaseConf(conf: Configuration): Configuration = { try { val mirror = universe.runtimeMirror(Utils.getContextOrSparkClassLoader) val confCreate = mirror.classLoader. loadClass("org.apache.hadoop.hbase.HBaseConfiguration"). getMethod("create", classOf[Configuration]) confCreate.invoke(null, conf).asInstanceOf[Configuration] } catch { case NonFatal(e) => logDebug("Fail to invoke HBaseConfiguration", e) conf } } }
Example 21
Source File: Platform.scala From neo-sbt-scalafmt with Apache License 2.0 | 5 votes |
package com.lucidchart.sbt.scalafmt import sbt.Keys._ import sbt._ import sbt.internal.inc.Analysis import sbt.util.CacheImplicits._ import sbt.util.CacheStore import scala.reflect.runtime.universe object AnalysisPlatform { def counted(prefix: String, single: String, plural: String, count: Int) = Analysis.counted(prefix, single, plural, count) } object CachePlatform { private[this] val mirror = universe.runtimeMirror(getClass.getClassLoader) private[this] val fileHashModified = { val module = mirror.reflectModule(mirror.staticModule("sbt.util.FileHashModified")) mirror.reflect(module.instance).reflectMethod(module.symbol.info.decl(universe.TermName("apply")).asMethod) } def fileInfo(file: File, hash: List[Byte], lastModified: Long) = fileHashModified(file, hash, lastModified.asInstanceOf[AnyRef]).asInstanceOf[HashModifiedFileInfo] def readFileInfo(cache: File) = CacheStore(cache).read(Set.empty[HashModifiedFileInfo]) def writeFileInfo(cache: File, value: Set[HashModifiedFileInfo]) = CacheStore(cache).write(value) } object CommandPlatform { val CommandStrings = sbt.internal.CommandStrings } object LibraryPlatform { def moduleInfo(useIvy: SettingKey[Boolean]) = scalaModuleInfo := { if (useIvy.value) { // otherwise scala-library conflicts scalaModuleInfo.value.map(_.withOverrideScalaVersion(false)) } else { scalaModuleInfo.value } } }
Example 22
Source File: SysProps.scala From scio with Apache License 2.0 | 5 votes |
package com.spotify.scio import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.reflect.ClassPath final case class SysProp(flag: String, description: String) { def value(default: => String): String = sys.props.getOrElse(flag, default) def value: String = sys.props(flag) def valueOption: Option[String] = sys.props.get(flag) def value_=(str: String): Unit = sys.props(flag) = str def show: String = s"-D$flag=<String>\n\t$description" } trait SysProps { def properties: List[SysProp] def show: String = { val props = properties.map(p => s" ${p.show}").mkString("\n") val name = this.getClass.getName.replace("$", "") s"$name:\n$props\n" } } object SysProps { import scala.jdk.CollectionConverters._ import scala.reflect.runtime.universe def properties: Iterable[SysProps] = { val classLoader = Thread.currentThread().getContextClassLoader val runtimeMirror = universe.runtimeMirror(classLoader) ClassPath .from(classLoader) .getAllClasses .asScala .filter(_.getName.endsWith("SysProps")) .flatMap { clsInfo => try { val cls = clsInfo.load() cls.getMethod("properties") val module = runtimeMirror.staticModule(cls.getName) val obj = runtimeMirror.reflectModule(module) Some(obj.instance.asInstanceOf[SysProps]) } catch { case _: Throwable => None } } } } @registerSysProps object CoreSysProps { val Project = SysProp("project", "") val Home = SysProp("java.home", "java home directory") val TmpDir = SysProp("java.io.tmpdir", "java temporary directory") val User = SysProp("user.name", "system username") val UserDir = SysProp("user.dir", "user dir") }
Example 23
Source File: WarpPropertyLike.scala From warp-core with MIT License | 5 votes |
package com.workday.warp.common import org.pmw.tinylog.Logger import scala.reflect.runtime.universe import scala.reflect.runtime.universe.{Mirror, ModuleSymbol, MethodSymbolApi, Type} def values[T <: WarpPropertyLike](`class`: Class[T]): Seq[PropertyEntry] = { Logger.debug(s"getting property values for ${`class`.getCanonicalName}") val mirror: Mirror = universe.runtimeMirror(`class`.getClassLoader) // concrete type of the property holder class // TODO don't read this as a static module, this breaks nested config objects, eg those defined within a class or method. val module: ModuleSymbol = mirror.staticModule(`class`.getCanonicalName) // we reflected this as a module (singleton), so get the single instance, and obtain a mirror for that instance. val instanceMirror = mirror.reflect(mirror.reflectModule(module).instance) // the `MODULE$` field holds all the members we are really interested in. module.info.members.find(_.name == universe.TermName("MODULE$")) match { case Some(member) => member.info.members.toSeq.collect { // retain only public accessor methods with the correct return type. // recall that scala vals are private fields with generated accessor methods. case method: MethodSymbolApi if method.isPublic && method.isAccessor && method.returnType =:= entryType => instanceMirror.reflectMethod(method.asMethod)().asInstanceOf[PropertyEntry] } case None => throw new RuntimeException(s"it appears that ${`class`.getCanonicalName} is not a scala object (does not have a MODULE$$ field)") } } }
Example 24
Source File: LocalKMeansModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.clustering import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.clustering.KMeansModel import org.apache.spark.mllib.clustering.{KMeansModel => OldKMeansModel} import org.apache.spark.mllib.linalg.{Vector => MLlibVec} import scala.reflect.runtime.universe class LocalKMeansModel(override val sparkTransformer: KMeansModel) extends LocalTransformer[KMeansModel] { lazy val parent: OldKMeansModel = { val mirror = universe.runtimeMirror(sparkTransformer.getClass.getClassLoader) val parentTerm = universe.typeOf[KMeansModel].decl(universe.TermName("parentModel")).asTerm mirror.reflect(sparkTransformer).reflectField(parentTerm).get.asInstanceOf[OldKMeansModel] } override def transform(localData: LocalData): LocalData = { import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._ localData.column(sparkTransformer.getFeaturesCol) match { case Some(column) => val newColumn = LocalDataColumn( sparkTransformer.getPredictionCol, column.data.mapToMlLibVectors.map(x => parent.predict(x)) ) localData.withColumn(newColumn) case None => localData } } } object LocalKMeansModel extends SimpleModelLoader[KMeansModel] with TypedTransformerConverter[KMeansModel] { override def build(metadata: Metadata, data: LocalData): KMeansModel = { val mapRows = data.toMapList val centers = mapRows map { row => val vec = DataUtils.constructVector(row("clusterCenter").asInstanceOf[Map[String, Any]]) org.apache.spark.mllib.linalg.Vectors.fromML(vec) } val parentConstructor = classOf[OldKMeansModel].getDeclaredConstructor(classOf[Array[MLlibVec]]) parentConstructor.setAccessible(true) val mlk = parentConstructor.newInstance(centers.toArray) val constructor = classOf[KMeansModel].getDeclaredConstructor(classOf[String], classOf[OldKMeansModel]) constructor.setAccessible(true) var inst = constructor .newInstance(metadata.uid, mlk) .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String]) .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String]) inst = inst.set(inst.k, metadata.paramMap("k").asInstanceOf[Number].intValue()) inst = inst.set(inst.initMode, metadata.paramMap("initMode").asInstanceOf[String]) inst = inst.set(inst.maxIter, metadata.paramMap("maxIter").asInstanceOf[Number].intValue()) inst = inst.set(inst.initSteps, metadata.paramMap("initSteps").asInstanceOf[Number].intValue()) inst = inst.set(inst.seed, metadata.paramMap("seed").toString.toLong) inst = inst.set(inst.tol, metadata.paramMap("tol").asInstanceOf[Double]) inst } override implicit def toLocal(transformer: KMeansModel) = new LocalKMeansModel(transformer) }
Example 25
Source File: ShapeConverter.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.utils.serializer.converters import com.intel.analytics.bigdl.tensor.TensorNumericMath import com.intel.analytics.bigdl.utils.serializer.{DeserializeContext, SerializeContext} import com.intel.analytics.bigdl.utils.{MultiShape, SingleShape, Shape => BigDLShape} import com.intel.analytics.bigdl.serialization.Bigdl import com.intel.analytics.bigdl.serialization.Bigdl.Shape.ShapeType import com.intel.analytics.bigdl.serialization.Bigdl.{AttrValue, BigDLModule, DataType, Shape} import scala.collection.JavaConverters._ import scala.reflect.ClassTag import scala.reflect.runtime.universe object ShapeConverter extends DataConverter { override def getAttributeValue[T: ClassTag] (context: DeserializeContext, attribute: Bigdl.AttrValue) (implicit ev: TensorNumericMath.TensorNumeric[T]): AnyRef = { val shape = attribute.getShape toBigDLShape(shape) } private def toBigDLShape(shape : Shape): BigDLShape = { if (shape.getSsize == 0) { // null is mapped to empty shape on the serialization stage. return null } if (shape.getShapeType == ShapeType.SINGLE) { val shapeValues = shape.getShapeValueList.asScala.toList.map(_.intValue) SingleShape(shapeValues) } else if (shape.getShapeType == ShapeType.MULTI) { val shapes = shape.getShapeList.asScala.toList.map(toBigDLShape(_)) MultiShape(shapes) } else { throw new RuntimeException(s"${shape.getShapeType} not supported for now") } } def shapeToBigDL[T: ClassTag](context: DeserializeContext, model: BigDLModule, name: String) (implicit ev: TensorNumericMath.TensorNumeric[T]): BigDLShape = { val attrbute = AttrValue.newBuilder attrbute.setShape( name match { case "input" => model.getInputShape case "output" => model.getOutputShape }) ShapeConverter.getAttributeValue(context, attrbute.build).asInstanceOf[BigDLShape] } def shapeToProto[T: ClassTag](context: SerializeContext[T], shape: BigDLShape) (implicit ev: TensorNumericMath.TensorNumeric[T]): Shape = { val attribute = AttrValue.newBuilder ShapeConverter.setAttributeValue(context, attribute, shape, universe.typeOf[BigDLShape]) attribute.getShape } override def setAttributeValue[T: ClassTag] (context: SerializeContext[T], attributeBuilder: AttrValue.Builder, value: Any, valueType: universe.Type)(implicit ev: TensorNumericMath.TensorNumeric[T]): Unit = { attributeBuilder.setDataType(DataType.SHAPE) if (value != null) { val shape = value.asInstanceOf[BigDLShape] val shapeBuilder = Shape.newBuilder setShape(shape, shapeBuilder) attributeBuilder.setShape(shapeBuilder.build) } } private def setShape(bigdlShape : BigDLShape, shapeBuilder : Shape.Builder): Unit = { if (bigdlShape.isInstanceOf[SingleShape]) { shapeBuilder.setShapeType(ShapeType.SINGLE) val shapes = bigdlShape.toSingle shapeBuilder.setSsize(shapes.size) shapes.foreach(shape => { shapeBuilder.addShapeValue(shape) }) } else if (bigdlShape.isInstanceOf[MultiShape]) { shapeBuilder.setShapeType(ShapeType.MULTI) val shapes = bigdlShape.toMulti shapeBuilder.setSsize(shapes.size) shapes.foreach(shape => { val subShapeBuilder = Shape.newBuilder setShape(shape, subShapeBuilder) shapeBuilder.addShape(subShapeBuilder.build) }) } else { throw new RuntimeException(s"${bigdlShape} type not supported !") } } }
Example 26
Source File: DataFormatConverter.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.utils.serializer.converters import com.intel.analytics.bigdl.nn.abstractnn.DataFormat import com.intel.analytics.bigdl.nn.abstractnn.DataFormat.{NCHW, NHWC} import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric import com.intel.analytics.bigdl.utils.serializer.{DeserializeContext, SerializeContext} import com.intel.analytics.bigdl.serialization.Bigdl.{AttrValue, DataType, InputDataFormat} import scala.reflect.ClassTag import scala.reflect.runtime.universe object DataFormatConverter extends DataConverter { override def getAttributeValue[T: ClassTag](context: DeserializeContext, attribute: AttrValue) (implicit ev: TensorNumeric[T]): AnyRef = { val dataFormat = attribute.getDataFormatValue dataFormat match { case InputDataFormat.NCHW => NCHW case InputDataFormat.NHWC => NHWC } } override def setAttributeValue[T: ClassTag] (context: SerializeContext[T], attributeBuilder: AttrValue.Builder, value: Any, valueType: universe.Type) (implicit ev: TensorNumeric[T]): Unit = { attributeBuilder.setDataType(DataType.DATA_FORMAT) if (value != null) { val dataFormat = value.asInstanceOf[DataFormat] val inputFormat = dataFormat match { case NCHW => InputDataFormat.NCHW case NHWC => InputDataFormat.NHWC } attributeBuilder.setDataFormatValue(inputFormat) } } }
Example 27
Source File: VariableFormatConverter.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.utils.serializer.converters import com.intel.analytics.bigdl.nn.VariableFormat import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric import com.intel.analytics.bigdl.utils.serializer.{DeserializeContext, SerializeContext} import com.intel.analytics.bigdl.serialization.Bigdl.{AttrValue, DataType, VarFormat} import scala.reflect.ClassTag import scala.reflect.runtime.universe object VariableFormatConverter extends DataConverter { override def getAttributeValue[T: ClassTag](context: DeserializeContext, attribute: AttrValue) (implicit ev: TensorNumeric[T]): AnyRef = { val format = attribute.getVariableFormatValue format match { case VarFormat.DEFAULT => VariableFormat.Default case VarFormat.ONE_D => VariableFormat.ONE_D case VarFormat.IN_OUT => VariableFormat.IN_OUT case VarFormat.OUT_IN => VariableFormat.OUT_IN case VarFormat.IN_OUT_KW_KH => VariableFormat.IN_OUT_KW_KH case VarFormat.OUT_IN_KW_KH => VariableFormat.OUT_IN_KW_KH case VarFormat.GP_OUT_IN_KW_KH => VariableFormat.GP_OUT_IN_KW_KH case VarFormat.GP_IN_OUT_KW_KH => VariableFormat.GP_IN_OUT_KW_KH case VarFormat.OUT_IN_KT_KH_KW => VariableFormat.OUT_IN_KT_KH_KW case VarFormat.EMPTY_FORMAT => null } } override def setAttributeValue[T: ClassTag]( context: SerializeContext[T], attributeBuilder: AttrValue.Builder, value: Any, valueType: universe.Type = null)(implicit ev: TensorNumeric[T]): Unit = { attributeBuilder.setDataType(DataType.VARIABLE_FORMAT) if (value != null) { val format = value.asInstanceOf[VariableFormat] val formatValue = format match { case VariableFormat.Default => VarFormat.DEFAULT case VariableFormat.ONE_D => VarFormat.ONE_D case VariableFormat.IN_OUT => VarFormat.IN_OUT case VariableFormat.OUT_IN => VarFormat.OUT_IN case VariableFormat.IN_OUT_KW_KH => VarFormat.IN_OUT_KW_KH case VariableFormat.OUT_IN_KW_KH => VarFormat.OUT_IN_KW_KH case VariableFormat.GP_OUT_IN_KW_KH => VarFormat.GP_OUT_IN_KW_KH case VariableFormat.GP_IN_OUT_KW_KH => VarFormat.GP_IN_OUT_KW_KH case VariableFormat.OUT_IN_KT_KH_KW => VarFormat.OUT_IN_KT_KH_KW } attributeBuilder.setVariableFormatValue(formatValue) } else { attributeBuilder.setVariableFormatValue(VarFormat.EMPTY_FORMAT) } } }
Example 28
Source File: ModuleConverter.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.utils.serializer.converters import com.intel.analytics.bigdl.nn.abstractnn.{AbstractModule, Activity} import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric import com.intel.analytics.bigdl.utils.serializer.{DeserializeContext, ModuleData, ModuleSerializer, SerializeContext} import com.intel.analytics.bigdl.serialization.Bigdl.{AttrValue, DataType} import scala.reflect.ClassTag import scala.reflect.runtime.universe object ModuleConverter extends DataConverter { override def getAttributeValue[T: ClassTag](context: DeserializeContext, attribute: AttrValue) (implicit ev: TensorNumeric[T]): AnyRef = { val serializedModule = attribute.getBigDLModuleValue if (serializedModule.getModuleType != null && serializedModule.getModuleType != "") { ModuleSerializer.load(DeserializeContext(serializedModule, context.storages, context.storageType)).module } else { null } } override def setAttributeValue[T: ClassTag](context: SerializeContext[T], attributeBuilder: AttrValue.Builder, value: Any, valueType: universe.Type = null)(implicit ev: TensorNumeric[T]): Unit = { attributeBuilder.setDataType(DataType.MODULE) if (value != null) { val module = value.asInstanceOf[AbstractModule[Activity, Activity, T]] val serializableModule = ModuleSerializer. serialize(SerializeContext(ModuleData(module, Seq[String](), Seq[String]()), context.storages, context.storageType)).bigDLModule attributeBuilder.setBigDLModuleValue(serializableModule) } } }
Example 29
Source File: InitMethodConverter.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.utils.serializer.converters import com.intel.analytics.bigdl.nn._ import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric import com.intel.analytics.bigdl.utils.serializer.{DeserializeContext, SerializeContext} import com.intel.analytics.bigdl.serialization.Bigdl.{AttrValue, DataType, InitMethod, InitMethodType} import scala.reflect.ClassTag import scala.reflect.runtime.universe object InitMethodConverter extends DataConverter { override def getAttributeValue[T: ClassTag](context: DeserializeContext, attribute: AttrValue) (implicit ev: TensorNumeric[T]): AnyRef = { val initMemethod = attribute.getInitMethodValue val initType = initMemethod.getMethodType val methodData = initMemethod.getDataList initType match { case InitMethodType.RANDOM_UNIFORM => RandomUniform case InitMethodType.RANDOM_UNIFORM_PARAM => RandomUniform(methodData.get(0), methodData.get(1)) case InitMethodType.RANDOM_NORMAL => RandomNormal(methodData.get(0), methodData.get(1)) case InitMethodType.ZEROS => Zeros case InitMethodType.ONES => Ones case InitMethodType.CONST => ConstInitMethod(methodData.get(0)) case InitMethodType.XAVIER => Xavier case InitMethodType.BILINEARFILLER => BilinearFiller case InitMethodType.EMPTY_INITIALIZATION => null } } override def setAttributeValue[T: ClassTag]( context: SerializeContext[T], attributeBuilder: AttrValue.Builder, value: Any, valueType: universe.Type = null)(implicit ev: TensorNumeric[T]): Unit = { attributeBuilder.setDataType(DataType.INITMETHOD) val initMethodBuilder = InitMethod.newBuilder if (value != null) { val initMethod = value.asInstanceOf[InitializationMethod] initMethod match { case RandomUniform => initMethodBuilder.setMethodType(InitMethodType.RANDOM_UNIFORM) case ru: RandomUniform => initMethodBuilder.setMethodType(InitMethodType.RANDOM_UNIFORM_PARAM) initMethodBuilder.addData(ru.lower) initMethodBuilder.addData(ru.upper) case rm: RandomNormal => initMethodBuilder.setMethodType(InitMethodType.RANDOM_NORMAL) initMethodBuilder.addData(rm.mean) initMethodBuilder.addData(rm.stdv) case Zeros => initMethodBuilder.setMethodType(InitMethodType.ZEROS) case Ones => initMethodBuilder.setMethodType(InitMethodType.ONES) case const: ConstInitMethod => initMethodBuilder.setMethodType(InitMethodType.CONST) initMethodBuilder.addData(const.value) case Xavier => initMethodBuilder.setMethodType(InitMethodType.XAVIER) case BilinearFiller => initMethodBuilder.setMethodType(InitMethodType.BILINEARFILLER) } attributeBuilder.setInitMethodValue(initMethodBuilder.build) } else { initMethodBuilder.setMethodType(InitMethodType.EMPTY_INITIALIZATION) attributeBuilder.setInitMethodValue(initMethodBuilder.build) } } }
Example 30
Source File: RegularizerConverter.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.utils.serializer.converters import com.intel.analytics.bigdl.optim.{L1L2Regularizer, L1Regularizer, L2Regularizer} import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric import com.intel.analytics.bigdl.utils.serializer.{DeserializeContext, SerializeContext} import com.intel.analytics.bigdl.serialization.Bigdl.{AttrValue, DataType, RegularizerType, Regularizer => SerializeRegularizer} import scala.reflect.ClassTag import scala.reflect.runtime.universe object RegularizerConverter extends DataConverter { override def getAttributeValue[T : ClassTag](context: DeserializeContext, attribute: AttrValue) (implicit ev: TensorNumeric[T]): AnyRef = { val regularizer = attribute.getRegularizerValue val regularizerType = regularizer.getRegularizerType if (regularizer.getRegularDataCount == 0) { return null } regularizerType match { case RegularizerType.L1Regularizer => val l1 = regularizer.getRegularDataList.get(0) L1Regularizer[T](l1) case RegularizerType.L2Regularizer => val l2 = regularizer.getRegularDataList.get(1) L2Regularizer[T](l2) case RegularizerType.L1L2Regularizer => val l1 = regularizer.getRegularDataList.get(0) val l2 = regularizer.getRegularDataList.get(1) L1L2Regularizer[T](l1, l2) } } override def setAttributeValue[T : ClassTag] (context: SerializeContext[T], attributeBuilder: AttrValue.Builder, value: Any, valueType : universe.Type = null) (implicit ev: TensorNumeric[T]): Unit = { attributeBuilder.setDataType(DataType.REGULARIZER) if (value != null) { var regularizerBuilder = SerializeRegularizer.newBuilder val regularizer = value.asInstanceOf[L1L2Regularizer[T]] val l1 = regularizer.l1 val l2 = regularizer.l2 regularizerBuilder.addRegularData(l1) regularizerBuilder.addRegularData(l2) val regularizerType = regularizer match { case l1: L1Regularizer[_] => RegularizerType.L1Regularizer case l2: L2Regularizer[_] => RegularizerType.L2Regularizer case l1l2: L1L2Regularizer[_] => RegularizerType.L1L2Regularizer } regularizerBuilder.setRegularizerType(regularizerType) attributeBuilder.setRegularizerValue(regularizerBuilder.build) } } }
Example 31
Source File: SpatialDilatedConvolution.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.nn.quantized import com.intel.analytics.bigdl.nn.abstractnn.DataFormat import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric import com.intel.analytics.bigdl.tensor.{FloatType, QuantizedTensor, Tensor} import com.intel.analytics.bigdl.utils.serializer.converters.DataConverter import com.intel.analytics.bigdl.utils.serializer.{DeserializeContext, ModuleData, SerializeContext} import com.intel.analytics.bigdl.serialization.Bigdl.{AttrValue, BigDLModule} import scala.reflect.ClassTag import scala.reflect.runtime.universe @SerialVersionUID(- 8572055756810843156L) private[bigdl] class SpatialDilatedConvolution[T: ClassTag]( nInputPlane: Int, // The number of expected input planes in the image given into forward() nOutputPlane: Int, // The number of output planes the convolution layer will produce. kernelW: Int, // The kernel width of the convolution kernelH: Int, // The kernel height of the convolution strideW: Int = 1, // The step of the convolution in the width dimension. strideH: Int = 1, // The step of the convolution in the height dimension padW: Int = 0, // The additional zeros added per width to the input planes. padH: Int = 0, // The additional zeros added per height to the input planes. val dilationW: Int = 1, val dilationH: Int = 1, format: DataFormat = DataFormat.NCHW )(implicit ev: TensorNumeric[T]) extends SpatialConvolution[T]( nInputPlane, nOutputPlane, kernelW, kernelH, strideW, strideH, padW, padH, format = format ) { override val dilationWidth: Int = dilationW override val dilationHeight: Int = dilationH override def toString(): String = { s"quantized.SpatialDilatedConvolution($nInputPlane -> $nOutputPlane, $kernelW x" + s" $kernelH, $strideW, $strideH, $padW, $padH, $dilationW, $dilationH)" } } object SpatialDilatedConvolution extends QuantSerializer { def apply[T: ClassTag]( nInputPlane: Int, nOutputPlane: Int, kW: Int, kH: Int, dW: Int = 1, dH: Int = 1, padW: Int = 0, padH: Int = 0, dilationW: Int = 1, dilationH: Int = 1, initWeight: Tensor[T] = null, initBias: Tensor[T] = null, format: DataFormat = DataFormat.NCHW )(implicit ev: TensorNumeric[T]) : SpatialDilatedConvolution[T] = { val conv = new SpatialDilatedConvolution[T](nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH, dilationW, dilationH, format = format) conv.initWeightAndBias(initWeight, initBias) } override def serializeWeight[T: ClassTag](context: SerializeContext[T], modelBuilder: BigDLModule.Builder)(implicit ev: TensorNumeric[T]): Unit = { val module = context.moduleData.module val conv = module.asInstanceOf[SpatialDilatedConvolution[T]] val weightBuilder = AttrValue.newBuilder ev.getType() match { case FloatType => DataConverter.setAttributeValue(context, weightBuilder, conv.weight, universe.typeOf[Array[Tensor[Float]]]) case _ => throw new UnsupportedOperationException(s"Only support Float for quantized model") } modelBuilder.putAttr("weights", weightBuilder.build) } override def loadWeight[T: ClassTag](context: DeserializeContext, moduleData: ModuleData[T])(implicit ev: TensorNumeric[T]): Unit = { val conv = moduleData.module.asInstanceOf[SpatialDilatedConvolution[T]] val attrMap = context.bigdlModule.getAttrMap val weights = DataConverter.getAttributeValue(context, attrMap.get("weights")) .asInstanceOf[Array[Tensor[T]]] for (i <- 0 until conv.weight.length) { conv.weight(i).asInstanceOf[QuantizedTensor[T]].release() conv.weight(i).set(weights(i)) } } }
Example 32
Source File: RandomUniform.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.nn.ops import com.intel.analytics.bigdl.nn.abstractnn.{AbstractModule, Activity} import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric import com.intel.analytics.bigdl.tensor._ import com.intel.analytics.bigdl.utils.RandomGenerator import com.intel.analytics.bigdl.utils.serializer.converters.DataConverter import com.intel.analytics.bigdl.utils.serializer.{DeserializeContext, ModuleSerializable, SerializeContext} import com.intel.analytics.bigdl.serialization.Bigdl.{AttrValue, BigDLModule} import scala.reflect.ClassTag import scala.reflect.runtime.universe private[bigdl] trait RandomNode class RandomUniform[T: ClassTag, D: ClassTag]( val minVal: Double, val maxVal: Double, val seed: Option[Int] = None )(implicit ev: TensorNumeric[T], ev2: TensorNumeric[D]) extends Operation[Tensor[Int], Tensor[D], T] with RandomNode { if (seed.isDefined) { RandomGenerator.RNG.setSeed(seed.get) } output = Activity.allocate[Tensor[D], D]() override def updateOutput(input: Tensor[Int]): Tensor[D] = { require(input.nDimension() == 1, "the shape should be a one-dimensional tensor.") val shape = input.storage().toArray output.resize(shape).rand( minVal, maxVal) output } override def getClassTagNumerics() : (Array[ClassTag[_]], Array[TensorNumeric[_]]) = { (Array[ClassTag[_]](scala.reflect.classTag[T], scala.reflect.classTag[D]), Array[TensorNumeric[_]](ev, ev2)) } } object RandomUniform extends ModuleSerializable { def apply[T: ClassTag, D: ClassTag]( minVal: Double, maxVal: Double, seed: Option[Int] = None) (implicit ev: TensorNumeric[T], ev2: TensorNumeric[D]): Operation[Activity, Activity, T] = ModuleToOperation[T](new RandomUniform[T, D](minVal, maxVal, seed)) override def doSerializeModule[T: ClassTag](context: SerializeContext[T], bigDLModelBuilder: BigDLModule.Builder)(implicit ev: TensorNumeric[T]): Unit = { val randomUniform = context.moduleData.module.asInstanceOf[RandomUniform[T, _]] val minValBuilder = AttrValue.newBuilder DataConverter.setAttributeValue(context, minValBuilder, randomUniform.minVal, universe.typeOf[Double]) bigDLModelBuilder.putAttr("minVal", minValBuilder.build) val maxValBuilder = AttrValue.newBuilder DataConverter.setAttributeValue(context, maxValBuilder, randomUniform.maxVal, universe.typeOf[Double]) bigDLModelBuilder.putAttr("maxVal", maxValBuilder.build) if (randomUniform.seed.isDefined) { val seedBuilder = AttrValue.newBuilder DataConverter.setAttributeValue(context, seedBuilder, randomUniform.seed.get, universe.typeOf[Int]) bigDLModelBuilder.putAttr("seed", seedBuilder.build) } } override def doLoadModule[T: ClassTag](context: DeserializeContext) (implicit ev: TensorNumeric[T]): AbstractModule[Activity, Activity, T] = { val attrMap = context.bigdlModule.getAttrMap val minVal = attrMap.get("minVal").getDoubleValue val maxVal = attrMap.get("maxVal").getDoubleValue var seed : Option[Int] = None if (attrMap.containsKey("seed")) { seed = Option[Int](attrMap.get("seed").getInt32Value) } RandomUniform(minVal, maxVal, seed) } }
Example 33
Source File: SimpleScalaRiakDataframesExample.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.examples.dataframes import com.basho.riak.client.core.query.indexes.LongIntIndex import com.basho.riak.client.core.query.Namespace import com.basho.riak.spark._ import com.basho.riak.spark.util.RiakObjectConversionUtil import org.apache.spark.SparkConf import org.apache.spark.SparkContext import scala.reflect.runtime.universe import scala.concurrent.Future import scala.concurrent.ExecutionContext.Implicits.global import scala.util.{ Failure, Success } import com.basho.riak.client.core.query.RiakObject import com.basho.riak.client.api.RiakClient import com.basho.riak.client.core.query.Location import com.basho.riak.spark.rdd.RiakFunctions object SimpleScalaRiakDataframesExample { private val bucketName = "users" case class UserData(user_id: String, name: String, age: Int, category: String) val testData = Seq( UserData("u1", "Ben", 23, "CategoryA"), UserData("u2", "Clair", 19, "CategoryB"), UserData("u3", "John", 21, null), UserData("u4", "Chris", 50, "Categoryc"), UserData("u5", "Mary", 15, "CategoryB"), UserData("u6", "George", 31, "CategoryC") ) def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("Riak Spark Dataframes Example") setSparkOpt(sparkConf, "spark.master", "local") setSparkOpt(sparkConf, "spark.riak.connection.host", "127.0.0.1:8087") val sc = new SparkContext(sparkConf) // Work with clear bucket clearBucket(sparkConf) val sqlContext = new org.apache.spark.sql.SQLContext(sc) // To enable toDF() import sqlContext.implicits._ println(s" Saving data to Riak: \n ${println(testData)}") // Save test data from json file to riak bucket val inputRDD = sc.parallelize(testData).map { line => val obj = RiakObjectConversionUtil.to(line) // RiakObjectConversionUtil.to() sets content type to text/plain if String is passed // Overwriting content type to application/json will allow automatic conversion to // User defined type when reading from Riak obj.setContentType("application/json") obj }.saveToRiak(bucketName) // Read from Riak with UDT to enable schema inference using reflection val df = sc.riakBucket[UserData](bucketName).queryAll.toDF println(s"Dataframe from Riak query: \n ${df.show()}") df.registerTempTable("users") println("count by category") df.groupBy("category").count.show println("sort by num of letters") // Register user defined function sqlContext.udf.register("stringLength", (s: String) => s.length) sqlContext.sql("select user_id, name, stringLength(name) nameLength from users order by nameLength").show println("filter age >= 21") sqlContext.sql("select * from users where age >= 21").show } private def clearBucket(sparkConf: SparkConf): Unit = { val rf = RiakFunctions(sparkConf) rf.withRiakDo(session => { rf.resetAndEmptyBucketByName(bucketName) }) } private def setSparkOpt(sparkConf: SparkConf, option: String, defaultOptVal: String): SparkConf = { val optval = sparkConf.getOption(option).getOrElse(defaultOptVal) sparkConf.set(option, optval) } }
Example 34
Source File: SparkDataframesTest.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.rdd import scala.reflect.runtime.universe import org.apache.spark.sql.DataFrame import org.apache.spark.sql.SQLContext import org.junit.Assert._ import org.junit.{ Before, Test } import com.basho.riak.spark.toSparkContextFunctions import org.junit.experimental.categories.Category case class TestData(id: String, name: String, age: Int, category: String) @Category(Array(classOf[RiakTSTests])) class SparkDataframesTest extends AbstractRiakSparkTest { private val indexName = "creationNo" protected override val jsonData = Some( """[ | {key: 'key1', value: {id: 'u1', name: 'Ben', age: 20, category: 'CategoryA'}}, | {key: 'key2', value: {id: 'u2', name: 'Clair', age: 30, category: 'CategoryB'}}, | {key: 'key3', value: {id: 'u3', name: 'John', age: 70}}, | {key: 'key4', value: {id: 'u4', name: 'Chris', age: 10, category: 'CategoryC'}}, | {key: 'key5', value: {id: 'u5', name: 'Mary', age: 40, category: 'CategoryB'}}, | {key: 'key6', value: {id: 'u6', name: 'George', age: 50, category: 'CategoryC'}} |]""".stripMargin) protected override def initSparkConf() = super.initSparkConf().setAppName("Dataframes Test") var sqlContextHolder: SQLContext = _ var df: DataFrame = _ @Before def initializeDF(): Unit = { val sqlContext = new org.apache.spark.sql.SQLContext(sc) import sqlContext.implicits._ sqlContextHolder = sqlContext df = sc.riakBucket[TestData](DEFAULT_NAMESPACE.getBucketNameAsString) .queryAll().toDF df.registerTempTable("test") } @Test def schemaTest(): Unit = { df.printSchema() val schema = df.schema.map(_.name).toList val fields = universe.typeOf[TestData].members.withFilter(!_.isMethod).map(_.name.toString.trim).toList assertEquals(schema.sorted, fields.sorted) } @Test def sqlQueryTest(): Unit = { val sqlResult = sqlContextHolder.sql("select * from test where category >= 'CategoryC'").toJSON.collect val expected = """ [ | {id:'u4',name:'Chris',age:10,category:'CategoryC'}, | {id:'u6',name:'George',age:50,category:'CategoryC'} | ]""".stripMargin assertEqualsUsingJSONIgnoreOrder(expected, stringify(sqlResult)) } @Test def udfTest(): Unit = { sqlContextHolder.udf.register("stringLength", (s: String) => s.length) val udf = sqlContextHolder.sql("select name, stringLength(name) strLgth from test order by strLgth, name").toJSON.collect val expected = """ [ | {name:'Ben',strLgth:3}, | {name:'John',strLgth:4}, | {name:'Mary',strLgth:4}, | {name:'Chris',strLgth:5}, | {name:'Clair',strLgth:5}, | {name:'George',strLgth:6} | ]""".stripMargin assertEqualsUsingJSON(expected, stringify(udf)) } @Test def grouppingTest(): Unit = { val groupped = df.groupBy("category").count.toJSON.collect val expected = """ [ | {category:'CategoryA',count:1}, | {category:'CategoryB',count:2}, | {category:'CategoryC',count:2}, | {count:1} | ]""".stripMargin assertEqualsUsingJSONIgnoreOrder(expected, stringify(groupped)) } @Test def sqlVsFilterTest(): Unit = { val sql = sqlContextHolder.sql("select id, name from test where age >= 50").toJSON.collect val filtered = df.where(df("age") >= 50).select("id", "name").toJSON.collect assertEqualsUsingJSONIgnoreOrder(stringify(sql), stringify(filtered)) } }
Example 35
Source File: LocalLDAModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.clustering import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.{DataUtils, ParamUtils} import org.apache.spark.ml.clustering.{LocalLDAModel => SparkLocalLDA} import org.apache.spark.mllib.clustering.{LocalLDAModel => OldSparkLocalLDA} import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors} import org.apache.spark.sql.SparkSession import DataUtils._ import scala.reflect.runtime.universe class LocalLDAModel(override val sparkTransformer: SparkLocalLDA) extends LocalTransformer[SparkLocalLDA] { lazy val oldModel: OldSparkLocalLDA = { val mirror = universe.runtimeMirror(sparkTransformer.getClass.getClassLoader) val parentTerm = universe.typeOf[SparkLocalLDA].decl(universe.TermName("oldLocalModel")).asTerm mirror.reflect(sparkTransformer).reflectField(parentTerm).get.asInstanceOf[OldSparkLocalLDA] } override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getFeaturesCol) match { case Some(column) => val newData = column.data.mapToMlLibVectors.map(oldModel.topicDistribution(_).toList) localData.withColumn( LocalDataColumn( sparkTransformer.getTopicDistributionCol, newData ) ) case None => localData } } } object LocalLDAModel extends SimpleModelLoader[SparkLocalLDA] with TypedTransformerConverter[SparkLocalLDA] { override def build(metadata: Metadata, data: LocalData): SparkLocalLDA = { val topics = DataUtils.constructMatrix( data.column("topicsMatrix").get.data.head.asInstanceOf[Map[String, Any]] ) val gammaShape = data.column("gammaShape").get.data.head.asInstanceOf[java.lang.Double] val topicConcentration = data.column("topicConcentration").get.data.head.asInstanceOf[java.lang.Double] val docConcentration = DataUtils.constructVector( data.column("docConcentration").get.data.head.asInstanceOf[Map[String, Any]] ) val vocabSize = data.column("vocabSize").get.data.head.asInstanceOf[java.lang.Integer] val oldLdaCtor = classOf[OldSparkLocalLDA].getDeclaredConstructor( classOf[Matrix], classOf[Vector], classOf[Double], classOf[Double] ) val oldLDA = oldLdaCtor.newInstance( Matrices.fromML(topics), Vectors.fromML(docConcentration), topicConcentration, gammaShape ) val ldaCtor = classOf[SparkLocalLDA].getDeclaredConstructor( classOf[String], classOf[Int], classOf[OldSparkLocalLDA], classOf[SparkSession] ) val lda = ldaCtor.newInstance(metadata.uid, vocabSize, oldLDA, null) ParamUtils.set(lda, lda.optimizer, metadata) ParamUtils.set(lda, lda.keepLastCheckpoint, metadata) ParamUtils.set(lda, lda.seed, metadata) ParamUtils.set(lda, lda.featuresCol, metadata) ParamUtils.set(lda, lda.learningDecay, metadata) ParamUtils.set(lda, lda.checkpointInterval, metadata) ParamUtils.set(lda, lda.learningOffset, metadata) ParamUtils.set(lda, lda.maxIter, metadata) ParamUtils.set(lda, lda.k, metadata) lda } override implicit def toLocal(sparkTransformer: SparkLocalLDA): LocalTransformer[SparkLocalLDA] = new LocalLDAModel(sparkTransformer) }
Example 36
Source File: HBaseCredentialProvider.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn.security import scala.reflect.runtime.universe import scala.util.control.NonFatal import org.apache.hadoop.conf.Configuration import org.apache.hadoop.security.Credentials import org.apache.hadoop.security.token.{Token, TokenIdentifier} import org.apache.spark.SparkConf import org.apache.spark.internal.Logging private[security] class HBaseCredentialProvider extends ServiceCredentialProvider with Logging { override def serviceName: String = "hbase" override def obtainCredentials( hadoopConf: Configuration, sparkConf: SparkConf, creds: Credentials): Option[Long] = { try { val mirror = universe.runtimeMirror(getClass.getClassLoader) val obtainToken = mirror.classLoader. loadClass("org.apache.hadoop.hbase.security.token.TokenUtil"). getMethod("obtainToken", classOf[Configuration]) logDebug("Attempting to fetch HBase security token.") val token = obtainToken.invoke(null, hbaseConf(hadoopConf)) .asInstanceOf[Token[_ <: TokenIdentifier]] logInfo(s"Get token from HBase: ${token.toString}") creds.addToken(token.getService, token) } catch { case NonFatal(e) => logDebug(s"Failed to get token from service $serviceName", e) } None } override def credentialsRequired(hadoopConf: Configuration): Boolean = { hbaseConf(hadoopConf).get("hbase.security.authentication") == "kerberos" } private def hbaseConf(conf: Configuration): Configuration = { try { val mirror = universe.runtimeMirror(getClass.getClassLoader) val confCreate = mirror.classLoader. loadClass("org.apache.hadoop.hbase.HBaseConfiguration"). getMethod("create", classOf[Configuration]) confCreate.invoke(null, conf).asInstanceOf[Configuration] } catch { case NonFatal(e) => logDebug("Fail to invoke HBaseConfiguration", e) conf } } }
Example 37
Source File: PumpedClass.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.common.utils import org.apache.spark.ml.Transformer import scala.reflect.runtime.universe class PumpedClass(classz: Class[_]) { def companion: Any = { val companionClassName = classz.getName + "$" val companionClass = Class.forName(companionClassName) val moduleField = companionClass.getField("MODULE$") moduleField.get(null) } } object PumpedClass { def companionFromClassName(className: String): Any = { val runtimeMirror = universe.runtimeMirror(this.getClass.getClassLoader) val module = runtimeMirror.staticModule(className + "$") val obj = runtimeMirror.reflectModule(module) obj.instance } }
Example 38
Source File: L8-1DataFrameAPI.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions.desc import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrDataframeApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 39
Source File: L8-3-6-7DataFrameCreation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions.desc import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.native.Serialization.write import org.json4s.DefaultFormats object DataframeCreationApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { //val cdrs = sqlC.createDataFrame(seqToCdr(rdd)) //val cdrs = sqlC.createDataFrame(seqToCdr(rdd).collect()) //val cdrs = seqToCdr(rdd).toDF() val cdrsJson = seqToCdr(rdd).map(r => { implicit val formats = DefaultFormats write(r) }) val cdrs = sqlC.read.json(cdrsJson) cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 40
Source File: L8-29DataFrameExamplesJoin.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.JDouble import org.json4s.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.compact import org.json4s.native.JsonMethods.parse import org.json4s.native.JsonMethods.render import org.json4s.string2JsonInput object CdrDataframeExamples3App { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: CdrDataframeExamples3App <appname> <batchInterval> <hostname> <port> <gridJsonPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, gridJsonPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ implicit val formats = DefaultFormats val gridFile = scala.io.Source.fromFile(gridJsonPath).mkString val gridGeo = (parse(gridFile) \ "features") val gridStr = gridGeo.children.map(r => { val c = (r \ "geometry" \ "coordinates").extract[List[List[List[Float]]]].flatten.flatten.map(r => JDouble(r)) val l = List(("id", r \ "id"), ("x1", c(0)), ("y1", c(1)), ("x2", c(2)), ("y2", c(3)), ("x3", c(4)), ("y3", c(5)), ("x4", c(6)), ("y4", c(7))) compact(render(JObject(l))) }) val gridDF = sqlC.read.json(ssc.sparkContext.makeRDD(gridStr)) val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.join(gridDF, $"squareId" === $"id").show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 41
Source File: L8-38SparkR.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import java.nio.file.Paths import org.apache.spark.SparkFiles object CdrStreamingSparkRApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: CdrStreamingSparkRApp <appname> <batchInterval> <hostname> <port> <tableName> <RScriptPath> <RScriptLogsPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, tableName, rScriptPath, logsPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val cl = Thread.currentThread().getContextClassLoader() val hiveC = new HiveContext(ssc.sparkContext) Thread.currentThread().setContextClassLoader(cl) import hiveC.implicits._ ssc.sparkContext.addFile(rScriptPath) val rScriptName = SparkFiles.get(Paths.get(rScriptPath).getFileName.toString) val master = hiveC.sparkContext.getConf.get("spark.master") val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD((rdd, time) => { val iTableName = tableName + time.milliseconds seqToCdr(rdd).toDF().write.saveAsTable(iTableName) hiveC.sparkContext.parallelize(Array(iTableName)).pipe("%s %s".format(rScriptName, master)).saveAsTextFile(Paths.get(logsPath, iTableName).toString) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 42
Source File: T8-5-L8-30-34DataFrameExamplesActions.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SaveMode import org.apache.spark.sql.functions.desc import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apress.prospark.CdrDataframeExamplesActionsApp.Cdr import org.json4s.DefaultFormats object CdrDataframeExamplesActionsApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeExamplesActionsApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val cl = Thread.currentThread().getContextClassLoader() val hiveC = new HiveContext(ssc.sparkContext) Thread.currentThread().setContextClassLoader(cl) import hiveC.implicits._ implicit val formats = DefaultFormats val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() val counts = cdrs.groupBy("countryCode").count().orderBy(desc("count")) counts.show(5) counts.show() println("head(5): " + counts.head(5)) println("take(5): " + counts.take(5)) println("head(): " + counts.head()) println("first(5): " + counts.first()) println("count(): " + counts.count()) println("collect(): " + counts.collect()) println("collectAsList(): " + counts.collectAsList()) println("describe(): " + cdrs.describe("smsInActivity", "smsOutActivity", "callInActivity", "callOutActivity", "internetTrafficActivity").show()) counts.write.format("parquet").save("/tmp/parquent" + rdd.id) counts.write.format("json").save("/tmp/json" + rdd.id) counts.write.parquet("/tmp/parquent2" + rdd.id) counts.write.json("/tmp/json2" + rdd.id) counts.write.saveAsTable("count_table") cdrs.groupBy("countryCode").count().orderBy(desc("count")).write.mode(SaveMode.Append).save("/tmp/counts") val prop: java.util.Properties = new java.util.Properties() counts.write.jdbc("jdbc:mysql://hostname:port/cdrsdb", "count_table", prop) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 43
Source File: L8-10-11UDF.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.io.Source import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.jackson.JsonMethods.parse import org.json4s.jvalue2extractable import org.json4s.string2JsonInput object CdrUDFApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrUDFApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ def getCountryCodeMapping() = { implicit val formats = org.json4s.DefaultFormats parse(Source.fromURL("http://country.io/phone.json").mkString).extract[Map[String, String]].map(_.swap) } def getCountryNameMapping() = { implicit val formats = org.json4s.DefaultFormats parse(Source.fromURL("http://country.io/names.json").mkString).extract[Map[String, String]] } def getCountryName(mappingPhone: Map[String, String], mappingName: Map[String, String], code: Int) = { mappingName.getOrElse(mappingPhone.getOrElse(code.toString, "NotFound"), "NotFound") } val getCountryNamePartial = getCountryName(getCountryCodeMapping(), getCountryNameMapping(), _: Int) sqlC.udf.register("getCountryNamePartial", getCountryNamePartial) val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.registerTempTable("cdrs") sqlC.sql("SELECT getCountryNamePartial(countryCode) AS countryName, COUNT(countryCode) AS cCount FROM cdrs GROUP BY countryCode ORDER BY cCount DESC LIMIT 5").show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 44
Source File: L8-14-27DataFrameExamples.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions._ import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrDataframeExamplesApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeExamplesApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.select("squareId", "timeInterval", "countryCode").show() cdrs.select($"squareId", $"timeInterval", $"countryCode").show() cdrs.filter("squareId = 5").show() cdrs.drop("countryCode").show() cdrs.select($"squareId", $"timeInterval", $"countryCode").where($"squareId" === 5).show() cdrs.limit(5).show() cdrs.groupBy("squareId").count().show() cdrs.groupBy("countryCode").avg("internetTrafficActivity").show() cdrs.groupBy("countryCode").max("callOutActivity").show() cdrs.groupBy("countryCode").min("callOutActivity").show() cdrs.groupBy("squareId").sum("internetTrafficActivity").show() cdrs.groupBy("squareId").agg(sum("callOutActivity"), sum("callInActivity"), sum("smsOutActivity"), sum("smsInActivity"), sum("internetTrafficActivity")).show() cdrs.groupBy("countryCode").sum("internetTrafficActivity").orderBy(desc("SUM(internetTrafficActivity)")).show() cdrs.agg(sum("callOutActivity"), sum("callInActivity"), sum("smsOutActivity"), sum("smsInActivity"), sum("internetTrafficActivity")).show() cdrs.rollup("squareId", "countryCode").count().orderBy(desc("squareId"), desc("countryCode")).rdd.saveAsTextFile("/tmp/rollup" + rdd.hashCode()) cdrs.cube("squareId", "countryCode").count().orderBy(desc("squareId"), desc("countryCode")).rdd.saveAsTextFile("/tmp/cube" + rdd.hashCode()) cdrs.dropDuplicates(Array("callOutActivity", "callInActivity")).show() cdrs.select("squareId", "countryCode", "internetTrafficActivity").distinct.show() cdrs.withColumn("endTime", cdrs("timeInterval") + 600000).show() cdrs.sample(true, 0.01).show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 45
Source File: L8-28DataFrameExamplesOps.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrDataframeExamples2App { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeExamples2App <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ var previousCdrs: Option[DataFrame] = None val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF().select("squareId", "countryCode").dropDuplicates() previousCdrs match { case Some(prevCdrs) => cdrs.unionAll(prevCdrs).show() //case Some(prevCdrs) => cdrs.intersect(prevCdrs).show() //case Some(prevCdrs) => cdrs.except(prevCdrs).show() case None => Unit } previousCdrs = Some(cdrs) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 46
Source File: T8-3DataFrameExamplesNA.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.JDouble import org.json4s.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.compact import org.json4s.native.JsonMethods.parse import org.json4s.native.JsonMethods.render import org.json4s.string2JsonInput object CdrDataframeExamplesNAApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeExamplesNAApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ implicit val formats = DefaultFormats val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.na.drop("any").show() cdrs.na.fill(0, Array("squareId")).show() cdrs.na.replace("squareId", Map(0 -> 1)).show() println("Correlation: " + cdrs.stat.corr("smsOutActivity", "callOutActivity")) println("Covariance: " + cdrs.stat.cov("smsInActivity", "callInActivity")) cdrs.stat.crosstab("squareId", "countryCode").show() cdrs.stat.freqItems(Array("squareId", "countryCode"), 0.1).show() cdrs.stat.crosstab("callOutActivity", "callInActivity").show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }