org.apache.spark.SparkConf Scala Examples
The following examples show how to use org.apache.spark.SparkConf.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DeltaQA.scala From spark-tools with Apache License 2.0 | 12 votes |
package io.univalence.deltaqa.kpialgebra import org.apache.spark.rdd.RDD import org.apache.spark.SparkConf import org.apache.spark.SparkContext import shapeless.contrib.spire._ import spire.algebra._ import spire.implicits._ import scala.reflect.ClassTag case class DeltaPart[T: AdditiveMonoid]( count: Long, part: T ) case class DeltaCommon[T: AdditiveMonoid]( count: Long, countZero: Long, diff: T, error: T, left: T, right: T ) case class Delta[L: AdditiveMonoid, R: AdditiveMonoid, C: AdditiveMonoid]( left: DeltaPart[L], right: DeltaPart[R], common: DeltaCommon[C] ) object KpiAlgebra { def computeCommon[LRC: AdditiveAbGroup: MultiplicativeSemigroup](left: LRC, right: LRC): DeltaCommon[LRC] = { val diff = left - right val error = diff * diff DeltaCommon( count = 1, countZero = if (diff == Monoid.additive[LRC].id) 1 else 0, diff = diff, error = error, left = left, right = right ) } def monoid[LM: AdditiveMonoid, RM: AdditiveMonoid, LRC: AdditiveMonoid]: Monoid[Delta[LM, RM, LRC]] = Monoid.additive[Delta[LM, RM, LRC]] def compare[ K: ClassTag, L: ClassTag, R: ClassTag, LM: AdditiveMonoid: ClassTag, RM: AdditiveMonoid: ClassTag, LRC: AdditiveAbGroup: MultiplicativeSemigroup: ClassTag ]( left: RDD[(K, L)], right: RDD[(K, R)] )(flm: L => LM, frm: R => RM, flc: L => LRC, frc: R => LRC): Delta[LM, RM, LRC] = { val map: RDD[Delta[LM, RM, LRC]] = left .fullOuterJoin(right) .map({ case (_, (Some(l), None)) => monoid[LM, RM, LRC].id .copy(left = DeltaPart(count = 1, part = flm(l))) case (_, (None, Some(r))) => monoid[LM, RM, LRC].id .copy(right = DeltaPart(count = 1, part = frm(r))) case (_, (Some(l), Some(r))) => monoid[LM, RM, LRC].id.copy(common = computeCommon(flc(l), frc(r))) }) map.reduce((x, y) => monoid[LM, RM, LRC].op(x, y)) } } case class KpiLeaf(l1: Long, l2: Long, l3: Long) object KpiAlgebraTest { def main(args: Array[String]) { val sc = new SparkContext(new SparkConf().setMaster("local[*]").setAppName("smoketest")) val parallelize: RDD[(Int, Int)] = sc.parallelize((1 to 4).zipWithIndex) // Delta(DeltaPart(0,0),DeltaPart(0,0),DeltaCommon(4,4,0,0,6,6)) val p2: RDD[(Int, KpiLeaf)] = sc.parallelize((1 to 4)).map(_ -> KpiLeaf(1, 2, 3)) import spire.implicits._ import shapeless.contrib.spire._ ////println(((KpiAlgebra.compare(p2, p2)(identity, identity, identity, identity)) } }
Example 2
Source File: Test1.scala From BigData-News with Apache License 2.0 | 12 votes |
package com.vita.spark.test import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD object Test1 { def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf() conf.setMaster("local") conf.setAppName("TransformationOperator") val sc: SparkContext = new SparkContext(conf) val list: List[String] = List("张无忌", "赵敏", "周芷若") val rdd: RDD[String] = sc.parallelize(list) val list1: List[(Int, String)] = List((1, "东方不败"), (2, "令狐冲"), (3, "林平之")) val list2: List[(Int, Int)] = List((1, 99), (2, 98), (3, 97)) val rdd1: RDD[(Int, String)] = sc.parallelize(list1) val rdd2: RDD[(Int, Int)] = sc.parallelize(list2) rdd1.join(rdd2).foreach(x => println("学号: " + x._1 + "名字:" + x._2._1 + " 分数:" + x._2._2)) } }
Example 3
Source File: SummaryStatisticsExample.scala From drizzle-spark with Apache License 2.0 | 6 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} // $example off$ object SummaryStatisticsExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SummaryStatisticsExample") val sc = new SparkContext(conf) // $example on$ val observations = sc.parallelize( Seq( Vectors.dense(1.0, 10.0, 100.0), Vectors.dense(2.0, 20.0, 200.0), Vectors.dense(3.0, 30.0, 300.0) ) ) // Compute column summary statistics. val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) println(summary.mean) // a dense vector containing the mean value for each column println(summary.variance) // column-wise variance println(summary.numNonzeros) // number of nonzeros in each column // $example off$ sc.stop() } } // scalastyle:on println
Example 4
Source File: DenseKMeans.scala From drizzle-spark with Apache License 2.0 | 6 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 5
Source File: SqlNetworkWordCount.scala From drizzle-spark with Apache License 2.0 | 6 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} object SparkSessionSingleton { @transient private var instance: SparkSession = _ def getInstance(sparkConf: SparkConf): SparkSession = { if (instance == null) { instance = SparkSession .builder .config(sparkConf) .getOrCreate() } instance } } // scalastyle:on println
Example 6
Source File: HDFSCredentialProvider.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.deploy.yarn.security import java.io.{ByteArrayInputStream, DataInputStream} import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier import org.apache.hadoop.mapred.Master import org.apache.hadoop.security.Credentials import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.deploy.yarn.config._ import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ private[security] class HDFSCredentialProvider extends ServiceCredentialProvider with Logging { // Token renewal interval, this value will be set in the first call, // if None means no token renewer specified, so cannot get token renewal interval. private var tokenRenewalInterval: Option[Long] = null override val serviceName: String = "hdfs" override def obtainCredentials( hadoopConf: Configuration, sparkConf: SparkConf, creds: Credentials): Option[Long] = { // NameNode to access, used to get tokens from different FileSystems nnsToAccess(hadoopConf, sparkConf).foreach { dst => val dstFs = dst.getFileSystem(hadoopConf) logInfo("getting token for namenode: " + dst) dstFs.addDelegationTokens(getTokenRenewer(hadoopConf), creds) } // Get the token renewal interval if it is not set. It will only be called once. if (tokenRenewalInterval == null) { tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, sparkConf) } // Get the time of next renewal. tokenRenewalInterval.map { interval => creds.getAllTokens.asScala .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) .map { t => val identifier = new DelegationTokenIdentifier() identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) identifier.getIssueDate + interval }.foldLeft(0L)(math.max) } } private def getTokenRenewalInterval( hadoopConf: Configuration, sparkConf: SparkConf): Option[Long] = { // We cannot use the tokens generated with renewer yarn. Trying to renew // those will fail with an access control issue. So create new tokens with the logged in // user as renewer. sparkConf.get(PRINCIPAL).map { renewer => val creds = new Credentials() nnsToAccess(hadoopConf, sparkConf).foreach { dst => val dstFs = dst.getFileSystem(hadoopConf) dstFs.addDelegationTokens(renewer, creds) } val t = creds.getAllTokens.asScala .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) .head val newExpiration = t.renew(hadoopConf) val identifier = new DelegationTokenIdentifier() identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) val interval = newExpiration - identifier.getIssueDate logInfo(s"Renewal Interval is $interval") interval } } private def getTokenRenewer(conf: Configuration): String = { val delegTokenRenewer = Master.getMasterPrincipal(conf) logDebug("delegation token renewer is: " + delegTokenRenewer) if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) { val errorMessage = "Can't get Master Kerberos principal for use as renewer" logError(errorMessage) throw new SparkException(errorMessage) } delegTokenRenewer } private def nnsToAccess(hadoopConf: Configuration, sparkConf: SparkConf): Set[Path] = { sparkConf.get(NAMENODES_TO_ACCESS).map(new Path(_)).toSet + sparkConf.get(STAGING_DIR).map(new Path(_)) .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory) } }
Example 7
Source File: RateController.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.streaming.scheduler import java.io.ObjectInputStream import java.util.concurrent.atomic.AtomicLong import scala.concurrent.{ExecutionContext, Future} import org.apache.spark.SparkConf import org.apache.spark.streaming.scheduler.rate.RateEstimator import org.apache.spark.util.{ThreadUtils, Utils} private def computeAndPublish(time: Long, elems: Long, workDelay: Long, waitDelay: Long): Unit = Future[Unit] { val newRate = rateEstimator.compute(time, elems, workDelay, waitDelay) newRate.foreach { s => rateLimit.set(s.toLong) publish(getLatestRate()) } } def getLatestRate(): Long = rateLimit.get() override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted) { val elements = batchCompleted.batchInfo.streamIdToInputInfo for { processingEnd <- batchCompleted.batchInfo.processingEndTime workDelay <- batchCompleted.batchInfo.processingDelay waitDelay <- batchCompleted.batchInfo.schedulingDelay elems <- elements.get(streamUID).map(_.numRecords) } computeAndPublish(processingEnd, elems, workDelay, waitDelay) } } object RateController { def isBackPressureEnabled(conf: SparkConf): Boolean = conf.getBoolean("spark.streaming.backpressure.enabled", false) }
Example 8
Source File: TFIDF.scala From AI with Apache License 2.0 | 6 votes |
package com.bigchange.mllib import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.{SparseVector => SV} import org.apache.spark.{SparkConf, SparkContext} import scala.io.Source object TFIDF { def main(args: Array[String]) { val conf = new SparkConf().setAppName("TfIdfTest") .setMaster("local") val sc = new SparkContext(conf) // Load documents (one per line).要求每行作为一个document,这里zipWithIndex将每一行的行号作为doc id val documents = sc.parallelize(Source.fromFile("J:\\github\\dataSet\\TFIDF-DOC").getLines() .filter(_.trim.length > 0).toSeq) .map(_.split(" ").toSeq) .zipWithIndex() // feature number val hashingTF = new HashingTF(Math.pow(2, 18).toInt) //line number for doc id,每一行的分词结果生成tf vector val idAndTFVector = documents.map { case (seq, num) => val tf = hashingTF.transform(seq) (num + 1, tf) } idAndTFVector.cache() // build idf model val idf = new IDF().fit(idAndTFVector.values) // transform tf vector to tf-idf vector val idAndTFIDFVector = idAndTFVector.mapValues(v => idf.transform(v)) // broadcast tf-idf vectors val idAndTFIDFVectorBroadCast = sc.broadcast(idAndTFIDFVector.collect()) // cal doc cosineSimilarity val docSims = idAndTFIDFVector.flatMap { case (id1, idf1) => // filter the same doc id val idfs = idAndTFIDFVectorBroadCast.value.filter(_._1 != id1) val sv1 = idf1.asInstanceOf[SV] import breeze.linalg._ val bsv1 = new SparseVector[Double](sv1.indices, sv1.values, sv1.size) idfs.map { case (id2, idf2) => val sv2 = idf2.asInstanceOf[SV] val bsv2 = new SparseVector[Double](sv2.indices, sv2.values, sv2.size) val cosSim = bsv1.dot(bsv2) / (norm(bsv1) * norm(bsv2)) (id1, id2, cosSim) } } docSims.foreach(println) sc.stop() } }
Example 9
Source File: SqlUnitTest.scala From SparkUnitTestingExamples with Apache License 2.0 | 6 votes |
package com.cloudera.sa.spark.unittest.sql import org.apache.spark.sql.Row import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} import scala.collection.mutable class SqlUnitTest extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll{ @transient var sc: SparkContext = null @transient var hiveContext: HiveContext = null override def beforeAll(): Unit = { val envMap = Map[String,String](("Xmx", "512m")) val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") sparkConfig.set("spark.io.compression.codec", "lzf") sc = new SparkContext("local[2]", "unit test", sparkConfig) hiveContext = new HiveContext(sc) } override def afterAll(): Unit = { sc.stop() } test("Test table creation and summing of counts") { val personRDD = sc.parallelize(Seq(Row("ted", 42, "blue"), Row("tj", 11, "green"), Row("andrew", 9, "green"))) hiveContext.sql("create table person (name string, age int, color string)") val emptyDataFrame = hiveContext.sql("select * from person limit 0") val personDataFrame = hiveContext.createDataFrame(personRDD, emptyDataFrame.schema) personDataFrame.registerTempTable("tempPerson") val ageSumDataFrame = hiveContext.sql("select sum(age) from tempPerson") val localAgeSum = ageSumDataFrame.take(10) assert(localAgeSum(0).get(0) == 62, "The sum of age should equal 62 but it equaled " + localAgeSum(0).get(0)) } }
Example 10
Source File: GraphGeneration.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 6 votes |
package com.github.maxpumperla.ml_spark.graphs import org.apache.spark.graphx.lib.TriangleCount import org.apache.spark.graphx.util.GraphGenerators import org.apache.spark.graphx.{Graph, GraphLoader, PartitionStrategy, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object GraphGeneration extends App { val conf = new SparkConf() .setAppName("Graph generation") .setMaster("local[4]") val sc = new SparkContext(conf) val edgeListGraph = GraphLoader.edgeListFile(sc, "./edge_list.txt") val rawEdges: RDD[(VertexId, VertexId)] = sc.textFile("./edge_list.txt").map { line => val field = line.split(" ") (field(0).toLong, field(1).toLong) } val edgeTupleGraph = Graph.fromEdgeTuples( rawEdges=rawEdges, defaultValue="") val gridGraph = GraphGenerators.gridGraph(sc, 5, 5) val starGraph = GraphGenerators.starGraph(sc, 11) val logNormalGraph = GraphGenerators.logNormalGraph( sc, numVertices = 20, mu=1, sigma = 3 ) logNormalGraph.outDegrees.map(_._2).collect().sorted val actorGraph = GraphLoader.edgeListFile( sc, "./ca-hollywood-2009.txt", true ).partitionBy(PartitionStrategy.RandomVertexCut) actorGraph.edges.count() val actorComponents = actorGraph.connectedComponents().cache actorComponents.vertices.map(_._2).distinct().count val clusterSizes =actorComponents.vertices.map( v => (v._2, 1)).reduceByKey(_ + _) clusterSizes.map(_._2).max clusterSizes.map(_._2).min val smallActorGraph = GraphLoader.edgeListFile(sc, "./ca-hollywood-2009.txt") val strongComponents = smallActorGraph.stronglyConnectedComponents(numIter = 5) strongComponents.vertices.map(_._2).distinct().count val canonicalGraph = actorGraph.mapEdges(e => 1).removeSelfEdges().convertToCanonicalEdges() val partitionedGraph = canonicalGraph.partitionBy(PartitionStrategy.RandomVertexCut) actorGraph.triangleCount() val triangles = TriangleCount.runPreCanonicalized(partitionedGraph) actorGraph.staticPageRank(10) val actorPrGraph: Graph[Double, Double] = actorGraph.pageRank(0.0001) actorPrGraph.vertices.reduce((v1, v2) => { if (v1._2 > v2._2) v1 else v2 }) actorPrGraph.inDegrees.filter(v => v._1 == 33024L).collect.foreach(println) actorPrGraph.inDegrees.map(_._2).collect().sorted.takeRight(10) actorPrGraph.inDegrees.map(_._2).filter(_ >= 62).count }
Example 11
Source File: SparkPFASuiteBase.scala From aardpfark with Apache License 2.0 | 6 votes |
package com.ibm.aardpfark.pfa import com.holdenkarau.spark.testing.DataFrameSuiteBase import org.apache.spark.SparkConf import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.functions.udf import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.scalactic.Equality import org.scalatest.FunSuite abstract class SparkPFASuiteBase extends FunSuite with DataFrameSuiteBase with PFATestUtils { val sparkTransformer: Transformer val input: Array[String] val expectedOutput: Array[String] val sparkConf = new SparkConf(). setMaster("local[*]"). setAppName("test"). set("spark.ui.enabled", "false"). set("spark.app.id", appID). set("spark.driver.host", "localhost") override lazy val spark = SparkSession.builder().config(sparkConf).getOrCreate() override val reuseContextIfPossible = true // Converts column containing a vector to an array def withColumnAsArray(df: DataFrame, colName: String) = { val vecToArray = udf { v: Vector => v.toArray } df.withColumn(colName, vecToArray(df(colName))) } def withColumnAsArray(df: DataFrame, first: String, others: String*) = { val vecToArray = udf { v: Vector => v.toArray } var result = df.withColumn(first, vecToArray(df(first))) others.foreach(c => result = result.withColumn(c, vecToArray(df(c)))) result } // Converts column containing a vector to a sparse vector represented as a map def getColumnAsSparseVectorMap(df: DataFrame, colName: String) = { val vecToMap = udf { v: Vector => v.toSparse.indices.map(i => (i.toString, v(i))).toMap } df.withColumn(colName, vecToMap(df(colName))) } } abstract class Result object ApproxEquality extends ApproxEquality trait ApproxEquality { import org.scalactic.Tolerance._ import org.scalactic.TripleEquals._ implicit val seqApproxEq: Equality[Seq[Double]] = new Equality[Seq[Double]] { override def areEqual(a: Seq[Double], b: Any): Boolean = { b match { case d: Seq[Double] => a.zip(d).forall { case (l, r) => l === r +- 0.001 } case _ => false } } } implicit val vectorApproxEq: Equality[Vector] = new Equality[Vector] { override def areEqual(a: Vector, b: Any): Boolean = { b match { case v: Vector => a.toArray.zip(v.toArray).forall { case (l, r) => l === r +- 0.001 } case _ => false } } } }
Example 12
Source File: L5-15KafkaDirect.scala From prosparkstreaming with Apache License 2.0 | 6 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import kafka.serializer.StringDecoder import org.apache.spark.streaming.kafka.KafkaUtils object StationJourneyCountDirectApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val topics = Set(topic) val params = Map[String, String]( "zookeeper.connect" -> zkQuorum, "group.id" -> consumerGroupId, "bootstrap.servers" -> brokerUrl) KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, params, topics).map(_._2) .map(rec => rec.split(",")) .map(rec => ((rec(3), rec(7)), 1)) .reduceByKey(_ + _) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 13
Source File: TestJoins.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples import org.apache.spark.{ SparkConf, SparkContext, HashPartitioner } import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import scala.Iterator object TestJoins { def main(args: Array[String]): Unit = { val sc = new SparkContext(new SparkConf().setAppName("TestJoinJob")) val x = sc.parallelize(List((1, 2), (1, 3), (2, 3), (2, 4))).partitionBy(new HashPartitioner(2)).cache val y = sc.parallelize(List((2, 5), (2, 6))).partitionBy(new HashPartitioner(2)).cache inspectRDD(x) inspectRDD(y) println(">>> joining x with y") val joinRDD = x.join(y).cache joinRDD.collect().foreach(println) inspectRDD(joinRDD) println(">>> left outer join of x with y") val leftJoin = x.leftOuterJoin(y).cache leftJoin.collect().foreach(println) inspectRDD(leftJoin) println(">>> right outer join of x with y") val rightJoin = x.rightOuterJoin(y).cache rightJoin.collect().foreach(println) inspectRDD(rightJoin) } def inspectRDD[T](rdd: RDD[T]): Unit = { println(">>> Partition length...") rdd.mapPartitions(f => Iterator(f.length), true).foreach(println) println(">>> Partition data...") rdd.foreachPartition(f => f.foreach(println)) } }
Example 14
Source File: TestAdditionInWindow.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples.streaming import org.apache.spark.streaming.{ StreamingContext, Seconds } import org.apache.spark.SparkConf object TestAdditionInWindow { def main(args: Array[String]): Unit = { val ssc = new StreamingContext(new SparkConf().setAppName("TestAdditionJob"), Seconds(1)) val msg = ssc.socketTextStream("localhost", 9999) msg .map(data => ("sum", data.toInt)) .reduceByKey(_ + _) .window(Seconds(3), Seconds(2)) .print() ssc.start() ssc.awaitTermination() } }
Example 15
Source File: TestUpdateStateByKey.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples.streaming import org.apache.spark.streaming.{StreamingContext, Duration} import org.apache.spark.SparkConf object TestUpdateStateByKey { val checkpointDir: String = "hdfs://localhost:9000/user/hduser/spark-chkpt" def main(args: Array[String]): Unit = { val ssc = StreamingContext.getOrCreate(checkpointDir, createFunc _) ssc.start() ssc.awaitTermination() } def updateFunc(values: Seq[Int], state: Option[Int]): Option[Int] = { Some(values.size + state.getOrElse(0)) } def createFunc(): StreamingContext = { val ssc = new StreamingContext(new SparkConf().setAppName("TestUpdateStateByKeyJob"), Duration(2000)) ssc.checkpoint(checkpointDir) ssc.socketTextStream("localhost", 9999) .flatMap(_.split(" ")) .map((_, 1)) .updateStateByKey(updateFunc _) .checkpoint(Duration(10000)) .print() ssc } }
Example 16
Source File: TestStreamingListener.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples.streaming import org.apache.spark.streaming.{ StreamingContext, Seconds } import org.apache.spark.streaming.scheduler.{ StreamingListener, StreamingListenerBatchStarted, StreamingListenerBatchCompleted } import org.apache.spark.SparkConf object TestStreamingListener { def main(args: Array[String]): Unit = { val ssc = new StreamingContext(new SparkConf().setAppName("TestStreamingListenerJob"), Seconds(5)) ssc.addStreamingListener(new MyStreamingListener()) ssc .socketTextStream("localhost", 9999) .flatMap(_.split(" ")) .count() .print() ssc.start() ssc.awaitTermination() } } class MyStreamingListener extends StreamingListener { override def onBatchStarted(batchStarted: StreamingListenerBatchStarted): Unit = { println(">>> Batch started...records in batch = " + batchStarted.batchInfo.numRecords) } override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = { println(">>> Batch completed...time taken (ms) = " + batchCompleted.batchInfo.totalDelay) } }
Example 17
Source File: TestMapWithState.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples.streaming import org.apache.spark.streaming.StreamingContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Seconds, State, StateSpec } def mappingFunc(key: String, value: Option[Int], state: State[Int]): Option[(String, Int)] = { val sum = value.getOrElse(0) + state.getOption().getOrElse(0) // updating the state of non-idle keys... // To call State.update(...) we need to check State.isTimingOut() == false, // else there will be NoSuchElementException("Cannot update the state that is timing out") if (state.isTimingOut()) println(key + " key is timing out...will be removed.") else state.update(sum) Some((key, sum)) } }
Example 18
Source File: RedisStandaloneEnv.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.redislabs.provider.redis.env import com.redislabs.provider.redis.{RedisConfig, RedisEndpoint} import org.apache.spark.SparkConf trait RedisStandaloneEnv extends Env { override val conf: SparkConf = new SparkConf() .setMaster("local[*]").setAppName(getClass.getName) .set("spark.redis.host", redisHost) .set("spark.redis.port", s"$redisPort") .set("spark.redis.auth", redisAuth) .set("spark.streaming.stopGracefullyOnShutdown", "true") .set("spark.driver.bindAddress", "127.0.0.1") override val redisConfig: RedisConfig = new RedisConfig(RedisEndpoint(redisHost, redisPort, redisAuth)) }
Example 19
Source File: Env.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.redislabs.provider.redis.env import com.redislabs.provider.redis.RedisConfig import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.StreamingContext import org.apache.spark.{SparkConf, SparkContext} trait Env { val conf: SparkConf var spark: SparkSession = _ var sc: SparkContext = _ var ssc: StreamingContext = _ val redisHost = "127.0.0.1" val redisPort = 6379 val redisAuth = "passwd" val redisConfig: RedisConfig }
Example 20
Source File: RedisStandaloneSSLEnv.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.redislabs.provider.redis.env import com.redislabs.provider.redis.{RedisConfig, RedisEndpoint} import org.apache.spark.SparkConf trait RedisStandaloneSSLEnv extends Env { override val redisPort = 6380 override val conf: SparkConf = new SparkConf() .setMaster("local[*]").setAppName(getClass.getName) .set("spark.redis.host", redisHost) .set("spark.redis.port", s"$redisPort") .set("spark.redis.auth", redisAuth) .set("spark.redis.ssl", "true") .set("spark.streaming.stopGracefullyOnShutdown", "true") .set("spark.driver.bindAddress", "127.0.0.1") override val redisConfig: RedisConfig = new RedisConfig(RedisEndpoint(redisHost, redisPort, redisAuth, ssl=true)) }
Example 21
Source File: GraphXUtils.scala From graphx-algorithm with GNU General Public License v2.0 | 5 votes |
package org.apache.spark.graphx import org.apache.spark.SparkConf import org.apache.spark.graphx.impl._ import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap import org.apache.spark.util.collection.{OpenHashSet, BitSet} import org.apache.spark.util.BoundedPriorityQueue object GraphXUtils { def registerKryoClasses(conf: SparkConf) { conf.registerKryoClasses(Array( classOf[Edge[Object]], classOf[(VertexId, Object)], classOf[EdgePartition[Object, Object]], classOf[BitSet], classOf[VertexIdToIndexMap], classOf[VertexAttributeBlock[Object]], classOf[PartitionStrategy], classOf[BoundedPriorityQueue[Object]], classOf[EdgeDirection], classOf[GraphXPrimitiveKeyOpenHashMap[VertexId, Int]], classOf[OpenHashSet[Int]], classOf[OpenHashSet[Long]])) } }
Example 22
Source File: CustomReceiver.scala From Learning-Spark-SQL with MIT License | 5 votes |
import java.io.{BufferedReader, InputStreamReader} import java.net.Socket import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { println("Connecting to " + host + ":" + port) socket = new Socket(host, port) println("Connected to " + host + ":" + port) val reader = new BufferedReader( new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8)) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() println("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } }
Example 23
Source File: TFLCustomReceiver.scala From Learning-Spark-SQL with MIT License | 5 votes |
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import org.apache.http.HttpResponse; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object TFLCustomReceiver { private val url = "https://api.tfl.gov.uk/Line/circle/Arrivals?stopPointId=940GZZLUERC&app_id=a73727f3&app_key=dc8150560a2422afae2b70cf291c4327" def main(args: Array[String]) { // Create the context with a 1 second batch size val sparkConf = new SparkConf().setAppName("TFLCustomReceiver") val ssc = new StreamingContext(sparkConf, Seconds(300)) val lines = ssc.receiverStream(new TFLCustomReceiver(url)) lines.print() ssc.start() ssc.awaitTermination() } } class TFLCustomReceiver(url: String) extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) { def onStart() { // Start the thread that receives data over a connection new Thread("Http Receiver") { override def run() { receive() } }.start() } def onStop() { // There is nothing much to do as the thread calling receive() // is designed to stop by itself if isStopped() returns false } private def receive() { var userInput: String = null var httpClient: DefaultHttpClient = null var getRequest: HttpGet = null try { // Connect to host:port httpClient = new DefaultHttpClient(); getRequest = new HttpGet(url); getRequest.addHeader("accept", "application/json"); while(!isStopped) { val response = httpClient.execute(getRequest); if (response.getStatusLine().getStatusCode() != 200) { throw new RuntimeException("Failed : HTTP error code : "+ response.getStatusLine().getStatusCode()); } val reader = new BufferedReader(new InputStreamReader((response.getEntity().getContent()))); userInput = reader.readLine() while(userInput != null) { store(userInput) //println(userInput) userInput = reader.readLine() } reader.close() Thread.sleep(60*1000) } httpClient.close() // Restart in an attempt to connect again when server is active again //restart("Trying to connect again") } catch { case e: java.net.ConnectException => // restart if could not connect to server restart("Error connecting to " + url, e) case t: Throwable => // restart if there is any other error restart("Error receiving data", t) } } }
Example 24
Source File: TFLStreamingApp.scala From Learning-Spark-SQL with MIT License | 5 votes |
import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object TFLStreamingApp { def main(args: Array[String]) { val conf = new SparkConf().setAppName("TFLStreaming") val ssc = new StreamingContext(conf, Seconds(300)) val stream = ssc.receiverStream(new TFLArrivalPredictionsByLine()) println("Before") stream.print() println("After") if (args.length > 2) { stream.saveAsTextFiles(args(2)) } ssc.start() ssc.awaitTermination() } }
Example 25
Source File: ModelSerialization.scala From CTRmodel with Apache License 2.0 | 5 votes |
package com.ggstar.example import com.ggstar.ctrmodel._ import com.ggstar.features.FeatureEngineering import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.{SparkConf, SparkContext} object ModelSerialization { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val conf = new SparkConf() .setMaster("local") .setAppName("ctrModel") .set("spark.submit.deployMode", "client") val spark = SparkSession.builder.config(conf).getOrCreate() val resourcesPath = this.getClass.getResource("/samples.snappy.orc") val rawSamples = spark.read.format("orc").option("compression", "snappy").load(resourcesPath.getPath) //transform array to vector for following vectorAssembler val samples = FeatureEngineering.transferArray2Vector(rawSamples) samples.printSchema() samples.show(5, false) //model training println("Neural Network Ctr Prediction Model:") val innModel = new InnerProductNNCtrModel() innModel.train(samples) val transformedData = innModel.transform(samples) transformedData.show(1,false) //model serialization by mleap val mleapModelSerializer = new com.ggstar.serving.mleap.serialization.ModelSerializer() mleapModelSerializer.serializeModel(innModel._pipelineModel, "jar:file:/Users/zhwang/Workspace/CTRmodel/model/inn.model.mleap.zip", transformedData) //model serialization by JPMML val jpmmlModelSerializer = new com.ggstar.serving.jpmml.serialization.ModelSerializer() jpmmlModelSerializer.serializeModel(innModel._pipelineModel, "model/inn.model.jpmml.xml", transformedData) } }
Example 26
Source File: ModelSelection.scala From CTRmodel with Apache License 2.0 | 5 votes |
package com.ggstar.example import com.ggstar.ctrmodel._ import com.ggstar.evaluation.Evaluator import com.ggstar.features.FeatureEngineering import org.apache.spark.sql.SparkSession import org.apache.spark.{SparkConf, SparkContext} import org.apache.log4j.{Level, Logger} object ModelSelection { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val conf = new SparkConf() .setMaster("local") .setAppName("ctrModel") .set("spark.submit.deployMode", "client") val spark = SparkSession.builder.config(conf).getOrCreate() val resourcesPath = this.getClass.getResource("/samples.snappy.orc") val rawSamples = spark.read.format("orc").option("compression", "snappy").load(resourcesPath.getPath) rawSamples.printSchema() rawSamples.show(10) //transform array to vector for following vectorAssembler val samples = FeatureEngineering.transferArray2Vector(rawSamples) //split samples into training samples and validation samples val Array(trainingSamples, validationSamples) = samples.randomSplit(Array(0.7, 0.3)) val evaluator = new Evaluator } }
Example 27
Source File: GenerateVerticesExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch08 // scalastyle:off println import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.rdd.RDD object GenerateVerticesExample { def main(args: Array[String]): Unit = { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } // ログレベルをWARNに設定 Logger.getLogger("org").setLevel(Level.WARN) // SparkContextの生成 val conf = new SparkConf().setAppName("GenerateVerticesExample") val sc = new SparkContext(conf) // 引数から設定値を取得 val (numProducts, numUsers): (Int, Int) = (args(0).toInt, args(1).toInt) implicit val recOpts: RecommendLogOptions = RecommendLogOptions(numProducts, numUsers) run(sc) sc.stop() } def run(sc: SparkContext) (implicit recOpts: RecommendLogOptions) : Unit = { // 商品リスト、ユーザリストのRDDを生成 val products: RDD[VertexProperty] = sc.parallelize(PurchaseLogGenerator.genProductList) val users: RDD[VertexProperty] = sc.parallelize(PurchaseLogGenerator.genUserList) // 商品リスト20件を表示 println("===================================") println("get top 20 products:") products.take(20).foreach(x => println(s"id: ${x.id},\ttype: ${x.kind},\tname: ${x.name}")) // ユーザリスト20件を表示 println("===================================") println("get top 20 users:") users.take(20).foreach(x => println(s"id: ${x.id},\ttype: ${x.kind},\tname: ${x.name}")) } } // scalastyle:on println
Example 28
Source File: gihyo_6_2_1_Sample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_2_1_Sample { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) val wordCounts = run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val words = stream.flatMap(_.split(" ")) val pairs = words.map(word => (word, 1)) val wordCounts = pairs.reduceByKey(_ + _) wordCounts.print } }
Example 29
Source File: gihyo_6_3_Join.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Join { def main(args: Array[String]) { if (args.length != 4) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost1 = args(0) val targetHostPort1 = args(1).toInt val targetHost2 = args(2) val targetHostPort2 = args(3).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines1 = ssc.socketTextStream(targetHost1, targetHostPort1) val lines2 = ssc.socketTextStream(targetHost2, targetHostPort2) run(lines1, lines2) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], joinStream: InputDStream[String]) { val lines1KV = stream.map(x => (x, "attribute1")) val lines2KV = joinStream.map(x => (x, Array("attribute2", "attribute3", "attribute4"))) val linesKVW = lines1KV.join(lines2KV) linesKVW.print } }
Example 30
Source File: gihyo_6_3_Reduce.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Reduce { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val windowLineCount = stream.reduce((x, y) => x + "," + y) windowLineCount.print } }
Example 31
Source File: gihyo_6_3_reduceByWindow.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_reduceByWindow { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.reduceByWindow((x, y) => x + y, Seconds(windowLength), Seconds(slideInterval)) userList.print } }
Example 32
Source File: gihyo_6_3_KafkaStream.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 // scalastyle:off println import kafka.serializer.StringDecoder import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_KafkaStream { def main(args: Array[String]) { if (args.length != 4) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val brokerList = args(0) val consumeTopic = args(1) val checkpointDir = args(2) val saveDir = args(3) val f = createStreamingContext(brokerList, consumeTopic, checkpointDir, saveDir) // StreamingContextの取得 val ssc = StreamingContext.getOrCreate(checkpointDir, f) sys.ShutdownHookThread { System.out.println("Gracefully stopping SparkStreaming Application") ssc.stop(true, true) System.out.println("SparkStreaming Application stopped") } ssc.start ssc.awaitTermination } def createStreamingContext(brokerList: String, consumeTopic: String, checkpointDir: String, saveDir: String): () => StreamingContext = { () => { System.out.println(values) Some(running.getOrElse(0) + values.length) } def run(stream: InputDStream[(String, String)], saveDir: String, windowLength: Int = 30, slideInterval: Int = 5) { val baseStream = stream.transform(rdd => { val t = (Long.MaxValue - System.currentTimeMillis) rdd.map(x => (x._1, x._2 + ", " + t)) }).map(x => { val splitVal = x._2.split(",") val userVal = splitVal(0).split(":") val actionVal = splitVal(1).split(":") val pageVal = splitVal(2).split(":") val timestamp = splitVal(3) (actionVal(1), userVal(1), pageVal(1), timestamp) }) baseStream.persist() val accountStream = baseStream.filter(_._1 == "view") .map(x => x._2) .countByValue() val totalUniqueUser = accountStream .updateStateByKey[Int](updateStateByKeyFunction _) .count() .map(x => "totalUniqueUser:" + x) val baseStreamPerTirty = baseStream .window(Seconds(windowLength), Seconds(slideInterval)) .filter(_._1 == "view") baseStreamPerTirty.persist() val pageViewPerTirty = baseStreamPerTirty .count() .map(x => "PageView:" + x) val uniqueUserPerTirty = baseStreamPerTirty .map(x => x._2) .countByValue() .count() .map(x => "UniqueUser:" + x) val pageViewStream = baseStream .filter(_._1 == "view") .map(x => x._3) .count() .map(x => "PageView:" + x) val outputStream = totalUniqueUser .union(pageViewPerTirty) .union(uniqueUserPerTirty) .union(pageViewStream) .reduce((x, y) => x + ", " + y) .saveAsTextFiles(saveDir) } } // scalastyle:on println
Example 33
Source File: gihyo_6_3_TwitterStream.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 // scalastyle:off println import org.atilika.kuromoji.Token import twitter4j.Status import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.twitter.TwitterUtils import org.apache.spark.streaming.{Seconds, StreamingContext} object gihyo_6_3_TwitterStream { def main(args: Array[String]) { if (args.length != 7) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val Array(cKey, cSecret, aToken, aSecret, cDir, tagDir, wordDir) = args System.setProperty("twitter4j.oauth.consumerKey", cKey) System.setProperty("twitter4j.oauth.consumerSecret", cSecret) System.setProperty("twitter4j.oauth.accessToken", aToken) System.setProperty("twitter4j.oauth.accessTokenSecret", aSecret) val f = createStreamingContext(cDir, tagDir, wordDir) val ssc = StreamingContext.getOrCreate(cDir, f) sys.ShutdownHookThread { System.out.println("Gracefully stopping SparkStreaming Application") ssc.stop(true, true) System.out.println("SparkStreaming Application stopped") } ssc.start ssc.awaitTermination } def createStreamingContext(checkpointDir: String, tagDir: String, wordDir: String): () => StreamingContext = { () => { val conf = new SparkConf().setAppName("gihyoSample_Application") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.registerKryoClasses(Array(classOf[UserDic])) val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) ssc.checkpoint(checkpointDir) val twitterStream = TwitterUtils.createStream(ssc, None) run(sc, twitterStream, tagDir, wordDir) ssc } } def run(sc: SparkContext, stream: InputDStream[Status], tagDir: String, wordDir: String) { val tokenizer = sc.broadcast(UserDic.getInstance) val tweets = stream.map(tweet => tweet.getText()) tweets.persist() val TweetText = tweets .flatMap(text => { val tokens = tokenizer.value.tokenize(text).toArray tokens.filter(t => { val token = t.asInstanceOf[Token] ((token.getPartOfSpeech.indexOf("名詞") > -1 && token.getPartOfSpeech.indexOf("一般") > -1) || token.getPartOfSpeech.indexOf("カスタム名詞") > -1) && token.getSurfaceForm.length > 1 && !(token.getSurfaceForm matches "^[a-zA-Z]+$|^[0-9]+$") }).map(t => t.asInstanceOf[Token].getSurfaceForm) }) .countByValue() .map(x => (x._2, x._1)) .transform(_.sortByKey(false)) .map(x => (x._2, x._1)) val TweetTags = tweets .flatMap(tweet => tweet.split(" ").filter(_.startsWith("#"))) .countByValue() .map(x => (x._2, x._1)) .transform(_.sortByKey(false)) .map(x => (x._2, x._1)) TweetText.saveAsTextFiles(wordDir) TweetTags.saveAsTextFiles(tagDir) } } // scalastyle:on println
Example 34
Source File: gihyo_6_3_Union.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.kafka.KafkaUtils object gihyo_6_3_Union { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHosts = args(0) val consumerGroup = args(1) val targetTopics = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val KafkaStreams = (1 to 5).map { i => KafkaUtils.createStream(ssc, targetHosts, consumerGroup, Map(targetTopics -> 1)) } run(ssc, KafkaStreams) ssc.start ssc.awaitTermination } def run(ssc: StreamingContext, streams: IndexedSeq[InputDStream[(String, String)]]) { val unionedStream = ssc.union(streams) unionedStream.print } }
Example 35
Source File: gihyo_6_3_flatMap.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_flatMap { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val words = stream.flatMap(line => line.split(" ")) words.print } }
Example 36
Source File: gihyo_6_3_Repartition.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Repartition { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val repartitionData = stream.repartition(3) // scalastyle:off println repartitionData.foreachRDD(rdd => println(s"partition size: ${rdd.partitions.size.toString}")) // scalastyle:on println repartitionData.print } }
Example 37
Source File: gihyo_6_3_Count.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Count { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val lineCount = stream.window(Seconds(windowLength), Seconds(slideInterval)).count lineCount.print } }
Example 38
Source File: gihyo_6_3_Map.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Map { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val lineCount = stream.map(line => (line, 1)) lineCount.print } }
Example 39
Source File: gihyo_6_3_Cogroup.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Cogroup { def main(args: Array[String]) { if (args.length != 4) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost1 = args(0) val targetHostPort1 = args(1).toInt val targetHost2 = args(2) val targetHostPort2 = args(3).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines1 = ssc.socketTextStream(targetHost1, targetHostPort1) val lines2 = ssc.socketTextStream(targetHost2, targetHostPort2) run(lines1, lines2) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], otherStream: InputDStream[String]) { val lines1KV = stream.map(x => (x, "attribute1")) val lines2KV = otherStream.map(x => (x, "attribute2")) val linesKVW = lines1KV.cogroup(lines2KV) linesKVW.print } }
Example 40
Source File: gihyo_6_3_reduceByKey.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_reduceByKey { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val countKeyValue = stream.map(x => (x, 1)).reduceByKey((x, y) => x + y) countKeyValue.print } }
Example 41
Source File: gihyo_6_3_reduceByKeyAndWindow_efficient.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_reduceByKeyAndWindow_efficient { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val checkpointDir = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) ssc.checkpoint(checkpointDir) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.map(x => (x, 1)) .reduceByKeyAndWindow( (a: Int, b: Int) => a + b, (a: Int, b: Int) => a - b, Seconds(windowLength), Seconds(slideInterval)) userList.print } }
Example 42
Source File: gihyo_6_3_Transform.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Transform { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) val blackList = sc.parallelize(Array(("user002", "rockLogin"), ("user003", "rockPayment"))) run(lines, blackList) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], blackList: RDD[(String, String)]) { val userList = stream.map(x => (x, "action:Login")).transform(rdd => { val tmpUserList = rdd.leftOuterJoin(blackList) tmpUserList.filter(user => (user._2._2 == None)) }) userList.print } }
Example 43
Source File: gihyo_6_3_reduceByKeyAndWindow.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_reduceByKeyAndWindow { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.map(x => (x, 1)) .reduceByKeyAndWindow((a: Int, b: Int) => a + b, Seconds(windowLength), Seconds(slideInterval)) userList.print } }
Example 44
Source File: gihyo_6_3_countByValueAndWindow.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 // scalastyle:off println import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_countByValueAndWindow { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val checkpointDir = args(2) val f = createStreamingContext(targetHost, targetHostPort, checkpointDir) val ssc = StreamingContext.getOrCreate(checkpointDir, f) sys.ShutdownHookThread { System.out.println("Gracefully stopping SparkStreaming Application") ssc.stop(true, true) System.out.println("SparkStreaming Application stopped") } ssc.start ssc.awaitTermination } def createStreamingContext( targetHost: String, targetHostPort: Int, checkpointDir: String): () => StreamingContext = { () => { val conf = new SparkConf().setAppName("gihyoSample_Application") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) ssc.checkpoint(checkpointDir) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc } } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.countByValueAndWindow(Seconds(windowLength), Seconds(slideInterval)) userList.print } } // scalastyle:on println
Example 45
Source File: gihyo_6_3_updateStateByKey.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_updateStateByKey { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val checkpointDir = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) ssc.checkpoint(checkpointDir) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val userList = stream.map(x => (x, 1)).updateStateByKey[Int](updateStateByKeyFunction _) userList.print } def updateStateByKeyFunction(values: Seq[Int], running: Option[Int]): Option[Int] = { Some(running.getOrElse(0) + values.size) } }
Example 46
Source File: gihyo_6_3_Filter.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Filter { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val overData = stream.filter(line => line.length > 5) overData.print } }
Example 47
Source File: gihyo_6_3_countByWindow.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_countByWindow { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val checkpointDir = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) ssc.checkpoint(checkpointDir) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.countByWindow(Seconds(windowLength), Seconds(slideInterval)) userList.print } }
Example 48
Source File: gihyo_6_3_Window.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Window { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.window(Seconds(windowLength), Seconds(slideInterval)).countByValue() userList.print } }
Example 49
Source File: gihyo_6_3_countByValue.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_countByValue { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val countValue = stream.countByValue() countValue.print } }
Example 50
Source File: ReduceExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object ReduceExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("ReduceExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1, 6, 3), 3) nums.reduce((x, y) => x + y) println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""sum: ${nums.reduce((x, y) => x + y)}""") } } // scalastyle:on println
Example 51
Source File: StatsExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object StatsExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("StatsExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val nums = sc.parallelize(Array.range(1, 11)) val stats = nums.stats() println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""count: ${stats.count}""") println(s"""mean: ${stats.mean}""") println(s"""stdev: ${stats.stdev}""") } } // scalastyle:on println
Example 52
Source File: FoldExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object FoldExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("FoldExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1, 6, 3), 3) nums.reduce((x, y) => x + y) println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""sum: ${nums.fold(0)((x, y) => x + y)}""") } } // scalastyle:on println
Example 53
Source File: OrderExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object OrderExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("OrderExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1)) println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""top3: ${nums.top(3).mkString(", ")}""") println(s"""takeOredered3: ${nums.takeOrdered(3).mkString(", ")}""") } } // scalastyle:on println
Example 54
Source File: AggregateExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object AggregateExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("AggregateExample") val sc = new SparkContext(conf) run(sc) sc.stop() } private[basic_action] def run(sc: SparkContext) { val nums = sc.parallelize(Array.range(1, 11), 3) val acc = nums.aggregate(zeroValue = (0.0, 0))( seqOp = (partAcc, n) => (partAcc._1 + n, partAcc._2 + 1), combOp = (acc1, acc2) => (acc1._1 + acc2._1, acc1._2 + acc2._2) ) val avg = acc._1 / acc._2 println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""sum: ${nums.fold(0)((x, y) => x + y)}""") } } // scalastyle:on println
Example 55
Source File: CollectAsMapExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object CollectAsMapExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("CollectAsMapExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize( Array( ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1) ), 3 ) val fruitsAsMap = fruits.collectAsMap() println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""fruitsAsMap: $fruitsAsMap""") } } // scalastyle:on println
Example 56
Source File: PersistExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.persistence import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} object PersistExample { def main(args: Array[String]) { if (args.length != 1) { new IllegalArgumentException("Invalid arguments") System.exit(1) } Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("PersistExample") val sc = new SparkContext(conf) run(sc, args(0)) sc.stop() } def run(sc: SparkContext, inputFile: String) { val lines = sc.textFile(inputFile) lines.count() lines.collect() val persistedLines = sc.textFile(inputFile).persist() persistedLines.collect() persistedLines.count() persistedLines.unpersist() persistedLines.collect() } }
Example 57
Source File: CustomPartitionerExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.partition import org.apache.log4j.{Level, Logger} import org.apache.spark.Partitioner import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object CustomPartitionerExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("CustomPartitionerExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val defaultPartitioned = fruits.map((_, 1)).reduceByKey(_ + _) val customPartitioned = fruits.map((_, 1)).reduceByKey( new FirstLetterPartitioner(sc.defaultParallelism), _ + _) println(s"""fruits:\n ${fruits.collect().mkString(", ")}""") println() println("partitioned by default partitioner") defaultPartitioned.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) println() println("partitioned by first letter partitioner") customPartitioned.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) } } private[partition] class FirstLetterPartitioner(numParts: Int) extends Partitioner { override def numPartitions: Int = numParts override def getPartition(key: Any): Int = { key.toString.charAt(0).hashCode % numPartitions match { case p if p < 0 => p + numPartitions case p => p } } override def equals(other: Any): Boolean = { other match { case p: FirstLetterPartitioner => p.numPartitions == numPartitions case _ => false } } } // scalastyle:on println
Example 58
Source File: PartitionExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.partition import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object PartitionExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("Partition") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1), 1) println(s"""nums:\n ${nums.collect().mkString(", ")}""") println() println("original:") nums.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) println() val numsPar3 = nums.repartition(3) println("repartition to 3:") numsPar3.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) println() val numsPar2 = numsPar3.coalesce(2) println("coalesce to 2:") numsPar2.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) } } // scalastyle:on println
Example 59
Source File: WordCountExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.shared_variable import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object WordCountExample { def main(args: Array[String]) { if (args.length != 1) { new IllegalArgumentException("Invalid arguments") System.exit(1) } Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("WordCountExample") val sc = new SparkContext(conf) run(sc, args(0)) sc.stop() } def run(sc: SparkContext, inputFile: String) { val stopWordCount = sc.accumulator(0L) val stopWords = sc.broadcast(Set("a", "an", "for", "in", "on")) val lines = sc.textFile(inputFile) val words = lines.flatMap(_.split(" ")).filter(!_.isEmpty) val wordCounts = words.map(w => (w, 1)).reduceByKey(_ + _).filter { w => val result = !stopWords.value.contains(w._1) if (!result) stopWordCount += 1L result } val sortedWordCounts = wordCounts.sortBy(_._2, ascending = false) println(s"""wordCounts: ${sortedWordCounts.take(10).mkString(", ")}""") println(s"""stopWordCounts: ${stopWordCount.value}""") } } // scalastyle:on println
Example 60
Source File: AggregateByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object AggregateByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("AggregateByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize( Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1))) val fruitCountAvgs = fruits.aggregateByKey(zeroValue = Acc(0.0, 0))( seqOp = (partAcc, n) => partAcc += n, combOp = (acc1, acc2) => acc1 ++= acc2 ).mapValues(acc => acc.sum / acc.count) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""fruitCountAvgs: ${fruitCountAvgs.collect().mkString(", ")}""") } } // scalastyle:on println
Example 61
Source File: MapValuesExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object MapValuesExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("MapValuesExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array(("Apple", 1), ("Orange", 4), ("Apple", 2), ("Peach", 1))) val plusOnes = fruits.mapValues(v => v + 1) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""plusOnes: ${plusOnes.collect().mkString(", ")}""") } } // scalastyle:on println
Example 62
Source File: SortByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object SortByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("SortByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize( Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1))) val sortedByKeyAsc = fruits.sortByKey(ascending = false) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""sortedByKeyAsc: ${sortedByKeyAsc.collect().mkString(", ")}""") val nums = sc.parallelize( Array(("One", 1), ("Hundred", 100), ("Three", 3), ("Thousand", 1000))) implicit val sortByStrLen = new Ordering[String] { def compare(x: String, y: String): Int = x.length - y.length } val sortedByKeyLength = nums.sortByKey() println() println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""sortedByKeyLength: ${sortedByKeyLength.collect().mkString(", ")}""") } } // scalastyle:on println
Example 63
Source File: CoGroupExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object CoGroupExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("CoGroupExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val persons = sc.parallelize(Array( ("Adam", "San francisco"), ("Bob", "San francisco"), ("Taro", "Tokyo"), ("Charles", "New York") )) val cities = sc.parallelize(Array( ("Tokyo", "Japan"), ("San francisco", "America"), ("Beijing", "China") )) val grouped = persons.map(_.swap).cogroup(cities) println(s"""persons: ${persons.collect().mkString(", ")}""") println(s"""cities: ${cities.collect().mkString(", ")}""") println() println(s"""grouped:\n${grouped.collect().mkString("\n")}""") } } // scalastyle:on println
Example 64
Source File: JoinExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object JoinExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("JoinExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val persons = sc.parallelize(Array( ("Adam", "San francisco"), ("Bob", "San francisco"), ("Taro", "Tokyo"), ("Charles", "New York") )) val cities = sc.parallelize(Array( ("Tokyo", "Japan"), ("San francisco", "America"), ("Beijing", "China") )) val leftJoined = persons.map(_.swap).join(cities) val leftOuterJoined = persons.map(_.swap).leftOuterJoin(cities) val rightOuterJoined = persons.map(_.swap).rightOuterJoin(cities) val fullOuterJoined = persons.map(_.swap).fullOuterJoin(cities) println(s"""persons: ${persons.collect().mkString(", ")}""") println(s"""cities: ${cities.collect().mkString(", ")}""") println() println(s"""leftJoined:\n${leftJoined.collect().mkString("\n")}""") println() println(s"""leftOuterJoined:\n${leftOuterJoined.collect().mkString("\n")}""") println() println(s"""rightOuterJoined:\n${rightOuterJoined.collect().mkString("\n")}""") println() println(s"""fullOuterJoined:\n${fullOuterJoined.collect().mkString("\n")}""") } } // scalastyle:on println
Example 65
Source File: GroupByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object GroupByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("GroupByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize( Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1))) val grouped = fruits.groupByKey() println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""grouped: ${grouped.collect().mkString(", ")}""") } } // scalastyle:on println
Example 66
Source File: ReduceByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object ReduceByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("ReduceByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array( ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1))) val fruitCounts = fruits.reduceByKey((x, y) => x + y) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""fruitCounts: ${fruitCounts.collect().mkString(", ")}""") } } // scalastyle:on println
Example 67
Source File: CombineByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object CombineByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("CombineByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize( Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1))) val fruitCountAvgs = fruits.combineByKey( createCombiner = (v: Int) => Acc(v.toDouble, 1), mergeValue = (partAcc: Acc, n: Int) => partAcc += n, mergeCombiners = (acc1: Acc, acc2: Acc) => acc1 ++= acc2 ).mapValues(acc => acc.sum / acc.count) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""fruitCountAvgs: ${fruitCountAvgs.collect().mkString(", ")}""") } } // scalastyle:on println
Example 68
Source File: FoldByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object FoldByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("FoldByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array( ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1))) val fruitCounts = fruits.foldByKey(0)((x, y) => x + y) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""fruitCounts: ${fruitCounts.collect().mkString(", ")}""") } } // scalastyle:on println
Example 69
Source File: MapPartitionsExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.module.scala.DefaultScalaModule import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object MapPartitionsExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("MapPartitionsExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val jsonLines = sc.parallelize(Array( """{"name": "Apple", "num": 1}""", """{"name": "Orange", "num": 4}""", """{"name": "Apple", "num": 2}""", """{"name": "Peach", "num": 1}""" )) val parsed = jsonLines.mapPartitions { lines => val mapper = new ObjectMapper() mapper.registerModule(DefaultScalaModule) lines.map { line => val f = mapper.readValue(line, classOf[Map[String, String]]) (f("name"), f("num")) } } println(s"""json:\n${jsonLines.collect().mkString("\n")}""") println() println(s"""parsed:\n${parsed.collect().mkString("\n")}""") } } // scalastyle:on println
Example 70
Source File: FlatMapExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object FlatMapExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("FlatMapExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val lines = sc.parallelize(Array("Apple is red", "PineApple is yellow")) val words = lines.flatMap(line => line.split(" ")) println(s"""lines: ${lines.collect().mkString(", ")}""") println(s"""words: ${words.collect().mkString(", ")}""") } } // scalastyle:on println
Example 71
Source File: SetOperationsExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object SetOperationsExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("SetOperationsExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits1 = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val fruits2 = sc.parallelize(Array("Grape", "Apple", "Banana", "Orange")) val union = fruits1.union(fruits2) val subtract = fruits1.subtract(fruits2) val intersection = fruits1.intersection(fruits2) val cartesian = fruits1.cartesian(fruits2) println(s"""fruits1: ${fruits1.collect().mkString(", ")}""") println(s"""fruits2: ${fruits2.collect().mkString(", ")}""") println(s"""union: ${union.collect().mkString(", ")}""") println(s"""subtract: ${subtract.collect().mkString(", ")}""") println(s"""intersection: ${intersection.collect().mkString(", ")}""") println(s"""cartesian: ${cartesian.collect().mkString(", ")}""") } } // scalastyle:on println
Example 72
Source File: MapExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object MapExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("MapExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val lengths = fruits.map(fruit => fruit.length) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""lengths: ${lengths.collect().mkString(", ")}""") } } // scalastyle:on println
Example 73
Source File: ZipExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object ZipExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("ZipExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits1 = sc.parallelize( Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val fruits2 = sc.parallelize( Array("りんご", "オレンジ", "桃", "オレンジ", "パイナップル", "オレンジ")) val zipped = fruits1.zip(fruits2) println(s"""fruits1: ${fruits1.collect().mkString(", ")}""") println(s"""fruits2: ${fruits2.collect().mkString(", ")}""") println(s"""zipped: ${zipped.collect().mkString(", ")}""") } } // scalastyle:on println
Example 74
Source File: DistinctExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object DistinctExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("DistinctExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val uniques = fruits.distinct() println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""uniques: ${uniques.collect().mkString(", ")}""") } } // scalastyle:on println
Example 75
Source File: SampleExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object SampleExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("SampleExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val samples = fruits.sample(withReplacement = false, 0.5, 1) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""samples: ${samples.collect().mkString(", ")}""") } } // scalastyle:on println
Example 76
Source File: FilterExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object FilterExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("FilterExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val startWithPs = fruits.filter(fruit => fruit.startsWith("P")) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""startWithPs: ${startWithPs.collect().mkString(", ")}""") } } // scalastyle:on println
Example 77
Source File: JdbcExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch05 // scalastyle:off println import java.util.Properties import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.{SparkConf, SparkContext} object JdbcExample { def main(args: Seq[String]): Unit = { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val url = args(0) val user = args(1) val pass = args(2) val conf = new SparkConf().setAppName("JdbcExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) run(sc, sqlContext, url, user, pass) sc.stop() } def run(sc: SparkContext, sqlContext: SQLContext, url: String, user: String, pass: String): Unit = { val prop = new Properties() prop.setProperty("user", user) prop.setProperty("password", pass) val df: DataFrame = sqlContext.read.jdbc(url, "gihyo_spark.person", prop) df.printSchema() println("# Rows: " + df.count()) } } // scalastyle:on println
Example 78
Source File: DataFrameNaFunctionExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch05 // scalastyle:off println import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object DataFrameNaFunctionExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("BasicDataFrameExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) run(sc, sqlContext) sc.stop() } def run( sc: SparkContext, sqlContext: SQLContext): Unit = { import sqlContext.implicits._ val nullDF = Seq[(String, java.lang.Integer, java.lang.Double)]( ("Bob", 16, 176.5), ("Alice", null, 164.3), ("", 60, null), ("UNKNOWN", 25, Double.NaN), ("Amy", null, null), (null, null, Double.NaN) ).toDF("name", "age", "height") // drop nullDF.na.drop("any").show() nullDF.na.drop("all").show() nullDF.na.drop(Array("age")).show() nullDF.na.drop(Seq("age", "height")).show() nullDF.na.drop("any", Array("name", "age")).show() nullDF.na.drop("all", Array("age", "height")).show() // fill nullDF.na.fill(0.0, Array("name", "height")).show() nullDF.na.fill(Map( "name" -> "UNKNOWN", "height" -> 0.0 )).show() // replace nullDF.na.replace("name", Map("" -> "UNKNOWN")).show() } } // scalastyle:on println
Example 79
Source File: DatasetExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch05 import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.sql.{Dataset, SQLContext} import org.apache.spark.sql.functions._ private case class Person(id: Int, name: String, age: Int) object DatasetExample { def main(args: Seq[String]): Unit = { val conf = new SparkConf().setAppName("DatasetExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) run(sc, sqlContext) sc.stop() } def run(sc: SparkContext, sqlContext: SQLContext): Unit = { import sqlContext.implicits._ // Creates a Dataset from a `Seq` val seq = Seq((1, "Bob", 23), (2, "Tom", 23), (3, "John", 22)) val ds1: Dataset[(Int, String, Int)] = sqlContext.createDataset(seq) val ds2: Dataset[(Int, String, Int)] = seq.toDS() // Creates a Dataset from a `RDD` val rdd = sc.parallelize(seq) val ds3: Dataset[(Int, String, Int)] = sqlContext.createDataset(rdd) val ds4: Dataset[(Int, String, Int)] = rdd.toDS() // Creates a Dataset from a `DataFrame` val df = rdd.toDF("id", "name", "age") val ds5: Dataset[Person] = df.as[Person] // Selects a column ds5.select(expr("name").as[String]).show() // Filtering ds5.filter(_.name == "Bob").show() ds5.filter(person => person.age == 23).show() // Groups and counts the number of rows ds5.groupBy(_.age).count().show() } }
Example 80
Source File: TestStreamingContext.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark import org.scalatest.{BeforeAndAfterEach, Suite} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import jp.gihyo.spark.ch06.UserDic private[spark] trait TestStreamingContext extends BeforeAndAfterEach { self: Suite => @transient var ssc: StreamingContext = _ @transient var sc: SparkContext = _ val master = "local[2]" val appN = "StreamingUnitTest" val bd = Seconds(1) override def beforeEach() { super.beforeEach() val conf = new SparkConf().setMaster(master) .setAppName(appN) .set("spark.streaming.clock", "org.apache.spark.util.ManualClock") .registerKryoClasses(Array(classOf[UserDic])) ssc = new StreamingContext(conf, bd) sc = ssc.sparkContext } override def afterEach() { try { if (ssc != null) { // stop with sc ssc.stop(true) } ssc = null; } finally { super.afterEach() } } }
Example 81
Source File: TestSparkContext.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark import org.scalatest.{BeforeAndAfterAll, Suite} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SQLContext private[spark] trait TestSparkContext extends BeforeAndAfterAll { self: Suite => @transient var sc: SparkContext = _ @transient var sqlContext: SQLContext = _ override def beforeAll() { super.beforeAll() val conf = new SparkConf() .setMaster("local[2]") .setAppName("SparkUnitTest") .set("spark.sql.shuffle.partitions", "2") sc = new SparkContext(conf) SQLContext.clearActive() sqlContext = new SQLContext(sc) SQLContext.setActive(sqlContext) } override def afterAll() { try { sqlContext = null SQLContext.clearActive() if (sc != null) { sc.stop() } sc = null } finally { super.afterAll() } } }
Example 82
Source File: TestMain.scala From hbrdd with Apache License 2.0 | 5 votes |
import org.apache.spark.{SparkContext, SparkConf} object TestMain { private val master = "Core1" private val port = "7077" private val appName = "hbase-rdd_spark" private val data = "hdfs://Master1:8020/test/spark/hbase/testhb" def main(args: Array[String]) { val sparkConf = new SparkConf() .setMaster(s"spark://$master:$port") .setAppName(appName).setJars(List("/home/lele/coding/hbrdd/out/artifacts/hbrdd_jar/hbrdd.jar")) val sc = new SparkContext(sparkConf) val ret = sc.textFile(data).map({ line => val Array(k, col1, col2, _) = line split "\t" val content = Map("col1" -> col1, "col2" -> col2) k -> content }) println(ret.count()) sc.stop() } }
Example 83
Source File: TestUtils.scala From odsc-east-realish-predictions with Apache License 2.0 | 5 votes |
package com.twilio.open.odsc.realish import com.holdenkarau.spark.testing.{LocalSparkContext, SparkContextProvider} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SparkSession import org.scalatest.{BeforeAndAfterAll, Suite} object TestUtils { } @SerialVersionUID(1L) case class UserPersonality(uuid: String, name: String, tags: Array[String]) extends Serializable @SerialVersionUID(1L) case class Author(uuid: String, name: String, age: Int) extends Serializable @SerialVersionUID(1L) case class LibraryBook(uuid: String, name: String, author: Author) extends Serializable case class MockKafkaDataFrame(key: Array[Byte], value: Array[Byte]) trait SharedSparkSql extends BeforeAndAfterAll with SparkContextProvider { self: Suite => @transient var _sparkSql: SparkSession = _ @transient private var _sc: SparkContext = _ override def sc: SparkContext = _sc def conf: SparkConf def sparkSql: SparkSession = _sparkSql override def beforeAll() { _sparkSql = SparkSession.builder().config(conf).getOrCreate() _sc = _sparkSql.sparkContext setup(_sc) super.beforeAll() } override def afterAll() { try { _sparkSql.close() _sparkSql = null LocalSparkContext.stop(_sc) _sc = null } finally { super.afterAll() } } }
Example 84
Source File: TestUtils.scala From odsc-east-realish-predictions with Apache License 2.0 | 5 votes |
package com.twilio.open.odsc.realish import com.holdenkarau.spark.testing.{LocalSparkContext, SparkContextProvider} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SparkSession import org.scalatest.{BeforeAndAfterAll, Suite} object TestUtils { } @SerialVersionUID(1L) case class UserPersonality(uuid: String, name: String, tags: Array[String]) extends Serializable @SerialVersionUID(1L) case class Author(uuid: String, name: String, age: Int) extends Serializable @SerialVersionUID(1L) case class LibraryBook(uuid: String, name: String, author: Author) extends Serializable case class MockKafkaDataFrame(key: Array[Byte], value: Array[Byte]) trait SharedSparkSql extends BeforeAndAfterAll with SparkContextProvider { self: Suite => @transient var _sparkSql: SparkSession = _ @transient private var _sc: SparkContext = _ override def sc: SparkContext = _sc def conf: SparkConf def sparkSql: SparkSession = _sparkSql override def beforeAll() { _sparkSql = SparkSession.builder().config(conf).getOrCreate() _sc = _sparkSql.sparkContext setup(_sc) super.beforeAll() } override def afterAll() { try { _sparkSql.close() _sparkSql = null LocalSparkContext.stop(_sc) _sc = null } finally { super.afterAll() } } }
Example 85
Source File: DatasetLoaderApp.scala From spark_recommender with Apache License 2.0 | 5 votes |
package es.alvsanand.spark_recommender import es.alvsanand.spark_recommender.parser.{DatasetDownloader, DatasetIngestion} import es.alvsanand.spark_recommender.utils.{ESConfig, Logging, MongoConfig} import org.apache.spark.SparkConf import scopt.OptionParser object DatasetLoaderApp extends App with Logging { override def main(args: Array[String]) { val defaultParams = scala.collection.mutable.Map[String, Any]() defaultParams += "spark.cores" -> "local[*]" defaultParams += "spark.option" -> scala.collection.mutable.Map[String, String]() defaultParams += "mongo.uri" -> "mongodb://127.0.0.1:27017/spark_recommender" defaultParams += "mongo.db" -> "spark_recommender" defaultParams += "es.httpHosts" -> "127.0.0.1:9200" defaultParams += "es.transportHosts" -> "127.0.0.1:9300" defaultParams += "es.index" -> "spark_recommender" defaultParams += "dataset.tmp.dir" -> "%s/.spark_recommender".format(sys.env("HOME")) val parser = new OptionParser[scala.collection.mutable.Map[String, Any]]("ScaleDataset") { head("Spark Recommender Example") opt[String]("spark.cores") .text("Number of cores in the Spark cluster") .action((x, c) => { c += "spark.cores" -> x }) opt[Map[String,String]]("spark.option") .text("Spark Config Option") .valueName("spark.property1=value1,spark.property2=value2,...") .action { (x, c) => { c("spark.option").asInstanceOf[scala.collection.mutable.Map[String, Any]] ++= x.toSeq c } } opt[String]("mongo.uri") .text("Mongo URI including the DB") .action((x, c) => { c += "mongo.uri" -> x }) opt[String]("mongo.db") .text("Mongo Database") .action((x, c) => { c += "mongo.db" -> x }) opt[String]("es.httpHosts") .text("ElasicSearch HTTP Hosts") .action((x, c) => { c += "es.httpHosts" -> x }) opt[String]("es.transportHosts") .text("ElasicSearch Transport Hosts") .action((x, c) => { c += "es.transportHosts" -> x }) opt[String]("es.index") .text("ElasicSearch index") .action((x, c) => { c += "es.index" -> x }) opt[String]("dataset.tmp.dir") .text("Temporal directory to store the products dataset") .action((x, c) => { c += "dataset.tmp.dir" -> x }) opt[String]("dataset.file") .text("Ingest only one dataset file") .action((x, c) => { c += "dataset.file" -> x }) help("help") text("prints this usage text") } parser.parse(args, defaultParams).map { params => run(params.toMap) } getOrElse { System.exit(1) } } private def run(params: Map[String, Any]): Unit = { implicit val conf = new SparkConf().setAppName("RecommenderTrainerApp").setMaster(params("spark.cores").asInstanceOf[String]) params("spark.option").asInstanceOf[scala.collection.mutable.Map[String, Any]].foreach { case (key: String, value: String) => conf.set(key, value) } implicit val mongoConf = new MongoConfig(params("mongo.uri").asInstanceOf[String], params("mongo.db").asInstanceOf[String]) implicit val esConf = new ESConfig(params("es.httpHosts").asInstanceOf[String], params("es.transportHosts").asInstanceOf[String], params("es.index").asInstanceOf[String]) try { DatasetDownloader.download(params("dataset.tmp.dir").asInstanceOf[String]) DatasetIngestion.storeData(DatasetDownloader.getFinalDstName(params("dataset.tmp.dir").asInstanceOf[String]), Option(params.getOrElse("dataset.file", null).asInstanceOf[String])) } catch { case e: Exception => logger.error("Error executing DatasetLoaderApp", e) sys.exit(1) } sys.exit(0) } }
Example 86
Source File: RecommenderTrainerApp.scala From spark_recommender with Apache License 2.0 | 5 votes |
package es.alvsanand.spark_recommender import es.alvsanand.spark_recommender.trainer.ALSTrainer import es.alvsanand.spark_recommender.utils.{Logging, MongoConfig} import org.apache.spark.SparkConf import scopt.OptionParser object RecommenderTrainerApp extends App with Logging { override def main(args: Array[String]) { val defaultParams = scala.collection.mutable.Map[String, Any]() defaultParams += "spark.cores" -> "local[*]" defaultParams += "spark.option" -> scala.collection.mutable.Map[String, String]() defaultParams += "mongo.uri" -> "mongodb://127.0.0.1:27017/spark_recommender" defaultParams += "mongo.db" -> "spark_recommender" defaultParams += "maxRecommendations" -> ALSTrainer.MAX_RECOMMENDATIONS.toString val parser = new OptionParser[scala.collection.mutable.Map[String, Any]]("RecommenderTrainerApp") { head("Recommendation System Trainer") opt[String]("spark.cores") .text("Number of cores in the Spark cluster") .action((x, c) => { c += "spark.cores" -> x }) opt[Map[String,String]]("spark.option") .text("Spark Config Option") .valueName("spark.property1=value1,spark.property2=value2,...") .action { (x, c) => { c("spark.option").asInstanceOf[scala.collection.mutable.Map[String, Any]] ++= x.toSeq c } } opt[String]("mongo.uri") .text("Mongo Hosts") .action((x, c) => { c += "mongo.uri" -> x }) opt[String]("mongo.db") .text("Mongo Database") .action((x, c) => { c += "mongo.db" -> x }) opt[String]("maxRecommendations") .text("Maximum number of recommendations") .action((x, c) => { c += "maxRecommendations" -> x }) help("help") text("prints this usage text") } parser.parse(args, defaultParams).map { params => run(params.toMap) } getOrElse { System.exit(1) } } private def run(params: Map[String, Any]): Unit = { implicit val conf = new SparkConf().setAppName("RecommenderTrainerApp").setMaster(params("spark.cores").asInstanceOf[String]) params("spark.option").asInstanceOf[scala.collection.mutable.Map[String, Any]].foreach { case (key: String, value: String) => conf.set(key, value) } implicit val mongoConf = new MongoConfig(params("mongo.uri").asInstanceOf[String], params("mongo.db").asInstanceOf[String]) val maxRecommendations = params("maxRecommendations").asInstanceOf[String].toInt try { ALSTrainer.calculateRecs(maxRecommendations) } catch { case e: Exception => logger.error("Error executing RecommenderTrainerApp", e) sys.exit(1) } sys.exit(0) } }
Example 87
Source File: HyperLogLog.scala From spark-hyperloglog with Apache License 2.0 | 5 votes |
package com.mozilla.spark.sql.hyperloglog.test import com.mozilla.spark.sql.hyperloglog.aggregates._ import com.mozilla.spark.sql.hyperloglog.functions._ import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions._ import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.{FlatSpec, Matchers} class HyperLogLogTest extends FlatSpec with Matchers{ "Algebird's HyperLogLog" can "be used from Spark" in { val sparkConf = new SparkConf().setAppName("HyperLogLog") sparkConf.setMaster(sparkConf.get("spark.master", "local[1]")) val sc = new SparkContext(sparkConf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ val hllMerge = new HyperLogLogMerge sqlContext.udf.register("hll_merge", hllMerge) sqlContext.udf.register("hll_create", hllCreate _) sqlContext.udf.register("hll_cardinality", hllCardinality _) val frame = sc.parallelize(List("a", "b", "c", "c"), 4).toDF("id") val count = frame .select(expr("hll_create(id, 12) as hll")) .groupBy() .agg(expr("hll_cardinality(hll_merge(hll)) as count")) .collect() count(0)(0) should be (3) } }
Example 88
Source File: SparkManager.scala From darwin with Apache License 2.0 | 5 votes |
package it.agilelab.darwin.app.spark import com.typesafe.config.Config import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ trait SparkManager { val sparkManagerLogger: Logger = LoggerFactory.getLogger("SparkManager") protected def defaultParallelism(implicit sparkSession: SparkSession, config: Config): Int = { sparkSession.conf.getOption(SparkConfigurationKeys.SPARK_EXECUTOR_INSTANCES) match { case Some(instances) => sparkSession.conf.getOption(SparkConfigurationKeys.SPARK_CORES).getOrElse("1").toInt * instances.toInt case None => sparkManagerLogger.info("Spark is configured with dynamic allocation, default parallelism will be gathered from app " + "conf: " + "next.process.parallelism") if (config.hasPath(SparkConfigurationKeys.PARALLELISM)) { config.getInt(SparkConfigurationKeys.PARALLELISM) } else { sparkManagerLogger.info("next.process.parallelism was not set fallback to sparkSession.defaultParallelism") sparkSession.sparkContext.defaultParallelism } } } }
Example 89
Source File: LinearPixels.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.pipelines.images.cifar import breeze.linalg.DenseVector import keystoneml.evaluation.MulticlassClassifierEvaluator import keystoneml.loaders.CifarLoader import keystoneml.nodes.images.{GrayScaler, ImageExtractor, ImageVectorizer, LabelExtractor} import keystoneml.nodes.learning.LinearMapEstimator import keystoneml.nodes.util.{Cacher, ClassLabelIndicatorsFromIntLabels, MaxClassifier} import org.apache.spark.{SparkConf, SparkContext} import keystoneml.pipelines.Logging import scopt.OptionParser import keystoneml.utils.Image import keystoneml.workflow.Pipeline object LinearPixels extends Logging { val appName = "LinearPixels" case class LinearPixelsConfig(trainLocation: String = "", testLocation: String = "") def run(sc: SparkContext, config: LinearPixelsConfig): Pipeline[Image, Int] = { val numClasses = 10 // Load and cache the training data. val trainData = CifarLoader(sc, config.trainLocation).cache() val trainImages = ImageExtractor(trainData) val labelExtractor = LabelExtractor andThen ClassLabelIndicatorsFromIntLabels(numClasses) andThen new Cacher[DenseVector[Double]] val trainLabels = labelExtractor(trainData) // A featurizer maps input images into vectors. For this pipeline, we'll also convert the image to grayscale. // We then estimate our model by calling a linear solver on our data. val predictionPipeline = GrayScaler andThen ImageVectorizer andThen (new LinearMapEstimator, trainImages, trainLabels) andThen MaxClassifier // Calculate training error. val evaluator = new MulticlassClassifierEvaluator(numClasses) val trainEval = evaluator.evaluate(predictionPipeline(trainImages), LabelExtractor(trainData)) // Do testing. val testData = CifarLoader(sc, config.testLocation) val testImages = ImageExtractor(testData) val testLabels = labelExtractor(testData) val testEval = evaluator.evaluate(predictionPipeline(testImages), LabelExtractor(testData)) logInfo(s"Training accuracy: \n${trainEval.totalAccuracy}") logInfo(s"Test accuracy: \n${testEval.totalAccuracy}") predictionPipeline } def parse(args: Array[String]): LinearPixelsConfig = new OptionParser[LinearPixelsConfig](appName) { head(appName, "0.1") help("help") text("prints this usage text") opt[String]("trainLocation") required() action { (x,c) => c.copy(trainLocation=x) } opt[String]("testLocation") required() action { (x,c) => c.copy(testLocation=x) } }.parse(args, LinearPixelsConfig()).get def main(args: Array[String]) = { val appConfig = parse(args) val conf = new SparkConf().setAppName(appName) conf.setIfMissing("spark.master", "local[2]") // This is a fallback if things aren't set via spark submit. val sc = new SparkContext(conf) run(sc, appConfig) sc.stop() } }
Example 90
Source File: AmazonReviewsPipeline.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.pipelines.text import breeze.linalg.SparseVector import keystoneml.evaluation.BinaryClassifierEvaluator import keystoneml.loaders.{AmazonReviewsDataLoader, LabeledData} import keystoneml.nodes.learning.LogisticRegressionEstimator import keystoneml.nodes.nlp._ import keystoneml.nodes.stats.TermFrequency import keystoneml.nodes.util.CommonSparseFeatures import org.apache.spark.sql.SparkSession import org.apache.spark.{SparkConf, SparkContext} import keystoneml.pipelines.Logging import scopt.OptionParser import keystoneml.workflow.Pipeline object AmazonReviewsPipeline extends Logging { val appName = "AmazonReviewsPipeline" def run(spark: SparkSession, conf: AmazonReviewsConfig): Pipeline[String, Double] = { val amazonTrainData = AmazonReviewsDataLoader(spark, conf.trainLocation, conf.threshold).labeledData val trainData = LabeledData(amazonTrainData.repartition(conf.numParts).cache()) val training = trainData.data val labels = trainData.labels // Build the classifier estimator val predictor = Trim andThen LowerCase() andThen Tokenizer() andThen NGramsFeaturizer(1 to conf.nGrams) andThen TermFrequency(x => 1) andThen (CommonSparseFeatures[Seq[String]](conf.commonFeatures), training) andThen (LogisticRegressionEstimator[SparseVector[Double]](numClasses = 2, numIters = conf.numIters), training, labels) // Evaluate the classifier val amazonTestData = AmazonReviewsDataLoader(spark, conf.testLocation, conf.threshold).labeledData val testData = LabeledData(amazonTestData.repartition(conf.numParts).cache()) val testLabels = testData.labels val testResults = predictor(testData.data) val eval = BinaryClassifierEvaluator.evaluate(testResults.get.map(_ > 0), testLabels.map(_ > 0)) logInfo("\n" + eval.summary()) predictor } case class AmazonReviewsConfig( trainLocation: String = "", testLocation: String = "", threshold: Double = 3.5, nGrams: Int = 2, commonFeatures: Int = 100000, numIters: Int = 20, numParts: Int = 512) def parse(args: Array[String]): AmazonReviewsConfig = new OptionParser[AmazonReviewsConfig](appName) { head(appName, "0.1") opt[String]("trainLocation") required() action { (x,c) => c.copy(trainLocation=x) } opt[String]("testLocation") required() action { (x,c) => c.copy(testLocation=x) } opt[Double]("threshold") action { (x,c) => c.copy(threshold=x)} opt[Int]("nGrams") action { (x,c) => c.copy(nGrams=x) } opt[Int]("commonFeatures") action { (x,c) => c.copy(commonFeatures=x) } opt[Int]("numIters") action { (x,c) => c.copy(numParts=x) } opt[Int]("numParts") action { (x,c) => c.copy(numParts=x) } }.parse(args, AmazonReviewsConfig()).get def main(args: Array[String]) = { val conf = new SparkConf().setAppName(appName) conf.setIfMissing("spark.master", "local[2]") // This is a fallback if things aren't set via spark submit. val spark = SparkSession.builder.config(conf).getOrCreate() val appConfig = parse(args) run(spark, appConfig) spark.stop() } }
Example 91
Source File: NewsgroupsPipeline.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.pipelines.text import breeze.linalg.SparseVector import keystoneml.evaluation.MulticlassClassifierEvaluator import keystoneml.loaders.NewsgroupsDataLoader import keystoneml.nodes.learning.NaiveBayesEstimator import keystoneml.nodes.nlp._ import keystoneml.nodes.stats.TermFrequency import keystoneml.nodes.util.{CommonSparseFeatures, MaxClassifier} import org.apache.spark.{SparkConf, SparkContext} import keystoneml.pipelines.Logging import scopt.OptionParser import keystoneml.workflow.Pipeline object NewsgroupsPipeline extends Logging { val appName = "NewsgroupsPipeline" def run(sc: SparkContext, conf: NewsgroupsConfig): Pipeline[String, Int] = { val trainData = NewsgroupsDataLoader(sc, conf.trainLocation) val numClasses = NewsgroupsDataLoader.classes.length // Build the classifier estimator logInfo("Training classifier") val predictor = Trim andThen LowerCase() andThen Tokenizer() andThen NGramsFeaturizer(1 to conf.nGrams) andThen TermFrequency(x => 1) andThen (CommonSparseFeatures[Seq[String]](conf.commonFeatures), trainData.data) andThen (NaiveBayesEstimator[SparseVector[Double]](numClasses), trainData.data, trainData.labels) andThen MaxClassifier // Evaluate the classifier logInfo("Evaluating classifier") val testData = NewsgroupsDataLoader(sc, conf.testLocation) val testLabels = testData.labels val testResults = predictor(testData.data) val eval = new MulticlassClassifierEvaluator(numClasses).evaluate(testResults, testLabels) logInfo("\n" + eval.summary(NewsgroupsDataLoader.classes)) predictor } case class NewsgroupsConfig( trainLocation: String = "", testLocation: String = "", nGrams: Int = 2, commonFeatures: Int = 100000) def parse(args: Array[String]): NewsgroupsConfig = new OptionParser[NewsgroupsConfig](appName) { head(appName, "0.1") opt[String]("trainLocation") required() action { (x,c) => c.copy(trainLocation=x) } opt[String]("testLocation") required() action { (x,c) => c.copy(testLocation=x) } opt[Int]("nGrams") action { (x,c) => c.copy(nGrams=x) } opt[Int]("commonFeatures") action { (x,c) => c.copy(commonFeatures=x) } }.parse(args, NewsgroupsConfig()).get def main(args: Array[String]) = { val conf = new SparkConf().setAppName(appName) conf.setIfMissing("spark.master", "local[2]") // This is a fallback if things aren't set via spark submit. val sc = new SparkContext(conf) val appConfig = parse(args) run(sc, appConfig) sc.stop() } }
Example 92
Source File: HiSpeedRead.scala From spark-db2 with Apache License 2.0 | 5 votes |
import com.ibm.spark.ibmdataserver.Constants import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkContext, SparkConf} object HiSpeedRead { def main(args: Array[String]) { val DB2_CONNECTION_URL = "jdbc:db2://localhost:50700/sample:traceFile=C:\\1.txt;" val conf = new SparkConf().setMaster("local[2]").setAppName("read test") val sparkContext = new SparkContext(conf) val sqlContext = new SQLContext(sparkContext) Class.forName("com.ibm.db2.jcc.DB2Driver") val jdbcRdr = sqlContext.read.format("com.ibm.spark.ibmdataserver") .option("url", DB2_CONNECTION_URL) // .option(Constants.TABLE, tableName) .option("user", "pallavipr") .option("password", "9manjari") .option("dbtable", "employee") .load() jdbcRdr.show() } }
Example 93
Source File: HiveExternalCatalogSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.hadoop.conf.Configuration import org.apache.spark.SparkConf import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.types.StructType class HiveExternalCatalogSuite extends ExternalCatalogSuite { private val externalCatalog: HiveExternalCatalog = { val catalog = new HiveExternalCatalog(new SparkConf, new Configuration) catalog.client.reset() catalog } protected override val utils: CatalogTestUtils = new CatalogTestUtils { override val tableInputFormat: String = "org.apache.hadoop.mapred.SequenceFileInputFormat" override val tableOutputFormat: String = "org.apache.hadoop.mapred.SequenceFileOutputFormat" override def newEmptyCatalog(): ExternalCatalog = externalCatalog override val defaultProvider: String = "hive" } protected override def resetState(): Unit = { externalCatalog.client.reset() } import utils._ test("SPARK-18647: do not put provider in table properties for Hive serde table") { val catalog = newBasicCatalog() val hiveTable = CatalogTable( identifier = TableIdentifier("hive_tbl", Some("db1")), tableType = CatalogTableType.MANAGED, storage = storageFormat, schema = new StructType().add("col1", "int").add("col2", "string"), provider = Some("hive")) catalog.createTable(hiveTable, ignoreIfExists = false) val rawTable = externalCatalog.client.getTable("db1", "hive_tbl") assert(!rawTable.properties.contains(HiveExternalCatalog.DATASOURCE_PROVIDER)) assert(DDLUtils.isHiveTable(externalCatalog.getTable("db1", "hive_tbl"))) } Seq("parquet", "hive").foreach { format => test(s"Partition columns should be put at the end of table schema for the format $format") { val catalog = newBasicCatalog() val newSchema = new StructType() .add("col1", "int") .add("col2", "string") .add("partCol1", "int") .add("partCol2", "string") val table = CatalogTable( identifier = TableIdentifier("tbl", Some("db1")), tableType = CatalogTableType.MANAGED, storage = CatalogStorageFormat.empty, schema = new StructType() .add("col1", "int") .add("partCol1", "int") .add("partCol2", "string") .add("col2", "string"), provider = Some(format), partitionColumnNames = Seq("partCol1", "partCol2")) catalog.createTable(table, ignoreIfExists = false) val restoredTable = externalCatalog.getTable("db1", "tbl") assert(restoredTable.schema == newSchema) } } test("SPARK-22306: alter table schema should not erase the bucketing metadata at hive side") { val catalog = newBasicCatalog() externalCatalog.client.runSqlHive( """ |CREATE TABLE db1.t(a string, b string) |CLUSTERED BY (a, b) SORTED BY (a, b) INTO 10 BUCKETS |STORED AS PARQUET """.stripMargin) val newSchema = new StructType().add("a", "string").add("b", "string").add("c", "string") catalog.alterTableDataSchema("db1", "t", newSchema) assert(catalog.getTable("db1", "t").schema == newSchema) val bucketString = externalCatalog.client.runSqlHive("DESC FORMATTED db1.t") .filter(_.contains("Num Buckets")).head assert(bucketString.contains("10")) } test("SPARK-23001: NullPointerException when running desc database") { val catalog = newBasicCatalog() catalog.createDatabase(newDb("dbWithNullDesc").copy(description = null), ignoreIfExists = false) assert(catalog.getDatabase("dbWithNullDesc").description == "") } }
Example 94
Source File: ConcurrentHiveSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.scalatest.BeforeAndAfterAll import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.sql.hive.test.TestHiveContext class ConcurrentHiveSuite extends SparkFunSuite with BeforeAndAfterAll { ignore("multiple instances not supported") { test("Multiple Hive Instances") { (1 to 10).map { i => val conf = new SparkConf() conf.set("spark.ui.enabled", "false") val ts = new TestHiveContext(new SparkContext("local", s"TestSQLContext$i", conf)) ts.sparkSession.sql("SHOW TABLES").collect() ts.sparkSession.sql("SELECT * FROM src").collect() ts.sparkSession.sql("SHOW TABLES").collect() } } } }
Example 95
Source File: HiveUtilsSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import java.net.URL import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.QueryTest import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SQLTestUtils} import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader} class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("newTemporaryConfiguration overwrites listener configurations") { Seq(true, false).foreach { useInMemoryDerby => val conf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby) assert(conf(ConfVars.METASTORE_PRE_EVENT_LISTENERS.varname) === "") assert(conf(ConfVars.METASTORE_EVENT_LISTENERS.varname) === "") assert(conf(ConfVars.METASTORE_END_FUNCTION_LISTENERS.varname) === "") } } test("newTemporaryConfiguration respect spark.hadoop.foo=bar in SparkConf") { sys.props.put("spark.hadoop.foo", "bar") Seq(true, false) foreach { useInMemoryDerby => val hiveConf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby) assert(!hiveConf.contains("spark.hadoop.foo")) assert(hiveConf("foo") === "bar") } } test("ChildFirstURLClassLoader's parent is null, get spark classloader instead") { val conf = new SparkConf val contextClassLoader = Thread.currentThread().getContextClassLoader val loader = new ChildFirstURLClassLoader(Array(), contextClassLoader) try { Thread.currentThread().setContextClassLoader(loader) HiveUtils.newClientForMetadata( conf, SparkHadoopUtil.newConfiguration(conf), HiveUtils.newTemporaryConfiguration(useInMemoryDerby = true)) } finally { Thread.currentThread().setContextClassLoader(contextClassLoader) } } test("toHiveString correctly handles UDTs") { val point = new ExamplePoint(50.0, 50.0) val tpe = new ExamplePointUDT() assert(HiveUtils.toHiveString((point, tpe)) === "(50.0, 50.0)") } }
Example 96
Source File: HiveClientBuilder.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.client import java.io.File import org.apache.hadoop.conf.Configuration import org.apache.hadoop.util.VersionInfo import org.apache.spark.SparkConf import org.apache.spark.util.Utils private[client] object HiveClientBuilder { // In order to speed up test execution during development or in Jenkins, you can specify the path // of an existing Ivy cache: private val ivyPath: Option[String] = { sys.env.get("SPARK_VERSIONS_SUITE_IVY_PATH").orElse( Some(new File(sys.props("java.io.tmpdir"), "hive-ivy-cache").getAbsolutePath)) } private def buildConf(extraConf: Map[String, String]) = { lazy val warehousePath = Utils.createTempDir() lazy val metastorePath = Utils.createTempDir() metastorePath.delete() extraConf ++ Map( "javax.jdo.option.ConnectionURL" -> s"jdbc:derby:;databaseName=$metastorePath;create=true", "hive.metastore.warehouse.dir" -> warehousePath.toString) } // for testing only def buildClient( version: String, hadoopConf: Configuration, extraConf: Map[String, String] = Map.empty, sharesHadoopClasses: Boolean = true): HiveClient = { IsolatedClientLoader.forVersion( hiveMetastoreVersion = version, hadoopVersion = VersionInfo.getVersion, sparkConf = new SparkConf(), hadoopConf = hadoopConf, config = buildConf(extraConf), ivyPath = ivyPath, sharesHadoopClasses = sharesHadoopClasses).createClient() } }
Example 97
Source File: HiveContextCompatibilitySuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.scalatest.BeforeAndAfterEach import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} class HiveContextCompatibilitySuite extends SparkFunSuite with BeforeAndAfterEach { override protected val enableAutoThreadAudit = false private var sc: SparkContext = null private var hc: HiveContext = null override def beforeAll(): Unit = { super.beforeAll() sc = SparkContext.getOrCreate(new SparkConf().setMaster("local").setAppName("test")) HiveUtils.newTemporaryConfiguration(useInMemoryDerby = true).foreach { case (k, v) => sc.hadoopConfiguration.set(k, v) } hc = new HiveContext(sc) } override def afterEach(): Unit = { try { hc.sharedState.cacheManager.clearCache() hc.sessionState.catalog.reset() } finally { super.afterEach() } } override def afterAll(): Unit = { try { sc = null hc = null } finally { super.afterAll() } } test("basic operations") { val _hc = hc import _hc.implicits._ val df1 = (1 to 20).map { i => (i, i) }.toDF("a", "x") val df2 = (1 to 100).map { i => (i, i % 10, i % 2 == 0) }.toDF("a", "b", "c") .select($"a", $"b") .filter($"a" > 10 && $"b" > 6 && $"c") val df3 = df1.join(df2, "a") val res = df3.collect() val expected = Seq((18, 18, 8)).toDF("a", "x", "b").collect() assert(res.toSeq == expected.toSeq) df3.createOrReplaceTempView("mai_table") val df4 = hc.table("mai_table") val res2 = df4.collect() assert(res2.toSeq == expected.toSeq) } test("basic DDLs") { val _hc = hc import _hc.implicits._ val databases = hc.sql("SHOW DATABASES").collect().map(_.getString(0)) assert(databases.toSeq == Seq("default")) hc.sql("CREATE DATABASE mee_db") hc.sql("USE mee_db") val databases2 = hc.sql("SHOW DATABASES").collect().map(_.getString(0)) assert(databases2.toSet == Set("default", "mee_db")) val df = (1 to 10).map { i => ("bob" + i.toString, i) }.toDF("name", "age") df.createOrReplaceTempView("mee_table") hc.sql("CREATE TABLE moo_table (name string, age int)") hc.sql("INSERT INTO moo_table SELECT * FROM mee_table") assert( hc.sql("SELECT * FROM moo_table order by name").collect().toSeq == df.collect().toSeq.sortBy(_.getString(0))) val tables = hc.sql("SHOW TABLES IN mee_db").select("tableName").collect().map(_.getString(0)) assert(tables.toSet == Set("moo_table", "mee_table")) hc.sql("DROP TABLE moo_table") hc.sql("DROP TABLE mee_table") val tables2 = hc.sql("SHOW TABLES IN mee_db").select("tableName").collect().map(_.getString(0)) assert(tables2.isEmpty) hc.sql("USE default") hc.sql("DROP DATABASE mee_db CASCADE") val databases3 = hc.sql("SHOW DATABASES").collect().map(_.getString(0)) assert(databases3.toSeq == Seq("default")) } }
Example 98
Source File: SparkSQLEnv.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.io.PrintStream import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.internal.Logging import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils} import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION import org.apache.spark.util.Utils def stop() { logDebug("Shutting down Spark SQL Environment") // Stop the SparkContext if (SparkSQLEnv.sparkContext != null) { sparkContext.stop() sparkContext = null sqlContext = null } } }
Example 99
Source File: HiveCliSessionStateSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import org.apache.hadoop.hive.cli.CliSessionState import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.ql.session.SessionState import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.hive.HiveUtils class HiveCliSessionStateSuite extends SparkFunSuite { def withSessionClear(f: () => Unit): Unit = { try f finally SessionState.detachSession() } test("CliSessionState will be reused") { withSessionClear { () => val hiveConf = new HiveConf(classOf[SessionState]) HiveUtils.newTemporaryConfiguration(useInMemoryDerby = false).foreach { case (key, value) => hiveConf.set(key, value) } val sessionState: SessionState = new CliSessionState(hiveConf) SessionState.start(sessionState) val s1 = SessionState.get val sparkConf = new SparkConf() val hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf) val s2 = HiveUtils.newClientForMetadata(sparkConf, hadoopConf).getState assert(s1 === s2) assert(s2.isInstanceOf[CliSessionState]) } } test("SessionState will not be reused") { withSessionClear { () => val sparkConf = new SparkConf() val hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf) HiveUtils.newTemporaryConfiguration(useInMemoryDerby = false).foreach { case (key, value) => hadoopConf.set(key, value) } val hiveClient = HiveUtils.newClientForMetadata(sparkConf, hadoopConf) val s1 = hiveClient.getState val s2 = hiveClient.newSession().getState assert(s1 !== s2) } } }
Example 100
Source File: DataSourceManagerFactory.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.xsql import java.util.ServiceLoader import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.util.Utils object DataSourceManagerFactory { def create( datasourceType: String, conf: SparkConf, hadoopConf: Configuration): DataSourceManager = { val loader = Utils.getContextOrSparkClassLoader val serviceLoader = ServiceLoader.load(classOf[DataSourceManager], loader) var cls: Class[_] = null // As we use ServiceLoader to support creating any user provided DataSourceManager here, // META-INF/services/org.apache.spark.sql.sources.DataSourceRegister must be packaged properly // in user's jar, and the implementation of DataSourceManager must have a public parameterless // constructor. For scala language, def this() = this(null...) just work. try { cls = serviceLoader.asScala .filter(_.shortName().equals(datasourceType)) .toList match { case head :: Nil => head.getClass case _ => throw new SparkException(s"error when instantiate datasource ${datasourceType}") } } catch { case _: Exception => throw new SparkException( s"""Can't find corresponding DataSourceManager for ${datasourceType} type, |please check |1. META-INF/services/org.apache.spark.sql.sources.DataSourceRegister is packaged |2. your implementation of DataSourceManager's shortname is ${datasourceType} |3. your implementation of DataSourceManager must have a public parameterless | constructor. For scala language, def this() = this(null, null, ...) just work. """.stripMargin) } try { val constructor = cls.getConstructor(classOf[SparkConf], classOf[Configuration]) val newHadoopConf = new Configuration(hadoopConf) constructor.newInstance(conf, newHadoopConf).asInstanceOf[DataSourceManager] } catch { case _: NoSuchMethodException => try { cls.getConstructor(classOf[SparkConf]).newInstance(conf).asInstanceOf[DataSourceManager] } catch { case _: NoSuchMethodException => cls.getConstructor().newInstance().asInstanceOf[DataSourceManager] } } } }
Example 101
Source File: XSQLTestSparkSession.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.xsql.test import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.internal.SessionState import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION import org.apache.spark.sql.test.TestSparkSession import org.apache.spark.sql.xsql.XSQLSessionStateBuilder class XSQLTestSparkSession(sc: SparkContext) extends TestSparkSession(sc) { self => def this(sparkConf: SparkConf) { this( new SparkContext( "local[2]", "test-sql-context", sparkConf.set("spark.sql.testkey", "true").set(CATALOG_IMPLEMENTATION, "xsql"))) } @transient override lazy val sessionState: SessionState = { new XSQLSessionStateBuilder(this, None).build() } }
Example 102
Source File: SQLHistoryServerPlugin.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.ui import org.apache.spark.SparkConf import org.apache.spark.scheduler.SparkListener import org.apache.spark.status.{AppHistoryServerPlugin, ElementTrackingStore} import org.apache.spark.ui.SparkUI class SQLHistoryServerPlugin extends AppHistoryServerPlugin { override def createListeners(conf: SparkConf, store: ElementTrackingStore): Seq[SparkListener] = { Seq(new SQLAppStatusListener(conf, store, live = false)) } override def setupUI(ui: SparkUI): Unit = { val sqlStatusStore = new SQLAppStatusStore(ui.store.store) if (sqlStatusStore.executionsCount() > 0) { new SQLTab(sqlStatusStore, ui) } } }
Example 103
Source File: DataSourceWriteBenchmark.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.benchmark import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.Benchmark trait DataSourceWriteBenchmark { val conf = new SparkConf() .setAppName("DataSourceWriteBenchmark") .setIfMissing("spark.master", "local[1]") .set(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true") val spark = SparkSession.builder.config(conf).getOrCreate() val tempTable = "temp" val numRows = 1024 * 1024 * 15 def withTempTable(tableNames: String*)(f: => Unit): Unit = { try f finally tableNames.foreach(spark.catalog.dropTempView) } def withTable(tableNames: String*)(f: => Unit): Unit = { try f finally { tableNames.foreach { name => spark.sql(s"DROP TABLE IF EXISTS $name") } } } def writeNumeric(table: String, format: String, benchmark: Benchmark, dataType: String): Unit = { spark.sql(s"create table $table(id $dataType) using $format") benchmark.addCase(s"Output Single $dataType Column") { _ => spark.sql(s"INSERT OVERWRITE TABLE $table SELECT CAST(id AS $dataType) AS c1 FROM $tempTable") } } def writeIntString(table: String, format: String, benchmark: Benchmark): Unit = { spark.sql(s"CREATE TABLE $table(c1 INT, c2 STRING) USING $format") benchmark.addCase("Output Int and String Column") { _ => spark.sql(s"INSERT OVERWRITE TABLE $table SELECT CAST(id AS INT) AS " + s"c1, CAST(id AS STRING) AS c2 FROM $tempTable") } } def writePartition(table: String, format: String, benchmark: Benchmark): Unit = { spark.sql(s"CREATE TABLE $table(p INT, id INT) USING $format PARTITIONED BY (p)") benchmark.addCase("Output Partitions") { _ => spark.sql(s"INSERT OVERWRITE TABLE $table SELECT CAST(id AS INT) AS id," + s" CAST(id % 2 AS INT) AS p FROM $tempTable") } } def writeBucket(table: String, format: String, benchmark: Benchmark): Unit = { spark.sql(s"CREATE TABLE $table(c1 INT, c2 INT) USING $format CLUSTERED BY (c2) INTO 2 BUCKETS") benchmark.addCase("Output Buckets") { _ => spark.sql(s"INSERT OVERWRITE TABLE $table SELECT CAST(id AS INT) AS " + s"c1, CAST(id AS INT) AS c2 FROM $tempTable") } } def runBenchmark(format: String): Unit = { val tableInt = "tableInt" val tableDouble = "tableDouble" val tableIntString = "tableIntString" val tablePartition = "tablePartition" val tableBucket = "tableBucket" withTempTable(tempTable) { spark.range(numRows).createOrReplaceTempView(tempTable) withTable(tableInt, tableDouble, tableIntString, tablePartition, tableBucket) { val benchmark = new Benchmark(s"$format writer benchmark", numRows) writeNumeric(tableInt, format, benchmark, "Int") writeNumeric(tableDouble, format, benchmark, "Double") writeIntString(tableIntString, format, benchmark) writePartition(tablePartition, format, benchmark) writeBucket(tableBucket, format, benchmark) benchmark.run() } } } }
Example 104
Source File: SaveIntoDataSourceCommandSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.SparkConf import org.apache.spark.sql.SaveMode import org.apache.spark.sql.test.SharedSQLContext class SaveIntoDataSourceCommandSuite extends SharedSQLContext { test("simpleString is redacted") { val URL = "connection.url" val PASS = "mypassword" val DRIVER = "mydriver" val dataSource = DataSource( sparkSession = spark, className = "jdbc", partitionColumns = Nil, options = Map("password" -> PASS, "url" -> URL, "driver" -> DRIVER)) val logicalPlanString = dataSource .planForWriting(SaveMode.ErrorIfExists, spark.range(1).logicalPlan) .treeString(true) assert(!logicalPlanString.contains(URL)) assert(!logicalPlanString.contains(PASS)) assert(logicalPlanString.contains(DRIVER)) } }
Example 105
Source File: DataSourceScanExecRedactionSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.hadoop.fs.Path import org.apache.spark.SparkConf import org.apache.spark.sql.QueryTest import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext class DataSourceScanExecRedactionSuite extends QueryTest with SharedSQLContext { override protected def sparkConf: SparkConf = super.sparkConf .set("spark.redaction.string.regex", "file:/[\\w_]+") test("treeString is redacted") { withTempDir { dir => val basePath = dir.getCanonicalPath spark.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString) val df = spark.read.parquet(basePath) val rootPath = df.queryExecution.sparkPlan.find(_.isInstanceOf[FileSourceScanExec]).get .asInstanceOf[FileSourceScanExec].relation.location.rootPaths.head assert(rootPath.toString.contains(dir.toURI.getPath.stripSuffix("/"))) assert(!df.queryExecution.sparkPlan.treeString(verbose = true).contains(rootPath.getName)) assert(!df.queryExecution.executedPlan.treeString(verbose = true).contains(rootPath.getName)) assert(!df.queryExecution.toString.contains(rootPath.getName)) assert(!df.queryExecution.simpleString.contains(rootPath.getName)) val replacement = "*********" assert(df.queryExecution.sparkPlan.treeString(verbose = true).contains(replacement)) assert(df.queryExecution.executedPlan.treeString(verbose = true).contains(replacement)) assert(df.queryExecution.toString.contains(replacement)) assert(df.queryExecution.simpleString.contains(replacement)) } } private def isIncluded(queryExecution: QueryExecution, msg: String): Boolean = { queryExecution.toString.contains(msg) || queryExecution.simpleString.contains(msg) || queryExecution.stringWithStats.contains(msg) } test("explain is redacted using SQLConf") { withTempDir { dir => val basePath = dir.getCanonicalPath spark.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString) val df = spark.read.parquet(basePath) val replacement = "*********" // Respect SparkConf and replace file:/ assert(isIncluded(df.queryExecution, replacement)) assert(isIncluded(df.queryExecution, "FileScan")) assert(!isIncluded(df.queryExecution, "file:/")) withSQLConf(SQLConf.SQL_STRING_REDACTION_PATTERN.key -> "(?i)FileScan") { // Respect SQLConf and replace FileScan assert(isIncluded(df.queryExecution, replacement)) assert(!isIncluded(df.queryExecution, "FileScan")) assert(isIncluded(df.queryExecution, "file:/")) } } } }
Example 106
Source File: SerializationSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.serializer.JavaSerializer import org.apache.spark.sql.test.SharedSQLContext class SerializationSuite extends SparkFunSuite with SharedSQLContext { test("[SPARK-5235] SQLContext should be serializable") { val spark = SparkSession.builder.getOrCreate() new JavaSerializer(new SparkConf()).newInstance().serialize(spark.sqlContext) } test("[SPARK-26409] SQLConf should be serializable") { val spark = SparkSession.builder.getOrCreate() new JavaSerializer(new SparkConf()).newInstance().serialize(spark.sessionState.conf) } }
Example 107
Source File: SharedSparkSession.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.test import scala.concurrent.duration._ import org.scalatest.{BeforeAndAfterEach, Suite} import org.scalatest.concurrent.Eventually import org.apache.spark.{DebugFilesystem, SparkConf} import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation import org.apache.spark.sql.internal.SQLConf protected override def afterAll(): Unit = { try { super.afterAll() } finally { try { if (_spark != null) { try { _spark.sessionState.catalog.reset() } finally { _spark.stop() _spark = null } } } finally { SparkSession.clearActiveSession() SparkSession.clearDefaultSession() } } } protected override def beforeEach(): Unit = { super.beforeEach() DebugFilesystem.clearOpenStreams() } protected override def afterEach(): Unit = { super.afterEach() // Clear all persistent datasets after each test spark.sharedState.cacheManager.clearCache() // files can be closed from other threads, so wait a bit // normally this doesn't take more than 1s eventually(timeout(10.seconds), interval(2.seconds)) { DebugFilesystem.assertNoOpenStreams() } } }
Example 108
Source File: TestSQLContext.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.test import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.internal.{SessionState, SessionStateBuilder, SQLConf, WithTestConf} val overrideConfs: Map[String, String] = Map( // Fewer shuffle partitions to speed up testing. SQLConf.SHUFFLE_PARTITIONS.key -> "5") } private[sql] class TestSQLSessionStateBuilder( session: SparkSession, state: Option[SessionState]) extends SessionStateBuilder(session, state) with WithTestConf { override def overrideConfs: Map[String, String] = TestSQLContext.overrideConfs override def newBuilder: NewBuilder = new TestSQLSessionStateBuilder(_, _) }
Example 109
Source File: AggregateHashMapSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.scalatest.BeforeAndAfter import org.apache.spark.SparkConf class SingleLevelAggregateHashMapSuite extends DataFrameAggregateSuite with BeforeAndAfter { override protected def sparkConf: SparkConf = super.sparkConf .set("spark.sql.codegen.fallback", "false") .set("spark.sql.codegen.aggregate.map.twolevel.enabled", "false") // adding some checking after each test is run, assuring that the configs are not changed // in test code after { assert(sparkConf.get("spark.sql.codegen.fallback") == "false", "configuration parameter changed in test body") assert(sparkConf.get("spark.sql.codegen.aggregate.map.twolevel.enabled") == "false", "configuration parameter changed in test body") } } class TwoLevelAggregateHashMapSuite extends DataFrameAggregateSuite with BeforeAndAfter { override protected def sparkConf: SparkConf = super.sparkConf .set("spark.sql.codegen.fallback", "false") .set("spark.sql.codegen.aggregate.map.twolevel.enabled", "true") // adding some checking after each test is run, assuring that the configs are not changed // in test code after { assert(sparkConf.get("spark.sql.codegen.fallback") == "false", "configuration parameter changed in test body") assert(sparkConf.get("spark.sql.codegen.aggregate.map.twolevel.enabled") == "true", "configuration parameter changed in test body") } } class TwoLevelAggregateHashMapWithVectorizedMapSuite extends DataFrameAggregateSuite with BeforeAndAfter { override protected def sparkConf: SparkConf = super.sparkConf .set("spark.sql.codegen.fallback", "false") .set("spark.sql.codegen.aggregate.map.twolevel.enabled", "true") .set("spark.sql.codegen.aggregate.map.vectorized.enable", "true") // adding some checking after each test is run, assuring that the configs are not changed // in test code after { assert(sparkConf.get("spark.sql.codegen.fallback") == "false", "configuration parameter changed in test body") assert(sparkConf.get("spark.sql.codegen.aggregate.map.twolevel.enabled") == "true", "configuration parameter changed in test body") assert(sparkConf.get("spark.sql.codegen.aggregate.map.vectorized.enable") == "true", "configuration parameter changed in test body") } }
Example 110
Source File: MonitorFactory.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.monitor import java.util.ServiceLoader import scala.collection.JavaConverters._ import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.alarm.Alarm import org.apache.spark.util.Utils import org.apache.spark.util.kvstore.KVStore object MonitorFactory { def create( monitorName: String, alarms: Seq[Alarm], appStore: KVStore, conf: SparkConf): Monitor = { val loader = Utils.getContextOrSparkClassLoader val serviceLoader = ServiceLoader.load(classOf[Monitor], loader) val MonitorClass = serviceLoader.asScala .filter(_.item.equals(MonitorItem.withName(monitorName))) .toList match { case head :: Nil => head.getClass case _ => throw new SparkException("error when instantiate spark.xsql.monitor.items") } MonitorClass.newInstance().bind(alarms).bind(appStore).bind(conf) } }
Example 111
Source File: Monitor.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.monitor import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkConf import org.apache.spark.alarm.{Alarm, AlertMessage} import org.apache.spark.alarm.AlertType.AlertType import org.apache.spark.internal.config.ConfigBuilder import org.apache.spark.monitor.MonitorItem.MonitorItem import org.apache.spark.scheduler.SparkListenerEvent import org.apache.spark.status.AppStatusStore import org.apache.spark.util.kvstore.KVStore trait Monitor { val alertType: Seq[AlertType] val item: MonitorItem val alarms: ArrayBuffer[Alarm] = ArrayBuffer() var kvStore: KVStore = null var appStore: AppStatusStore = null var conf: SparkConf = null def watchOut(event: SparkListenerEvent): Option[AlertMessage] def bind(alarm: Alarm): Monitor = { alarms.append(alarm) this } def bind(alarms: Seq[Alarm]): Monitor = { this.alarms.appendAll(alarms) this } def bind(kvStore: KVStore): Monitor = { this.kvStore = kvStore this.appStore = new AppStatusStore(kvStore) this } def bind(conf: SparkConf): Monitor = { this.conf = conf this } def onEvent(event: SparkListenerEvent): Unit = { val message = watchOut(event) if (message.isDefined) { alarms.foreach(_.alarm(message.get)) } } } object Monitor { val commonClasses = Seq( "org.apache.spark.sql.xsql.shell.SparkXSQLShell", "org.apache.spark.repl.Main", "org.apache.spark.sql.hive.xitong.shell.SparkHiveShell", "org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver") val dateFormats = Seq("yyyy-MM-dd", "yyyy/MM/dd", "yyyyMMdd") val PREFIX = "spark.monitor" private[spark] val MONITOR_ITEMS = ConfigBuilder("spark.monitor.items") .internal() .doc("choose monitors to open, split with `,`") .stringConf .transform(_.toUpperCase) .toSequence .checkValue( _.toSet.subsetOf(MonitorItem.values.map(_.toString)), s"must be one of ${MonitorItem.values.map(_.toString)}") .createWithDefault(Seq.empty) } object MonitorItem extends Enumeration { type MonitorItem = Value val SQL_CHANGE_NOTIFIER = Value val APP_FINISH_NOTIFIER, EXECUTOR_NUM_NOTIFIER, DATASKEW_NOTIFIER, EXECUTOR_MEMORY_ADVISER = Value val SPARK_APPLICATION_SUMMARY, APP_IDLE_WARNER = Value }
Example 112
Source File: HierarchyBuilderSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hierarchy import org.apache.spark.SparkConf import org.apache.spark.serializer.JavaSerializer import org.apache.spark.sql.Row import org.apache.spark.sql.types._ import org.apache.spark.sql.types.Node import org.scalatest.FunSuite class HierarchyBuilderSuite extends FunSuite { val N = 5 val rowFunctions = HierarchyRowFunctions(Seq.fill(N)(StringType)) test("HierarchyRowFunctions.rowGet") { for (i <- 0 to 5) { val row = Row((0 to 5).map(_.toString): _*) assertResult(i.toString)(rowFunctions.rowGet(i)(row)) } } test("HierarchyRowFunctions.rowInit") { for (i <- 0 to 5) { val row = Row((0 to 5).map(_.toString): _*) val result = rowFunctions.rowInit(rowFunctions.rowGet(i), StringType)(row, None) val expected = Row(row.toSeq :+ Node(List(i.toString), StringType): _*) assertResult(expected)(result) } } // scalastyle:off magic.number test("HierarchyRowFunctions.rowInitWithOrder") { for (i <- 0 to 5) { val row = Row((0 to 5).map(_.toString): _*) val result = rowFunctions.rowInit(rowFunctions.rowGet(i), StringType)(row, Some(42L)) val expected = Row(row.toSeq :+ Node(List(i.toString),StringType, ordPath = List(42L)): _*) assertResult(expected)(result) } } // scalastyle:on magic.number test("HierarchyRowFunctions.rowModify") { for (i <- 0 to 5) { val rightRow = Row(0 to 5: _*) val leftRow = Row("foo", 0, "bar", Node(List(0),StringType)) val result = rowFunctions.rowModify( rowFunctions.rowGet(i),StringType )(leftRow, rightRow) val expected = Row((0 to 5) :+ Node(List(0, i), StringType): _*) assertResult(expected)(result) } } // scalastyle:off magic.number test("HierarchyRowFunctions.rowModifyAndOrder") { for (i <- 0 to 5) { val rightRow = Row(0 to 5: _*) val leftRow = Row("foo", 0, "bar", Node(List(0),StringType)) val result = rowFunctions.rowModifyAndOrder( rowFunctions.rowGet(i), StringType )(leftRow, rightRow, Some(42L)) val expected = Row((0 to 5) :+ Node(List(0, i), StringType, ordPath = List(42L)): _*) assertResult(expected)(result) } } // scalastyle:on magic.number test("HierarchyBuilder closure is serializable") { val closureSerializer = new JavaSerializer(new SparkConf(loadDefaults = false)).newInstance() val serialized = closureSerializer.serialize(() => HierarchyJoinBuilder(null, null, null, null, null, null)) } test("HierarchyRowFunctions closure is serializable") { val closureSerializer = new JavaSerializer(new SparkConf(loadDefaults = false)).newInstance() val serialized = closureSerializer.serialize(() => HierarchyRowJoinBuilder(null, null, null, null)) } }
Example 113
Source File: WithSparkContext.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package com.sap.spark import com.sap.spark.util.TestUtils._ import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.{BeforeAndAfterAll, Suite} trait WithSparkContext extends BeforeAndAfterAll { self: Suite => override def beforeAll(): Unit = { try { super.beforeAll() setUpSparkContext() } catch { case ex: Throwable => tearDownSparkContext() throw ex } } override def afterAll(): Unit = { try { super.afterAll() } finally { tearDownSparkContext() } } conf.set("spark.sql.autoBroadcastJoinThreshold", "-1") conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.HttpBroadcastFactory") conf.set("spark.shuffle.spill", "false") conf.set("spark.shuffle.compress", "false") conf.set("spark.ui.enabled", "false") conf.set("spark.ui.showConsoleProgress", "false") } def sc: SparkContext protected def setUpSparkContext(): Unit protected def tearDownSparkContext(): Unit }
Example 114
Source File: GlobalSparkContext.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package com.sap.spark import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.{BeforeAndAfterAll, Suite} } } object GlobalSparkContext { @transient private var _sc: SparkContext = _ def init(sparkMaster: String, sparkConf: SparkConf): Unit = { if (_sc == null) { this.synchronized { if (_sc == null) { _sc = new SparkContext(sparkMaster, "test", sparkConf) } } } } def reset(): Unit = { if (_sc != null) { _sc.cancelAllJobs() } } def close(): Unit = { if (_sc != null) { _sc.stop() _sc = null } } }
Example 115
Source File: SapSQLEnv.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.sap.thriftserver import java.io.PrintStream import org.apache.spark.scheduler.StatsReportListener import org.apache.spark.sql.hive.{HiveContext, SapHiveContext} import org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver import org.apache.spark.sql.hive.thriftserver.SparkSQLEnv._ import org.apache.spark.util.Utils import org.apache.spark.{Logging, SparkConf, SparkContext} import scala.collection.JavaConversions._ object SapSQLEnv extends Logging { def init() { logDebug("Initializing SapSQLEnv") if (hiveContext == null) { logInfo("Creating SapSQLContext") val sparkConf = new SparkConf(loadDefaults = true) val maybeSerializer = sparkConf.getOption("spark.serializer") val maybeKryoReferenceTracking = sparkConf.getOption("spark.kryo.referenceTracking") // If user doesn't specify the appName, we want to get [SparkSQL::localHostName] instead of // the default appName [SparkSQLCLIDriver] in cli or beeline. val maybeAppName = sparkConf .getOption("spark.app.name") .filterNot(_ == classOf[SparkSQLCLIDriver].getName) sparkConf .setAppName(maybeAppName.getOrElse(s"SparkSQL::${Utils.localHostName()}")) .set("spark.serializer", maybeSerializer.getOrElse("org.apache.spark.serializer.KryoSerializer")) .set("spark.kryo.referenceTracking", maybeKryoReferenceTracking.getOrElse("false")) sparkContext = new SparkContext(sparkConf) sparkContext.addSparkListener(new StatsReportListener()) hiveContext = new SapHiveContext(sparkContext) hiveContext.metadataHive.setOut(new PrintStream(System.out, true, "UTF-8")) hiveContext.metadataHive.setInfo(new PrintStream(System.err, true, "UTF-8")) hiveContext.metadataHive.setError(new PrintStream(System.err, true, "UTF-8")) hiveContext.setConf("spark.sql.hive.version", HiveContext.hiveExecutionVersion) if (log.isDebugEnabled) { hiveContext.hiveconf.getAllProperties.toSeq.sorted.foreach { case (k, v) => logDebug(s"HiveConf var: $k=$v") } } } } }
Example 116
Source File: VLBFGS1.scala From spark-vl-bfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.optim import java.util.Random import scala.language.implicitConversions import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.optim.VectorFreeLBFGS.{Oracle, VectorSpace} import org.apache.spark.ml.optim.VectorRDDFunctions._ import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors} import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.storage.StorageLevel private def gradient(data: RDD[Array[LabeledPoint]], dx: RDD[Vector]): RDD[Vector] = { data.cartesian(dx).map { case (points, x) => val g = Vectors.zeros(x.size) points.foreach { case LabeledPoint(b, a) => val err = BLAS.dot(a, x) - b BLAS.axpy(err, a, g) } g }.treeSum() } def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("VLBFGS").setMaster("local[*]") val sc = new SparkContext(conf) sc.setCheckpointDir("/tmp/checkpoint") val n = 1000 val p = 100 val random = new Random(0L) val xExact = Vectors.dense(Array.fill(p)(random.nextDouble())) val data = RandomRDDs.normalVectorRDD(sc, n, p, 4, 11L).mapPartitionsWithIndex { (idx, part) => val random = new Random(100 + idx) part.map { v => val target = BLAS.dot(v, xExact) + 0.1 * random.nextGaussian() LabeledPoint(target, v) } }.glom() .cache() val x = solve(data).first() println(s"x_exact = $xExact") println(s"x_vlbfgs = $x") sc.stop() } }
Example 117
Source File: LocalSparkContext.scala From streamliner-examples with Apache License 2.0 | 5 votes |
package test.util import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.BeforeAndAfterEach import org.scalatest._ trait LocalSparkContext extends BeforeAndAfterEach { self: Suite => @transient private var _sc: SparkContext = _ val _sparkConf = new SparkConf(false) .set("spark.ui.showConsoleProgress", "false") def sc: SparkContext = _sc override def beforeEach() { _sc = new SparkContext("local[4]", "test", _sparkConf) super.beforeEach() } override def afterEach() { resetSparkContext() super.afterEach() } def resetSparkContext(): Unit = { LocalSparkContext.stop(_sc) _sc = null } } object LocalSparkContext { def stop(sc: SparkContext) { if (sc != null) { sc.stop() } // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown System.clearProperty("spark.driver.port") } def withSpark[T](sc: SparkContext)(f: SparkContext => T): T = { try { f(sc) } finally { stop(sc) } } }
Example 118
Source File: LocalSparkContext.scala From streamliner-examples with Apache License 2.0 | 5 votes |
package test.util import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.BeforeAndAfterEach import org.scalatest._ trait LocalSparkContext extends BeforeAndAfterEach { self: Suite => @transient private var _sc: SparkContext = _ val _sparkConf = new SparkConf(false) .set("spark.ui.showConsoleProgress", "false") def sc: SparkContext = _sc override def beforeEach() { _sc = new SparkContext("local[4]", "test", _sparkConf) super.beforeEach() } override def afterEach() { resetSparkContext() super.afterEach() } def resetSparkContext(): Unit = { LocalSparkContext.stop(_sc) _sc = null } } object LocalSparkContext { def stop(sc: SparkContext) { if (sc != null) { sc.stop() } // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown System.clearProperty("spark.driver.port") } def withSpark[T](sc: SparkContext)(f: SparkContext => T): T = { try { f(sc) } finally { stop(sc) } } }
Example 119
Source File: ClientConf.scala From spark-power-bi with Apache License 2.0 | 5 votes |
package com.granturing.spark.powerbi import org.apache.spark.SparkConf import scala.concurrent.duration._ def fromSparkConf(conf: SparkConf): ClientConf = { val token = conf.get("spark.powerbi.token.uri", TOKEN_URI_DEFAULT) val resource = conf.get("spark.powerbi.token.resource", TOKEN_RESOURCE_DEFAULT) val api = conf.get("spark.powerbi.uri", API_URI_DEFAULT) val username = sys.env.getOrElse(POWERBI_USERNAME, conf.get("spark.powerbi.username")) val password = sys.env.getOrElse(POWERBI_PASSWORD, conf.get("spark.powerbi.password")) val clientid = sys.env.getOrElse(POWERBI_CLIENTID, conf.get("spark.powerbi.clientid")) val timeout = Duration(conf.get("spark.powerbi.timeout", "30").toInt, SECONDS) val maxPartitions = conf.get("spark.powerbi.max_partitions", MAX_PARTITIONS.toString).toInt val batchSize = conf.get("spark.powerbi.batch_size", BATCH_SIZE.toString).toInt ClientConf(token, resource, api, username, password, clientid, timeout, maxPartitions, batchSize) } }
Example 120
Source File: ClientSuite.scala From spark-power-bi with Apache License 2.0 | 5 votes |
package com.granturing.spark.powerbi import org.apache.spark.SparkConf import org.scalatest.{BeforeAndAfterAll, Matchers, FunSuite} import scala.concurrent.Await class ClientSuite extends FunSuite with Matchers with BeforeAndAfterAll { val clientConf = ClientConf.fromSparkConf(new SparkConf()) val client = new Client(clientConf) val dataset = "PowerBI Spark Test" var datasetId: String = _ val group = sys.env.get("POWERBI_GROUP") var groupId: Option[String] = None val table = "People" val tableSchema = Table( table, Seq( Column("name", "string"), Column("age", "Int64"), Column("birthday", "Datetime"), Column("timestamp", "Datetime") )) override def beforeAll = { groupId = group match { case Some(grp) => { val grpOpt = Await.result(client.getGroups, clientConf.timeout).filter(g => grp.equals(g.name)).map(_.id).headOption grpOpt match { case Some(g) => Some(g) case None => sys.error(s"group $grp not found") } } case None => None } } test("client can list groups") { val groups = Await.result(client.getGroups, clientConf.timeout) groups should not be null } test("client can list datasets") { val ds = Await.result(client.getDatasets(groupId), clientConf.timeout) ds should not be null } }
Example 121
Source File: utils.scala From spark-http-stream with BSD 2-Clause "Simplified" License | 5 votes |
package org.apache.spark.sql.execution.streaming.http import org.apache.spark.sql.types.StructField import org.apache.spark.sql.types.StructType import org.apache.spark.sql.types.TimestampType import org.apache.spark.SparkConf import org.apache.commons.io.IOUtils import org.apache.spark.serializer.KryoSerializer import java.io.InputStream import com.esotericsoftware.kryo.io.Input import java.io.ByteArrayOutputStream class WrongArgumentException(name: String, value: Any) extends RuntimeException(s"wrong argument: $name=$value") { } class MissingRequiredArgumentException(map: Map[String, String], paramName: String) extends RuntimeException(s"missing required argument: $paramName, all parameters=$map") { } class InvalidSerializerNameException(serializerName: String) extends RuntimeException(s"invalid serializer name: $serializerName") { } object SchemaUtils { def buildSchema(schema: StructType, includesTimestamp: Boolean, timestampColumnName: String = "_TIMESTAMP_"): StructType = { if (!includesTimestamp) schema; else StructType(schema.fields.toSeq :+ StructField(timestampColumnName, TimestampType, false)); } } object Params { def deserialize(bytes: Array[Byte]): Any = { val kryo = kryoSerializer.newKryo(); val input = new Input(); input.setBuffer(bytes); kryo.readClassAndObject(input); } }
Example 122
Source File: SerializerFactory.scala From spark-http-stream with BSD 2-Clause "Simplified" License | 5 votes |
package org.apache.spark.sql.execution.streaming.http import java.nio.ByteBuffer import org.apache.spark.serializer.SerializerInstance import org.apache.spark.serializer.DeserializationStream import org.apache.spark.serializer.SerializationStream import java.io.OutputStream import java.io.InputStream import scala.reflect.ClassTag import com.fasterxml.jackson.databind.ObjectMapper import org.apache.spark.SparkConf import org.apache.spark.serializer.JavaSerializer import org.apache.spark.serializer.KryoSerializer object SerializerFactory { val DEFAULT = new SerializerFactory { override def getSerializerInstance(serializerName: String): SerializerInstance = { serializerName.toLowerCase() match { case "kryo" ⇒ new KryoSerializer(new SparkConf()).newInstance(); case "java" ⇒ new JavaSerializer(new SparkConf()).newInstance(); case _ ⇒ throw new InvalidSerializerNameException(serializerName); } } } } trait SerializerFactory { def getSerializerInstance(serializerName: String): SerializerInstance; }
Example 123
Source File: UtilsTest.scala From spark-http-stream with BSD 2-Clause "Simplified" License | 5 votes |
import java.sql.Date import org.apache.spark.SparkConf import org.apache.spark.serializer.KryoSerializer import org.apache.spark.sql.SparkSession import org.junit.Assert import org.junit.Test import java.io.ByteArrayOutputStream import java.io.InputStream import org.apache.commons.io.IOUtils import com.esotericsoftware.kryo.io.Input import org.apache.spark.sql.execution.streaming.http.KryoSerializerUtils class UtilsTest { @Test def testKryoSerDe() { val d1 = new Date(30000); val bytes = KryoSerializerUtils.serialize(d1); val d2 = KryoSerializerUtils.deserialize(bytes); Assert.assertEquals(d1, d2); val d3 = Map('x' -> Array("aaa", "bbb"), 'y' -> Array("ccc", "ddd")); println(d3); val bytes2 = KryoSerializerUtils.serialize(d3); val d4 = KryoSerializerUtils.deserialize(bytes2).asInstanceOf[Map[String, Any]]; println(d4); } @Test def testEncoderSchema() { val spark = SparkSession.builder.master("local[4]") .getOrCreate(); val sqlContext = spark.sqlContext; import sqlContext.implicits._ import org.apache.spark.sql.catalyst.encoders.encoderFor val schema1 = encoderFor[String].schema; val schema2 = encoderFor[(String)].schema; val schema3 = encoderFor[((String))].schema; Assert.assertEquals(schema1, schema2); Assert.assertEquals(schema1, schema3); } @Test def testDateInTuple() { val spark = SparkSession.builder.master("local[4]") .getOrCreate(); val sqlContext = spark.sqlContext; import sqlContext.implicits._ val d1 = new Date(30000); val ds = sqlContext.createDataset(Seq[(Int, Date)]((1, d1))); val d2 = ds.collect()(0)._2; //NOTE: d1!=d2, maybe a bug println(d1.equals(d2)); } }
Example 124
Source File: HttpStreamServerClientTest.scala From spark-http-stream with BSD 2-Clause "Simplified" License | 5 votes |
import org.apache.spark.SparkConf import org.apache.spark.serializer.KryoSerializer import org.apache.spark.sql.Row import org.apache.spark.sql.execution.streaming.http.HttpStreamClient import org.junit.Assert import org.junit.Test import org.apache.spark.sql.types.LongType import org.apache.spark.sql.types.IntegerType import org.apache.spark.sql.types.DoubleType import org.apache.spark.sql.types.BooleanType import org.apache.spark.sql.types.FloatType import org.apache.spark.sql.types.StringType import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.StructType import org.apache.spark.sql.types.StructField import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.ByteType import org.apache.spark.sql.execution.streaming.http.HttpStreamServer import org.apache.spark.sql.execution.streaming.http.StreamPrinter import org.apache.spark.sql.execution.streaming.http.HttpStreamServerSideException class HttpStreamServerClientTest { val ROWS1 = Array(Row("hello1", 1, true, 0.1f, 0.1d, 1L, '1'.toByte), Row("hello2", 2, false, 0.2f, 0.2d, 2L, '2'.toByte), Row("hello3", 3, true, 0.3f, 0.3d, 3L, '3'.toByte)); val ROWS2 = Array(Row("hello"), Row("world"), Row("bye"), Row("world")); @Test def testHttpStreamIO() { //starts a http server val kryoSerializer = new KryoSerializer(new SparkConf()); val server = HttpStreamServer.start("/xxxx", 8080); val spark = SparkSession.builder.appName("testHttpTextSink").master("local[4]") .getOrCreate(); spark.conf.set("spark.sql.streaming.checkpointLocation", "/tmp/"); val sqlContext = spark.sqlContext; import spark.implicits._ //add a local message buffer to server, with 2 topics registered server.withBuffer() .addListener(new StreamPrinter()) .createTopic[(String, Int, Boolean, Float, Double, Long, Byte)]("topic-1") .createTopic[String]("topic-2"); val client = HttpStreamClient.connect("http://localhost:8080/xxxx"); //tests schema of topics val schema1 = client.fetchSchema("topic-1"); Assert.assertArrayEquals(Array[Object](StringType, IntegerType, BooleanType, FloatType, DoubleType, LongType, ByteType), schema1.fields.map(_.dataType).asInstanceOf[Array[Object]]); val schema2 = client.fetchSchema("topic-2"); Assert.assertArrayEquals(Array[Object](StringType), schema2.fields.map(_.dataType).asInstanceOf[Array[Object]]); //prepare to consume messages val sid1 = client.subscribe("topic-1")._1; val sid2 = client.subscribe("topic-2")._1; //produces some data client.sendRows("topic-1", 1, ROWS1); val sid4 = client.subscribe("topic-1")._1; val sid5 = client.subscribe("topic-2")._1; client.sendRows("topic-2", 1, ROWS2); //consumes data val fetched = client.fetchStream(sid1).map(_.originalRow); Assert.assertArrayEquals(ROWS1.asInstanceOf[Array[Object]], fetched.asInstanceOf[Array[Object]]); //it is empty now Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid1).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid2).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid4).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]); client.unsubscribe(sid4); try { client.fetchStream(sid4); //exception should be thrown, because subscriber id is invalidated Assert.assertTrue(false); } catch { case e: Throwable ⇒ e.printStackTrace(); Assert.assertEquals(classOf[HttpStreamServerSideException], e.getClass); } server.stop(); } }
Example 125
Source File: HttpStreamDemo.scala From spark-http-stream with BSD 2-Clause "Simplified" License | 5 votes |
import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.streaming.http.HttpStreamServer import org.apache.spark.SparkConf import org.apache.spark.sql.execution.streaming.http.StreamPrinter import org.apache.spark.sql.execution.streaming.http.HttpStreamSourceProvider import org.apache.spark.sql.execution.streaming.http.HttpStreamSinkProvider object HttpStreamDemo { def printUsage() { println("USAGE:"); val name = this.getClass.getSimpleName; println(s"\t$name start-server-on 8080 /xxxx"); println(s"\t$name write-into http://localhost:8080/xxxx"); println(s"\t$name read-from http://localhost:8080/xxxx"); } def main(args: Array[String]) { if (args.length == 0) { printUsage(); } else { args(0) match { case "write-into" ⇒ runAsSink(args(1)); case "start-server-on" ⇒ runAsReceiver(args(2), args(1).toInt); case "read-from" ⇒ runAsSource(args(1)); case s: String ⇒ printUsage(); } } } def runAsSink(httpServletURL: String) { val spark = SparkSession.builder.appName("StructuredNetworkWordCount").master("local[4]") .getOrCreate(); println(s"reading from tcp://localhost:9999"); println(s"writing into $httpServletURL"); val sqlContext = spark.sqlContext; //tcp->HttpStreamSink val lines = spark.readStream. format("socket"). option("host", "localhost"). option("port", 9999). load(); spark.conf.set("spark.sql.streaming.checkpointLocation", "/tmp/"); val query = lines.writeStream .format(classOf[HttpStreamSinkProvider].getName) .option("httpServletUrl", httpServletURL) .option("topic", "topic-1") .start(); query.awaitTermination(); } def runAsReceiver(servletPath: String, httpPort: Int) { val spark = SparkSession.builder.appName("StructuredNetworkWordCount").master("local[4]") .getOrCreate(); import spark.implicits._ //starts a http server with a buffer HttpStreamServer.start(servletPath, httpPort) .withBuffer() .addListener(new StreamPrinter()) .createTopic[String]("topic-1"); } def runAsSource(httpServletURL: String) { val spark = SparkSession.builder.appName("StructuredNetworkWordCount").master("local[4]") .getOrCreate(); spark.conf.set("spark.sql.streaming.checkpointLocation", "/tmp/"); //HttpStreamSource->map->console //HttpStreamSource as a source stream val lines = spark.readStream.format(classOf[HttpStreamSourceProvider].getName) .option("httpServletUrl", httpServletURL) .option("topic", "topic-1").load(); import spark.implicits._ val words = lines.as[String].flatMap(_.split(" ")); val wordCounts = words.groupBy("value").count(); val query = wordCounts.writeStream. outputMode("complete"). format("console"). start(); query.awaitTermination(); } }
Example 126
Source File: Conf.scala From spark-util with Apache License 2.0 | 5 votes |
package org.hammerlab.spark import org.apache.spark.SparkConf import org.hammerlab.paths.Path object Conf { val propsLineRegex = """(\S+)\s+(.*)""".r def apply(loadDefaults: Boolean = true): SparkConf = { val envSparkPropertiesFiles = Option(System.getenv("SPARK_PROPERTIES_FILES")) .toList .flatMap(_.split(",")) .filterNot(_.isEmpty) val sparkProperties = envSparkPropertiesFiles .flatMap { path ⇒ Path(path) .lines .filter(_.trim.nonEmpty) .map { case propsLineRegex(key, value) ⇒ key → value case line ⇒ throw new IllegalArgumentException( s"Invalid property line in $path: '$line'" ) } } val sparkConf = new SparkConf() for { (k, v) ← sparkProperties } { sparkConf.set(k, v) } sparkConf } }
Example 127
Source File: SparkConfBase.scala From spark-util with Apache License 2.0 | 5 votes |
package org.hammerlab.spark import org.apache.spark.SparkConf import scala.collection.mutable trait SparkConfBase { private val _sparkConfs = mutable.Map[String, String]() protected def sparkConfs: Map[String, String] = _sparkConfs.toMap protected def makeSparkConf: SparkConf = { val sparkConf = new SparkConf() for { (k, v) ← _sparkConfs } { sparkConf.setIfMissing(k, v) } sparkConf } protected def sparkConf(confs: (String, String)*): Unit = for { (k, v) ← confs } { _sparkConfs(k) = v } }
Example 128
Source File: Context.scala From spark-util with Apache License 2.0 | 5 votes |
package org.hammerlab.spark import org.apache.spark.{ SparkConf, SparkContext } import org.hammerlab.hadoop.Configuration case class Context(@transient sc: SparkContext) extends Configuration(sc.hadoopConfiguration) object Context { implicit def makeContext(sc: SparkContext): Context = Context(sc) implicit def deriveContext(implicit sc: SparkContext): Context = Context(sc) implicit def umakeContext(context: Context): SparkContext = context.sc def apply()(implicit conf: SparkConf): Context = Context( new SparkContext( conf ) ) }
Example 129
Source File: Sessionize.scala From Mastering-Scala-Machine-Learning with MIT License | 5 votes |
package org.akozlov.chapter06 import java.io._ import java.time.ZoneOffset import java.time.LocalDateTime import java.time.format.DateTimeFormatter import org.apache.spark.{SparkConf,SparkContext} import org.apache.spark.storage.StorageLevel object Sessionize extends App { val sc = new SparkContext("local[8]", "Sessionize", new SparkConf()) val checkoutPattern = ".*>checkout.*".r.pattern // a basic page view structure case class PageView(ts: String, path: String) extends Serializable with Ordered[PageView] { override def toString: String = { s"($ts #$path)" } def compare(other: PageView) = ts compare other.ts } // represent a session case class Session[A <: PageView](id: String, visits: Seq[A]) extends Serializable { override def toString: String = { val vsts = visits.mkString("[", ",", "]") s"($id -> $vsts)" } } def toEpochSeconds(str: String) = { LocalDateTime.parse(str, DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")).toEpochSecond(ZoneOffset.UTC) } val sessions = sc.textFile("data/clickstream") .map(line => {val parts = line.split("\t"); (parts(4), new PageView(parts(0), parts(20)))}) .groupByKey.map(x => { new Session(x._1, x._2.toSeq.sorted) } ) .cache // sessions.take(100).foreach(println) def findAllCheckoutSessions(s: Session[PageView]) = { s.visits.tails.filter { _ match { case PageView(ts1, "mycompanycom>homepage") :: PageView(ts2, page) :: tail if (page != "mycompanycom>homepage" ) => true; case _ => false } } .foldLeft(Seq[Session[PageView]]()) { case (r, x) => { x.find(y => checkoutPattern.matcher(y.path).matches) match { case Some(checkout) if (toEpochSeconds(checkout.ts) > toEpochSeconds(x.head.ts) + 60) => r.:+(new Session(s.id, x.slice(0, x.indexOf(checkout)))) case _ => r } } } } val prodLandingSessions = sessions.flatMap(findAllCheckoutSessions) prodLandingSessions.collect.foreach(println) sc.stop() }
Example 130
Source File: FlumeWordCount.scala From Mastering-Scala-Machine-Learning with MIT License | 5 votes |
package org.akozlov.chapter03 import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.flume._ object FlumeWordCount { def main(args: Array[String]) { // Create the context with a 2 second batch size val sparkConf = new SparkConf().setMaster("local[2]").setAppName("FlumeWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) ssc.checkpoint("/tmp/flume_check") val hostPort=args(0).split(":") System.out.println("Opening a sink at host: [" + hostPort(0) + "] port: [" + hostPort(1).toInt + "]") val lines = FlumeUtils.createPollingStream(ssc, hostPort(0), hostPort(1).toInt, StorageLevel.MEMORY_ONLY) val words = lines .map(e => new String(e.event.getBody.array)).map(_.toLowerCase).flatMap(_.split("\\W+")) .map(word => (word, 1L)) .reduceByKeyAndWindow(_+_, _-_, Seconds(6), Seconds(2)).print ssc.start() ssc.awaitTermination() } }
Example 131
Source File: KafkaWordCount.scala From Mastering-Scala-Machine-Learning with MIT License | 5 votes |
package org.akozlov.chapter03 import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.kafka._ object KafkaWordCount { def main(args: Array[String]) { // Create the context with a 2 second batch size val sparkConf = new SparkConf().setMaster("local[2]").setAppName("KafkaWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) ssc.checkpoint("/tmp/kafka_check") System.out.println("Opening a Kafka consumer at zk: [" + args(0) + "] for group group-1 and topic example") val lines = KafkaUtils.createStream(ssc, args(0), "group-1", Map("example" -> 1), StorageLevel.MEMORY_ONLY) val words = lines .flatMap(_._2.toLowerCase.split("\\W+")) .map(word => (word, 1L)) .reduceByKeyAndWindow(_+_, _-_, Seconds(6), Seconds(2)).print ssc.start() ssc.awaitTermination() } }
Example 132
Source File: L10-9Graph.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.graphx.Edge import org.apache.spark.graphx.Graph import org.apache.spark.graphx.Graph.graphToGraphOps import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object UserRankApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: UserRankApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) ssc.socketTextStream(hostname, port.toInt) .map(r => { implicit val formats = DefaultFormats parse(r) }) .foreachRDD(rdd => { val edges = rdd.map(jvalue => { implicit val formats = DefaultFormats ((jvalue \ "user_id").extract[String], (jvalue \ "friends").extract[Array[String]]) }) .flatMap(r => r._2.map(f => Edge(r._1.hashCode.toLong, f.hashCode.toLong, 1.0))) val vertices = rdd.map(jvalue => { implicit val formats = DefaultFormats ((jvalue \ "user_id").extract[String]) }) .map(r => (r.hashCode.toLong, r)) val tolerance = 0.0001 val graph = Graph(vertices, edges, "defaultUser") .subgraph(vpred = (id, idStr) => idStr != "defaultUser") val pr = graph.pageRank(tolerance).cache graph.outerJoinVertices(pr.vertices) { (userId, attrs, rank) => (rank.getOrElse(0.0).asInstanceOf[Number].doubleValue, attrs) }.vertices.top(10) { Ordering.by(_._2._1) }.foreach(rec => println("User id: %s, Rank: %f".format(rec._2._2, rec._2._1))) }) ssc.start() ssc.awaitTermination() } }
Example 133
Source File: L10-2DataProc.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.HashPartitioner import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.json4s.DefaultFormats import org.json4s.JsonAST.JNothing import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object DataProcApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: DataProcApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) ssc.socketTextStream(hostname, port.toInt) .map(r => { implicit val formats = DefaultFormats parse(r) }) .filter(jvalue => { jvalue \ "attributes" \ "Wi-Fi" != JNothing }) .map(jvalue => { implicit val formats = DefaultFormats ((jvalue \ "attributes" \ "Wi-Fi").extract[String], (jvalue \ "stars").extract[Int]) }) .combineByKey( (v) => (v, 1), (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1), (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2), new HashPartitioner(ssc.sparkContext.defaultParallelism)) .map({ case (k, v) => (k, v._1 / v._2.toFloat) }) .print() ssc.start() ssc.awaitTermination() } }
Example 134
Source File: L5-7MultipleSocketStreams.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Seconds, StreamingContext } import org.apache.spark.streaming.dstream.PairDStreamFunctions import java.util.Calendar object TripByYearMultiApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: TripByYearMultiApp <appname> <hostname> <base_port> <num_of_sockets>") System.exit(1) } val Seq(appName, hostname, basePort, nSockets) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) val streams = (0 to nSockets.toInt - 1).map(i => ssc.socketTextStream(hostname, basePort.toInt + i)) val uniStream = ssc.union(streams) uniStream .map(rec => rec.split(",")) .map(rec => (rec(13), rec(0).toInt)) .reduceByKey(_ + _) .map(pair => (pair._2, normalizeYear(pair._1))) .transform(rec => rec.sortByKey(ascending = false)) .saveAsTextFiles("TripByYear") ssc.start() ssc.awaitTermination() } def normalizeYear(s: String): String = { try { (Calendar.getInstance().get(Calendar.YEAR) - s.toInt).toString } catch { case e: Exception => s } } }
Example 135
Source File: L5-9Mqtt.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.mqtt.MQTTUtils object YearlyDistributionApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: YearlyDistributionApp <appname> <brokerUrl> <topic> <checkpointDir>") System.exit(1) } val Seq(appName, brokerUrl, topic, checkpointDir) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) MQTTUtils.createStream(ssc, brokerUrl, topic, StorageLevel.MEMORY_ONLY_SER_2) .map(rec => rec.split(",")) .map(rec => (rec(1).split(" ")(0), 1)) .updateStateByKey(statefulCount) .map(pair => (pair._2, pair._1)) .transform(rec => rec.sortByKey(ascending = false)) .saveAsTextFiles("YearlyDistribution") ssc.start() ssc.awaitTermination() } val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) }
Example 136
Source File: L5-11FlumePull.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.flume.FlumeUtils object DailyUserTypeDistributionApp2 { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: DailyUserTypeDistributionApp <appname> <hostname> <port> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, hostname, port, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) FlumeUtils.createPollingStream(ssc, hostname, port.toInt, StorageLevel.MEMORY_ONLY_SER_2) .map(rec => new String(rec.event.getBody().array()).split(",")) .map(rec => ((rec(1).split(" ")(0), rec(12)), 1)) .updateStateByKey(statefulCount) .repartition(1) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) }
Example 137
Source File: L5-6SocketStream.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Seconds, StreamingContext } import org.apache.spark.streaming.dstream.PairDStreamFunctions import java.util.Calendar object TripByYearApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: TripByYearApp <appname> <hostname> <port>") System.exit(1) } val Seq(appName, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.socketTextStream(hostname, port.toInt) .map(rec => rec.split(",")) .map(rec => (rec(13), rec(0).toInt)) .reduceByKey(_ + _) .map(pair => (pair._2, normalizeYear(pair._1))) .transform(rec => rec.sortByKey(ascending = false)) .saveAsTextFiles("TripByYear") ssc.start() ssc.awaitTermination() } def normalizeYear(s: String): String = { try { (Calendar.getInstance().get(Calendar.YEAR) - s.toInt).toString } catch { case e: Exception => s } } }
Example 138
Source File: L5-16Twitter.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.twitter.TwitterUtils import org.apache.spark.storage.StorageLevel import twitter4j.conf.ConfigurationBuilder import twitter4j.TwitterFactory object TwitterApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: TwitterApp <appname> <outputPath>") System.exit(1) } val Seq(appName, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) val cb = new ConfigurationBuilder() cb.setOAuthConsumerKey("") cb.setOAuthConsumerSecret("") cb.setOAuthAccessToken("") cb.setOAuthAccessTokenSecret("") val twitterAuth = new TwitterFactory(cb.build()).getInstance().getAuthorization() val tweetStream = TwitterUtils.createStream(ssc, Some(twitterAuth), Array("nyc citi bike", "nyc bike share")) tweetStream.count().print() tweetStream.saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 139
Source File: L5-11FlumePush.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.flume.FlumeUtils object DailyUserTypeDistributionApp { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: DailyUserTypeDistributionApp <appname> <hostname> <port> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, hostname, port, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) FlumeUtils.createStream(ssc, hostname, port.toInt, StorageLevel.MEMORY_ONLY_SER_2) .map(rec => new String(rec.event.getBody().array()).split(",")) .map(rec => ((rec(1).split(" ")(0), rec(12)), 1)) .updateStateByKey(statefulCount) .repartition(1) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) }
Example 140
Source File: L5-13Kafka.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.kafka.KafkaUtils object StationJourneyCountApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) //.set("spark.streaming.receiver.writeAheadLog.enable", "true") val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val topics = Map[String, Int]( topic -> 1) KafkaUtils.createStream(ssc, zkQuorum, consumerGroupId, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2) .map(rec => rec.split(",")) .map(rec => ((rec(3), rec(7)), 1)) .reduceByKey(_ + _) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 141
Source File: L5-18Http.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object HttpApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: HttpApp <appname> <outputPath>") System.exit(1) } val Seq(appName, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://www.citibikenyc.com/stations/json", interval = batchInterval) .flatMap(rec => (parse(rec) \ "stationBeanList").children) .filter(rec => { implicit val formats = DefaultFormats (rec \ "statusKey").extract[Integer] != 1 }) .map(rec => rec.filterField { case JField("id", _) => true case JField("stationName", _) => true case JField("statusValue", _) => true case _ => false }) .map(rec => { implicit val formats = DefaultFormats (rec(0)._2.extract[Integer], rec(1)._2.extract[String], rec(2)._2.extract[String]) }) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 142
Source File: L5-14KafkaCustomConf.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.kafka.KafkaUtils import kafka.serializer.StringDecoder import org.apache.spark.storage.StorageLevel object StationJourneyCountCustomApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) //.set("spark.streaming.receiver.writeAheadLog.enable", "true") val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val topics = Map[String, Int]( topic -> 1) val params = Map[String, String]( "zookeeper.connect" -> zkQuorum, "group.id" -> consumerGroupId, "bootstrap.servers" -> brokerUrl) KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](ssc, params, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2) .map(rec => rec.split(",")) .map(rec => ((rec(3), rec(7)), 1)) .reduceByKey(_ + _) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 143
Source File: L7-2-3Tachyon.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions object ReferrerApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: ReferrerApp <appname> <hostname> <port> <tachyonUrl> <checkpointDir> <outputPathTop> <outputPathSpark>") System.exit(1) } val Seq(appName, hostname, port, tachyonUrl, checkpointDir, outputPathTop, outputPathSpark) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.externalBlockStore.url", tachyonUrl) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val clickstream = ssc.socketTextStream(hostname, port.toInt) .map(rec => rec.split("\\t")) .persist(StorageLevel.OFF_HEAP) val topRefStream = clickstream .map(rec => { var prev_title = rec(3) if (!prev_title.startsWith("other")) { prev_title = "wikipedia" } (prev_title, 1) }) val topSparkStream = clickstream .filter(rec => rec(4).equals("Apache_Spark")) .map(rec => (rec(3), 1)) saveTopKeys(topRefStream, outputPathTop) saveTopKeys(topSparkStream, outputPathSpark) ssc.start() ssc.awaitTermination() } def saveTopKeys(clickstream: DStream[(String, Int)], outputPath: String) { clickstream.updateStateByKey((values, state: Option[Int]) => Some(values.sum + state.getOrElse(0))) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rec => rec.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) } }
Example 144
Source File: L7-4UI.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.util.concurrent.atomic.AtomicLong import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object SocialSearchApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: SocialSearchApp <appname> <hostname> <port>") System.exit(1) } val Seq(appName, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) //.set("spark.eventLog.enabled", "true") //.set("spark.eventLog.dir", "/tmp/historical") val countSearch = new AtomicLong(0) val countSocial = new AtomicLong(0) val ssc = new StreamingContext(conf, Seconds(1)) val titleStream = ssc.socketTextStream(hostname, port.toInt) .map(rec => rec.split("\\t")) .filter(_(3) match { case "other-google" | "other-bing" | "other-yahoo" | "other-facebook" | "other-twitter" => true case _ => false }) .map(rec => (rec(3), rec(4))) .cache() val searchStream = titleStream.filter(_._1 match { case "other-google" | "other-bing" | "other-yahoo" => true case _ => false }) .map(rec => rec._2) val socialStream = titleStream.filter(_._1 match { case "other-facebook" | "other-twitter" => true case _ => false }) .map(rec => rec._2) val exclusiveSearch = searchStream.transformWith(socialStream, (searchRDD: RDD[String], socialRDD: RDD[String]) => searchRDD.subtract(socialRDD)) .foreachRDD(rdd => { countSearch.addAndGet(rdd.count()) println("Exclusive count search engines: " + countSearch) }) val exclusiveSocial = socialStream.transformWith(searchStream, (socialRDD: RDD[String], searchRDD: RDD[String]) => socialRDD.subtract(searchRDD)) .foreachRDD(rdd => { countSocial.addAndGet(rdd.count()) println("Exclusive count social media: " + countSocial) }) ssc.start() ssc.awaitTermination() } }
Example 145
Source File: L4-1Voyager.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.fs.Path import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions object VoyagerApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: VoyagerApp <appname> <inputPath> <outputPath>") System.exit(1) } val Seq(appName, inputPath, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.executor.extraJavaOptions", "-XX:+UseConcMarkSweepGC") val ssc = new StreamingContext(conf, Seconds(10)) val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) voyager1.map(rec => { val attrs = rec.split("\\s+") ((attrs(0).toInt), attrs.slice(18, 28).map(_.toDouble)) }).filter(pflux => pflux._2.exists(_ > 1.0)).map(rec => (rec._1, 1)) .reduceByKey(_ + _) .transform(rec => rec.sortByKey(ascending = false, numPartitions = 1)).saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 146
Source File: L4-4Kryo.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.fs.Path import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions object VoyagerAppKryo { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: VoyagerAppKryo <appname> <inputPath> <outputPath>") System.exit(1) } val Seq(appName, inputPath, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .registerKryoClasses(Array(classOf[ProtonFlux])) val ssc = new StreamingContext(conf, Seconds(10)) val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val projected = voyager1.map(rec => { val attrs = rec.split("\\s+") new ProtonFlux(attrs(0), attrs(18), attrs(19), attrs(20), attrs(21), attrs(22), attrs(23), attrs(24), attrs(25), attrs(26), attrs(27), attrs(28)) }) val filtered = projected.filter(pflux => pflux.isSolarStorm) val yearlyBreakdown = filtered.map(rec => (rec.year, 1)) .reduceByKey(_ + _) .transform(rec => rec.sortByKey(ascending = false)) yearlyBreakdown.saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 147
Source File: L8-1DataFrameAPI.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions.desc import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrDataframeApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 148
Source File: L8-3-6-7DataFrameCreation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions.desc import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.native.Serialization.write import org.json4s.DefaultFormats object DataframeCreationApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { //val cdrs = sqlC.createDataFrame(seqToCdr(rdd)) //val cdrs = sqlC.createDataFrame(seqToCdr(rdd).collect()) //val cdrs = seqToCdr(rdd).toDF() val cdrsJson = seqToCdr(rdd).map(r => { implicit val formats = DefaultFormats write(r) }) val cdrs = sqlC.read.json(cdrsJson) cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 149
Source File: L8-29DataFrameExamplesJoin.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.JDouble import org.json4s.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.compact import org.json4s.native.JsonMethods.parse import org.json4s.native.JsonMethods.render import org.json4s.string2JsonInput object CdrDataframeExamples3App { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: CdrDataframeExamples3App <appname> <batchInterval> <hostname> <port> <gridJsonPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, gridJsonPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ implicit val formats = DefaultFormats val gridFile = scala.io.Source.fromFile(gridJsonPath).mkString val gridGeo = (parse(gridFile) \ "features") val gridStr = gridGeo.children.map(r => { val c = (r \ "geometry" \ "coordinates").extract[List[List[List[Float]]]].flatten.flatten.map(r => JDouble(r)) val l = List(("id", r \ "id"), ("x1", c(0)), ("y1", c(1)), ("x2", c(2)), ("y2", c(3)), ("x3", c(4)), ("y3", c(5)), ("x4", c(6)), ("y4", c(7))) compact(render(JObject(l))) }) val gridDF = sqlC.read.json(ssc.sparkContext.makeRDD(gridStr)) val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.join(gridDF, $"squareId" === $"id").show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 150
Source File: L8-38SparkR.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import java.nio.file.Paths import org.apache.spark.SparkFiles object CdrStreamingSparkRApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: CdrStreamingSparkRApp <appname> <batchInterval> <hostname> <port> <tableName> <RScriptPath> <RScriptLogsPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, tableName, rScriptPath, logsPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val cl = Thread.currentThread().getContextClassLoader() val hiveC = new HiveContext(ssc.sparkContext) Thread.currentThread().setContextClassLoader(cl) import hiveC.implicits._ ssc.sparkContext.addFile(rScriptPath) val rScriptName = SparkFiles.get(Paths.get(rScriptPath).getFileName.toString) val master = hiveC.sparkContext.getConf.get("spark.master") val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD((rdd, time) => { val iTableName = tableName + time.milliseconds seqToCdr(rdd).toDF().write.saveAsTable(iTableName) hiveC.sparkContext.parallelize(Array(iTableName)).pipe("%s %s".format(rScriptName, master)).saveAsTextFile(Paths.get(logsPath, iTableName).toString) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 151
Source File: T8-5-L8-30-34DataFrameExamplesActions.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SaveMode import org.apache.spark.sql.functions.desc import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apress.prospark.CdrDataframeExamplesActionsApp.Cdr import org.json4s.DefaultFormats object CdrDataframeExamplesActionsApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeExamplesActionsApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val cl = Thread.currentThread().getContextClassLoader() val hiveC = new HiveContext(ssc.sparkContext) Thread.currentThread().setContextClassLoader(cl) import hiveC.implicits._ implicit val formats = DefaultFormats val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() val counts = cdrs.groupBy("countryCode").count().orderBy(desc("count")) counts.show(5) counts.show() println("head(5): " + counts.head(5)) println("take(5): " + counts.take(5)) println("head(): " + counts.head()) println("first(5): " + counts.first()) println("count(): " + counts.count()) println("collect(): " + counts.collect()) println("collectAsList(): " + counts.collectAsList()) println("describe(): " + cdrs.describe("smsInActivity", "smsOutActivity", "callInActivity", "callOutActivity", "internetTrafficActivity").show()) counts.write.format("parquet").save("/tmp/parquent" + rdd.id) counts.write.format("json").save("/tmp/json" + rdd.id) counts.write.parquet("/tmp/parquent2" + rdd.id) counts.write.json("/tmp/json2" + rdd.id) counts.write.saveAsTable("count_table") cdrs.groupBy("countryCode").count().orderBy(desc("count")).write.mode(SaveMode.Append).save("/tmp/counts") val prop: java.util.Properties = new java.util.Properties() counts.write.jdbc("jdbc:mysql://hostname:port/cdrsdb", "count_table", prop) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 152
Source File: L8-10-11UDF.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.io.Source import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.jackson.JsonMethods.parse import org.json4s.jvalue2extractable import org.json4s.string2JsonInput object CdrUDFApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrUDFApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ def getCountryCodeMapping() = { implicit val formats = org.json4s.DefaultFormats parse(Source.fromURL("http://country.io/phone.json").mkString).extract[Map[String, String]].map(_.swap) } def getCountryNameMapping() = { implicit val formats = org.json4s.DefaultFormats parse(Source.fromURL("http://country.io/names.json").mkString).extract[Map[String, String]] } def getCountryName(mappingPhone: Map[String, String], mappingName: Map[String, String], code: Int) = { mappingName.getOrElse(mappingPhone.getOrElse(code.toString, "NotFound"), "NotFound") } val getCountryNamePartial = getCountryName(getCountryCodeMapping(), getCountryNameMapping(), _: Int) sqlC.udf.register("getCountryNamePartial", getCountryNamePartial) val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.registerTempTable("cdrs") sqlC.sql("SELECT getCountryNamePartial(countryCode) AS countryName, COUNT(countryCode) AS cCount FROM cdrs GROUP BY countryCode ORDER BY cCount DESC LIMIT 5").show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 153
Source File: L8-4DataFrameCreationSchema.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.Row import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions.desc import org.apache.spark.sql.types.DataType import org.apache.spark.sql.types.StructType import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object DataframeCreationApp2 { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: CdrDataframeApp2 <appname> <batchInterval> <hostname> <port> <schemaPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, schemaFile) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) val schemaJson = scala.io.Source.fromFile(schemaFile).mkString val schema = DataType.fromJson(schemaJson).asInstanceOf[StructType] val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = sqlC.createDataFrame(rdd.map(c => Row(c: _*)), schema) cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5) }) ssc.start() ssc.awaitTermination() } }
Example 154
Source File: L8-14-27DataFrameExamples.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions._ import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrDataframeExamplesApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeExamplesApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.select("squareId", "timeInterval", "countryCode").show() cdrs.select($"squareId", $"timeInterval", $"countryCode").show() cdrs.filter("squareId = 5").show() cdrs.drop("countryCode").show() cdrs.select($"squareId", $"timeInterval", $"countryCode").where($"squareId" === 5).show() cdrs.limit(5).show() cdrs.groupBy("squareId").count().show() cdrs.groupBy("countryCode").avg("internetTrafficActivity").show() cdrs.groupBy("countryCode").max("callOutActivity").show() cdrs.groupBy("countryCode").min("callOutActivity").show() cdrs.groupBy("squareId").sum("internetTrafficActivity").show() cdrs.groupBy("squareId").agg(sum("callOutActivity"), sum("callInActivity"), sum("smsOutActivity"), sum("smsInActivity"), sum("internetTrafficActivity")).show() cdrs.groupBy("countryCode").sum("internetTrafficActivity").orderBy(desc("SUM(internetTrafficActivity)")).show() cdrs.agg(sum("callOutActivity"), sum("callInActivity"), sum("smsOutActivity"), sum("smsInActivity"), sum("internetTrafficActivity")).show() cdrs.rollup("squareId", "countryCode").count().orderBy(desc("squareId"), desc("countryCode")).rdd.saveAsTextFile("/tmp/rollup" + rdd.hashCode()) cdrs.cube("squareId", "countryCode").count().orderBy(desc("squareId"), desc("countryCode")).rdd.saveAsTextFile("/tmp/cube" + rdd.hashCode()) cdrs.dropDuplicates(Array("callOutActivity", "callInActivity")).show() cdrs.select("squareId", "countryCode", "internetTrafficActivity").distinct.show() cdrs.withColumn("endTime", cdrs("timeInterval") + 600000).show() cdrs.sample(true, 0.01).show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 155
Source File: L8-28DataFrameExamplesOps.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrDataframeExamples2App { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeExamples2App <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ var previousCdrs: Option[DataFrame] = None val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF().select("squareId", "countryCode").dropDuplicates() previousCdrs match { case Some(prevCdrs) => cdrs.unionAll(prevCdrs).show() //case Some(prevCdrs) => cdrs.intersect(prevCdrs).show() //case Some(prevCdrs) => cdrs.except(prevCdrs).show() case None => Unit } previousCdrs = Some(cdrs) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 156
Source File: T8-3DataFrameExamplesNA.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.JDouble import org.json4s.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.compact import org.json4s.native.JsonMethods.parse import org.json4s.native.JsonMethods.render import org.json4s.string2JsonInput object CdrDataframeExamplesNAApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeExamplesNAApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ implicit val formats = DefaultFormats val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.na.drop("any").show() cdrs.na.fill(0, Array("squareId")).show() cdrs.na.replace("squareId", Map(0 -> 1)).show() println("Correlation: " + cdrs.stat.corr("smsOutActivity", "callOutActivity")) println("Covariance: " + cdrs.stat.cov("smsInActivity", "callInActivity")) cdrs.stat.crosstab("squareId", "countryCode").show() cdrs.stat.freqItems(Array("squareId", "countryCode"), 0.1).show() cdrs.stat.crosstab("callOutActivity", "callInActivity").show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 157
Source File: L8-8Sql.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrSqlApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrSqlApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.registerTempTable("cdrs") sqlC.sql("SELECT countryCode, COUNT(countryCode) AS cCount FROM cdrs GROUP BY countryCode ORDER BY cCount DESC LIMIT 5").show() sqlC.dropTempTable("cdrs") }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 158
Source File: L8-35DataFrameExamplesRDD.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.types.DataType import org.apache.spark.sql.types.StructType import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats object CdrDataframeExamplesRDDApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: CdrDataframeExamplesRDDApp <appname> <batchInterval> <hostname> <schemaPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, schemaFile) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ implicit val formats = DefaultFormats val schemaJson = scala.io.Source.fromFile(schemaFile).mkString val schema = DataType.fromJson(schemaJson).asInstanceOf[StructType] val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() val highInternet = sqlC.createDataFrame(cdrs.rdd.filter(r => r.getFloat(3) + r.getFloat(4) >= r.getFloat(5) + r.getFloat(6)), schema) val highOther = cdrs.except(highInternet) val highInternetGrid = highInternet.select("squareId", "countryCode").dropDuplicates() val highOtherGrid = highOther.select("squareId", "countryCode").dropDuplicates() highOtherGrid.except(highInternetGrid).show() highInternetGrid.except(highOtherGrid).show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 159
Source File: L8-13HiveQL.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrHiveqlApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrHiveqlApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val cl = Thread.currentThread().getContextClassLoader() val hiveC = new HiveContext(ssc.sparkContext) Thread.currentThread().setContextClassLoader(cl) import hiveC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { seqToCdr(rdd).toDF().registerTempTable("cdrs") hiveC.sql("SET DATE_FMT='yy-MM-dd|HH'") hiveC.sql("SELECT from_unixtime(timeInterval, ${hiveconf:DATE_FMT}) AS TS, SUM(smsInActivity + smsOutActivity + callInActivity + callOutActivity + internetTrafficActivity) AS Activity FROM cdrs GROUP BY from_unixtime(timeInterval, ${hiveconf:DATE_FMT}) ORDER BY Activity DESC").show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 160
Source File: L6-6PerRecord.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.eclipse.paho.client.mqttv3.MqttClient import org.eclipse.paho.client.mqttv3.MqttMessage import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.JsonAST.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object MqttSinkAppB { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>") System.exit(1) } val Seq(appName, outputBrokerUrl, topic) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { val query = parse(rec) \ "query" ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) }) .map(rec => { implicit val formats = DefaultFormats rec.children.map(f => f.extract[String]) mkString "," }) .foreachRDD { rdd => rdd.foreach { rec => { val client = new MqttClient(outputBrokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) client.connect() client.publish(topic, new MqttMessage(rec.getBytes(StandardCharsets.UTF_8))) client.disconnect() client.close() } } } ssc.start() ssc.awaitTermination() } }
Example 161
Source File: L6-12StaticPool.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.eclipse.paho.client.mqttv3.MqttClient import org.eclipse.paho.client.mqttv3.MqttMessage import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.JsonAST.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object MqttSinkAppF { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>") System.exit(1) } val Seq(appName, outputBrokerUrl, topic) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) val mqttSink = ssc.sparkContext.broadcast(MqttSinkLazy(outputBrokerUrl)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { val query = parse(rec) \ "query" ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) }) .map(rec => { implicit val formats = DefaultFormats rec.children.map(f => f.extract[String]) mkString "," }) .foreachRDD { rdd => rdd.foreachPartition { par => par.foreach(message => mqttSink.value.client.publish(topic, new MqttMessage(message.getBytes(StandardCharsets.UTF_8)))) } } ssc.start() ssc.awaitTermination() } } class MqttSinkLazy(brokerUrl: String) extends Serializable { lazy val client = { val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) client.connect() sys.addShutdownHook { client.disconnect() client.close() } client } } object MqttSinkLazy { val brokerUrl = "tcp://localhost:1883" val client = new MqttSinkLazy(brokerUrl) def apply(brokerUrl: String): MqttSinkLazy = { client } }
Example 162
Source File: L6-8Static.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.eclipse.paho.client.mqttv3.MqttClient import org.eclipse.paho.client.mqttv3.MqttMessage import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.JsonAST.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object MqttSinkAppD { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>") System.exit(1) } val Seq(appName, outputBrokerUrl, topic) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { val query = parse(rec) \ "query" ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) }) .map(rec => { implicit val formats = DefaultFormats rec.children.map(f => f.extract[String]) mkString "," }) .foreachRDD { rdd => rdd.foreachPartition { par => par.foreach(message => MqttSink().publish(topic, new MqttMessage(message.getBytes(StandardCharsets.UTF_8)))) } } ssc.start() ssc.awaitTermination() } } object MqttSink { val brokerUrl = "tcp://localhost:1883" val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) client.connect() sys.addShutdownHook { client.disconnect() client.close() } def apply(): MqttClient = { client } }
Example 163
Source File: L6-18Cassandra.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.JsonAST.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.Text import java.nio.ByteBuffer import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat import org.apache.cassandra.hadoop.ConfigHelper import org.apache.cassandra.thrift.ColumnOrSuperColumn import org.apache.cassandra.thrift.Column import org.apache.cassandra.utils.ByteBufferUtil import org.apache.cassandra.thrift.Mutation import java.util.Arrays object CassandraSinkApp { def main(args: Array[String]) { if (args.length != 6) { System.err.println( "Usage: CassandraSinkApp <appname> <cassandraHost> <cassandraPort> <keyspace> <columnFamilyName> <columnName>") System.exit(1) } val Seq(appName, cassandraHost, cassandraPort, keyspace, columnFamilyName, columnName) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val windowSize = 20 val slideInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) }) .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) .foreachRDD(rdd => { val jobConf = new Configuration() ConfigHelper.setOutputRpcPort(jobConf, cassandraPort) ConfigHelper.setOutputInitialAddress(jobConf, cassandraHost) ConfigHelper.setOutputColumnFamily(jobConf, keyspace, columnFamilyName) ConfigHelper.setOutputPartitioner(jobConf, "Murmur3Partitioner") rdd.map(rec => { val c = new Column() c.setName(ByteBufferUtil.bytes(columnName)) c.setValue(ByteBufferUtil.bytes(rec._2 / (windowSize / batchInterval))) c.setTimestamp(System.currentTimeMillis) val m = new Mutation() m.setColumn_or_supercolumn(new ColumnOrSuperColumn()) m.column_or_supercolumn.setColumn(c) (ByteBufferUtil.bytes(rec._1), Arrays.asList(m)) }).saveAsNewAPIHadoopFile(keyspace, classOf[ByteBuffer], classOf[List[Mutation]], classOf[ColumnFamilyOutputFormat], jobConf) }) ssc.start() ssc.awaitTermination() } }
Example 164
Source File: L6-20CassandraConnector.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput import com.datastax.spark.connector.SomeColumns import com.datastax.spark.connector.cql.CassandraConnector import com.datastax.spark.connector.streaming.toDStreamFunctions import com.datastax.spark.connector.toNamedColumnRef object CassandraConnectorSinkApp { def main(args: Array[String]) { if (args.length != 6) { System.err.println( "Usage: CassandraConnectorSinkApp <appname> <cassandraHost> <cassandraPort> <keyspace> <tableName> <columnName>") System.exit(1) } val Seq(appName, cassandraHost, cassandraPort, keyspace, tableName, columnName) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.cassandra.connection.host", cassandraHost) .set("spark.cassandra.connection.port", cassandraPort) val batchInterval = 10 val windowSize = 20 val slideInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) CassandraConnector(conf).withSessionDo { session => session.execute(s"CREATE KEYSPACE IF NOT EXISTS %s WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }".format(keyspace)) session.execute(s"CREATE TABLE IF NOT EXISTS %s.%s (key TEXT PRIMARY KEY, %s FLOAT)".format(keyspace, tableName, columnName)) } HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) }) .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) .map(stock => (stock._1, stock._2 / (windowSize / batchInterval))) .saveToCassandra(keyspace, tableName) ssc.start() ssc.awaitTermination() } }
Example 165
Source File: DirectKafkaWordCount.scala From spark-secure-kafka-app with Apache License 2.0 | 5 votes |
package com.cloudera.spark.examples import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.spark.SparkConf import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.kafka010.{ConsumerStrategies, LocationStrategies, KafkaUtils} import org.apache.spark.streaming._ object DirectKafkaWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println(s""" |Usage: DirectKafkaWordCount <brokers> <topics> | <brokers> is a list of one or more Kafka brokers | <topics> is a list of one or more kafka topics to consume from | <ssl> true if using SSL, false otherwise. | """.stripMargin) System.exit(1) } val Array(brokers, topics, ssl) = args // Create context with 2 second batch interval val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) val isUsingSsl = ssl.toBoolean // Create direct kafka stream with brokers and topics val topicsSet = topics.split(",").toSet val commonParams = Map[String, Object]( "bootstrap.servers" -> brokers, "security.protocol" -> (if (isUsingSsl) "SASL_SSL" else "SASL_PLAINTEXT"), "sasl.kerberos.service.name" -> "kafka", "auto.offset.reset" -> "earliest", "key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer", "value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer", "group.id" -> "default", "enable.auto.commit" -> (false: java.lang.Boolean) ) val additionalSslParams = if (isUsingSsl) { Map( "ssl.truststore.location" -> "/etc/cdep-ssl-conf/CA_STANDARD/truststore.jks", "ssl.truststore.password" -> "cloudera" ) } else { Map.empty } val kafkaParams = commonParams ++ additionalSslParams val messages: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String]( ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topicsSet, kafkaParams) ) // Get the lines, split them into words, count the words and print val lines = messages.map(_.value()) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) wordCounts.print() // Start the computation ssc.start() ssc.awaitTermination() } }
Example 166
Source File: SparkLensTest.scala From spark-tools with Apache License 2.0 | 5 votes |
package io.univalence import org.apache.spark.SparkConf import org.apache.spark.sql.DataFrame import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.ArrayType import org.apache.spark.sql.types.StringType import io.univalence.SparkLens._ import org.scalatest.FunSuite case class Toto(name: String, age: Int) case class Tata(toto: Toto) class SparkLensTest extends FunSuite { val conf: SparkConf = new SparkConf() conf.setAppName("yo") conf.setMaster("local[*]") implicit val ss: SparkSession = SparkSession.builder.config(conf).getOrCreate import ss.implicits._ test("testLensRegExp change string") { assert(lensRegExp(ss.createDataFrame(Seq(Toto("a", 1))))({ case ("name", StringType) => true case _ => false }, { case (a: String, d) => a.toUpperCase }).as[Toto].first() == Toto("A", 1)) } test("change Int") { assert(lensRegExp(ss.createDataFrame(Seq(Tata(Toto("a", 1)))))({ case ("toto/age", _) => true case _ => false }, { case (a: Int, d) => a + 1 }).as[Tata].first() == Tata(Toto("a", 2))) } ignore("null to nil") { val df: DataFrame = ss.read.parquet("/home/phong/daily_gpp_20180705") val yoho: DataFrame = lensRegExp(df)({ case (_, ArrayType(_, _)) => true case _ => false }, (a, b) => if (a == null) Nil else a) } }
Example 167
Source File: ConfigurableDataGeneratorMain.scala From Spark.TableStatsExample with Apache License 2.0 | 5 votes |
package com.cloudera.sa.examples.tablestats import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRow import org.apache.spark.sql.types.{StringType, LongType, StructField, StructType} import org.apache.spark.{SparkContext, SparkConf} import scala.collection.mutable import scala.util.Random object ConfigurableDataGeneratorMain { def main(args: Array[String]): Unit = { if (args.length == 0) { println("ConfigurableDataGeneratorMain <outputPath> <numberOfColumns> <numberOfRecords> <numberOfPartitions> <local>") return } val outputPath = args(0) val numberOfColumns = args(1).toInt val numberOfRecords = args(2).toInt val numberOfPartitions = args(3).toInt val runLocal = (args.length == 5 && args(4).equals("L")) var sc: SparkContext = null if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") sc = new SparkContext("local", "test", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("ConfigurableDataGeneratorMain") sc = new SparkContext(sparkConfig) } val sqlContext = new org.apache.spark.sql.SQLContext(sc) //Part A val rowRDD = sc.parallelize( (0 until numberOfPartitions).map( i => i), numberOfPartitions) //Part B val megaDataRDD = rowRDD.flatMap( r => { val random = new Random() val dataRange = (0 until numberOfRecords/numberOfPartitions).iterator dataRange.map[Row]( x => { val values = new mutable.ArrayBuffer[Any] for (i <- 0 until numberOfColumns) { if (i % 2 == 0) { values.+=(random.nextInt(100).toLong) } else { values.+=(random.nextInt(100).toString) } } new GenericRow(values.toArray) }) }) //Part C val schema = StructType( (0 until numberOfColumns).map( i => { if (i % 2 == 0) { StructField("longColumn_" + i, LongType, true) } else { StructField("stringColumn_" + i, StringType, true) } }) ) val df = sqlContext.createDataFrame(megaDataRDD, schema) df.saveAsParquetFile(outputPath) //Part D sc.stop() } }
Example 168
Source File: SimpleDataGeneratorMain.scala From Spark.TableStatsExample with Apache License 2.0 | 5 votes |
package com.cloudera.sa.examples.tablestats import org.apache.spark.sql.Row import org.apache.spark.sql.types._ import org.apache.spark.{SparkContext, SparkConf} object SimpleDataGeneratorMain { def main(args: Array[String]): Unit = { if (args.length == 0) { println("SimpleDataGeneratorMain <outputPath> ") return } val outputPath = args(0) val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") var sc = new SparkContext("local", "test", sparkConfig) val sqlContext = new org.apache.spark.sql.SQLContext(sc) val schema = StructType( Array( StructField("id", LongType, true), StructField("name", StringType, true), StructField("age", LongType, true), StructField("gender", StringType, true), StructField("height", LongType, true), StructField("job_title", StringType, true) ) ) val rowRDD = sc.parallelize(Array( Row(1l, "Name.1", 20l, "M", 6l, "dad"), Row(2l, "Name.2", 20l, "F", 5l, "mom"), Row(3l, "Name.3", 20l, "F", 5l, "mom"), Row(4l, "Name.4", 20l, "F", 5l, "mom"), Row(5l, "Name.5", 10l, "M", 4l, "kid"), Row(6l, "Name.6", 8l, "M", 3l, "kid"))) val df = sqlContext.createDataFrame(rowRDD, schema) println("columns:") df.columns.foreach( c => println(" - " + c)) df.saveAsParquetFile(outputPath) sc.stop() } }
Example 169
Source File: TestTableStatsSinglePathMain.scala From Spark.TableStatsExample with Apache License 2.0 | 5 votes |
package com.cloudera.sa.examples.tablestats import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.sql.Row import org.apache.spark.sql.types.{StringType, LongType, StructField, StructType} import org.scalatest.{FunSuite, BeforeAndAfterEach, BeforeAndAfterAll} class TestTableStatsSinglePathMain extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll{ test("run table stats on sample data") { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") var sc = new SparkContext("local", "test", sparkConfig) try { val sqlContext = new org.apache.spark.sql.SQLContext(sc) val schema = StructType( Array( StructField("id", LongType, true), StructField("name", StringType, true), StructField("age", LongType, true), StructField("gender", StringType, true), StructField("height", LongType, true), StructField("job_title", StringType, true) ) ) val rowRDD = sc.parallelize(Array( Row(1l, "Name.1", 20l, "M", 6l, "dad"), Row(2l, "Name.2", 20l, "F", 5l, "mom"), Row(3l, "Name.3", 20l, "F", 5l, "mom"), Row(4l, "Name.4", 20l, "M", 5l, "mom"), Row(5l, "Name.5", 10l, "M", 4l, "kid"), Row(6l, "Name.6", 8l, "M", 3l, "kid"))) val df = sqlContext.createDataFrame(rowRDD, schema) val firstPassStats = TableStatsSinglePathMain.getFirstPassStat(df) assertResult(6l)(firstPassStats.columnStatsMap(0).maxLong) assertResult(1l)(firstPassStats.columnStatsMap(0).minLong) assertResult(21l)(firstPassStats.columnStatsMap(0).sumLong) assertResult(3l)(firstPassStats.columnStatsMap(0).avgLong) assertResult(2)(firstPassStats.columnStatsMap(3).topNValues.topNCountsForColumnArray.length) firstPassStats.columnStatsMap(3).topNValues.topNCountsForColumnArray.foreach { r => if (r._1.equals("M")) { assertResult(4l)(r._2) } else if (r._1.equals("F")) { assertResult(2l)(r._2) } else { throw new RuntimeException("Unknown gender: " + r._1) } } } finally { sc.stop() } } }
Example 170
Source File: SynthBenchmark.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.graphx import java.io.{FileOutputStream, PrintWriter} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.graphx.{GraphXUtils, PartitionStrategy} import org.apache.spark.graphx.util.GraphGenerators def main(args: Array[String]) { val options = args.map { arg => arg.dropWhile(_ == '-').split('=') match { case Array(opt, v) => (opt -> v) case _ => throw new IllegalArgumentException("Invalid argument: " + arg) } } var app = "pagerank" var niter = 10 var numVertices = 100000 var numEPart: Option[Int] = None var partitionStrategy: Option[PartitionStrategy] = None var mu: Double = 4.0 var sigma: Double = 1.3 var degFile: String = "" var seed: Int = -1 options.foreach { case ("app", v) => app = v case ("niters", v) => niter = v.toInt case ("nverts", v) => numVertices = v.toInt case ("numEPart", v) => numEPart = Some(v.toInt) case ("partStrategy", v) => partitionStrategy = Some(PartitionStrategy.fromString(v)) case ("mu", v) => mu = v.toDouble case ("sigma", v) => sigma = v.toDouble case ("degFile", v) => degFile = v case ("seed", v) => seed = v.toInt case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt) } val conf = new SparkConf() .setAppName(s"GraphX Synth Benchmark (nverts = $numVertices, app = $app)") GraphXUtils.registerKryoClasses(conf) val sc = new SparkContext(conf) // Create the graph println(s"Creating graph...") val unpartitionedGraph = GraphGenerators.logNormalGraph(sc, numVertices, numEPart.getOrElse(sc.defaultParallelism), mu, sigma, seed) // Repartition the graph val graph = partitionStrategy.foldLeft(unpartitionedGraph)(_.partitionBy(_)).cache() var startTime = System.currentTimeMillis() val numEdges = graph.edges.count() println(s"Done creating graph. Num Vertices = $numVertices, Num Edges = $numEdges") val loadTime = System.currentTimeMillis() - startTime // Collect the degree distribution (if desired) if (!degFile.isEmpty) { val fos = new FileOutputStream(degFile) val pos = new PrintWriter(fos) val hist = graph.vertices.leftJoin(graph.degrees)((id, _, optDeg) => optDeg.getOrElse(0)) .map(p => p._2).countByValue() hist.foreach { case (deg, count) => pos.println(s"$deg \t $count") } } // Run PageRank startTime = System.currentTimeMillis() if (app == "pagerank") { println("Running PageRank") val totalPR = graph.staticPageRank(niter).vertices.map(_._2).sum() println(s"Total PageRank = $totalPR") } else if (app == "cc") { println("Running Connected Components") val numComponents = graph.connectedComponents.vertices.map(_._2).distinct().count() println(s"Number of components = $numComponents") } val runTime = System.currentTimeMillis() - startTime println(s"Num Vertices = $numVertices") println(s"Num Edges = $numEdges") println(s"Creation time = ${loadTime/1000.0} seconds") println(s"Run time = ${runTime/1000.0} seconds") sc.stop() } } // scalastyle:on println
Example 171
Source File: NormalizerExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.Normalizer import org.apache.spark.mllib.util.MLUtils // $example off$ object NormalizerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("NormalizerExample") val sc = new SparkContext(conf) // $example on$ val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val normalizer1 = new Normalizer() val normalizer2 = new Normalizer(p = Double.PositiveInfinity) // Each sample in data1 will be normalized using $L^2$ norm. val data1 = data.map(x => (x.label, normalizer1.transform(x.features))) // Each sample in data2 will be normalized using $L^\infty$ norm. val data2 = data.map(x => (x.label, normalizer2.transform(x.features))) // $example off$ println("data1: ") data1.foreach(x => println(x)) println("data2: ") data2.foreach(x => println(x)) sc.stop() } } // scalastyle:on println
Example 172
Source File: PCAOnSourceVectorExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD // $example off$ object PCAOnSourceVectorExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAOnSourceVectorExample") val sc = new SparkContext(conf) // $example on$ val data: RDD[LabeledPoint] = sc.parallelize(Seq( new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 1)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 1, 0)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)), new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 0)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)))) // Compute the top 5 principal components. val pca = new PCA(5).fit(data.map(_.features)) // Project vectors to the linear space spanned by the top 5 principal // components, keeping the label val projected = data.map(p => p.copy(features = pca.transform(p.features))) // $example off$ val collect = projected.collect() println("Projected vector of principal component:") collect.foreach { vector => println(vector) } } } // scalastyle:on println
Example 173
Source File: PCAOnRowMatrixExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object PCAOnRowMatrixExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAOnRowMatrixExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 4 principal components. // Principal components are stored in a local dense matrix. val pc: Matrix = mat.computePrincipalComponents(4) // Project the rows to the linear space spanned by the top 4 principal components. val projected: RowMatrix = mat.multiply(pc) // $example off$ val collect = projected.rows.collect() println("Projected Row Matrix of principal component:") collect.foreach { vector => println(vector) } } } // scalastyle:on println
Example 174
Source File: NaiveBayesExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel} import org.apache.spark.mllib.util.MLUtils // $example off$ object NaiveBayesExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("NaiveBayesExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data file. val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") // Split data into training (60%) and test (40%). val Array(training, test) = data.randomSplit(Array(0.6, 0.4)) val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial") val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() // Save and load model model.save(sc, "target/tmp/myNaiveBayesModel") val sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel") // $example off$ } } // scalastyle:on println
Example 175
Source File: TallSkinnyPCA.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.Vectors object TallSkinnyPCA { def main(args: Array[String]) { if (args.length != 1) { System.err.println("Usage: TallSkinnyPCA <input>") System.exit(1) } val conf = new SparkConf().setAppName("TallSkinnyPCA") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(args(0)).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute principal components. val pc = mat.computePrincipalComponents(mat.numCols().toInt) println("Principal components are:\n" + pc) sc.stop() } } // scalastyle:on println
Example 176
Source File: GaussianMixtureExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.{GaussianMixture, GaussianMixtureModel} import org.apache.spark.mllib.linalg.Vectors // $example off$ object GaussianMixtureExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("GaussianMixtureExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/gmm_data.txt") val parsedData = data.map(s => Vectors.dense(s.trim.split(' ').map(_.toDouble))).cache() // Cluster the data into two classes using GaussianMixture val gmm = new GaussianMixture().setK(2).run(parsedData) // Save and load model gmm.save(sc, "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel") val sameModel = GaussianMixtureModel.load(sc, "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel") // output parameters of max-likelihood model for (i <- 0 until gmm.k) { println("weight=%f\nmu=%s\nsigma=\n%s\n" format (gmm.weights(i), gmm.gaussians(i).mu, gmm.gaussians(i).sigma)) } // $example off$ sc.stop() } } // scalastyle:on println
Example 177
Source File: Word2VecExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel} // $example off$ object Word2VecExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("Word2VecExample") val sc = new SparkContext(conf) // $example on$ val input = sc.textFile("data/mllib/sample_lda_data.txt").map(line => line.split(" ").toSeq) val word2vec = new Word2Vec() val model = word2vec.fit(input) val synonyms = model.findSynonyms("1", 5) for((synonym, cosineSimilarity) <- synonyms) { println(s"$synonym $cosineSimilarity") } // Save and load model model.save(sc, "myModelPath") val sameModel = Word2VecModel.load(sc, "myModelPath") // $example off$ sc.stop() } } // scalastyle:on println
Example 178
Source File: Correlations.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"Correlations with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Calculate label -- feature correlations val labelRDD = examples.map(_.label) val numFeatures = examples.take(1)(0).features.size val corrType = "pearson" println() println(s"Correlation ($corrType) between label and each feature") println(s"Feature\tCorrelation") var feature = 0 while (feature < numFeatures) { val featureRDD = examples.map(_.features(feature)) val corr = Statistics.corr(labelRDD, featureRDD) println(s"$feature\t$corr") feature += 1 } println() sc.stop() } } // scalastyle:on println
Example 179
Source File: FPGrowthExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.fpm.FPGrowth object FPGrowthExample { case class Params( input: String = null, minSupport: Double = 0.3, numPartition: Int = -1) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("FPGrowthExample") { head("FPGrowth: an example FP-growth app.") opt[Double]("minSupport") .text(s"minimal support level, default: ${defaultParams.minSupport}") .action((x, c) => c.copy(minSupport = x)) opt[Int]("numPartition") .text(s"number of partition, default: ${defaultParams.numPartition}") .action((x, c) => c.copy(numPartition = x)) arg[String]("<input>") .text("input paths to input data set, whose file format is that each line " + "contains a transaction with each item in String and separated by a space") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"FPGrowthExample with $params") val sc = new SparkContext(conf) val transactions = sc.textFile(params.input).map(_.split(" ")).cache() println(s"Number of transactions: ${transactions.count()}") val model = new FPGrowth() .setMinSupport(params.minSupport) .setNumPartitions(params.numPartition) .run(transactions) println(s"Number of frequent itemsets: ${model.freqItemsets.count()}") model.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) } sc.stop() } } // scalastyle:on println
Example 180
Source File: LinearRegression.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.optimization.{L1Updater, SimpleUpdater, SquaredL2Updater} import org.apache.spark.mllib.regression.LinearRegressionWithSGD import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"LinearRegression with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0).cache() val test = splits(1).cache() val numTraining = training.count() val numTest = test.count() println(s"Training: $numTraining, test: $numTest.") examples.unpersist(blocking = false) val updater = params.regType match { case NONE => new SimpleUpdater() case L1 => new L1Updater() case L2 => new SquaredL2Updater() } val algorithm = new LinearRegressionWithSGD() algorithm.optimizer .setNumIterations(params.numIterations) .setStepSize(params.stepSize) .setUpdater(updater) .setRegParam(params.regParam) val model = algorithm.run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val loss = predictionAndLabel.map { case (p, l) => val err = p - l err * err }.reduce(_ + _) val rmse = math.sqrt(loss / numTest) println(s"Test RMSE = $rmse.") sc.stop() } } // scalastyle:on println
Example 181
Source File: BinaryClassification.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, SVMWithSGD} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.optimization.{L1Updater, SquaredL2Updater} import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --algorithm LR --regType L2 --regParam 1.0 \ | data/mllib/sample_binary_classification_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"BinaryClassification with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0).cache() val test = splits(1).cache() val numTraining = training.count() val numTest = test.count() println(s"Training: $numTraining, test: $numTest.") examples.unpersist(blocking = false) val updater = params.regType match { case L1 => new L1Updater() case L2 => new SquaredL2Updater() } val model = params.algorithm match { case LR => val algorithm = new LogisticRegressionWithLBFGS() algorithm.optimizer .setNumIterations(params.numIterations) .setUpdater(updater) .setRegParam(params.regParam) algorithm.run(training).clearThreshold() case SVM => val algorithm = new SVMWithSGD() algorithm.optimizer .setNumIterations(params.numIterations) .setStepSize(params.stepSize) .setUpdater(updater) .setRegParam(params.regParam) algorithm.run(training).clearThreshold() } val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val metrics = new BinaryClassificationMetrics(predictionAndLabel) println(s"Test areaUnderPR = ${metrics.areaUnderPR()}.") println(s"Test areaUnderROC = ${metrics.areaUnderROC()}.") sc.stop() } } // scalastyle:on println
Example 182
Source File: SparseNaiveBayes.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println
Example 183
Source File: PCAExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} // $example off$ @deprecated("Deprecated since LinearRegressionWithSGD is deprecated. Use ml.feature.PCA", "2.0.0") object PCAExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAExample") val sc = new SparkContext(conf) // $example on$ val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features)) val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) val numIterations = 100 val model = LinearRegressionWithSGD.train(training, numIterations) val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations) val valuesAndPreds = test.map { point => val score = model.predict(point.features) (score, point.label) } val valuesAndPreds_pca = test_pca.map { point => val score = model_pca.predict(point.features) (score, point.label) } val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean() val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean() println("Mean Squared Error = " + MSE) println("PCA Mean Squared Error = " + MSE_pca) // $example off$ sc.stop() } } // scalastyle:on println
Example 184
Source File: HypothesisTestingKolmogorovSmirnovTestExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD // $example off$ object HypothesisTestingKolmogorovSmirnovTestExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample") val sc = new SparkContext(conf) // $example on$ val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25)) // an RDD of sample data // run a KS test for the sample versus a standard normal distribution val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1) // summary of the test including the p-value, test statistic, and null hypothesis if our p-value // indicates significance, we can reject the null hypothesis. println(testResult) println() // perform a KS test using a cumulative distribution function of our making val myCDF = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1) val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF) println(testResult2) // $example off$ sc.stop() } } // scalastyle:on println
Example 185
Source File: RandomRDDGeneration.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.rdd.RDD object RandomRDDGeneration { def main(args: Array[String]) { val conf = new SparkConf().setAppName(s"RandomRDDGeneration") val sc = new SparkContext(conf) val numExamples = 10000 // number of examples to generate val fraction = 0.1 // fraction of data to sample // Example: RandomRDDs.normalRDD val normalRDD: RDD[Double] = RandomRDDs.normalRDD(sc, numExamples) println(s"Generated RDD of ${normalRDD.count()}" + " examples sampled from the standard normal distribution") println(" First 5 samples:") normalRDD.take(5).foreach( x => println(s" $x") ) // Example: RandomRDDs.normalVectorRDD val normalVectorRDD = RandomRDDs.normalVectorRDD(sc, numRows = numExamples, numCols = 2) println(s"Generated RDD of ${normalVectorRDD.count()} examples of length-2 vectors.") println(" First 5 samples:") normalVectorRDD.take(5).foreach( x => println(s" $x") ) println() sc.stop() } } // scalastyle:on println
Example 186
Source File: SimpleFPGrowth.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.fpm.FPGrowth import org.apache.spark.rdd.RDD // $example off$ object SimpleFPGrowth { def main(args: Array[String]) { val conf = new SparkConf().setAppName("SimpleFPGrowth") val sc = new SparkContext(conf) // $example on$ val data = sc.textFile("data/mllib/sample_fpgrowth.txt") val transactions: RDD[Array[String]] = data.map(s => s.trim.split(' ')) val fpg = new FPGrowth() .setMinSupport(0.2) .setNumPartitions(10) val model = fpg.run(transactions) model.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) } val minConfidence = 0.8 model.generateAssociationRules(minConfidence).collect().foreach { rule => println( rule.antecedent.mkString("[", ",", "]") + " => " + rule.consequent .mkString("[", ",", "]") + ", " + rule.confidence) } // $example off$ } } // scalastyle:on println
Example 187
Source File: KernelDensityEstimationExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.stat.KernelDensity import org.apache.spark.rdd.RDD // $example off$ object KernelDensityEstimationExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("KernelDensityEstimationExample") val sc = new SparkContext(conf) // $example on$ // an RDD of sample data val data: RDD[Double] = sc.parallelize(Seq(1, 1, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9)) // Construct the density estimator with the sample data and a standard deviation // for the Gaussian kernels val kd = new KernelDensity() .setSample(data) .setBandwidth(3.0) // Find density estimates for the given values val densities = kd.estimate(Array(-1.0, 2.0, 5.0)) // $example off$ densities.foreach(println) sc.stop() } } // scalastyle:on println
Example 188
Source File: CosineSimilarity.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix} object CosineSimilarity { case class Params(inputFile: String = null, threshold: Double = 0.1) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("CosineSimilarity") { head("CosineSimilarity: an example app.") opt[Double]("threshold") .required() .text(s"threshold similarity: to tradeoff computation vs quality estimate") .action((x, c) => c.copy(threshold = x)) arg[String]("<inputFile>") .required() .text(s"input file, one row per line, space-separated") .action((x, c) => c.copy(inputFile = x)) note( """ |For example, the following command runs this app on a dataset: | | ./bin/spark-submit --class org.apache.spark.examples.mllib.CosineSimilarity \ | examplesjar.jar \ | --threshold 0.1 data/mllib/sample_svm_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName("CosineSimilarity") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(params.inputFile).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) }.cache() val mat = new RowMatrix(rows) // Compute similar columns perfectly, with brute force. val exact = mat.columnSimilarities() // Compute similar columns with estimation using DIMSUM val approx = mat.columnSimilarities(params.threshold) val exactEntries = exact.entries.map { case MatrixEntry(i, j, u) => ((i, j), u) } val approxEntries = approx.entries.map { case MatrixEntry(i, j, v) => ((i, j), v) } val MAE = exactEntries.leftOuterJoin(approxEntries).values.map { case (u, Some(v)) => math.abs(u - v) case (u, None) => math.abs(u) }.mean() println(s"Average absolute error in estimate is: $MAE") sc.stop() } } // scalastyle:on println
Example 189
Source File: ElementwiseProductExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.ElementwiseProduct import org.apache.spark.mllib.linalg.Vectors // $example off$ object ElementwiseProductExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("ElementwiseProductExample") val sc = new SparkContext(conf) // $example on$ // Create some vector data; also works for sparse vectors val data = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0))) val transformingVector = Vectors.dense(0.0, 1.0, 2.0) val transformer = new ElementwiseProduct(transformingVector) // Batch transform and per-row transform give the same results: val transformedData = transformer.transform(data) val transformedData2 = data.map(x => transformer.transform(x)) // $example off$ println("transformedData: ") transformedData.foreach(x => println(x)) println("transformedData2: ") transformedData2.foreach(x => println(x)) sc.stop() } } // scalastyle:on println
Example 190
Source File: SVDExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.SingularValueDecomposition import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object SVDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SVDExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 5 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. // $example off$ val collect = U.rows.collect() println("U factor is:") collect.foreach { vector => println(vector) } println(s"Singular values are: $s") println(s"V factor is:\n$V") } } // scalastyle:on println
Example 191
Source File: StratifiedSamplingExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} object StratifiedSamplingExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("StratifiedSamplingExample") val sc = new SparkContext(conf) // $example on$ // an RDD[(K, V)] of any key value pairs val data = sc.parallelize( Seq((1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f'))) // specify the exact fraction desired from each key val fractions = Map(1 -> 0.1, 2 -> 0.6, 3 -> 0.3) // Get an approximate sample from each stratum val approxSample = data.sampleByKey(withReplacement = false, fractions = fractions) // Get an exact sample from each stratum val exactSample = data.sampleByKeyExact(withReplacement = false, fractions = fractions) // $example off$ println("approxSample size is " + approxSample.collect().size.toString) approxSample.collect().foreach(println) println("exactSample its size is " + exactSample.collect().size.toString) exactSample.collect().foreach(println) sc.stop() } } // scalastyle:on println
Example 192
Source File: TallSkinnySVD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.Vectors object TallSkinnySVD { def main(args: Array[String]) { if (args.length != 1) { System.err.println("Usage: TallSkinnySVD <input>") System.exit(1) } val conf = new SparkConf().setAppName("TallSkinnySVD") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(args(0)).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute SVD. val svd = mat.computeSVD(mat.numCols().toInt) println("Singular values are " + svd.s) sc.stop() } } // scalastyle:on println
Example 193
Source File: PrefixSpanExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.fpm.PrefixSpan // $example off$ object PrefixSpanExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("PrefixSpanExample") val sc = new SparkContext(conf) // $example on$ val sequences = sc.parallelize(Seq( Array(Array(1, 2), Array(3)), Array(Array(1), Array(3, 2), Array(1, 2)), Array(Array(1, 2), Array(5)), Array(Array(6)) ), 2).cache() val prefixSpan = new PrefixSpan() .setMinSupport(0.5) .setMaxPatternLength(5) val model = prefixSpan.run(sequences) model.freqSequences.collect().foreach { freqSequence => println( freqSequence.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]") + ", " + freqSequence.freq) } // $example off$ } } // scalastyle:off println
Example 194
Source File: StandardScalerExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLUtils // $example off$ object StandardScalerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("StandardScalerExample") val sc = new SparkContext(conf) // $example on$ val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val scaler1 = new StandardScaler().fit(data.map(x => x.features)) val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features)) // scaler3 is an identical model to scaler2, and will produce identical transformations val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean) // data1 will be unit variance. val data1 = data.map(x => (x.label, scaler1.transform(x.features))) // data2 will be unit variance and zero mean. val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray)))) // $example off$ println("data1: ") data1.foreach(x => println(x)) println("data2: ") data2.foreach(x => println(x)) sc.stop() } } // scalastyle:on println
Example 195
Source File: KMeansExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.mllib.linalg.Vectors // $example off$ object KMeansExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("KMeansExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/kmeans_data.txt") val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache() // Cluster the data into two classes using KMeans val numClusters = 2 val numIterations = 20 val clusters = KMeans.train(parsedData, numClusters, numIterations) // Evaluate clustering by computing Within Set Sum of Squared Errors val WSSSE = clusters.computeCost(parsedData) println("Within Set Sum of Squared Errors = " + WSSSE) // Save and load model clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel") val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 196
Source File: MultivariateSummarizer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"MultivariateSummarizer with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Summarize labels val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(Vectors.dense(lp.label)), (sum1, sum2) => sum1.merge(sum2)) // Summarize features val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(lp.features), (sum1, sum2) => sum1.merge(sum2)) println() println(s"Summary statistics") println(s"\tLabel\tFeatures") println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}") println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}") println( s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}") println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}") println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}") println() sc.stop() } } // scalastyle:on println
Example 197
Source File: RecommendationExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.recommendation.ALS import org.apache.spark.mllib.recommendation.MatrixFactorizationModel import org.apache.spark.mllib.recommendation.Rating // $example off$ object RecommendationExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("CollaborativeFilteringExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/als/test.data") val ratings = data.map(_.split(',') match { case Array(user, item, rate) => Rating(user.toInt, item.toInt, rate.toDouble) }) // Build the recommendation model using ALS val rank = 10 val numIterations = 10 val model = ALS.train(ratings, rank, numIterations, 0.01) // Evaluate the model on rating data val usersProducts = ratings.map { case Rating(user, product, rate) => (user, product) } val predictions = model.predict(usersProducts).map { case Rating(user, product, rate) => ((user, product), rate) } val ratesAndPreds = ratings.map { case Rating(user, product, rate) => ((user, product), rate) }.join(predictions) val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) => val err = (r1 - r2) err * err }.mean() println("Mean Squared Error = " + MSE) // Save and load model model.save(sc, "target/tmp/myCollaborativeFilter") val sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter") // $example off$ } } // scalastyle:on println
Example 198
Source File: AssociationRulesExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.fpm.AssociationRules import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset // $example off$ object AssociationRulesExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("AssociationRulesExample") val sc = new SparkContext(conf) // $example on$ val freqItemsets = sc.parallelize(Seq( new FreqItemset(Array("a"), 15L), new FreqItemset(Array("b"), 35L), new FreqItemset(Array("a", "b"), 12L) )) val ar = new AssociationRules() .setMinConfidence(0.8) val results = ar.run(freqItemsets) results.collect().foreach { rule => println("[" + rule.antecedent.mkString(",") + "=>" + rule.consequent.mkString(",") + "]," + rule.confidence) } // $example off$ } } // scalastyle:on println
Example 199
Source File: LinearRegressionWithSGDExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.LinearRegressionWithSGD // $example off$ @deprecated("Use ml.regression.LinearRegression or LBFGS", "2.0.0") object LinearRegressionWithSGDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/ridge-data/lpsa.data") val parsedData = data.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() // Building the model val numIterations = 100 val stepSize = 0.00000001 val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize) // Evaluate model on training examples and compute training error val valuesAndPreds = parsedData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean() println("training Mean Squared Error = " + MSE) // Save and load model model.save(sc, "target/tmp/scalaLinearRegressionWithSGDModel") val sameModel = LinearRegressionModel.load(sc, "target/tmp/scalaLinearRegressionWithSGDModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 200
Source File: SampledRDDs.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"SampledRDDs with $params") val sc = new SparkContext(conf) val fraction = 0.1 // fraction of data to sample val examples = MLUtils.loadLibSVMFile(sc, params.input) val numExamples = examples.count() if (numExamples == 0) { throw new RuntimeException("Error: Data file had no samples to load.") } println(s"Loaded data with $numExamples examples from file: ${params.input}") // Example: RDD.sample() and RDD.takeSample() val expectedSampleSize = (numExamples * fraction).toInt println(s"Sampling RDD using fraction $fraction. Expected sample size = $expectedSampleSize.") val sampledRDD = examples.sample(withReplacement = true, fraction = fraction) println(s" RDD.sample(): sample has ${sampledRDD.count()} examples") val sampledArray = examples.takeSample(withReplacement = true, num = expectedSampleSize) println(s" RDD.takeSample(): sample has ${sampledArray.length} examples") println() // Example: RDD.sampleByKey() and RDD.sampleByKeyExact() val keyedRDD = examples.map { lp => (lp.label.toInt, lp.features) } println(s" Keyed data using label (Int) as key ==> Orig") // Count examples per label in original data. val keyCounts = keyedRDD.countByKey() // Subsample, and count examples per label in sampled data. (approximate) val fractions = keyCounts.keys.map((_, fraction)).toMap val sampledByKeyRDD = keyedRDD.sampleByKey(withReplacement = true, fractions = fractions) val keyCountsB = sampledByKeyRDD.countByKey() val sizeB = keyCountsB.values.sum println(s" Sampled $sizeB examples using approximate stratified sampling (by label)." + " ==> Approx Sample") // Subsample, and count examples per label in sampled data. (approximate) val sampledByKeyRDDExact = keyedRDD.sampleByKeyExact(withReplacement = true, fractions = fractions) val keyCountsBExact = sampledByKeyRDDExact.countByKey() val sizeBExact = keyCountsBExact.values.sum println(s" Sampled $sizeBExact examples using exact stratified sampling (by label)." + " ==> Exact Sample") // Compare samples println(s" \tFractions of examples with key") println(s"Key\tOrig\tApprox Sample\tExact Sample") keyCounts.keys.toSeq.sorted.foreach { key => val origFrac = keyCounts(key) / numExamples.toDouble val approxFrac = if (sizeB != 0) { keyCountsB.getOrElse(key, 0L) / sizeB.toDouble } else { 0 } val exactFrac = if (sizeBExact != 0) { keyCountsBExact.getOrElse(key, 0L) / sizeBExact.toDouble } else { 0 } println(s"$key\t$origFrac\t$approxFrac\t$exactFrac") } sc.stop() } } // scalastyle:on println