org.apache.spark.SparkContext Scala Examples
The following examples show how to use org.apache.spark.SparkContext.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DeltaQA.scala From spark-tools with Apache License 2.0 | 12 votes |
package io.univalence.deltaqa.kpialgebra import org.apache.spark.rdd.RDD import org.apache.spark.SparkConf import org.apache.spark.SparkContext import shapeless.contrib.spire._ import spire.algebra._ import spire.implicits._ import scala.reflect.ClassTag case class DeltaPart[T: AdditiveMonoid]( count: Long, part: T ) case class DeltaCommon[T: AdditiveMonoid]( count: Long, countZero: Long, diff: T, error: T, left: T, right: T ) case class Delta[L: AdditiveMonoid, R: AdditiveMonoid, C: AdditiveMonoid]( left: DeltaPart[L], right: DeltaPart[R], common: DeltaCommon[C] ) object KpiAlgebra { def computeCommon[LRC: AdditiveAbGroup: MultiplicativeSemigroup](left: LRC, right: LRC): DeltaCommon[LRC] = { val diff = left - right val error = diff * diff DeltaCommon( count = 1, countZero = if (diff == Monoid.additive[LRC].id) 1 else 0, diff = diff, error = error, left = left, right = right ) } def monoid[LM: AdditiveMonoid, RM: AdditiveMonoid, LRC: AdditiveMonoid]: Monoid[Delta[LM, RM, LRC]] = Monoid.additive[Delta[LM, RM, LRC]] def compare[ K: ClassTag, L: ClassTag, R: ClassTag, LM: AdditiveMonoid: ClassTag, RM: AdditiveMonoid: ClassTag, LRC: AdditiveAbGroup: MultiplicativeSemigroup: ClassTag ]( left: RDD[(K, L)], right: RDD[(K, R)] )(flm: L => LM, frm: R => RM, flc: L => LRC, frc: R => LRC): Delta[LM, RM, LRC] = { val map: RDD[Delta[LM, RM, LRC]] = left .fullOuterJoin(right) .map({ case (_, (Some(l), None)) => monoid[LM, RM, LRC].id .copy(left = DeltaPart(count = 1, part = flm(l))) case (_, (None, Some(r))) => monoid[LM, RM, LRC].id .copy(right = DeltaPart(count = 1, part = frm(r))) case (_, (Some(l), Some(r))) => monoid[LM, RM, LRC].id.copy(common = computeCommon(flc(l), frc(r))) }) map.reduce((x, y) => monoid[LM, RM, LRC].op(x, y)) } } case class KpiLeaf(l1: Long, l2: Long, l3: Long) object KpiAlgebraTest { def main(args: Array[String]) { val sc = new SparkContext(new SparkConf().setMaster("local[*]").setAppName("smoketest")) val parallelize: RDD[(Int, Int)] = sc.parallelize((1 to 4).zipWithIndex) // Delta(DeltaPart(0,0),DeltaPart(0,0),DeltaCommon(4,4,0,0,6,6)) val p2: RDD[(Int, KpiLeaf)] = sc.parallelize((1 to 4)).map(_ -> KpiLeaf(1, 2, 3)) import spire.implicits._ import shapeless.contrib.spire._ ////println(((KpiAlgebra.compare(p2, p2)(identity, identity, identity, identity)) } }
Example 2
Source File: Test1.scala From BigData-News with Apache License 2.0 | 12 votes |
package com.vita.spark.test import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD object Test1 { def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf() conf.setMaster("local") conf.setAppName("TransformationOperator") val sc: SparkContext = new SparkContext(conf) val list: List[String] = List("张无忌", "赵敏", "周芷若") val rdd: RDD[String] = sc.parallelize(list) val list1: List[(Int, String)] = List((1, "东方不败"), (2, "令狐冲"), (3, "林平之")) val list2: List[(Int, Int)] = List((1, 99), (2, 98), (3, 97)) val rdd1: RDD[(Int, String)] = sc.parallelize(list1) val rdd2: RDD[(Int, Int)] = sc.parallelize(list2) rdd1.join(rdd2).foreach(x => println("学号: " + x._1 + "名字:" + x._2._1 + " 分数:" + x._2._2)) } }
Example 3
Source File: CleanupUtil.scala From hazelcast-spark with Apache License 2.0 | 7 votes |
package com.hazelcast.spark.connector.util import com.hazelcast.spark.connector.util.ConnectionUtil.closeAll import org.apache.spark.SparkContext import org.apache.spark.scheduler.{SparkListener, SparkListenerJobEnd, SparkListenerJobStart} object CleanupUtil { val jobIds: collection.mutable.Map[Int, Seq[Int]] = collection.mutable.Map[Int, Seq[Int]]() val cleanupJobRddName: String = "HazelcastResourceCleanupJob" def addCleanupListener(sc: SparkContext): Unit = { sc.addSparkListener(new SparkListener { override def onJobStart(jobStart: SparkListenerJobStart): Unit = { this.synchronized { jobStart.stageInfos.foreach(info => { info.rddInfos.foreach(rdd => { if (!cleanupJobRddName.equals(rdd.name)) { val ids: Seq[Int] = info.rddInfos.map(_.id) val maybeIds: Option[Seq[Int]] = jobIds.get(jobStart.jobId) if (maybeIds.isDefined) { jobIds.put(jobStart.jobId, ids ++ maybeIds.get) } else { jobIds.put(jobStart.jobId, ids) } } }) }) } } override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = { this.synchronized { if (jobIds.contains(jobEnd.jobId)) { try { val workers = sc.getConf.getInt("spark.executor.instances", sc.getExecutorStorageStatus.length) val rddId: Option[Seq[Int]] = jobIds.get(jobEnd.jobId) if (rddId.isDefined) { sc.parallelize(1 to workers, workers).setName(cleanupJobRddName).foreachPartition(it ⇒ closeAll(rddId.get)) } jobIds -= jobEnd.jobId } catch { case e: Exception => } } } } }) } }
Example 4
Source File: SummaryStatisticsExample.scala From drizzle-spark with Apache License 2.0 | 6 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} // $example off$ object SummaryStatisticsExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SummaryStatisticsExample") val sc = new SparkContext(conf) // $example on$ val observations = sc.parallelize( Seq( Vectors.dense(1.0, 10.0, 100.0), Vectors.dense(2.0, 20.0, 200.0), Vectors.dense(3.0, 30.0, 300.0) ) ) // Compute column summary statistics. val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) println(summary.mean) // a dense vector containing the mean value for each column println(summary.variance) // column-wise variance println(summary.numNonzeros) // number of nonzeros in each column // $example off$ sc.stop() } } // scalastyle:on println
Example 5
Source File: DenseKMeans.scala From drizzle-spark with Apache License 2.0 | 6 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 6
Source File: OperatorsDSL.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 6 votes |
package ml.sparkling.graph.operators import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection._ import ml.sparkling.graph.api.operators.measures.{EdgeMeasure, VertexMeasureConfiguration} import ml.sparkling.graph.operators.algorithms.coarsening.labelpropagation.LPCoarsening import ml.sparkling.graph.operators.algorithms.community.pscan.PSCAN._ import ml.sparkling.graph.operators.algorithms.link.BasicLinkPredictor import ml.sparkling.graph.operators.measures.edge.{AdamicAdar, CommonNeighbours} import ml.sparkling.graph.operators.measures.vertex.{Degree, NeighborhoodConnectivity, VertexEmbeddedness} import ml.sparkling.graph.operators.measures.vertex.clustering.LocalClustering import ml.sparkling.graph.operators.measures.graph.{FreemanCentrality, Modularity} import ml.sparkling.graph.operators.partitioning.CommunityBasedPartitioning._ import ml.sparkling.graph.operators.measures.vertex.closenes.Closeness import ml.sparkling.graph.operators.measures.vertex.eigenvector.EigenvectorCentrality import ml.sparkling.graph.operators.measures.vertex.hits.Hits import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import scala.reflect.ClassTag object OperatorsDSL { implicit class ModularityDSL[E:ClassTag](graph:Graph[ComponentID,E]){ def modularity()=Modularity.compute(graph) } implicit class DSL[VD:ClassTag ,ED:ClassTag](graph:Graph[VD,ED]){ def PSCAN(epsilon:Double=0.1)= computeConnectedComponents(graph,epsilon) def LPCoarse(treatAsUndirected:Boolean=false)=LPCoarsening.coarse(graph,treatAsUndirected = treatAsUndirected) def closenessCentrality(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])= Closeness.compute(graph,vertexMeasureConfiguration) def eigenvectorCentrality(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])= EigenvectorCentrality.compute(graph,vertexMeasureConfiguration) def hits(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])= Hits.compute(graph,vertexMeasureConfiguration) def degreeCentrality(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])= Degree.compute(graph,vertexMeasureConfiguration) def neighborhoodConnectivity(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])= NeighborhoodConnectivity.compute(graph,vertexMeasureConfiguration) def vertexEmbeddedness(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])= VertexEmbeddedness.compute(graph,vertexMeasureConfiguration) def localClustering(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])= LocalClustering.compute(graph,vertexMeasureConfiguration) def freemanCentrality()=FreemanCentrality.compute(graph) def partitionBy(communityDetectionMethod:CommunityDetectionMethod[VD,ED])(implicit sc:SparkContext)= partitionGraphBy(graph,communityDetectionMethod) def partitionBy(communityDetectionMethod:CommunityDetectionAlgorithm,numParts:Int= -1)(implicit sc:SparkContext)= partitionGraphUsing(graph,communityDetectionMethod,numParts) def adamicAdar(treatAsUndirected:Boolean=false)={ AdamicAdar.computeWithPreprocessing(graph,treatAsUndirected) } def commonNeighbours(treatAsUndirected:Boolean=false)={ CommonNeighbours.computeWithPreprocessing(graph,treatAsUndirected) } def predictLinks[EV: ClassTag, EO: ClassTag]( edgeMeasure: EdgeMeasure[EO, EV],threshold: EO,treatAsUndirected:Boolean=false)(implicit num: Numeric[EO]) = { BasicLinkPredictor.predictLinks(graph, edgeMeasure, threshold, treatAsUndirected) } } }
Example 7
Source File: HBase.scala From AI with Apache License 2.0 | 6 votes |
package com.bigchange.hbase import com.bigchange.util.HBaseUtil._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName} import org.apache.hadoop.hbase.client.{Result, _} import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.mapreduce.TableInputFormat import org.apache.hadoop.hbase.protobuf.ProtobufUtil import org.apache.hadoop.hbase.protobuf.generated.ClientProtos import org.apache.hadoop.hbase.util.Base64 import org.apache.spark.SparkContext def existRowKey(row:String, table: Table): Boolean ={ val get = new Get(row.getBytes()) val result = table.get(get) if (result.isEmpty) { warn("hbase table don't have this data,execute insert") return false } true } def getConfiguration = if(hBaseConfiguration == null) { warn("hbase setDefaultConfiguration....") setDefaultConfiguration } else hBaseConfiguration def setDefaultConfiguration = { hBaseConfiguration = HBaseConfiguration.create // 本地测试 需配置的选项, 在集群上通过对应配置文件路径自动获得 hBaseConfiguration.set("fs.defaultFS", "hdfs://ns1"); // nameservices的路径 hBaseConfiguration.set("dfs.nameservices", "ns1"); // hBaseConfiguration.set("dfs.ha.namenodes.ns1", "nn1,nn2"); //namenode的路径 hBaseConfiguration.set("dfs.namenode.rpc-address.ns1.nn1", "server3:9000"); // namenode 通信地址 hBaseConfiguration.set("dfs.namenode.rpc-address.ns1.nn2", "server4:9000"); // namenode 通信地址 // 设置namenode自动切换的实现类 hBaseConfiguration.set("dfs.client.failover.proxy.provider.ns1", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider") hBaseConfiguration.set("hbase.rootdir", "hdfs://ns1/hbase") hBaseConfiguration.set("hbase.zookeeper.quorum", "server0,server1,server2") hBaseConfiguration.set("hbase.zookeeper.property.clientPort", "2181") hBaseConfiguration } }
Example 8
Source File: TFIDF.scala From AI with Apache License 2.0 | 6 votes |
package com.bigchange.mllib import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.{SparseVector => SV} import org.apache.spark.{SparkConf, SparkContext} import scala.io.Source object TFIDF { def main(args: Array[String]) { val conf = new SparkConf().setAppName("TfIdfTest") .setMaster("local") val sc = new SparkContext(conf) // Load documents (one per line).要求每行作为一个document,这里zipWithIndex将每一行的行号作为doc id val documents = sc.parallelize(Source.fromFile("J:\\github\\dataSet\\TFIDF-DOC").getLines() .filter(_.trim.length > 0).toSeq) .map(_.split(" ").toSeq) .zipWithIndex() // feature number val hashingTF = new HashingTF(Math.pow(2, 18).toInt) //line number for doc id,每一行的分词结果生成tf vector val idAndTFVector = documents.map { case (seq, num) => val tf = hashingTF.transform(seq) (num + 1, tf) } idAndTFVector.cache() // build idf model val idf = new IDF().fit(idAndTFVector.values) // transform tf vector to tf-idf vector val idAndTFIDFVector = idAndTFVector.mapValues(v => idf.transform(v)) // broadcast tf-idf vectors val idAndTFIDFVectorBroadCast = sc.broadcast(idAndTFIDFVector.collect()) // cal doc cosineSimilarity val docSims = idAndTFIDFVector.flatMap { case (id1, idf1) => // filter the same doc id val idfs = idAndTFIDFVectorBroadCast.value.filter(_._1 != id1) val sv1 = idf1.asInstanceOf[SV] import breeze.linalg._ val bsv1 = new SparseVector[Double](sv1.indices, sv1.values, sv1.size) idfs.map { case (id2, idf2) => val sv2 = idf2.asInstanceOf[SV] val bsv2 = new SparseVector[Double](sv2.indices, sv2.values, sv2.size) val cosSim = bsv1.dot(bsv2) / (norm(bsv1) * norm(bsv2)) (id1, id2, cosSim) } } docSims.foreach(println) sc.stop() } }
Example 9
Source File: SqlUnitTest.scala From SparkUnitTestingExamples with Apache License 2.0 | 6 votes |
package com.cloudera.sa.spark.unittest.sql import org.apache.spark.sql.Row import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} import scala.collection.mutable class SqlUnitTest extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll{ @transient var sc: SparkContext = null @transient var hiveContext: HiveContext = null override def beforeAll(): Unit = { val envMap = Map[String,String](("Xmx", "512m")) val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") sparkConfig.set("spark.io.compression.codec", "lzf") sc = new SparkContext("local[2]", "unit test", sparkConfig) hiveContext = new HiveContext(sc) } override def afterAll(): Unit = { sc.stop() } test("Test table creation and summing of counts") { val personRDD = sc.parallelize(Seq(Row("ted", 42, "blue"), Row("tj", 11, "green"), Row("andrew", 9, "green"))) hiveContext.sql("create table person (name string, age int, color string)") val emptyDataFrame = hiveContext.sql("select * from person limit 0") val personDataFrame = hiveContext.createDataFrame(personRDD, emptyDataFrame.schema) personDataFrame.registerTempTable("tempPerson") val ageSumDataFrame = hiveContext.sql("select sum(age) from tempPerson") val localAgeSum = ageSumDataFrame.take(10) assert(localAgeSum(0).get(0) == 62, "The sum of age should equal 62 but it equaled " + localAgeSum(0).get(0)) } }
Example 10
Source File: GraphGeneration.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 6 votes |
package com.github.maxpumperla.ml_spark.graphs import org.apache.spark.graphx.lib.TriangleCount import org.apache.spark.graphx.util.GraphGenerators import org.apache.spark.graphx.{Graph, GraphLoader, PartitionStrategy, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object GraphGeneration extends App { val conf = new SparkConf() .setAppName("Graph generation") .setMaster("local[4]") val sc = new SparkContext(conf) val edgeListGraph = GraphLoader.edgeListFile(sc, "./edge_list.txt") val rawEdges: RDD[(VertexId, VertexId)] = sc.textFile("./edge_list.txt").map { line => val field = line.split(" ") (field(0).toLong, field(1).toLong) } val edgeTupleGraph = Graph.fromEdgeTuples( rawEdges=rawEdges, defaultValue="") val gridGraph = GraphGenerators.gridGraph(sc, 5, 5) val starGraph = GraphGenerators.starGraph(sc, 11) val logNormalGraph = GraphGenerators.logNormalGraph( sc, numVertices = 20, mu=1, sigma = 3 ) logNormalGraph.outDegrees.map(_._2).collect().sorted val actorGraph = GraphLoader.edgeListFile( sc, "./ca-hollywood-2009.txt", true ).partitionBy(PartitionStrategy.RandomVertexCut) actorGraph.edges.count() val actorComponents = actorGraph.connectedComponents().cache actorComponents.vertices.map(_._2).distinct().count val clusterSizes =actorComponents.vertices.map( v => (v._2, 1)).reduceByKey(_ + _) clusterSizes.map(_._2).max clusterSizes.map(_._2).min val smallActorGraph = GraphLoader.edgeListFile(sc, "./ca-hollywood-2009.txt") val strongComponents = smallActorGraph.stronglyConnectedComponents(numIter = 5) strongComponents.vertices.map(_._2).distinct().count val canonicalGraph = actorGraph.mapEdges(e => 1).removeSelfEdges().convertToCanonicalEdges() val partitionedGraph = canonicalGraph.partitionBy(PartitionStrategy.RandomVertexCut) actorGraph.triangleCount() val triangles = TriangleCount.runPreCanonicalized(partitionedGraph) actorGraph.staticPageRank(10) val actorPrGraph: Graph[Double, Double] = actorGraph.pageRank(0.0001) actorPrGraph.vertices.reduce((v1, v2) => { if (v1._2 > v2._2) v1 else v2 }) actorPrGraph.inDegrees.filter(v => v._1 == 33024L).collect.foreach(println) actorPrGraph.inDegrees.map(_._2).collect().sorted.takeRight(10) actorPrGraph.inDegrees.map(_._2).filter(_ >= 62).count }
Example 11
Source File: L5-15KafkaDirect.scala From prosparkstreaming with Apache License 2.0 | 6 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import kafka.serializer.StringDecoder import org.apache.spark.streaming.kafka.KafkaUtils object StationJourneyCountDirectApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val topics = Set(topic) val params = Map[String, String]( "zookeeper.connect" -> zkQuorum, "group.id" -> consumerGroupId, "bootstrap.servers" -> brokerUrl) KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, params, topics).map(_._2) .map(rec => rec.split(",")) .map(rec => ((rec(3), rec(7)), 1)) .reduceByKey(_ + _) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 12
Source File: gihyo_6_3_reduceByWindow.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_reduceByWindow { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.reduceByWindow((x, y) => x + y, Seconds(windowLength), Seconds(slideInterval)) userList.print } }
Example 13
Source File: gihyo_6_3_KafkaStream.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 // scalastyle:off println import kafka.serializer.StringDecoder import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_KafkaStream { def main(args: Array[String]) { if (args.length != 4) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val brokerList = args(0) val consumeTopic = args(1) val checkpointDir = args(2) val saveDir = args(3) val f = createStreamingContext(brokerList, consumeTopic, checkpointDir, saveDir) // StreamingContextの取得 val ssc = StreamingContext.getOrCreate(checkpointDir, f) sys.ShutdownHookThread { System.out.println("Gracefully stopping SparkStreaming Application") ssc.stop(true, true) System.out.println("SparkStreaming Application stopped") } ssc.start ssc.awaitTermination } def createStreamingContext(brokerList: String, consumeTopic: String, checkpointDir: String, saveDir: String): () => StreamingContext = { () => { System.out.println(values) Some(running.getOrElse(0) + values.length) } def run(stream: InputDStream[(String, String)], saveDir: String, windowLength: Int = 30, slideInterval: Int = 5) { val baseStream = stream.transform(rdd => { val t = (Long.MaxValue - System.currentTimeMillis) rdd.map(x => (x._1, x._2 + ", " + t)) }).map(x => { val splitVal = x._2.split(",") val userVal = splitVal(0).split(":") val actionVal = splitVal(1).split(":") val pageVal = splitVal(2).split(":") val timestamp = splitVal(3) (actionVal(1), userVal(1), pageVal(1), timestamp) }) baseStream.persist() val accountStream = baseStream.filter(_._1 == "view") .map(x => x._2) .countByValue() val totalUniqueUser = accountStream .updateStateByKey[Int](updateStateByKeyFunction _) .count() .map(x => "totalUniqueUser:" + x) val baseStreamPerTirty = baseStream .window(Seconds(windowLength), Seconds(slideInterval)) .filter(_._1 == "view") baseStreamPerTirty.persist() val pageViewPerTirty = baseStreamPerTirty .count() .map(x => "PageView:" + x) val uniqueUserPerTirty = baseStreamPerTirty .map(x => x._2) .countByValue() .count() .map(x => "UniqueUser:" + x) val pageViewStream = baseStream .filter(_._1 == "view") .map(x => x._3) .count() .map(x => "PageView:" + x) val outputStream = totalUniqueUser .union(pageViewPerTirty) .union(uniqueUserPerTirty) .union(pageViewStream) .reduce((x, y) => x + ", " + y) .saveAsTextFiles(saveDir) } } // scalastyle:on println
Example 14
Source File: gihyo_6_3_TwitterStream.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 // scalastyle:off println import org.atilika.kuromoji.Token import twitter4j.Status import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.twitter.TwitterUtils import org.apache.spark.streaming.{Seconds, StreamingContext} object gihyo_6_3_TwitterStream { def main(args: Array[String]) { if (args.length != 7) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val Array(cKey, cSecret, aToken, aSecret, cDir, tagDir, wordDir) = args System.setProperty("twitter4j.oauth.consumerKey", cKey) System.setProperty("twitter4j.oauth.consumerSecret", cSecret) System.setProperty("twitter4j.oauth.accessToken", aToken) System.setProperty("twitter4j.oauth.accessTokenSecret", aSecret) val f = createStreamingContext(cDir, tagDir, wordDir) val ssc = StreamingContext.getOrCreate(cDir, f) sys.ShutdownHookThread { System.out.println("Gracefully stopping SparkStreaming Application") ssc.stop(true, true) System.out.println("SparkStreaming Application stopped") } ssc.start ssc.awaitTermination } def createStreamingContext(checkpointDir: String, tagDir: String, wordDir: String): () => StreamingContext = { () => { val conf = new SparkConf().setAppName("gihyoSample_Application") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.registerKryoClasses(Array(classOf[UserDic])) val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) ssc.checkpoint(checkpointDir) val twitterStream = TwitterUtils.createStream(ssc, None) run(sc, twitterStream, tagDir, wordDir) ssc } } def run(sc: SparkContext, stream: InputDStream[Status], tagDir: String, wordDir: String) { val tokenizer = sc.broadcast(UserDic.getInstance) val tweets = stream.map(tweet => tweet.getText()) tweets.persist() val TweetText = tweets .flatMap(text => { val tokens = tokenizer.value.tokenize(text).toArray tokens.filter(t => { val token = t.asInstanceOf[Token] ((token.getPartOfSpeech.indexOf("名詞") > -1 && token.getPartOfSpeech.indexOf("一般") > -1) || token.getPartOfSpeech.indexOf("カスタム名詞") > -1) && token.getSurfaceForm.length > 1 && !(token.getSurfaceForm matches "^[a-zA-Z]+$|^[0-9]+$") }).map(t => t.asInstanceOf[Token].getSurfaceForm) }) .countByValue() .map(x => (x._2, x._1)) .transform(_.sortByKey(false)) .map(x => (x._2, x._1)) val TweetTags = tweets .flatMap(tweet => tweet.split(" ").filter(_.startsWith("#"))) .countByValue() .map(x => (x._2, x._1)) .transform(_.sortByKey(false)) .map(x => (x._2, x._1)) TweetText.saveAsTextFiles(wordDir) TweetTags.saveAsTextFiles(tagDir) } } // scalastyle:on println
Example 15
Source File: gihyo_6_3_reduceByKeyAndWindow_efficient.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_reduceByKeyAndWindow_efficient { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val checkpointDir = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) ssc.checkpoint(checkpointDir) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.map(x => (x, 1)) .reduceByKeyAndWindow( (a: Int, b: Int) => a + b, (a: Int, b: Int) => a - b, Seconds(windowLength), Seconds(slideInterval)) userList.print } }
Example 16
Source File: gihyo_6_3_Transform.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Transform { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) val blackList = sc.parallelize(Array(("user002", "rockLogin"), ("user003", "rockPayment"))) run(lines, blackList) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], blackList: RDD[(String, String)]) { val userList = stream.map(x => (x, "action:Login")).transform(rdd => { val tmpUserList = rdd.leftOuterJoin(blackList) tmpUserList.filter(user => (user._2._2 == None)) }) userList.print } }
Example 17
Source File: gihyo_6_3_reduceByKeyAndWindow.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_reduceByKeyAndWindow { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.map(x => (x, 1)) .reduceByKeyAndWindow((a: Int, b: Int) => a + b, Seconds(windowLength), Seconds(slideInterval)) userList.print } }
Example 18
Source File: gihyo_6_3_countByValueAndWindow.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 // scalastyle:off println import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_countByValueAndWindow { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val checkpointDir = args(2) val f = createStreamingContext(targetHost, targetHostPort, checkpointDir) val ssc = StreamingContext.getOrCreate(checkpointDir, f) sys.ShutdownHookThread { System.out.println("Gracefully stopping SparkStreaming Application") ssc.stop(true, true) System.out.println("SparkStreaming Application stopped") } ssc.start ssc.awaitTermination } def createStreamingContext( targetHost: String, targetHostPort: Int, checkpointDir: String): () => StreamingContext = { () => { val conf = new SparkConf().setAppName("gihyoSample_Application") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) ssc.checkpoint(checkpointDir) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc } } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.countByValueAndWindow(Seconds(windowLength), Seconds(slideInterval)) userList.print } } // scalastyle:on println
Example 19
Source File: gihyo_6_3_updateStateByKey.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_updateStateByKey { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val checkpointDir = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) ssc.checkpoint(checkpointDir) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val userList = stream.map(x => (x, 1)).updateStateByKey[Int](updateStateByKeyFunction _) userList.print } def updateStateByKeyFunction(values: Seq[Int], running: Option[Int]): Option[Int] = { Some(running.getOrElse(0) + values.size) } }
Example 20
Source File: gihyo_6_3_countByWindow.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_countByWindow { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val checkpointDir = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) ssc.checkpoint(checkpointDir) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.countByWindow(Seconds(windowLength), Seconds(slideInterval)) userList.print } }
Example 21
Source File: gihyo_6_3_Window.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Window { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.window(Seconds(windowLength), Seconds(slideInterval)).countByValue() userList.print } }
Example 22
Source File: ReduceExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object ReduceExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("ReduceExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1, 6, 3), 3) nums.reduce((x, y) => x + y) println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""sum: ${nums.reduce((x, y) => x + y)}""") } } // scalastyle:on println
Example 23
Source File: StatsExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object StatsExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("StatsExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val nums = sc.parallelize(Array.range(1, 11)) val stats = nums.stats() println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""count: ${stats.count}""") println(s"""mean: ${stats.mean}""") println(s"""stdev: ${stats.stdev}""") } } // scalastyle:on println
Example 24
Source File: FoldExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object FoldExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("FoldExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1, 6, 3), 3) nums.reduce((x, y) => x + y) println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""sum: ${nums.fold(0)((x, y) => x + y)}""") } } // scalastyle:on println
Example 25
Source File: OrderExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object OrderExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("OrderExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1)) println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""top3: ${nums.top(3).mkString(", ")}""") println(s"""takeOredered3: ${nums.takeOrdered(3).mkString(", ")}""") } } // scalastyle:on println
Example 26
Source File: AggregateExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object AggregateExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("AggregateExample") val sc = new SparkContext(conf) run(sc) sc.stop() } private[basic_action] def run(sc: SparkContext) { val nums = sc.parallelize(Array.range(1, 11), 3) val acc = nums.aggregate(zeroValue = (0.0, 0))( seqOp = (partAcc, n) => (partAcc._1 + n, partAcc._2 + 1), combOp = (acc1, acc2) => (acc1._1 + acc2._1, acc1._2 + acc2._2) ) val avg = acc._1 / acc._2 println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""sum: ${nums.fold(0)((x, y) => x + y)}""") } } // scalastyle:on println
Example 27
Source File: CollectAsMapExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_action import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object CollectAsMapExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("CollectAsMapExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize( Array( ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1) ), 3 ) val fruitsAsMap = fruits.collectAsMap() println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""fruitsAsMap: $fruitsAsMap""") } } // scalastyle:on println
Example 28
Source File: PersistExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.persistence import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} object PersistExample { def main(args: Array[String]) { if (args.length != 1) { new IllegalArgumentException("Invalid arguments") System.exit(1) } Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("PersistExample") val sc = new SparkContext(conf) run(sc, args(0)) sc.stop() } def run(sc: SparkContext, inputFile: String) { val lines = sc.textFile(inputFile) lines.count() lines.collect() val persistedLines = sc.textFile(inputFile).persist() persistedLines.collect() persistedLines.count() persistedLines.unpersist() persistedLines.collect() } }
Example 29
Source File: CustomPartitionerExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.partition import org.apache.log4j.{Level, Logger} import org.apache.spark.Partitioner import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object CustomPartitionerExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("CustomPartitionerExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val defaultPartitioned = fruits.map((_, 1)).reduceByKey(_ + _) val customPartitioned = fruits.map((_, 1)).reduceByKey( new FirstLetterPartitioner(sc.defaultParallelism), _ + _) println(s"""fruits:\n ${fruits.collect().mkString(", ")}""") println() println("partitioned by default partitioner") defaultPartitioned.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) println() println("partitioned by first letter partitioner") customPartitioned.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) } } private[partition] class FirstLetterPartitioner(numParts: Int) extends Partitioner { override def numPartitions: Int = numParts override def getPartition(key: Any): Int = { key.toString.charAt(0).hashCode % numPartitions match { case p if p < 0 => p + numPartitions case p => p } } override def equals(other: Any): Boolean = { other match { case p: FirstLetterPartitioner => p.numPartitions == numPartitions case _ => false } } } // scalastyle:on println
Example 30
Source File: PartitionExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.partition import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object PartitionExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("Partition") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1), 1) println(s"""nums:\n ${nums.collect().mkString(", ")}""") println() println("original:") nums.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) println() val numsPar3 = nums.repartition(3) println("repartition to 3:") numsPar3.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) println() val numsPar2 = numsPar3.coalesce(2) println("coalesce to 2:") numsPar2.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) } } // scalastyle:on println
Example 31
Source File: WordCountExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.shared_variable import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object WordCountExample { def main(args: Array[String]) { if (args.length != 1) { new IllegalArgumentException("Invalid arguments") System.exit(1) } Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("WordCountExample") val sc = new SparkContext(conf) run(sc, args(0)) sc.stop() } def run(sc: SparkContext, inputFile: String) { val stopWordCount = sc.accumulator(0L) val stopWords = sc.broadcast(Set("a", "an", "for", "in", "on")) val lines = sc.textFile(inputFile) val words = lines.flatMap(_.split(" ")).filter(!_.isEmpty) val wordCounts = words.map(w => (w, 1)).reduceByKey(_ + _).filter { w => val result = !stopWords.value.contains(w._1) if (!result) stopWordCount += 1L result } val sortedWordCounts = wordCounts.sortBy(_._2, ascending = false) println(s"""wordCounts: ${sortedWordCounts.take(10).mkString(", ")}""") println(s"""stopWordCounts: ${stopWordCount.value}""") } } // scalastyle:on println
Example 32
Source File: AggregateByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object AggregateByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("AggregateByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize( Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1))) val fruitCountAvgs = fruits.aggregateByKey(zeroValue = Acc(0.0, 0))( seqOp = (partAcc, n) => partAcc += n, combOp = (acc1, acc2) => acc1 ++= acc2 ).mapValues(acc => acc.sum / acc.count) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""fruitCountAvgs: ${fruitCountAvgs.collect().mkString(", ")}""") } } // scalastyle:on println
Example 33
Source File: MapValuesExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object MapValuesExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("MapValuesExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array(("Apple", 1), ("Orange", 4), ("Apple", 2), ("Peach", 1))) val plusOnes = fruits.mapValues(v => v + 1) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""plusOnes: ${plusOnes.collect().mkString(", ")}""") } } // scalastyle:on println
Example 34
Source File: SortByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object SortByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("SortByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize( Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1))) val sortedByKeyAsc = fruits.sortByKey(ascending = false) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""sortedByKeyAsc: ${sortedByKeyAsc.collect().mkString(", ")}""") val nums = sc.parallelize( Array(("One", 1), ("Hundred", 100), ("Three", 3), ("Thousand", 1000))) implicit val sortByStrLen = new Ordering[String] { def compare(x: String, y: String): Int = x.length - y.length } val sortedByKeyLength = nums.sortByKey() println() println(s"""nums: ${nums.collect().mkString(", ")}""") println(s"""sortedByKeyLength: ${sortedByKeyLength.collect().mkString(", ")}""") } } // scalastyle:on println
Example 35
Source File: CoGroupExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object CoGroupExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("CoGroupExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val persons = sc.parallelize(Array( ("Adam", "San francisco"), ("Bob", "San francisco"), ("Taro", "Tokyo"), ("Charles", "New York") )) val cities = sc.parallelize(Array( ("Tokyo", "Japan"), ("San francisco", "America"), ("Beijing", "China") )) val grouped = persons.map(_.swap).cogroup(cities) println(s"""persons: ${persons.collect().mkString(", ")}""") println(s"""cities: ${cities.collect().mkString(", ")}""") println() println(s"""grouped:\n${grouped.collect().mkString("\n")}""") } } // scalastyle:on println
Example 36
Source File: JoinExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object JoinExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("JoinExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val persons = sc.parallelize(Array( ("Adam", "San francisco"), ("Bob", "San francisco"), ("Taro", "Tokyo"), ("Charles", "New York") )) val cities = sc.parallelize(Array( ("Tokyo", "Japan"), ("San francisco", "America"), ("Beijing", "China") )) val leftJoined = persons.map(_.swap).join(cities) val leftOuterJoined = persons.map(_.swap).leftOuterJoin(cities) val rightOuterJoined = persons.map(_.swap).rightOuterJoin(cities) val fullOuterJoined = persons.map(_.swap).fullOuterJoin(cities) println(s"""persons: ${persons.collect().mkString(", ")}""") println(s"""cities: ${cities.collect().mkString(", ")}""") println() println(s"""leftJoined:\n${leftJoined.collect().mkString("\n")}""") println() println(s"""leftOuterJoined:\n${leftOuterJoined.collect().mkString("\n")}""") println() println(s"""rightOuterJoined:\n${rightOuterJoined.collect().mkString("\n")}""") println() println(s"""fullOuterJoined:\n${fullOuterJoined.collect().mkString("\n")}""") } } // scalastyle:on println
Example 37
Source File: GroupByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object GroupByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("GroupByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize( Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1))) val grouped = fruits.groupByKey() println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""grouped: ${grouped.collect().mkString(", ")}""") } } // scalastyle:on println
Example 38
Source File: ReduceByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object ReduceByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("ReduceByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array( ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1))) val fruitCounts = fruits.reduceByKey((x, y) => x + y) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""fruitCounts: ${fruitCounts.collect().mkString(", ")}""") } } // scalastyle:on println
Example 39
Source File: CombineByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object CombineByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("CombineByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize( Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1))) val fruitCountAvgs = fruits.combineByKey( createCombiner = (v: Int) => Acc(v.toDouble, 1), mergeValue = (partAcc: Acc, n: Int) => partAcc += n, mergeCombiners = (acc1: Acc, acc2: Acc) => acc1 ++= acc2 ).mapValues(acc => acc.sum / acc.count) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""fruitCountAvgs: ${fruitCountAvgs.collect().mkString(", ")}""") } } // scalastyle:on println
Example 40
Source File: FoldByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.pairrdd_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object FoldByKeyExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("FoldByKeyExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array( ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1))) val fruitCounts = fruits.foldByKey(0)((x, y) => x + y) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""fruitCounts: ${fruitCounts.collect().mkString(", ")}""") } } // scalastyle:on println
Example 41
Source File: MapPartitionsExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.module.scala.DefaultScalaModule import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object MapPartitionsExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("MapPartitionsExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val jsonLines = sc.parallelize(Array( """{"name": "Apple", "num": 1}""", """{"name": "Orange", "num": 4}""", """{"name": "Apple", "num": 2}""", """{"name": "Peach", "num": 1}""" )) val parsed = jsonLines.mapPartitions { lines => val mapper = new ObjectMapper() mapper.registerModule(DefaultScalaModule) lines.map { line => val f = mapper.readValue(line, classOf[Map[String, String]]) (f("name"), f("num")) } } println(s"""json:\n${jsonLines.collect().mkString("\n")}""") println() println(s"""parsed:\n${parsed.collect().mkString("\n")}""") } } // scalastyle:on println
Example 42
Source File: FlatMapExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object FlatMapExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("FlatMapExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val lines = sc.parallelize(Array("Apple is red", "PineApple is yellow")) val words = lines.flatMap(line => line.split(" ")) println(s"""lines: ${lines.collect().mkString(", ")}""") println(s"""words: ${words.collect().mkString(", ")}""") } } // scalastyle:on println
Example 43
Source File: SetOperationsExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object SetOperationsExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("SetOperationsExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits1 = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val fruits2 = sc.parallelize(Array("Grape", "Apple", "Banana", "Orange")) val union = fruits1.union(fruits2) val subtract = fruits1.subtract(fruits2) val intersection = fruits1.intersection(fruits2) val cartesian = fruits1.cartesian(fruits2) println(s"""fruits1: ${fruits1.collect().mkString(", ")}""") println(s"""fruits2: ${fruits2.collect().mkString(", ")}""") println(s"""union: ${union.collect().mkString(", ")}""") println(s"""subtract: ${subtract.collect().mkString(", ")}""") println(s"""intersection: ${intersection.collect().mkString(", ")}""") println(s"""cartesian: ${cartesian.collect().mkString(", ")}""") } } // scalastyle:on println
Example 44
Source File: MapExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object MapExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("MapExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val lengths = fruits.map(fruit => fruit.length) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""lengths: ${lengths.collect().mkString(", ")}""") } } // scalastyle:on println
Example 45
Source File: ZipExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object ZipExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("ZipExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits1 = sc.parallelize( Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val fruits2 = sc.parallelize( Array("りんご", "オレンジ", "桃", "オレンジ", "パイナップル", "オレンジ")) val zipped = fruits1.zip(fruits2) println(s"""fruits1: ${fruits1.collect().mkString(", ")}""") println(s"""fruits2: ${fruits2.collect().mkString(", ")}""") println(s"""zipped: ${zipped.collect().mkString(", ")}""") } } // scalastyle:on println
Example 46
Source File: DistinctExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object DistinctExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("DistinctExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val uniques = fruits.distinct() println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""uniques: ${uniques.collect().mkString(", ")}""") } } // scalastyle:on println
Example 47
Source File: SampleExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object SampleExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("SampleExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val samples = fruits.sample(withReplacement = false, 0.5, 1) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""samples: ${samples.collect().mkString(", ")}""") } } // scalastyle:on println
Example 48
Source File: FilterExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.basic_transformation import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object FilterExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("FilterExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val startWithPs = fruits.filter(fruit => fruit.startsWith("P")) println(s"""fruits: ${fruits.collect().mkString(", ")}""") println(s"""startWithPs: ${startWithPs.collect().mkString(", ")}""") } } // scalastyle:on println
Example 49
Source File: JdbcExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch05 // scalastyle:off println import java.util.Properties import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.{SparkConf, SparkContext} object JdbcExample { def main(args: Seq[String]): Unit = { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val url = args(0) val user = args(1) val pass = args(2) val conf = new SparkConf().setAppName("JdbcExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) run(sc, sqlContext, url, user, pass) sc.stop() } def run(sc: SparkContext, sqlContext: SQLContext, url: String, user: String, pass: String): Unit = { val prop = new Properties() prop.setProperty("user", user) prop.setProperty("password", pass) val df: DataFrame = sqlContext.read.jdbc(url, "gihyo_spark.person", prop) df.printSchema() println("# Rows: " + df.count()) } } // scalastyle:on println
Example 50
Source File: DataFrameNaFunctionExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch05 // scalastyle:off println import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object DataFrameNaFunctionExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("BasicDataFrameExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) run(sc, sqlContext) sc.stop() } def run( sc: SparkContext, sqlContext: SQLContext): Unit = { import sqlContext.implicits._ val nullDF = Seq[(String, java.lang.Integer, java.lang.Double)]( ("Bob", 16, 176.5), ("Alice", null, 164.3), ("", 60, null), ("UNKNOWN", 25, Double.NaN), ("Amy", null, null), (null, null, Double.NaN) ).toDF("name", "age", "height") // drop nullDF.na.drop("any").show() nullDF.na.drop("all").show() nullDF.na.drop(Array("age")).show() nullDF.na.drop(Seq("age", "height")).show() nullDF.na.drop("any", Array("name", "age")).show() nullDF.na.drop("all", Array("age", "height")).show() // fill nullDF.na.fill(0.0, Array("name", "height")).show() nullDF.na.fill(Map( "name" -> "UNKNOWN", "height" -> 0.0 )).show() // replace nullDF.na.replace("name", Map("" -> "UNKNOWN")).show() } } // scalastyle:on println
Example 51
Source File: DatasetExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch05 import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.sql.{Dataset, SQLContext} import org.apache.spark.sql.functions._ private case class Person(id: Int, name: String, age: Int) object DatasetExample { def main(args: Seq[String]): Unit = { val conf = new SparkConf().setAppName("DatasetExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) run(sc, sqlContext) sc.stop() } def run(sc: SparkContext, sqlContext: SQLContext): Unit = { import sqlContext.implicits._ // Creates a Dataset from a `Seq` val seq = Seq((1, "Bob", 23), (2, "Tom", 23), (3, "John", 22)) val ds1: Dataset[(Int, String, Int)] = sqlContext.createDataset(seq) val ds2: Dataset[(Int, String, Int)] = seq.toDS() // Creates a Dataset from a `RDD` val rdd = sc.parallelize(seq) val ds3: Dataset[(Int, String, Int)] = sqlContext.createDataset(rdd) val ds4: Dataset[(Int, String, Int)] = rdd.toDS() // Creates a Dataset from a `DataFrame` val df = rdd.toDF("id", "name", "age") val ds5: Dataset[Person] = df.as[Person] // Selects a column ds5.select(expr("name").as[String]).show() // Filtering ds5.filter(_.name == "Bob").show() ds5.filter(person => person.age == 23).show() // Groups and counts the number of rows ds5.groupBy(_.age).count().show() } }
Example 52
Source File: TestStreamingContext.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark import org.scalatest.{BeforeAndAfterEach, Suite} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import jp.gihyo.spark.ch06.UserDic private[spark] trait TestStreamingContext extends BeforeAndAfterEach { self: Suite => @transient var ssc: StreamingContext = _ @transient var sc: SparkContext = _ val master = "local[2]" val appN = "StreamingUnitTest" val bd = Seconds(1) override def beforeEach() { super.beforeEach() val conf = new SparkConf().setMaster(master) .setAppName(appN) .set("spark.streaming.clock", "org.apache.spark.util.ManualClock") .registerKryoClasses(Array(classOf[UserDic])) ssc = new StreamingContext(conf, bd) sc = ssc.sparkContext } override def afterEach() { try { if (ssc != null) { // stop with sc ssc.stop(true) } ssc = null; } finally { super.afterEach() } } }
Example 53
Source File: TestSparkContext.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark import org.scalatest.{BeforeAndAfterAll, Suite} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SQLContext private[spark] trait TestSparkContext extends BeforeAndAfterAll { self: Suite => @transient var sc: SparkContext = _ @transient var sqlContext: SQLContext = _ override def beforeAll() { super.beforeAll() val conf = new SparkConf() .setMaster("local[2]") .setAppName("SparkUnitTest") .set("spark.sql.shuffle.partitions", "2") sc = new SparkContext(conf) SQLContext.clearActive() sqlContext = new SQLContext(sc) SQLContext.setActive(sqlContext) } override def afterAll() { try { sqlContext = null SQLContext.clearActive() if (sc != null) { sc.stop() } sc = null } finally { super.afterAll() } } }
Example 54
Source File: TestMain.scala From hbrdd with Apache License 2.0 | 5 votes |
import org.apache.spark.{SparkContext, SparkConf} object TestMain { private val master = "Core1" private val port = "7077" private val appName = "hbase-rdd_spark" private val data = "hdfs://Master1:8020/test/spark/hbase/testhb" def main(args: Array[String]) { val sparkConf = new SparkConf() .setMaster(s"spark://$master:$port") .setAppName(appName).setJars(List("/home/lele/coding/hbrdd/out/artifacts/hbrdd_jar/hbrdd.jar")) val sc = new SparkContext(sparkConf) val ret = sc.textFile(data).map({ line => val Array(k, col1, col2, _) = line split "\t" val content = Map("col1" -> col1, "col2" -> col2) k -> content }) println(ret.count()) sc.stop() } }
Example 55
Source File: XmlFile.scala From spark-xml with Apache License 2.0 | 5 votes |
package com.databricks.spark.xml.util import java.io.CharArrayWriter import java.nio.charset.Charset import javax.xml.stream.XMLOutputFactory import scala.collection.Map import com.databricks.spark.xml.parsers.StaxXmlGenerator import com.sun.xml.txw2.output.IndentingXMLStreamWriter import org.apache.hadoop.io.{Text, LongWritable} import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.sql.DataFrame import com.databricks.spark.xml.{XmlOptions, XmlInputFormat} private[xml] object XmlFile { val DEFAULT_INDENT = " " def withCharset( context: SparkContext, location: String, charset: String, rowTag: String): RDD[String] = { // This just checks the charset's validity early, to keep behavior Charset.forName(charset) context.hadoopConfiguration.set(XmlInputFormat.START_TAG_KEY, s"<$rowTag>") context.hadoopConfiguration.set(XmlInputFormat.END_TAG_KEY, s"</$rowTag>") context.hadoopConfiguration.set(XmlInputFormat.ENCODING_KEY, charset) context.newAPIHadoopFile(location, classOf[XmlInputFormat], classOf[LongWritable], classOf[Text]).map { case (_, text) => new String(text.getBytes, 0, text.getLength, charset) } } def saveAsXmlFile( dataFrame: DataFrame, path: String, parameters: Map[String, String] = Map()): Unit = { val options = XmlOptions(parameters.toMap) val codecClass = CompressionCodecs.getCodecClass(options.codec) val rowSchema = dataFrame.schema val indent = XmlFile.DEFAULT_INDENT val xmlRDD = dataFrame.rdd.mapPartitions { iter => val factory = XMLOutputFactory.newInstance() val writer = new CharArrayWriter() val xmlWriter = factory.createXMLStreamWriter(writer) val indentingXmlWriter = new IndentingXMLStreamWriter(xmlWriter) indentingXmlWriter.setIndentStep(indent) new Iterator[String] { var firstRow: Boolean = true var lastRow: Boolean = true override def hasNext: Boolean = iter.hasNext || firstRow || lastRow override def next: String = { if (iter.nonEmpty) { if (firstRow) { indentingXmlWriter.writeStartElement(options.rootTag) firstRow = false } val xml = { StaxXmlGenerator( rowSchema, indentingXmlWriter, options)(iter.next()) indentingXmlWriter.flush() writer.toString } writer.reset() xml } else { if (!firstRow) { lastRow = false indentingXmlWriter.writeEndElement() indentingXmlWriter.close() writer.toString } else { // This means the iterator was initially empty. firstRow = false lastRow = false "" } } } } } codecClass match { case null => xmlRDD.saveAsTextFile(path) case codec => xmlRDD.saveAsTextFile(path, codec) } } }
Example 56
Source File: XmlFileSuite.scala From spark-xml with Apache License 2.0 | 5 votes |
package com.databricks.spark.xml.util import java.nio.charset.{StandardCharsets, UnsupportedCharsetException} import org.apache.spark.SparkContext import org.scalatest.BeforeAndAfterAll import org.scalatest.funsuite.AnyFunSuite final class XmlFileSuite extends AnyFunSuite with BeforeAndAfterAll { private val booksFile = "src/test/resources/books.xml" private val booksUnicodeInTagNameFile = "src/test/resources/books-unicode-in-tag-name.xml" private val booksFileTag = "book" private val booksUnicodeFileTag = "\u66F8" // scalastyle:ignore private val numBooks = 12 private val numBooksUnicodeInTagName = 3 private val fiasHouse = "src/test/resources/fias_house.xml" private val fiasRowTag = "House" private val numHouses = 37 private val utf8 = StandardCharsets.UTF_8.name private var sparkContext: SparkContext = _ override def beforeAll(): Unit = { super.beforeAll() sparkContext = new SparkContext("local[2]", "TextFileSuite") } override def afterAll(): Unit = { try { sparkContext.stop() sparkContext = null } finally { super.afterAll() } } test("read utf-8 encoded file") { val baseRDD = XmlFile.withCharset(sparkContext, booksFile, utf8, rowTag = booksFileTag) assert(baseRDD.count() === numBooks) } test("read file with unicode chars in row tag name") { val baseRDD = XmlFile.withCharset( sparkContext, booksUnicodeInTagNameFile, utf8, rowTag = booksUnicodeFileTag) assert(baseRDD.count() === numBooksUnicodeInTagName) } test("read utf-8 encoded file with empty tag") { val baseRDD = XmlFile.withCharset(sparkContext, fiasHouse, utf8, rowTag = fiasRowTag) assert(baseRDD.count() == numHouses) baseRDD.collect().foreach(x => assert(x.contains("/>"))) } test("unsupported charset") { val exception = intercept[UnsupportedCharsetException] { XmlFile.withCharset(sparkContext, booksFile, "frylock", rowTag = booksFileTag).count() } assert(exception.getMessage.contains("frylock")) } }
Example 57
Source File: SparkSuite.scala From spark-sorted with Apache License 2.0 | 5 votes |
package com.tresata.spark.sorted import org.scalactic.Equality import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.{ Dataset, SparkSession } object SparkSuite { lazy val spark: SparkSession = { val session = SparkSession.builder .master("local[*]") .appName("test") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.ui.enabled", false) .config("spark.sql.shuffle.partitions", 4) .getOrCreate() session } lazy val sc: SparkContext = spark.sparkContext lazy val jsc = new JavaSparkContext(sc) def javaSparkContext() = jsc } trait SparkSuite { implicit lazy val spark: SparkSession = SparkSuite.spark implicit lazy val sc: SparkContext = SparkSuite.spark.sparkContext implicit def rddEq[X]: Equality[RDD[X]] = new Equality[RDD[X]] { private def toCounts[Y](s: Seq[Y]): Map[Y, Int] = s.groupBy(identity).mapValues(_.size) def areEqual(a: RDD[X], b: Any): Boolean = b match { case s: Seq[_] => toCounts(a.collect) == toCounts(s) case rdd: RDD[_] => toCounts(a.collect) == toCounts(rdd.collect) } } implicit def gsEq[K, V](implicit rddEq: Equality[RDD[(K, V)]]): Equality[GroupSorted[K, V]] = new Equality[GroupSorted[K, V]] { def areEqual(a: GroupSorted[K, V], b: Any): Boolean = rddEq.areEqual(a, b) } implicit def dsEq[X](implicit rddEq: Equality[RDD[X]]): Equality[Dataset[X]] = new Equality[Dataset[X]] { def areEqual(a: Dataset[X], b: Any): Boolean = b match { case ds: Dataset[_] => rddEq.areEqual(a.rdd, ds.rdd) case x => rddEq.areEqual(a.rdd, x) } } }
Example 58
Source File: TestUtils.scala From odsc-east-realish-predictions with Apache License 2.0 | 5 votes |
package com.twilio.open.odsc.realish import com.holdenkarau.spark.testing.{LocalSparkContext, SparkContextProvider} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SparkSession import org.scalatest.{BeforeAndAfterAll, Suite} object TestUtils { } @SerialVersionUID(1L) case class UserPersonality(uuid: String, name: String, tags: Array[String]) extends Serializable @SerialVersionUID(1L) case class Author(uuid: String, name: String, age: Int) extends Serializable @SerialVersionUID(1L) case class LibraryBook(uuid: String, name: String, author: Author) extends Serializable case class MockKafkaDataFrame(key: Array[Byte], value: Array[Byte]) trait SharedSparkSql extends BeforeAndAfterAll with SparkContextProvider { self: Suite => @transient var _sparkSql: SparkSession = _ @transient private var _sc: SparkContext = _ override def sc: SparkContext = _sc def conf: SparkConf def sparkSql: SparkSession = _sparkSql override def beforeAll() { _sparkSql = SparkSession.builder().config(conf).getOrCreate() _sc = _sparkSql.sparkContext setup(_sc) super.beforeAll() } override def afterAll() { try { _sparkSql.close() _sparkSql = null LocalSparkContext.stop(_sc) _sc = null } finally { super.afterAll() } } }
Example 59
Source File: TestUtils.scala From odsc-east-realish-predictions with Apache License 2.0 | 5 votes |
package com.twilio.open.odsc.realish import com.holdenkarau.spark.testing.{LocalSparkContext, SparkContextProvider} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SparkSession import org.scalatest.{BeforeAndAfterAll, Suite} object TestUtils { } @SerialVersionUID(1L) case class UserPersonality(uuid: String, name: String, tags: Array[String]) extends Serializable @SerialVersionUID(1L) case class Author(uuid: String, name: String, age: Int) extends Serializable @SerialVersionUID(1L) case class LibraryBook(uuid: String, name: String, author: Author) extends Serializable case class MockKafkaDataFrame(key: Array[Byte], value: Array[Byte]) trait SharedSparkSql extends BeforeAndAfterAll with SparkContextProvider { self: Suite => @transient var _sparkSql: SparkSession = _ @transient private var _sc: SparkContext = _ override def sc: SparkContext = _sc def conf: SparkConf def sparkSql: SparkSession = _sparkSql override def beforeAll() { _sparkSql = SparkSession.builder().config(conf).getOrCreate() _sc = _sparkSql.sparkContext setup(_sc) super.beforeAll() } override def afterAll() { try { _sparkSql.close() _sparkSql = null LocalSparkContext.stop(_sc) _sc = null } finally { super.afterAll() } } }
Example 60
Source File: HyperLogLog.scala From spark-hyperloglog with Apache License 2.0 | 5 votes |
package com.mozilla.spark.sql.hyperloglog.test import com.mozilla.spark.sql.hyperloglog.aggregates._ import com.mozilla.spark.sql.hyperloglog.functions._ import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions._ import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.{FlatSpec, Matchers} class HyperLogLogTest extends FlatSpec with Matchers{ "Algebird's HyperLogLog" can "be used from Spark" in { val sparkConf = new SparkConf().setAppName("HyperLogLog") sparkConf.setMaster(sparkConf.get("spark.master", "local[1]")) val sc = new SparkContext(sparkConf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ val hllMerge = new HyperLogLogMerge sqlContext.udf.register("hll_merge", hllMerge) sqlContext.udf.register("hll_create", hllCreate _) sqlContext.udf.register("hll_cardinality", hllCardinality _) val frame = sc.parallelize(List("a", "b", "c", "c"), 4).toDF("id") val count = frame .select(expr("hll_create(id, 12) as hll")) .groupBy() .agg(expr("hll_cardinality(hll_merge(hll)) as count")) .collect() count(0)(0) should be (3) } }
Example 61
Source File: ImageLoaderUtils.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.loaders import java.awt.image.BufferedImage import java.io.{InputStream, ByteArrayInputStream} import java.net.URI import java.util.zip.GZIPInputStream import javax.imageio.ImageIO import keystoneml.loaders.VOCLoader._ import org.apache.commons.compress.archivers.ArchiveStreamFactory import org.apache.commons.compress.archivers.tar.TarArchiveInputStream import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import keystoneml.pipelines.Logging import keystoneml.utils._ import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag object ImageLoaderUtils extends Logging { def loadFiles[L, I <: AbstractLabeledImage[L] : ClassTag]( filePathsRDD: RDD[URI], labelsMap: String => L, imageBuilder: (Image, L, Option[String]) => I, // TODO(etrain): We can probably do this with implicits. namePrefix: Option[String] = None): RDD[I] = { filePathsRDD.flatMap(fileUri => loadFile(fileUri, labelsMap, imageBuilder, namePrefix)) } private def loadFile[L, I <: AbstractLabeledImage[L]]( fileUri: URI, labelsMap: String => L, imageBuilder: (Image, L, Option[String]) => I, namePrefix: Option[String]): Iterator[I] = { val filePath = new Path(fileUri) val conf = new Configuration(true) val fs = FileSystem.get(filePath.toUri(), conf) val fStream = fs.open(filePath) val tarStream = new ArchiveStreamFactory().createArchiveInputStream( "tar", fStream).asInstanceOf[TarArchiveInputStream] var entry = tarStream.getNextTarEntry() val imgs = new ArrayBuffer[I] while (entry != null) { if (!entry.isDirectory && (namePrefix.isEmpty || entry.getName.startsWith(namePrefix.get))) { var offset = 0 var ret = 0 val content = new Array[Byte](entry.getSize().toInt) while (ret >= 0 && offset != entry.getSize()) { ret = tarStream.read(content, offset, content.length - offset) if (ret >= 0) { offset += ret } } val bais = new ByteArrayInputStream(content) val image = ImageUtils.loadImage(bais).map { img => imageBuilder(img, labelsMap(entry.getName), Some(entry.getName)) } imgs ++= image } entry = tarStream.getNextTarEntry() } imgs.iterator } }
Example 62
Source File: TimitFeaturesDataLoader.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.loaders import breeze.linalg.DenseVector import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import scala.collection.mutable def apply(sc: SparkContext, trainDataLocation: String, trainLabelsLocation: String, testDataLocation: String, testLabelsLocation: String, numParts: Int = 512): TimitFeaturesData = { val trainData = CsvDataLoader(sc, trainDataLocation, numParts) val trainLabels = createLabelsRDD(parseSparseLabels(trainLabelsLocation), trainData) val testData = CsvDataLoader(sc, testDataLocation, numParts) val testLabels = createLabelsRDD(parseSparseLabels(testLabelsLocation), testData) TimitFeaturesData(LabeledData(trainLabels.zip(trainData)), LabeledData(testLabels.zip(testData))) } }
Example 63
Source File: ImageNetLoader.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.loaders import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import keystoneml.utils.LabeledImage def apply(sc: SparkContext, dataPath: String, labelsPath: String): RDD[LabeledImage] = { val filePathsRDD = ImageLoaderUtils.getFilePathsRDD(sc, dataPath) val labelsMapFile = scala.io.Source.fromFile(labelsPath) val labelsMap = labelsMapFile.getLines().map(x => x.toString).toArray.map { line => val parts = line.split(" ") (parts(0), parts(1).toInt) }.toMap def labelsMapF(fname: String): Int = { labelsMap(fname.split('/')(0)) } ImageLoaderUtils.loadFiles(filePathsRDD, labelsMapF, LabeledImage.apply) } }
Example 64
Source File: VOCLoader.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.loaders import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import keystoneml.pipelines.Logging import keystoneml.utils.MultiLabeledImage case class VOCDataPath(imagesDirName: String, namePrefix: String, numParts: Option[Int]) case class VOCLabelPath(labelsFileName: String) def apply(sc: SparkContext, dataPath: VOCDataPath, labelsPath: VOCLabelPath): RDD[MultiLabeledImage] = { val filePathsRDD = ImageLoaderUtils.getFilePathsRDD(sc, dataPath.imagesDirName, dataPath.numParts) val labelsMapFile = scala.io.Source.fromFile(labelsPath.labelsFileName) val labelsMap: Map[String, Array[Int]] = labelsMapFile .getLines() .drop(1) .map(x => x.toString) .map { line => val parts = line.split(",") (parts(4).replace("\"", ""), parts(1).toInt - 1) } .toArray .groupBy(_._1) .mapValues(_.map(_._2)) .map(identity) labelsMapFile.close() ImageLoaderUtils.loadFiles(filePathsRDD, labelsMap, MultiLabeledImage.apply, Some(dataPath.namePrefix)) } }
Example 65
Source File: LinearPixels.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.pipelines.images.cifar import breeze.linalg.DenseVector import keystoneml.evaluation.MulticlassClassifierEvaluator import keystoneml.loaders.CifarLoader import keystoneml.nodes.images.{GrayScaler, ImageExtractor, ImageVectorizer, LabelExtractor} import keystoneml.nodes.learning.LinearMapEstimator import keystoneml.nodes.util.{Cacher, ClassLabelIndicatorsFromIntLabels, MaxClassifier} import org.apache.spark.{SparkConf, SparkContext} import keystoneml.pipelines.Logging import scopt.OptionParser import keystoneml.utils.Image import keystoneml.workflow.Pipeline object LinearPixels extends Logging { val appName = "LinearPixels" case class LinearPixelsConfig(trainLocation: String = "", testLocation: String = "") def run(sc: SparkContext, config: LinearPixelsConfig): Pipeline[Image, Int] = { val numClasses = 10 // Load and cache the training data. val trainData = CifarLoader(sc, config.trainLocation).cache() val trainImages = ImageExtractor(trainData) val labelExtractor = LabelExtractor andThen ClassLabelIndicatorsFromIntLabels(numClasses) andThen new Cacher[DenseVector[Double]] val trainLabels = labelExtractor(trainData) // A featurizer maps input images into vectors. For this pipeline, we'll also convert the image to grayscale. // We then estimate our model by calling a linear solver on our data. val predictionPipeline = GrayScaler andThen ImageVectorizer andThen (new LinearMapEstimator, trainImages, trainLabels) andThen MaxClassifier // Calculate training error. val evaluator = new MulticlassClassifierEvaluator(numClasses) val trainEval = evaluator.evaluate(predictionPipeline(trainImages), LabelExtractor(trainData)) // Do testing. val testData = CifarLoader(sc, config.testLocation) val testImages = ImageExtractor(testData) val testLabels = labelExtractor(testData) val testEval = evaluator.evaluate(predictionPipeline(testImages), LabelExtractor(testData)) logInfo(s"Training accuracy: \n${trainEval.totalAccuracy}") logInfo(s"Test accuracy: \n${testEval.totalAccuracy}") predictionPipeline } def parse(args: Array[String]): LinearPixelsConfig = new OptionParser[LinearPixelsConfig](appName) { head(appName, "0.1") help("help") text("prints this usage text") opt[String]("trainLocation") required() action { (x,c) => c.copy(trainLocation=x) } opt[String]("testLocation") required() action { (x,c) => c.copy(testLocation=x) } }.parse(args, LinearPixelsConfig()).get def main(args: Array[String]) = { val appConfig = parse(args) val conf = new SparkConf().setAppName(appName) conf.setIfMissing("spark.master", "local[2]") // This is a fallback if things aren't set via spark submit. val sc = new SparkContext(conf) run(sc, appConfig) sc.stop() } }
Example 66
Source File: AmazonReviewsPipeline.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.pipelines.text import breeze.linalg.SparseVector import keystoneml.evaluation.BinaryClassifierEvaluator import keystoneml.loaders.{AmazonReviewsDataLoader, LabeledData} import keystoneml.nodes.learning.LogisticRegressionEstimator import keystoneml.nodes.nlp._ import keystoneml.nodes.stats.TermFrequency import keystoneml.nodes.util.CommonSparseFeatures import org.apache.spark.sql.SparkSession import org.apache.spark.{SparkConf, SparkContext} import keystoneml.pipelines.Logging import scopt.OptionParser import keystoneml.workflow.Pipeline object AmazonReviewsPipeline extends Logging { val appName = "AmazonReviewsPipeline" def run(spark: SparkSession, conf: AmazonReviewsConfig): Pipeline[String, Double] = { val amazonTrainData = AmazonReviewsDataLoader(spark, conf.trainLocation, conf.threshold).labeledData val trainData = LabeledData(amazonTrainData.repartition(conf.numParts).cache()) val training = trainData.data val labels = trainData.labels // Build the classifier estimator val predictor = Trim andThen LowerCase() andThen Tokenizer() andThen NGramsFeaturizer(1 to conf.nGrams) andThen TermFrequency(x => 1) andThen (CommonSparseFeatures[Seq[String]](conf.commonFeatures), training) andThen (LogisticRegressionEstimator[SparseVector[Double]](numClasses = 2, numIters = conf.numIters), training, labels) // Evaluate the classifier val amazonTestData = AmazonReviewsDataLoader(spark, conf.testLocation, conf.threshold).labeledData val testData = LabeledData(amazonTestData.repartition(conf.numParts).cache()) val testLabels = testData.labels val testResults = predictor(testData.data) val eval = BinaryClassifierEvaluator.evaluate(testResults.get.map(_ > 0), testLabels.map(_ > 0)) logInfo("\n" + eval.summary()) predictor } case class AmazonReviewsConfig( trainLocation: String = "", testLocation: String = "", threshold: Double = 3.5, nGrams: Int = 2, commonFeatures: Int = 100000, numIters: Int = 20, numParts: Int = 512) def parse(args: Array[String]): AmazonReviewsConfig = new OptionParser[AmazonReviewsConfig](appName) { head(appName, "0.1") opt[String]("trainLocation") required() action { (x,c) => c.copy(trainLocation=x) } opt[String]("testLocation") required() action { (x,c) => c.copy(testLocation=x) } opt[Double]("threshold") action { (x,c) => c.copy(threshold=x)} opt[Int]("nGrams") action { (x,c) => c.copy(nGrams=x) } opt[Int]("commonFeatures") action { (x,c) => c.copy(commonFeatures=x) } opt[Int]("numIters") action { (x,c) => c.copy(numParts=x) } opt[Int]("numParts") action { (x,c) => c.copy(numParts=x) } }.parse(args, AmazonReviewsConfig()).get def main(args: Array[String]) = { val conf = new SparkConf().setAppName(appName) conf.setIfMissing("spark.master", "local[2]") // This is a fallback if things aren't set via spark submit. val spark = SparkSession.builder.config(conf).getOrCreate() val appConfig = parse(args) run(spark, appConfig) spark.stop() } }
Example 67
Source File: NewsgroupsPipeline.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.pipelines.text import breeze.linalg.SparseVector import keystoneml.evaluation.MulticlassClassifierEvaluator import keystoneml.loaders.NewsgroupsDataLoader import keystoneml.nodes.learning.NaiveBayesEstimator import keystoneml.nodes.nlp._ import keystoneml.nodes.stats.TermFrequency import keystoneml.nodes.util.{CommonSparseFeatures, MaxClassifier} import org.apache.spark.{SparkConf, SparkContext} import keystoneml.pipelines.Logging import scopt.OptionParser import keystoneml.workflow.Pipeline object NewsgroupsPipeline extends Logging { val appName = "NewsgroupsPipeline" def run(sc: SparkContext, conf: NewsgroupsConfig): Pipeline[String, Int] = { val trainData = NewsgroupsDataLoader(sc, conf.trainLocation) val numClasses = NewsgroupsDataLoader.classes.length // Build the classifier estimator logInfo("Training classifier") val predictor = Trim andThen LowerCase() andThen Tokenizer() andThen NGramsFeaturizer(1 to conf.nGrams) andThen TermFrequency(x => 1) andThen (CommonSparseFeatures[Seq[String]](conf.commonFeatures), trainData.data) andThen (NaiveBayesEstimator[SparseVector[Double]](numClasses), trainData.data, trainData.labels) andThen MaxClassifier // Evaluate the classifier logInfo("Evaluating classifier") val testData = NewsgroupsDataLoader(sc, conf.testLocation) val testLabels = testData.labels val testResults = predictor(testData.data) val eval = new MulticlassClassifierEvaluator(numClasses).evaluate(testResults, testLabels) logInfo("\n" + eval.summary(NewsgroupsDataLoader.classes)) predictor } case class NewsgroupsConfig( trainLocation: String = "", testLocation: String = "", nGrams: Int = 2, commonFeatures: Int = 100000) def parse(args: Array[String]): NewsgroupsConfig = new OptionParser[NewsgroupsConfig](appName) { head(appName, "0.1") opt[String]("trainLocation") required() action { (x,c) => c.copy(trainLocation=x) } opt[String]("testLocation") required() action { (x,c) => c.copy(testLocation=x) } opt[Int]("nGrams") action { (x,c) => c.copy(nGrams=x) } opt[Int]("commonFeatures") action { (x,c) => c.copy(commonFeatures=x) } }.parse(args, NewsgroupsConfig()).get def main(args: Array[String]) = { val conf = new SparkConf().setAppName(appName) conf.setIfMissing("spark.master", "local[2]") // This is a fallback if things aren't set via spark submit. val sc = new SparkContext(conf) val appConfig = parse(args) run(sc, appConfig) sc.stop() } }
Example 68
Source File: MeanAveragePrecisionSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.evaluation import breeze.linalg.DenseVector import org.scalatest.FunSuite import org.apache.spark.SparkContext import keystoneml.utils.Stats import keystoneml.workflow.PipelineContext class MeanAveragePrecisionSuite extends FunSuite with PipelineContext { test("random map test") { sc = new SparkContext("local", "test") // Build some random test data with 4 classes 0,1,2,3 val actual = List(Array(0, 3), Array(2), Array(1, 2), Array(0)) val actualRdd = sc.parallelize(actual) val predicted = List( DenseVector(0.1, -0.05, 0.12, 0.5), DenseVector(-0.23, -0.45, 0.23, 0.1), DenseVector(-0.34, -0.32, -0.66, 1.52), DenseVector(-0.1, -0.2, 0.5, 0.8)) val predictedRdd = sc.parallelize(predicted) val map = new MeanAveragePrecisionEvaluator(4).evaluate(predictedRdd, actualRdd) // Expected values from running this in MATLAB val expected = DenseVector(1.0, 0.3333, 0.5, 0.3333) assert(Stats.aboutEq(map, expected, 1e-4)) } }
Example 69
Source File: MulticlassClassifierEvaluatorSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.evaluation import breeze.linalg.DenseMatrix import org.apache.spark.SparkContext import org.scalatest.FunSuite import keystoneml.workflow.PipelineContext class MulticlassClassifierEvaluatorSuite extends FunSuite with PipelineContext { test("Multiclass keystoneml.evaluation metrics") { sc = new SparkContext("local", "test") val confusionMatrix = new DenseMatrix(3, 3, Array(2, 1, 0, 1, 3, 0, 1, 0, 1)) val labels = Array(0.0, 1.0, 2.0) val predictionAndLabels = sc.parallelize( Seq((0.0, 0.0), (0.0, 1.0), (0.0, 0.0), (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)), 2) val evaluator = new MulticlassClassifierEvaluator(3) val metrics = evaluator.evaluate(predictionAndLabels.map(_._1.toInt), predictionAndLabels.map(_._2.toInt) ) val delta = 0.0000001 val precision0 = 2.0 / (2 + 1) val precision1 = 3.0 / (3 + 1) val precision2 = 1.0 / (1 + 1) val recall0 = 2.0 / (2 + 2) val recall1 = 3.0 / (3 + 1) val recall2 = 1.0 / (1 + 0) val f1measure0 = 2 * precision0 * recall0 / (precision0 + recall0) val f1measure1 = 2 * precision1 * recall1 / (precision1 + recall1) val f1measure2 = 2 * precision2 * recall2 / (precision2 + recall2) val f2measure0 = (1 + 2 * 2) * precision0 * recall0 / (2 * 2 * precision0 + recall0) val f2measure1 = (1 + 2 * 2) * precision1 * recall1 / (2 * 2 * precision1 + recall1) val f2measure2 = (1 + 2 * 2) * precision2 * recall2 / (2 * 2 * precision2 + recall2) assert(metrics.confusionMatrix.toArray.sameElements(confusionMatrix.toArray)) assert(math.abs(metrics.classMetrics(0).precision - precision0) < delta) assert(math.abs(metrics.classMetrics(1).precision - precision1) < delta) assert(math.abs(metrics.classMetrics(2).precision - precision2) < delta) assert(math.abs(metrics.classMetrics(0).recall - recall0) < delta) assert(math.abs(metrics.classMetrics(1).recall - recall1) < delta) assert(math.abs(metrics.classMetrics(2).recall - recall2) < delta) assert(math.abs(metrics.classMetrics(0).fScore() - f1measure0) < delta) assert(math.abs(metrics.classMetrics(1).fScore() - f1measure1) < delta) assert(math.abs(metrics.classMetrics(2).fScore() - f1measure2) < delta) assert(math.abs(metrics.classMetrics(0).fScore(2.0) - f2measure0) < delta) assert(math.abs(metrics.classMetrics(1).fScore(2.0) - f2measure1) < delta) assert(math.abs(metrics.classMetrics(2).fScore(2.0) - f2measure2) < delta) assert(math.abs(metrics.microRecall - (2.0 + 3.0 + 1.0) / ((2 + 3 + 1) + (1 + 1 + 1))) < delta) assert(math.abs(metrics.microRecall - metrics.microPrecision) < delta) assert(math.abs(metrics.microRecall - metrics.microFScore()) < delta) assert(math.abs(metrics.macroPrecision - (precision0 + precision1 + precision2) / 3.0) < delta) assert(math.abs(metrics.macroRecall - (recall0 + recall1 + recall2) / 3.0) < delta) assert(math.abs(metrics.macroFScore() - (f1measure0 + f1measure1 + f1measure2) / 3.0) < delta) assert(math.abs(metrics.macroFScore(2.0) - (f2measure0 + f2measure1 + f2measure2) / 3.0) < delta) } }
Example 70
Source File: BinaryClassifierEvaluatorSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.evaluation import org.apache.spark.SparkContext import org.scalatest.FunSuite import keystoneml.utils.Stats import keystoneml.workflow.PipelineContext class BinaryClassifierEvaluatorSuite extends FunSuite with PipelineContext { test("Multiclass keystoneml.evaluation metrics") { sc = new SparkContext("local", "test") val predictionAndLabels = sc.parallelize( Seq.fill(6)((true, true)) ++ Seq.fill(2)((false, true)) ++ Seq.fill(1)((true, false)) ++ Seq.fill(3)((false, false)), 2) val metrics = BinaryClassifierEvaluator.evaluate(predictionAndLabels.map(_._1), predictionAndLabels.map(_._2)) assert(metrics.tp === 6) assert(metrics.fp === 1) assert(metrics.tn === 3) assert(metrics.fn === 2) assert(Stats.aboutEq(metrics.precision, 6.0/7.0)) assert(Stats.aboutEq(metrics.recall, 6.0/8.0)) assert(Stats.aboutEq(metrics.accuracy, 9.0/12.0)) assert(Stats.aboutEq(metrics.specificity, 3.0/4.0)) assert(Stats.aboutEq(metrics.fScore(), 2.0 * 6.0 / (2.0 * 6.0 + 2.0 + 1.0))) } }
Example 71
Source File: TestUtils.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.utils import java.io.{FileReader, ByteArrayInputStream} import breeze.linalg.DenseMatrix import breeze.stats.distributions.{Gaussian, RandBasis, ThreadLocalRandomGenerator, Rand} import edu.berkeley.cs.amplab.mlmatrix.RowPartitionedMatrix import org.apache.commons.io.IOUtils import org.apache.commons.math3.random.MersenneTwister import org.apache.spark.SparkContext import scala.io.Source import scala.util.Random def genChannelMajorArrayVectorizedImage(x: Int, y: Int, z: Int): ChannelMajorArrayVectorizedImage = { ChannelMajorArrayVectorizedImage(genData(x, y, z), ImageMetadata(x,y,z)) } def genRowColumnMajorByteArrayVectorizedImage(x: Int, y: Int, z: Int): RowColumnMajorByteArrayVectorizedImage = { RowColumnMajorByteArrayVectorizedImage(genData(x,y,z).map(_.toByte), ImageMetadata(x,y,z)) } def createRandomMatrix( sc: SparkContext, numRows: Int, numCols: Int, numParts: Int, seed: Int = 42): RowPartitionedMatrix = { val rowsPerPart = numRows / numParts val matrixParts = sc.parallelize(1 to numParts, numParts).mapPartitionsWithIndex { (index, part) => val randBasis: RandBasis = new RandBasis(new ThreadLocalRandomGenerator(new MersenneTwister(seed+index))) Iterator(DenseMatrix.rand(rowsPerPart, numCols, Gaussian(0.0, 1.0)(randBasis))) } RowPartitionedMatrix.fromMatrix(matrixParts.cache()) } def createLocalRandomMatrix(numRows: Int, numCols: Int, seed: Int = 42): DenseMatrix[Double] = { val randBasis: RandBasis = new RandBasis(new ThreadLocalRandomGenerator(new MersenneTwister(seed))) DenseMatrix.rand(numRows, numCols, Gaussian(0.0, 1.0)(randBasis)) } }
Example 72
Source File: MatrixUtilsSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.utils import org.scalatest.FunSuite import breeze.linalg._ import breeze.stats._ import org.apache.spark.SparkContext import keystoneml.pipelines._ import keystoneml.workflow.PipelineContext class MatrixUtilsSuite extends FunSuite with PipelineContext { test("computeMean works correctly") { val numRows = 1000 val numCols = 32 val numParts = 4 sc = new SparkContext("local", "test") val in = DenseMatrix.rand(numRows, numCols) val inArr = MatrixUtils.matrixToRowArray(in) val rdd = sc.parallelize(inArr, numParts).mapPartitions { iter => Iterator.single(MatrixUtils.rowsToMatrix(iter)) } val expected = mean(in(::, *)).t val actual = MatrixUtils.computeMean(rdd) assert(Stats.aboutEq(expected, actual, 1e-6)) } }
Example 73
Source File: EstimatorSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.workflow import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.scalatest.FunSuite import keystoneml.pipelines.Logging class EstimatorSuite extends FunSuite with PipelineContext with Logging { test("Estimator fit RDD") { sc = new SparkContext("local", "test") val intEstimator = new Estimator[Int, Int] { def fit(data: RDD[Int]): Transformer[Int, Int] = { val first = data.first() Transformer(x => x + first) } } val trainData = sc.parallelize(Seq(32, 94, 12)) val testData = sc.parallelize(Seq(42, 58, 61)) val pipeline = intEstimator.withData(trainData) assert(pipeline.apply(testData).get().collect().toSeq === Seq(42 + 32, 58 + 32, 61 + 32)) } test("Estimator fit Pipeline Data") { sc = new SparkContext("local", "test") val transformer = Transformer[Int, Int](_ * 2) val intEstimator = new Estimator[Int, Int] { def fit(data: RDD[Int]): Transformer[Int, Int] = { val first = data.first() Transformer(x => x + first) } } val trainData = sc.parallelize(Seq(32, 94, 12)) val testData = sc.parallelize(Seq(42, 58, 61)) val pipeline = intEstimator.withData(transformer(trainData)) assert(pipeline.apply(testData).get().collect().toSeq === Seq(42 + 64, 58 + 64, 61 + 64)) } }
Example 74
Source File: LabelEstimatorSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.workflow import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.scalatest.FunSuite import keystoneml.pipelines.Logging class LabelEstimatorSuite extends FunSuite with PipelineContext with Logging { test("LabelEstimator fit RDD") { sc = new SparkContext("local", "test") val intEstimator = new LabelEstimator[Int, Int, String] { def fit(data: RDD[Int], labels: RDD[String]): Transformer[Int, Int] = { val first = data.first() val label = labels.first().hashCode Transformer(x => x + first + label) } } val trainData = sc.parallelize(Seq(32, 94, 12)) val trainLabels = sc.parallelize(Seq("sjkfdl", "iw", "432")) val testData = sc.parallelize(Seq(42, 58, 61)) val pipeline = intEstimator.withData(trainData, trainLabels) val offset = 32 + "sjkfdl".hashCode assert(pipeline.apply(testData).get().collect().toSeq === Seq(42 + offset, 58 + offset, 61 + offset)) } test("LabelEstimator fit pipeline data") { sc = new SparkContext("local", "test") val dataTransformer = Transformer[Int, Int](_ * 2) val labelTransformer = Transformer[String, String](_ + "hi") val intEstimator = new LabelEstimator[Int, Int, String] { def fit(data: RDD[Int], labels: RDD[String]): Transformer[Int, Int] = { val first = data.first() val label = labels.first().hashCode Transformer(x => x + first + label) } } val trainData = sc.parallelize(Seq(32, 94, 12)) val trainLabels = sc.parallelize(Seq("sjkfdl", "iw", "432")) val testData = sc.parallelize(Seq(42, 58, 61)) val pipeline = intEstimator.withData(dataTransformer(trainData), labelTransformer(trainLabels)) val offset = 64 + "sjkfdlhi".hashCode assert(pipeline.apply(testData).get().collect().toSeq === Seq(42 + offset, 58 + offset, 61 + offset)) } }
Example 75
Source File: KMeansPlusPlusSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.learning import breeze.linalg._ import org.apache.spark.SparkContext import org.scalatest.FunSuite import keystoneml.pipelines._ import keystoneml.utils.{MatrixUtils, Stats} import keystoneml.workflow.PipelineContext class KMeansPlusPlusSuite extends FunSuite with PipelineContext with Logging { test("K-Means++ Single Center") { sc = new SparkContext("local", "test") val k = 1 val data = sc.parallelize(Array( DenseVector[Double](1.0, 2.0, 6.0), DenseVector[Double](1.0, 3.0, 0.0), DenseVector[Double](1.0, 4.0, 6.0) )) val center = DenseVector[Double](1.0, 3.0, 4.0).asDenseMatrix val kMeans = KMeansPlusPlusEstimator(k, maxIterations = 1).fit(data) assert(Stats.aboutEq(kMeans.means, center)) val kMeans10 = KMeansPlusPlusEstimator(k, maxIterations = 10).fit(data) assert(Stats.aboutEq(kMeans.means, center)) val out = kMeans.apply(data).collect() } test("K-Means++ Two Centers") { sc = new SparkContext("local", "test") val k = 2 val data = sc.parallelize(Array( DenseVector[Double](1.0, 2.0, 6.0), DenseVector[Double](1.0, 3.0, 0.0), DenseVector[Double](1.0, 4.0, 6.0), DenseVector[Double](1.0, 1.0, 0.0) )) val centers = Set( DenseVector[Double](1.0, 2.0, 0.0), DenseVector[Double](1.0, 3.0, 6.0) ) val kMeans = KMeansPlusPlusEstimator(k, maxIterations = 10).fit(data) val fitCenters = MatrixUtils.matrixToRowArray(kMeans.means).toSet assert(fitCenters === centers ) val kMeans5 = KMeansPlusPlusEstimator(k, maxIterations = 5).fit(data) val fitCenters5 = MatrixUtils.matrixToRowArray(kMeans5.means).toSet assert(fitCenters5 === centers ) val out = kMeans.apply(data).collect() } test("K-Means Transformer") { sc = new SparkContext("local", "test") val data = Array( DenseVector[Double](1.0, 2.0, 6.0), DenseVector[Double](1.0, 3.0, 0.0), DenseVector[Double](1.0, 4.0, 6.0), DenseVector[Double](1.0, 1.0, 0.0) ) val centers = MatrixUtils.rowsToMatrix(Array( DenseVector[Double](1.0, 2.0, 0.0), DenseVector[Double](1.0, 3.0, 6.0) )) val clusterOne = DenseVector[Double](1.0, 0.0) val clusterTwo = DenseVector[Double](0.0, 1.0) val assignments = Seq(clusterTwo, clusterOne, clusterTwo, clusterOne) val kMeans = KMeansModel(centers) // Test Single Apply assert(kMeans.apply(DenseVector[Double](1.0, 3.0, 0.0)) === clusterOne) assert(kMeans.apply(DenseVector[Double](1.0, 1.0, 0.0)) === clusterOne) assert(kMeans.apply(DenseVector[Double](1.0, 2.0, 6.0)) === clusterTwo) assert(kMeans.apply(DenseVector[Double](1.0, 4.0, 6.0)) === clusterTwo) // Test Matrix Apply assert(kMeans.apply(MatrixUtils.rowsToMatrix(data)) === MatrixUtils.rowsToMatrix(assignments)) // Test RDD Apply assert(kMeans.apply(sc.parallelize(data)).collect().toSeq === assignments) } }
Example 76
Source File: KernelModelSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.learning import breeze.linalg._ import org.apache.spark.SparkContext import org.scalatest.FunSuite import keystoneml.workflow.PipelineContext import keystoneml.utils.{MatrixUtils, Stats} class KernelModelSuite extends FunSuite with PipelineContext { test("KernelModel XOR test") { sc = new SparkContext("local", "test") val x = Array(DenseVector(-1.0, -1.0), DenseVector(1.0, 1.0), DenseVector(-1.0, 1.0),DenseVector(1.0, -1.0)) val xTest = Array(DenseVector(-1.0, -1.0), DenseVector(1.0, 1.0), DenseVector(-1.0, 1.0)) val y = Array(DenseVector(0.0, 1.0), DenseVector(0.0, 1.0), DenseVector(1.0, 0.0), DenseVector(1.0, 0.0)) val yTest = Array(DenseVector(0.0, 1.0), DenseVector(0.0, 1.0), DenseVector(1.0, 0.0)) val xRDD = sc.parallelize(x, 2) val yRDD = sc.parallelize(y, 2) val xTestRDD = sc.parallelize(xTest, 2) val gaussian = new GaussianKernelGenerator(10) // Set block size to number of data points so no blocking happens val clf = new KernelRidgeRegression(gaussian, 0, 4, 2) val kernelModel = clf.fit(xRDD, yRDD) val yHat = kernelModel(xTestRDD).collect() // Fit should be good val delta = MatrixUtils.rowsToMatrix(yHat) - MatrixUtils.rowsToMatrix(yTest) delta :*= delta println("SUM OF DELTA1 " + sum(delta)) assert(Stats.aboutEq(sum(delta), 0, 1e-4)) } test("KernelModel XOR blocked test") { sc = new SparkContext("local", "test") val x = Array(DenseVector(-1.0, -1.0), DenseVector(1.0, 1.0), DenseVector(-1.0, 1.0),DenseVector(1.0, -1.0)) val xTest = Array(DenseVector(-1.0, -1.0), DenseVector(1.0, 1.0), DenseVector(-1.0, 1.0)) val y = Array(DenseVector(0.0, 1.0), DenseVector(0.0, 1.0), DenseVector(1.0, 0.0), DenseVector(1.0, 0.0)) val yTest = Array(DenseVector(0.0, 1.0), DenseVector(0.0, 1.0), DenseVector(1.0, 0.0)) val xRDD = sc.parallelize(x, 2) val yRDD = sc.parallelize(y, 2) val xTestRDD = sc.parallelize(xTest, 2) val gaussian = new GaussianKernelGenerator(10) // Set block size to half number of data points so blocking happens val clf = new KernelRidgeRegression(gaussian, 0, 2, 2) val kernelModel = clf.fit(xRDD, yRDD) val yHat = kernelModel(xTestRDD).collect() // Fit should be good val delta = MatrixUtils.rowsToMatrix(yHat) - MatrixUtils.rowsToMatrix(yTest) delta :*= delta assert(Stats.aboutEq(sum(delta), 0, 1e-4)) } }
Example 77
Source File: BlockLinearMapperSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.learning import breeze.linalg.{DenseVector, DenseMatrix} import breeze.stats.distributions.Rand import keystoneml.workflow.PipelineContext import scala.collection.mutable.ArrayBuffer import org.scalatest.FunSuite import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import keystoneml.pipelines._ import keystoneml.utils.Stats class BlockLinearMapperSuite extends FunSuite with PipelineContext with Logging { test("BlockLinearMapper transformation") { sc = new SparkContext("local", "test") val inDims = 1000 val outDims = 100 val numChunks = 5 val numPerChunk = inDims/numChunks val mat = DenseMatrix.rand(inDims, outDims, Rand.gaussian) val vec = DenseVector.rand(inDims, Rand.gaussian) val intercept = DenseVector.rand(outDims, Rand.gaussian) val splitVec = (0 until numChunks).map(i => vec((numPerChunk*i) until (numPerChunk*i + numPerChunk))) val splitMat = (0 until numChunks).map(i => mat((numPerChunk*i) until (numPerChunk*i + numPerChunk), ::)) val linearMapper = new LinearMapper[DenseVector[Double]](mat, Some(intercept)) val blockLinearMapper = new BlockLinearMapper(splitMat, numPerChunk, Some(intercept)) val linearOut = linearMapper(vec) // Test with intercept assert(Stats.aboutEq(blockLinearMapper(vec), linearOut, 1e-4)) // Test the apply and evaluate call val blmOuts = new ArrayBuffer[RDD[DenseVector[Double]]] val splitVecRDDs = splitVec.map { vec => sc.parallelize(Seq(vec), 1) } blockLinearMapper.applyAndEvaluate(splitVecRDDs, (predictedValues: RDD[DenseVector[Double]]) => { blmOuts += predictedValues () } ) // The last blmOut should match the linear mapper's output assert(Stats.aboutEq(blmOuts.last.collect()(0), linearOut, 1e-4)) } }
Example 78
Source File: LinearMapperSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.learning import breeze.linalg._ import edu.berkeley.cs.amplab.mlmatrix.RowPartitionedMatrix import keystoneml.nodes.stats.StandardScaler import org.apache.spark.SparkContext import org.scalatest.FunSuite import keystoneml.pipelines.Logging import keystoneml.utils.{TestUtils, MatrixUtils, Stats} import keystoneml.workflow.PipelineContext class LinearMapperSuite extends FunSuite with PipelineContext with Logging { test("Solve and apply a linear system") { sc = new SparkContext("local", "test") // Create the data. val A = TestUtils.createRandomMatrix(sc, 128, 5, 4) val x = DenseVector(5.0, 4.0, 3.0, 2.0, -1.0).toDenseMatrix val b = A.mapPartitions(part => part * x.t) val Aary = A.rdd.flatMap(part => MatrixUtils.matrixToRowArray(part.mat).toIterator) val bary = b.rdd.flatMap(part => MatrixUtils.matrixToRowArray(part.mat).toIterator) val mapper = new LinearMapEstimator().fit(Aary, bary) assert(Stats.aboutEq(mapper.x, x.t), "Coefficients from the solve must match the hand-created model.") val point = DenseVector(2.0, -3.0, 2.0, 3.0, 5.0) assert(Stats.aboutEq(mapper(sc.parallelize(Seq(point))).first()(0), 5.0), "Linear model applied to a point should be 5.0") val bt = mapper(Aary) assert(Stats.aboutEq(bt.collect()(0), bary.collect()(0)), "Linear model applied to input should be the same as training points.") } test("LocalLeastSquaresEstimator doesn't crash") { sc = new SparkContext("local", "test") // Create the data. val A = TestUtils.createRandomMatrix(sc, 50, 400, 4) val x = DenseVector(5.0, 4.0, 3.0, 2.0, -1.0).toDenseMatrix val b = A.mapPartitions(part => DenseMatrix.rand(part.rows, 3)) val Aary = A.rdd.flatMap(part => MatrixUtils.matrixToRowArray(part.mat).toIterator) val bary = b.rdd.flatMap(part => MatrixUtils.matrixToRowArray(part.mat).toIterator) val mapper = new LocalLeastSquaresEstimator(1e-2).fit(Aary, bary) assert(mapper.x.rows === 400) assert(mapper.x.cols === 3) } test("Solve a dense linear system (fit intercept) using local least squares") { sc = new SparkContext("local", "test") // Create the data. val A = TestUtils.createRandomMatrix(sc, 128, 5, 4) val x = DenseMatrix((5.0, 4.0, 3.0, 2.0, -1.0), (3.0, -1.0, 2.0, -2.0, 1.0)) val dataMean = DenseVector(1.0, 0.0, 1.0, 2.0, 0.0) val extraBias = DenseVector(3.0, 4.0) val initialAary = A.rdd.flatMap(part => MatrixUtils.matrixToRowArray(part.mat).toIterator) val meanScaler = new StandardScaler(normalizeStdDev = false).fit(initialAary) val Aary = meanScaler.apply(initialAary).map(_ + dataMean) val bary = Aary.map(a => (x * (a - dataMean)) + extraBias) val mapper = new LocalLeastSquaresEstimator(0).fit(Aary, bary) val trueResult = MatrixUtils.rowsToMatrix(bary.collect()) val solverResult = MatrixUtils.rowsToMatrix(mapper(Aary).collect()) assert(Stats.aboutEq(trueResult, solverResult, 1e-5), "Results from the solve must match the hand-created model.") assert(Stats.aboutEq(mapper.x, x.t, 1e-6), "Model weights from the solve must match the hand-created model.") assert(Stats.aboutEq(mapper.bOpt.get, extraBias, 1e-6), "Learned intercept must match the hand-created model.") assert(Stats.aboutEq(mapper.featureScaler.get.mean, dataMean, 1e-6), "Learned intercept must match the hand-created model.") } }
Example 79
Source File: LinearDiscriminantAnalysisSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.learning import breeze.linalg._ import breeze.stats.distributions.{Multinomial, Uniform, Gaussian} import keystoneml.nodes.stats.StandardScaler import org.apache.spark.SparkContext import org.scalatest.FunSuite import keystoneml.pipelines.Logging import keystoneml.utils.{TestUtils, MatrixUtils, Stats} import keystoneml.workflow.PipelineContext class LinearDiscriminantAnalysisSuite extends FunSuite with PipelineContext with Logging { test("Solve Linear Discriminant Analysis on the Iris Dataset") { sc = new SparkContext("local", "test") // Uses the Iris flower dataset val irisData = sc.parallelize(TestUtils.loadFile("iris.data")) val trainData = irisData.map(_.split(",").dropRight(1).map(_.toDouble)).map(new DenseVector(_)) val features = new StandardScaler().fit(trainData).apply(trainData) val labels = irisData.map(_ match { case x if x.endsWith("Iris-setosa") => 1 case x if x.endsWith("Iris-versicolor") => 2 case x if x.endsWith("Iris-virginica") => 3 }) val lda = new LinearDiscriminantAnalysis(2) val out = lda.fit(features, labels) // Correct output taken from http://sebastianraschka.com/Articles/2014_python_lda.html#introduction logInfo(s"\n${out.x}") val majorVector = DenseVector(-0.1498, -0.1482, 0.8511, 0.4808) val minorVector = DenseVector(0.0095, 0.3272, -0.5748, 0.75) // Note that because eigenvectors can be reversed and still valid, we allow either direction assert(Stats.aboutEq(out.x(::, 0), majorVector, 1E-4) || Stats.aboutEq(out.x(::, 0), majorVector * -1.0, 1E-4)) assert(Stats.aboutEq(out.x(::, 1), minorVector, 1E-4) || Stats.aboutEq(out.x(::, 1), minorVector * -1.0, 1E-4)) } test("Check LDA output for a diagonal covariance") { sc = new SparkContext("local", "test") val matRows = 1000 val matCols = 10 val dimRed = 5 // Generate a random Gaussian matrix. val gau = new Gaussian(0.0, 1.0) val randMatrix = new DenseMatrix(matRows, matCols, gau.sample(matRows*matCols).toArray) // Parallelize and estimate the LDA. val data = sc.parallelize(MatrixUtils.matrixToRowArray(randMatrix)) val labels = data.map(x => Multinomial(DenseVector(0.2, 0.2, 0.2, 0.2, 0.2)).draw(): Int) val lda = new LinearDiscriminantAnalysis(dimRed).fit(data, labels) // Apply LDA to the input data. val redData = lda(data) val redMat = MatrixUtils.rowsToMatrix(redData.collect) // Compute its covariance. val redCov = cov(redMat) log.info(s"Covar\n$redCov") // The covariance of the dimensionality reduced matrix should be diagonal. for ( x <- 0 until dimRed; y <- 0 until dimRed if x != y ) { assert(Stats.aboutEq(redCov(x,y), 0.0, 1e-6), s"LDA Matrix should be 0 off-diagonal. $x,$y = ${redCov(x,y)}") } } }
Example 80
Source File: TermFrequencySuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.misc import keystoneml.nodes.stats.TermFrequency import org.apache.spark.SparkContext import org.scalatest.FunSuite import keystoneml.workflow.PipelineContext class TermFrequencySuite extends FunSuite with PipelineContext { test("term frequency of simple strings") { sc = new SparkContext("local", "test") val in = Seq(Seq[Any]("b", "a", "c", "b", "b", "a", "b")) val out = TermFrequency().apply(sc.parallelize(in)).first().toMap assert(out === Map("a" -> 2, "b" -> 4, "c" -> 1)) } test("term frequency of varying types") { sc = new SparkContext("local", "test") val in = Seq(Seq("b", "a", "c", ("b", "b"), ("b", "b"), 12, 12, "a", "b", 12)) val out = TermFrequency().apply(sc.parallelize(in)).first().toMap assert(out === Map("a" -> 2, "b" -> 2, "c" -> 1, ("b", "b") -> 2, 12 -> 3)) } test("log term frequency") { sc = new SparkContext("local", "test") val in = Seq(Seq[Any]("b", "a", "c", "b", "b", "a", "b")) val out = TermFrequency(x => math.log(x + 1)).apply(sc.parallelize(in)).first().toMap assert(out === Map("a" -> math.log(3), "b" -> math.log(5), "c" -> math.log(2))) } }
Example 81
Source File: SparseFeatureVectorizerSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.misc import keystoneml.nodes.util.{SparseFeatureVectorizer, AllSparseFeatures, CommonSparseFeatures} import org.apache.spark.SparkContext import org.scalatest.FunSuite import keystoneml.pipelines.Logging import keystoneml.workflow.PipelineContext class SparseFeatureVectorizerSuite extends FunSuite with PipelineContext with Logging { test("sparse feature vectorization") { sc = new SparkContext("local", "test") val featureVectorizer = new SparseFeatureVectorizer(Map("First" -> 0, "Second" -> 1, "Third" -> 2)) val test = Seq(("Third", 4.0), ("Fourth", 6.0), ("First", 1.0)) val vector = featureVectorizer.apply(sc.parallelize(Seq(test))).first() assert(vector.size == 3) assert(vector(0) == 1) assert(vector(1) == 0) assert(vector(2) == 4) } test("all sparse feature selection") { sc = new SparkContext("local", "test") val train = sc.parallelize(List(Seq(("First", 0.0), ("Second", 6.0)), Seq(("Third", 3.0), ("Second", 4.0)))) val featureVectorizer = AllSparseFeatures().fit(train.map(x => x)) // The selected features should now be "First", "Second", and "Third" val test = Seq(("Third", 4.0), ("Fourth", 6.0), ("First", 1.0)) val out = featureVectorizer.apply(sc.parallelize(Seq(test))).first().toArray assert(out === Array(1.0, 0.0, 4.0)) } test("common sparse feature selection") { sc = new SparkContext("local", "test") val train = sc.parallelize(List( Seq(("First", 0.0), ("Second", 6.0)), Seq(("Third", 3.0), ("Second", 4.8)), Seq(("Third", 7.0), ("Fourth", 5.0)), Seq(("Fifth", 5.0), ("Second", 7.3)) )) val featureVectorizer = CommonSparseFeatures(2).fit(train.map(x => x)) // The selected features should now be "Second", and "Third" val test = Seq(("Third", 4.0), ("Seventh", 8.0), ("Second", 1.3), ("Fourth", 6.0), ("First", 1.0)) val out = featureVectorizer.apply(sc.parallelize(Seq(test))).first().toArray assert(out === Array(1.3, 4.0)) } }
Example 82
Source File: LinearRectifierSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.stats import breeze.linalg.DenseMatrix import breeze.stats.distributions.Rand import org.apache.spark.SparkContext import org.scalatest.FunSuite import keystoneml.pipelines._ import keystoneml.utils.{TestUtils, MatrixUtils} import keystoneml.workflow.PipelineContext class LinearRectifierSuite extends FunSuite with PipelineContext with Logging { test("Test MaxVal") { sc = new SparkContext("local", "test") val matrixParts = TestUtils.createRandomMatrix(sc, 128, 16, 4).rdd.map(_.mat) val x = matrixParts.flatMap(y => MatrixUtils.matrixToRowArray(y)) val y = x.map(r => r.forall(_ >= 0.0)) val valmaxNode = LinearRectifier() val maxy = valmaxNode.apply(x).map(r => r.forall(_ >= 0.0)) //The random matrix should *not* all be >= 0 assert(!y.reduce {(a,b) => a | b}) //The valmax'ed random matrix *should* all be >= 0. assert(maxy.reduce {(a,b) => a | b}) } }
Example 83
Source File: PaddedFFTSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.stats import breeze.linalg._ import org.apache.spark.SparkContext import org.scalatest.FunSuite import keystoneml.pipelines.Logging import keystoneml.utils.Stats import keystoneml.workflow.PipelineContext class PaddedFFTSuite extends FunSuite with PipelineContext with Logging { test("Test PaddedFFT node") { sc = new SparkContext("local", "test") // Set up a test matrix. val ones = DenseVector.zeros[Double](100) val twos = DenseVector.zeros[Double](100) ones(0) = 1.0 twos(2) = 1.0 val x = sc.parallelize(Seq(twos, ones)) val fftd = PaddedFFT().apply(x).collect() val twosout = fftd(0) val onesout = fftd(1) // Proof by agreement w/ R: Re(fft(c(0, 0, 1, rep(0, 125)))) assert(twosout.length === 64) assert(Stats.aboutEq(twosout(0), 1.0)) assert(Stats.aboutEq(twosout(16), 0.0)) assert(Stats.aboutEq(twosout(32), -1.0)) assert(Stats.aboutEq(twosout(48), 0.0)) // Proof by agreement w/ R: Re(fft(c(1, rep(0, 127)))) assert(Stats.aboutEq(onesout, DenseVector.ones[Double](64))) } }
Example 84
Source File: CoreNLPFeatureExtractorSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.nlp import org.apache.spark.SparkContext import org.scalatest.FunSuite import keystoneml.pipelines.Logging import keystoneml.workflow.PipelineContext class CoreNLPFeatureExtractorSuite extends FunSuite with PipelineContext with Logging { test("lemmatization") { sc = new SparkContext("local", "test") val text = "jumping snakes lakes oceans hunted" val tokens = CoreNLPFeatureExtractor(1 to 3).apply(sc.parallelize(Seq(text))).first().toSet // Make sure at least very simple cases were lemmatized assert(tokens.contains("jump")) assert(tokens.contains("snake")) assert(tokens.contains("lake")) assert(tokens.contains("ocean")) assert(tokens.contains("hunt")) // Assert the unlemmatized tokens are no longer there assert(!tokens.contains("jumping")) assert(!tokens.contains("snakes")) assert(!tokens.contains("oceans")) assert(!tokens.contains("lakes")) assert(!tokens.contains("hunted")) } test("entity extraction") { sc = new SparkContext("local", "test") val text = "John likes cake and he lives in Florida" val tokens = CoreNLPFeatureExtractor(1 to 3).apply(sc.parallelize(Seq(text))).first().toSet // Make sure at least very simple entities were identified and extracted assert(tokens.contains("PERSON")) assert(tokens.contains("LOCATION")) // Assert the original tokens are no longer there assert(!tokens.contains("John")) assert(!tokens.contains("Florida")) } test("1-2-3-grams") { sc = new SparkContext("local", "test") val text = "a b c d" val tokens = CoreNLPFeatureExtractor(1 to 3).apply(sc.parallelize(Seq(text))).first().toSet // Make sure expected unigrams appear assert(tokens.contains("a")) assert(tokens.contains("b")) assert(tokens.contains("c")) assert(tokens.contains("d")) // Make sure expected bigrams appear assert(tokens.contains("a b")) assert(tokens.contains("b c")) assert(tokens.contains("c d")) // Make sure expected 3-grams appear assert(tokens.contains("a b c")) assert(tokens.contains("b c d")) } }
Example 85
Source File: StringUtilsSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.nlp import org.apache.spark.SparkContext import org.scalatest.FunSuite import keystoneml.workflow.PipelineContext class StringUtilsSuite extends FunSuite with PipelineContext { val stringToManip = Array(" The quick BROWN fo.X ", " ! !.,)JumpeD. ovER the LAZy DOG.. ! ") test("trim") { sc = new SparkContext("local", "test") val out = Trim.apply(sc.parallelize(stringToManip, 1)).collect().toSeq assert(out === Seq("The quick BROWN fo.X", "! !.,)JumpeD. ovER the LAZy DOG.. !")) } test("lower case") { sc = new SparkContext("local", "test") val out = LowerCase().apply(sc.parallelize(stringToManip, 1)).collect().toSeq assert(out === Seq(" the quick brown fo.x ", " ! !.,)jumped. over the lazy dog.. ! ")) } test("tokenizer") { sc = new SparkContext("local", "test") val out = Tokenizer().apply(sc.parallelize(stringToManip, 1)).collect().toSeq assert(out === Seq(Seq("", "The", "quick", "BROWN", "fo", "X"), Seq("", "JumpeD", "ovER", "the", "LAZy", "DOG"))) } }
Example 86
Source File: TopKClassifierSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.util import breeze.linalg.DenseVector import org.apache.spark.SparkContext import org.scalatest.FunSuite import keystoneml.workflow.PipelineContext class TopKClassifierSuite extends FunSuite with PipelineContext { test("top k classifier, k <= vector size") { sc = new SparkContext("local", "test") assert(TopKClassifier(2).apply(DenseVector(-10.0, 42.4, -43.0, 23.0)) === Array(1, 3)) assert(TopKClassifier(4).apply(DenseVector(Double.MinValue, Double.MaxValue, 12.0, 11.0, 10.0)) === Array(1, 2, 3, 4)) assert(TopKClassifier(3).apply(DenseVector(3.0, -23.2, 2.99)) === Array(0, 2, 1)) } test("top k classifier, k > vector size") { sc = new SparkContext("local", "test") assert(TopKClassifier(5).apply(DenseVector(-10.0, 42.4, -43.0, 23.0)) === Array(1, 3, 0, 2)) assert(TopKClassifier(2).apply(DenseVector(Double.MinValue)) === Array(0)) assert(TopKClassifier(20).apply(DenseVector(3.0, -23.2, 2.99)) === Array(0, 2, 1)) } }
Example 87
Source File: VOCLoaderSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.loaders import org.scalatest.FunSuite import org.apache.spark.SparkContext import keystoneml.utils.TestUtils import keystoneml.workflow.PipelineContext class VOCLoaderSuite extends FunSuite with PipelineContext { test("load a sample of VOC data") { sc = new SparkContext("local", "test") val dataPath = TestUtils.getTestResourceFileName("images/voc") val labelsPath = TestUtils.getTestResourceFileName("images/voclabels.csv") val imgs = VOCLoader(sc, VOCDataPath(dataPath, "VOCdevkit/VOC2007/JPEGImages/", Some(1)), VOCLabelPath(labelsPath)).collect() // We should have 10 images assert(imgs.length === 10) // There should be one file whose name ends with "000104.jpg" val personMonitor = imgs.filter(_.filename.get.endsWith("000104.jpg")) assert(personMonitor.length === 1) // It should have two labels, 14 and 19. assert(personMonitor(0).label.contains(14) && personMonitor(0).label.contains(19)) // There should be two 13 labels total and 9 should be distinct. assert(imgs.map(_.label).flatten.length === 13) assert(imgs.map(_.label).flatten.distinct.length === 9) } }
Example 88
Source File: ImageNetLoaderSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.loaders import org.scalatest.FunSuite import org.apache.spark.SparkContext import keystoneml.utils.TestUtils import keystoneml.workflow.PipelineContext class ImageNetLoaderSuite extends FunSuite with PipelineContext { test("load a sample of imagenet data") { sc = new SparkContext("local", "test") val dataPath = TestUtils.getTestResourceFileName("images/imagenet") val labelsPath = TestUtils.getTestResourceFileName("images/imagenet-test-labels") val imgs = ImageNetLoader.apply(sc, dataPath, labelsPath).collect() // We should have 5 images assert(imgs.length === 5) // The images should all have label 12 assert(imgs.map(_.label).distinct.length === 1) assert(imgs.map(_.label).distinct.head === 12) // The image filenames should begin with n15075141 assert(imgs.forall(_.filename.get.startsWith("n15075141")), "Image filenames should be correct") } }
Example 89
Source File: StupidBackoffSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.pipelines.nlp import keystoneml.nodes.nlp._ import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.scalatest.FunSuite import keystoneml.workflow.PipelineContext import scala.collection.JavaConverters._ class StupidBackoffSuite extends FunSuite with PipelineContext { val data = Seq("Winter is coming", "Finals are coming", "Summer is coming really soon") def featurizer(orders: Seq[Int], mode: NGramsCountsMode.Value = NGramsCountsMode.Default) = { def feat(data: RDD[String]) = { NGramsCounts[String](mode).apply( (Tokenizer() andThen NGramsFeaturizer[String](orders)).apply(data).get) } feat _ } def requireNGramColocation[T, V]( ngrams: RDD[(NGram[T], V)], indexer: BackoffIndexer[T, NGram[T]]) = { ngrams.mapPartitions { part => val map = new java.util.HashMap[NGram[T], V]().asScala part.foreach { case (ngramId, count) => map.put(ngramId, count) } map.keySet.foreach { ngramId => var currNGram = ngramId while (indexer.ngramOrder(currNGram) > 2) { val context = indexer.removeCurrentWord(currNGram) require(map.contains(context), s"ngram $currNGram is not co-located with its context $context within same partition") currNGram = context } } Iterator.empty }.count() } test("end-to-end InitialBigramPartitioner") { sc = new SparkContext("local[4]", "StupidBackoffSuite") val corpus = sc.parallelize(data, 3) val ngrams = featurizer(2 to 5, NGramsCountsMode.NoAdd)(corpus) val unigrams = featurizer(1 to 1)(corpus) .collectAsMap() .map { case (key, value) => key.words(0) -> value } val stupidBackoff = StupidBackoffEstimator[String](unigrams).fit(ngrams) requireNGramColocation[String, Double](stupidBackoff.scoresRDD, new NGramIndexerImpl) } test("Stupid Backoff calculates correct scores") { sc = new SparkContext("local[4]", "StupidBackoffSuite") val corpus = sc.parallelize(data, 3) val ngrams = featurizer(2 to 5, NGramsCountsMode.NoAdd)(corpus) val unigrams = featurizer(1 to 1)(corpus) .collectAsMap() .map { case (key, value) => key.words(0) -> value } val lm = StupidBackoffEstimator[String](unigrams).fit(ngrams) assert(lm.score(new NGram(Seq("is", "coming"))) === 2.0 / 2.0) assert(lm.score(new NGram(Seq("is", "coming", "really"))) === 1.0 / 2.0) assert(lm.score(new NGram(Seq("is", "unseen-coming"))) === 0, "not equal to expected: bacoffed once & curr word unseen, so should be zero") assert(lm.score(new NGram(Seq("is-unseen", "coming"))) === lm.alpha * 3.0 / lm.numTokens, "not equal to expected: backoffed once, should be alpha * currWordCount / numTokens") } }
Example 90
Source File: HiSpeedRead.scala From spark-db2 with Apache License 2.0 | 5 votes |
import com.ibm.spark.ibmdataserver.Constants import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkContext, SparkConf} object HiSpeedRead { def main(args: Array[String]) { val DB2_CONNECTION_URL = "jdbc:db2://localhost:50700/sample:traceFile=C:\\1.txt;" val conf = new SparkConf().setMaster("local[2]").setAppName("read test") val sparkContext = new SparkContext(conf) val sqlContext = new SQLContext(sparkContext) Class.forName("com.ibm.db2.jcc.DB2Driver") val jdbcRdr = sqlContext.read.format("com.ibm.spark.ibmdataserver") .option("url", DB2_CONNECTION_URL) // .option(Constants.TABLE, tableName) .option("user", "pallavipr") .option("password", "9manjari") .option("dbtable", "employee") .load() jdbcRdr.show() } }
Example 91
Source File: MultiZippedPartitionRDD.scala From spark-vlbfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{Partition, SparkContext, TaskContext} import scala.reflect.ClassTag private[spark] class MultiZippedPartitionsRDD[A: ClassTag, V: ClassTag]( sc: SparkContext, var f: (List[Iterator[A]]) => Iterator[V], var rddList: List[RDD[A]], preservesPartitioning: Boolean = false) extends ZippedPartitionsBaseRDD[V](sc, rddList, preservesPartitioning) { override def compute(s: Partition, context: TaskContext): Iterator[V] = { val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions val iterList = rddList.zipWithIndex.map{ case (rdd: RDD[A], index: Int) => rdd.iterator(partitions(index), context) } f(iterList) } override def clearDependencies() { super.clearDependencies() rddList = null f = null } }
Example 92
Source File: ConcurrentHiveSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.scalatest.BeforeAndAfterAll import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.sql.hive.test.TestHiveContext class ConcurrentHiveSuite extends SparkFunSuite with BeforeAndAfterAll { ignore("multiple instances not supported") { test("Multiple Hive Instances") { (1 to 10).map { i => val conf = new SparkConf() conf.set("spark.ui.enabled", "false") val ts = new TestHiveContext(new SparkContext("local", s"TestSQLContext$i", conf)) ts.sparkSession.sql("SHOW TABLES").collect() ts.sparkSession.sql("SELECT * FROM src").collect() ts.sparkSession.sql("SHOW TABLES").collect() } } } }
Example 93
Source File: HiveContextCompatibilitySuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.scalatest.BeforeAndAfterEach import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} class HiveContextCompatibilitySuite extends SparkFunSuite with BeforeAndAfterEach { override protected val enableAutoThreadAudit = false private var sc: SparkContext = null private var hc: HiveContext = null override def beforeAll(): Unit = { super.beforeAll() sc = SparkContext.getOrCreate(new SparkConf().setMaster("local").setAppName("test")) HiveUtils.newTemporaryConfiguration(useInMemoryDerby = true).foreach { case (k, v) => sc.hadoopConfiguration.set(k, v) } hc = new HiveContext(sc) } override def afterEach(): Unit = { try { hc.sharedState.cacheManager.clearCache() hc.sessionState.catalog.reset() } finally { super.afterEach() } } override def afterAll(): Unit = { try { sc = null hc = null } finally { super.afterAll() } } test("basic operations") { val _hc = hc import _hc.implicits._ val df1 = (1 to 20).map { i => (i, i) }.toDF("a", "x") val df2 = (1 to 100).map { i => (i, i % 10, i % 2 == 0) }.toDF("a", "b", "c") .select($"a", $"b") .filter($"a" > 10 && $"b" > 6 && $"c") val df3 = df1.join(df2, "a") val res = df3.collect() val expected = Seq((18, 18, 8)).toDF("a", "x", "b").collect() assert(res.toSeq == expected.toSeq) df3.createOrReplaceTempView("mai_table") val df4 = hc.table("mai_table") val res2 = df4.collect() assert(res2.toSeq == expected.toSeq) } test("basic DDLs") { val _hc = hc import _hc.implicits._ val databases = hc.sql("SHOW DATABASES").collect().map(_.getString(0)) assert(databases.toSeq == Seq("default")) hc.sql("CREATE DATABASE mee_db") hc.sql("USE mee_db") val databases2 = hc.sql("SHOW DATABASES").collect().map(_.getString(0)) assert(databases2.toSet == Set("default", "mee_db")) val df = (1 to 10).map { i => ("bob" + i.toString, i) }.toDF("name", "age") df.createOrReplaceTempView("mee_table") hc.sql("CREATE TABLE moo_table (name string, age int)") hc.sql("INSERT INTO moo_table SELECT * FROM mee_table") assert( hc.sql("SELECT * FROM moo_table order by name").collect().toSeq == df.collect().toSeq.sortBy(_.getString(0))) val tables = hc.sql("SHOW TABLES IN mee_db").select("tableName").collect().map(_.getString(0)) assert(tables.toSet == Set("moo_table", "mee_table")) hc.sql("DROP TABLE moo_table") hc.sql("DROP TABLE mee_table") val tables2 = hc.sql("SHOW TABLES IN mee_db").select("tableName").collect().map(_.getString(0)) assert(tables2.isEmpty) hc.sql("USE default") hc.sql("DROP DATABASE mee_db CASCADE") val databases3 = hc.sql("SHOW DATABASES").collect().map(_.getString(0)) assert(databases3.toSeq == Seq("default")) } }
Example 94
Source File: ThriftServerTab.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.ui import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._ import org.apache.spark.ui.{SparkUI, SparkUITab} private[thriftserver] class ThriftServerTab(sparkContext: SparkContext) extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging { override val name = "JDBC/ODBC Server" val parent = getSparkUI(sparkContext) val listener = HiveThriftServer2.listener attachPage(new ThriftServerPage(this)) attachPage(new ThriftServerSessionPage(this)) parent.attachTab(this) def detach() { getSparkUI(sparkContext).detachTab(this) } } private[thriftserver] object ThriftServerTab { def getSparkUI(sparkContext: SparkContext): SparkUI = { sparkContext.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 95
Source File: SparkSQLEnv.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.io.PrintStream import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.internal.Logging import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils} import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION import org.apache.spark.util.Utils def stop() { logDebug("Shutting down Spark SQL Environment") // Stop the SparkContext if (SparkSQLEnv.sparkContext != null) { sparkContext.stop() sparkContext = null sqlContext = null } } }
Example 96
Source File: XSQLTestSparkSession.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.xsql.test import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.internal.SessionState import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION import org.apache.spark.sql.test.TestSparkSession import org.apache.spark.sql.xsql.XSQLSessionStateBuilder class XSQLTestSparkSession(sc: SparkContext) extends TestSparkSession(sc) { self => def this(sparkConf: SparkConf) { this( new SparkContext( "local[2]", "test-sql-context", sparkConf.set("spark.sql.testkey", "true").set(CATALOG_IMPLEMENTATION, "xsql"))) } @transient override lazy val sessionState: SessionState = { new XSQLSessionStateBuilder(this, None).build() } }
Example 97
Source File: SparkPlanner.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.SparkContext import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, FileSourceStrategy} import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Strategy import org.apache.spark.sql.internal.SQLConf class SparkPlanner( val sparkContext: SparkContext, val conf: SQLConf, val experimentalMethods: ExperimentalMethods) extends SparkStrategies { def numPartitions: Int = conf.numShufflePartitions override def strategies: Seq[Strategy] = experimentalMethods.extraStrategies ++ extraPlanningStrategies ++ ( PythonEvals :: DataSourceV2Strategy :: FileSourceStrategy :: DataSourceStrategy(conf) :: SpecialLimits :: Aggregation :: Window :: JoinSelection :: InMemoryScans :: BasicOperators :: Nil) def pruneFilterProject( projectList: Seq[NamedExpression], filterPredicates: Seq[Expression], prunePushedDownFilters: Seq[Expression] => Seq[Expression], scanBuilder: Seq[Attribute] => SparkPlan): SparkPlan = { val projectSet = AttributeSet(projectList.flatMap(_.references)) val filterSet = AttributeSet(filterPredicates.flatMap(_.references)) val filterCondition: Option[Expression] = prunePushedDownFilters(filterPredicates).reduceLeftOption(catalyst.expressions.And) // Right now we still use a projection even if the only evaluation is applying an alias // to a column. Since this is a no-op, it could be avoided. However, using this // optimization with the current implementation would change the output schema. // TODO: Decouple final output schema from expression evaluation so this copy can be // avoided safely. if (AttributeSet(projectList.map(_.toAttribute)) == projectSet && filterSet.subsetOf(projectSet)) { // When it is possible to just use column pruning to get the right projection and // when the columns of this projection are enough to evaluate all filter conditions, // just do a scan followed by a filter, with no extra project. val scan = scanBuilder(projectList.asInstanceOf[Seq[Attribute]]) filterCondition.map(FilterExec(_, scan)).getOrElse(scan) } else { val scan = scanBuilder((projectSet ++ filterSet).toSeq) ProjectExec(projectList, filterCondition.map(FilterExec(_, scan)).getOrElse(scan)) } } }
Example 98
Source File: DataSourceRDD.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.v2 import scala.reflect.ClassTag import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.v2.reader.InputPartition class DataSourceRDDPartition[T : ClassTag](val index: Int, val inputPartition: InputPartition[T]) extends Partition with Serializable class DataSourceRDD[T: ClassTag]( sc: SparkContext, @transient private val inputPartitions: Seq[InputPartition[T]]) extends RDD[T](sc, Nil) { override protected def getPartitions: Array[Partition] = { inputPartitions.zipWithIndex.map { case (inputPartition, index) => new DataSourceRDDPartition(index, inputPartition) }.toArray } override def compute(split: Partition, context: TaskContext): Iterator[T] = { val reader = split.asInstanceOf[DataSourceRDDPartition[T]].inputPartition .createPartitionReader() context.addTaskCompletionListener[Unit](_ => reader.close()) val iter = new Iterator[T] { private[this] var valuePrepared = false override def hasNext: Boolean = { if (!valuePrepared) { valuePrepared = reader.next() } valuePrepared } override def next(): T = { if (!hasNext) { throw new java.util.NoSuchElementException("End of stream") } valuePrepared = false reader.get() } } new InterruptibleIterator(context, iter) } override def getPreferredLocations(split: Partition): Seq[String] = { split.asInstanceOf[DataSourceRDDPartition[T]].inputPartition.preferredLocations() } }
Example 99
Source File: BasicWriteStatsTracker.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.FileNotFoundException import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.{SparkContext, TaskContext} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.util.SerializableConfiguration class BasicWriteJobStatsTracker( serializableHadoopConf: SerializableConfiguration, @transient val metrics: Map[String, SQLMetric]) extends WriteJobStatsTracker { override def newTaskInstance(): WriteTaskStatsTracker = { new BasicWriteTaskStatsTracker(serializableHadoopConf.value) } override def processStats(stats: Seq[WriteTaskStats]): Unit = { val sparkContext = SparkContext.getActive.get var numPartitions: Long = 0L var numFiles: Long = 0L var totalNumBytes: Long = 0L var totalNumOutput: Long = 0L val basicStats = stats.map(_.asInstanceOf[BasicWriteTaskStats]) basicStats.foreach { summary => numPartitions += summary.numPartitions numFiles += summary.numFiles totalNumBytes += summary.numBytes totalNumOutput += summary.numRows } metrics(BasicWriteJobStatsTracker.NUM_FILES_KEY).add(numFiles) metrics(BasicWriteJobStatsTracker.NUM_OUTPUT_BYTES_KEY).add(totalNumBytes) metrics(BasicWriteJobStatsTracker.NUM_OUTPUT_ROWS_KEY).add(totalNumOutput) metrics(BasicWriteJobStatsTracker.NUM_PARTS_KEY).add(numPartitions) val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toList) } } object BasicWriteJobStatsTracker { private val NUM_FILES_KEY = "numFiles" private val NUM_OUTPUT_BYTES_KEY = "numOutputBytes" private val NUM_OUTPUT_ROWS_KEY = "numOutputRows" private val NUM_PARTS_KEY = "numParts" def metrics: Map[String, SQLMetric] = { val sparkContext = SparkContext.getActive.get Map( NUM_FILES_KEY -> SQLMetrics.createMetric(sparkContext, "number of written files"), NUM_OUTPUT_BYTES_KEY -> SQLMetrics.createMetric(sparkContext, "bytes of written output"), NUM_OUTPUT_ROWS_KEY -> SQLMetrics.createMetric(sparkContext, "number of output rows"), NUM_PARTS_KEY -> SQLMetrics.createMetric(sparkContext, "number of dynamic part") ) } }
Example 100
Source File: SQLExecution.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import java.util.concurrent.ConcurrentHashMap import java.util.concurrent.atomic.AtomicLong import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.ui.{SparkListenerSQLExecutionEnd, SparkListenerSQLExecutionStart} object SQLExecution { val EXECUTION_ID_KEY = "spark.sql.execution.id" private val _nextExecutionId = new AtomicLong(0) private def nextExecutionId: Long = _nextExecutionId.getAndIncrement private val executionIdToQueryExecution = new ConcurrentHashMap[Long, QueryExecution]() def getQueryExecution(executionId: Long): QueryExecution = { executionIdToQueryExecution.get(executionId) } private val testing = sys.props.contains("spark.testing") private[sql] def checkSQLExecutionId(sparkSession: SparkSession): Unit = { val sc = sparkSession.sparkContext // only throw an exception during tests. a missing execution ID should not fail a job. if (testing && sc.getLocalProperty(EXECUTION_ID_KEY) == null) { // Attention testers: when a test fails with this exception, it means that the action that // started execution of a query didn't call withNewExecutionId. The execution ID should be // set by calling withNewExecutionId in the action that begins execution, like // Dataset.collect or DataFrameWriter.insertInto. throw new IllegalStateException("Execution ID should be set") } } def withSQLConfPropagated[T](sparkSession: SparkSession)(body: => T): T = { val sc = sparkSession.sparkContext // Set all the specified SQL configs to local properties, so that they can be available at // the executor side. val allConfigs = sparkSession.sessionState.conf.getAllConfs val originalLocalProps = allConfigs.collect { case (key, value) if key.startsWith("spark") => val originalValue = sc.getLocalProperty(key) sc.setLocalProperty(key, value) (key, originalValue) } try { body } finally { for ((key, value) <- originalLocalProps) { sc.setLocalProperty(key, value) } } } }
Example 101
Source File: ContinuousShuffleReadRDD.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous.shuffle import java.util.UUID import org.apache.spark.{Partition, SparkContext, SparkEnv, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.rpc.RpcAddress import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.NextIterator case class ContinuousShuffleReadPartition( index: Int, endpointName: String, queueSize: Int, numShuffleWriters: Int, epochIntervalMs: Long) extends Partition { // Initialized only on the executor, and only once even as we call compute() multiple times. lazy val (reader: ContinuousShuffleReader, endpoint) = { val env = SparkEnv.get.rpcEnv val receiver = new RPCContinuousShuffleReader( queueSize, numShuffleWriters, epochIntervalMs, env) val endpoint = env.setupEndpoint(endpointName, receiver) TaskContext.get().addTaskCompletionListener[Unit] { ctx => env.stop(endpoint) } (receiver, endpoint) } } class ContinuousShuffleReadRDD( sc: SparkContext, numPartitions: Int, queueSize: Int = 1024, numShuffleWriters: Int = 1, epochIntervalMs: Long = 1000, val endpointNames: Seq[String] = Seq(s"RPCContinuousShuffleReader-${UUID.randomUUID()}")) extends RDD[UnsafeRow](sc, Nil) { override protected def getPartitions: Array[Partition] = { (0 until numPartitions).map { partIndex => ContinuousShuffleReadPartition( partIndex, endpointNames(partIndex), queueSize, numShuffleWriters, epochIntervalMs) }.toArray } override def compute(split: Partition, context: TaskContext): Iterator[UnsafeRow] = { split.asInstanceOf[ContinuousShuffleReadPartition].reader.read() } }
Example 102
Source File: TestSQLContext.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.test import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.internal.{SessionState, SessionStateBuilder, SQLConf, WithTestConf} val overrideConfs: Map[String, String] = Map( // Fewer shuffle partitions to speed up testing. SQLConf.SHUFFLE_PARTITIONS.key -> "5") } private[sql] class TestSQLSessionStateBuilder( session: SparkSession, state: Option[SessionState]) extends SessionStateBuilder(session, state) with WithTestConf { override def overrideConfs: Map[String, String] = TestSQLContext.overrideConfs override def newBuilder: NewBuilder = new TestSQLSessionStateBuilder(_, _) }
Example 103
Source File: ExecutorNumListener.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.monitor import java.text.SimpleDateFormat import java.util import java.util.Date import java.util.concurrent.atomic.AtomicBoolean import com.fasterxml.jackson.annotation.JsonIgnore import org.apache.spark.SparkContext import org.apache.spark.internal.Logging import org.apache.spark.scheduler.{ SparkListener, SparkListenerExecutorAdded, SparkListenerExecutorRemoved } import org.apache.spark.util.kvstore.KVIndex class ExecutorNumListener extends SparkListener with Logging { lazy val kvstore = SparkContext.getActive.get.statusStore.store var initialized: AtomicBoolean = new AtomicBoolean(false) var lastPointTime: Long = new Date().getTime var recentEventTime: Long = new Date().getTime private val liveExecutors = new util.HashSet[String]() def initialize(): Unit = { SparkContext.getActive.map(_.ui).flatten.foreach { case ui => ui.attachTab(new ExecutorNumTab(ui)) ui.addStaticHandler("static", "/static/special") } } def maybeAddPoint(time: Long): Unit = { if (!initialized.get) { initialize() initialized.compareAndSet(false, true) } if (time - lastPointTime > 20 * 1000) { addPoint(recentEventTime) addPoint(time) lastPointTime = time } recentEventTime = time } def addPoint(time: Long): Unit = { val executorNum = liveExecutors.size kvstore.write(new ExecutorNumWrapper(new ExecutorNum( s"own ${executorNum} executors at ${new SimpleDateFormat("HH:mm:ss").format(new Date(time))}", IndexedSeq(time, executorNum)))) } override def onExecutorAdded(event: SparkListenerExecutorAdded): Unit = { liveExecutors.add(event.executorId) maybeAddPoint(event.time) } override def onExecutorRemoved(event: SparkListenerExecutorRemoved): Unit = { liveExecutors.remove(event.executorId) maybeAddPoint(event.time) } } private[spark] class ExecutorNumWrapper(val point: ExecutorNum) { @JsonIgnore @KVIndex def id: Long = point.value(0) } private[spark] class ExecutorNum(val name: String, val value: IndexedSeq[Long])
Example 104
Source File: ExtendableHiveContext.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.SparkContext import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.ParserDialect import org.apache.spark.sql.catalyst.analysis.{Analyzer, _} import org.apache.spark.sql.catalyst.optimizer.Optimizer import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.ui.SQLListener import org.apache.spark.sql.execution.{CacheManager, ExtractPythonUDFs} import org.apache.spark.sql.extension._ import org.apache.spark.sql.hive.client.{ClientInterface, ClientWrapper} import org.apache.spark.sql.sources.commands.hive.HiveEmulationCatalog @transient override protected[sql] lazy val analyzer: Analyzer = new Analyzer(catalog, functionRegistry, conf) { override val extendedResolutionRules = resolutionRules(this) ++ (catalog.ParquetConversions :: catalog.CreateTables :: catalog.PreInsertionCasts :: ExtractPythonUDFs :: ResolveHiveWindowFunction :: PreInsertCastAndRename :: Nil) override val extendedCheckRules = ExtendableHiveContext.this.extendedCheckRules(this) } @transient override protected[sql] lazy val optimizer: Optimizer = OptimizerFactory.produce( earlyBatches = optimizerEarlyBatches, mainBatchRules = optimizerMainBatchRules, postBatches = optimizerPostBatches ) @transient override protected[sql] val planner: SparkPlanner with HiveStrategies = new SparkPlanner with HiveStrategies with ExtendedPlanner { def baseStrategies(hiveContext: HiveContext): Seq[Strategy] = Seq( DataSourceStrategy, HiveCommandStrategy(self), HiveDDLStrategy, DDLStrategy, TakeOrderedAndProject, InMemoryScans, HiveTableScans, DataSinks, Scripts, Aggregation, LeftSemiJoin, EquiJoinSelection, BasicOperators, BroadcastNestedLoop, CartesianProduct, DefaultJoin ) override def strategies: Seq[Strategy] = self.strategies(this) ++ experimental.extraStrategies ++ baseStrategies(self) override val hiveContext = self } }
Example 105
Source File: SapHiveContext.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.SparkContext import org.apache.spark.sql.{CommonSapSQLContext, SQLContext} import org.apache.spark.sql.execution.CacheManager import org.apache.spark.sql.execution.ui.SQLListener import org.apache.spark.sql.hive.client.{ClientInterface, ClientWrapper} class SapHiveContext( @transient sparkContext: SparkContext, cacheManager: CacheManager, listener: SQLListener, @transient execHive: ClientWrapper, @transient metaHive: ClientInterface, isRootContext: Boolean) extends ExtendableHiveContext( sparkContext, cacheManager, listener, execHive, metaHive, isRootContext) with CommonSapSQLContext { def this(sc: SparkContext) = this(sc, new CacheManager, SQLContext.createListenerAndUI(sc), null, null, true) override def newSession(): HiveContext = new SapHiveContext( sparkContext = this.sparkContext, cacheManager = this.cacheManager, listener = this.listener, executionHive.newSession(), metadataHive.newSession(), isRootContext = false) }
Example 106
Source File: ExtendableSQLContext.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.extension import org.apache.spark.SparkContext import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.ParserDialect import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.optimizer.Optimizer import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.ExtractPythonUDFs import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources.commands.hive.HiveEmulationCatalog @transient override protected[sql] val planner = // HiveStrategies defines its own strategies, we should be back to SparkPlanner strategies new SparkPlanner with ExtendedPlanner { def baseStrategies: Seq[Strategy] = DataSourceStrategy :: DDLStrategy :: TakeOrderedAndProject :: Aggregation :: LeftSemiJoin :: EquiJoinSelection :: InMemoryScans :: BasicOperators :: BroadcastNestedLoop :: CartesianProduct :: DefaultJoin :: Nil override def strategies: Seq[Strategy] = self.strategies(this) ++ experimental.extraStrategies ++ baseStrategies } }
Example 107
Source File: BasicCurrencyConversionFunction.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.currency.basic import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.currency._ import org.apache.spark.sql.util.ValidatingPropertyMap._ import scala.util.Try protected object BasicCurrencyConversionConfig { private def updateRatesMapByTable(ratesTable: String, sqlContext: SQLContext): Unit = { val ratesTableData = sqlContext.sql(s"SELECT * FROM $ratesTable").collect() ratesTableData.foreach { row => val from = row.getString(0) val to = row.getString(1) val date = row.getString(2).replaceAll("-", "").toInt val rate = Try(row.getDecimal(3)).recover { case ex: ClassCastException => new java.math.BigDecimal(row.getDouble(3)) }.get ratesMap.put((from, to), date, rate) } } }
Example 108
Source File: SQLRunner.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package com.sap.spark.cli import java.io._ import org.apache.spark.sql.{DataFrame, Row, SQLContext} import org.apache.spark.{Logging, SparkContext} import scala.annotation.tailrec protected[cli] case class CLIOptions( sqlFiles: List[String] = Nil, output: Option[String] = None) def main(args: Array[String]): Unit = { def fail(msg: String = USAGE): Unit = { logError(msg) System.exit(1) } val opts = parseOpts(args.toList) val outputStream: OutputStream = opts.output match { case Some(filename) => new FileOutputStream(new File(filename)) case None => System.out } opts.sqlFiles .map((string: String) => new FileInputStream(new File(string))) .foreach(sql(_, outputStream)) } }
Example 109
Source File: GlobalSapSQLContext.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import java.io.File import com.sap.spark.util.TestUtils import com.sap.spark.{GlobalSparkContext, WithSQLContext} import org.apache.spark.SparkContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{BoundReference, Cast} import org.apache.spark.unsafe.types._ import org.apache.spark.sql.types._ import org.scalatest.Suite import scala.io.Source trait GlobalSapSQLContext extends GlobalSparkContext with WithSQLContext { self: Suite => override implicit def sqlContext: SQLContext = GlobalSapSQLContext._sqlc override protected def setUpSQLContext(): Unit = GlobalSapSQLContext.init(sc) override protected def tearDownSQLContext(): Unit = GlobalSapSQLContext.reset() def getDataFrameFromSourceFile(sparkSchema: StructType, path: File): DataFrame = { val conversions = sparkSchema.toSeq.zipWithIndex.map({ case (field, index) => Cast(BoundReference(index, StringType, nullable = true), field.dataType) }) val data = Source.fromFile(path) .getLines() .map({ line => val stringRow = InternalRow.fromSeq(line.split(",", -1).map(UTF8String.fromString)) Row.fromSeq(conversions.map({ c => c.eval(stringRow) })) }) val rdd = sc.parallelize(data.toSeq, numberOfSparkWorkers) sqlContext.createDataFrame(rdd, sparkSchema) } } object GlobalSapSQLContext { private var _sqlc: SQLContext = _ private def init(sc: SparkContext): Unit = if (_sqlc == null) { _sqlc = TestUtils.newSQLContext(sc) } private def reset(): Unit = { if (_sqlc != null) { _sqlc.catalog.unregisterAllTables() } } }
Example 110
Source File: WithSparkContext.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package com.sap.spark import com.sap.spark.util.TestUtils._ import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.{BeforeAndAfterAll, Suite} trait WithSparkContext extends BeforeAndAfterAll { self: Suite => override def beforeAll(): Unit = { try { super.beforeAll() setUpSparkContext() } catch { case ex: Throwable => tearDownSparkContext() throw ex } } override def afterAll(): Unit = { try { super.afterAll() } finally { tearDownSparkContext() } } conf.set("spark.sql.autoBroadcastJoinThreshold", "-1") conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.HttpBroadcastFactory") conf.set("spark.shuffle.spill", "false") conf.set("spark.shuffle.compress", "false") conf.set("spark.ui.enabled", "false") conf.set("spark.ui.showConsoleProgress", "false") } def sc: SparkContext protected def setUpSparkContext(): Unit protected def tearDownSparkContext(): Unit }
Example 111
Source File: GlobalSparkContext.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package com.sap.spark import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.{BeforeAndAfterAll, Suite} } } object GlobalSparkContext { @transient private var _sc: SparkContext = _ def init(sparkMaster: String, sparkConf: SparkConf): Unit = { if (_sc == null) { this.synchronized { if (_sc == null) { _sc = new SparkContext(sparkMaster, "test", sparkConf) } } } } def reset(): Unit = { if (_sc != null) { _sc.cancelAllJobs() } } def close(): Unit = { if (_sc != null) { _sc.stop() _sc = null } } }
Example 112
Source File: TestUtils.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package com.sap.spark.util import java.util.Locale import scala.io.Source import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.{Row, SQLContext, SapSQLContext} import org.apache.spark.sql.hive.SapHiveContext import org.apache.spark.sql.sources.sql.SqlLikeRelation import org.apache.spark.sql.sources.{BaseRelation, CatalystSource, Table} import org.apache.spark.sql.types.StructType import org.mockito.Matchers._ import org.mockito.Mockito._ import scala.tools.nsc.io.Directory import scala.util.{Failure, Success} def parsePTestFile(fileName: String): List[(String, String, String)] = { val filePath = getFileFromClassPath(fileName) val fileContents = Source.fromFile(filePath).getLines .map(p => p.stripMargin.trim) .filter(p => !p.isEmpty && !p.startsWith("//")) // filter empty rows and comments .mkString("\n") val p = new PTestFileParser // strip semicolons from query and parsed p(fileContents) match { case Success(lines) => lines.map { case (query, parsed, expect) => (stripSemicolon(query).trim, stripSemicolon(parsed).trim, expect.trim) } case Failure(ex) => throw ex } } private def stripSemicolon(sql: String): String = if (sql.endsWith(";")) { sql.substring(0, sql.length-1) } else { sql } def withTempDirectory[A](f: Directory => A): A = { val dir = Directory.makeTemp() try { f(dir) } finally { dir.deleteIfExists() } } }
Example 113
Source File: SQLRunnerSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package com.sap.spark.cli import java.io.{ByteArrayInputStream, ByteArrayOutputStream, InputStream} import org.apache.spark.SparkContext import org.apache.spark.sql.{GlobalSapSQLContext, SQLContext} import org.scalatest.{BeforeAndAfterEach, FunSuite, ShouldMatchers} // good call val goodOpts = SQLRunner.parseOpts(List("a.sql", "b.sql", "-o", "output.csv")) goodOpts.sqlFiles should be(List("a.sql", "b.sql")) goodOpts.output should be(Some("output.csv")) // bad call val badOpts = SQLRunner.parseOpts(List()) badOpts.sqlFiles should be(List()) badOpts.output should be(None) // ugly call val uglyOpts = SQLRunner.parseOpts(List("a.sql", "-o", "output.csv", "b.sql")) uglyOpts.sqlFiles should be(List("a.sql", "b.sql")) uglyOpts.output should be(Some("output.csv")) } def runSQLTest(input: String, expectedOutput: String): Unit = { val inputStream: InputStream = new ByteArrayInputStream(input.getBytes()) val outputStream = new ByteArrayOutputStream() SQLRunner.sql(inputStream, outputStream) val output = outputStream.toString output should be(expectedOutput) } test("can run dummy query") { val input = "SELECT 1;" val output = "1\n" runSQLTest(input, output) } test("can run multiple dummy queries") { val input = """ |SELECT 1;SELECT 2; |SELECT 3; """.stripMargin val output = "1\n2\n3\n" runSQLTest(input, output) } test("can run a basic example with tables") { val input = """ |SELECT * FROM DEMO_TABLE; |SELECT * FROM DEMO_TABLE LIMIT 1; |DROP TABLE DEMO_TABLE; """.stripMargin val output = "1,a\n2,b\n3,c\n1,a\n" runSQLTest(input, output) } test("can run an example with comments") { val input = """ |SELECT * FROM DEMO_TABLE; -- this is the first query |SELECT * FROM DEMO_TABLE LIMIT 1; |-- now let's drop a table |DROP TABLE DEMO_TABLE; """.stripMargin val output = "1,a\n2,b\n3,c\n1,a\n" runSQLTest(input, output) } }
Example 114
Source File: SapSQLEnv.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.sap.thriftserver import java.io.PrintStream import org.apache.spark.scheduler.StatsReportListener import org.apache.spark.sql.hive.{HiveContext, SapHiveContext} import org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver import org.apache.spark.sql.hive.thriftserver.SparkSQLEnv._ import org.apache.spark.util.Utils import org.apache.spark.{Logging, SparkConf, SparkContext} import scala.collection.JavaConversions._ object SapSQLEnv extends Logging { def init() { logDebug("Initializing SapSQLEnv") if (hiveContext == null) { logInfo("Creating SapSQLContext") val sparkConf = new SparkConf(loadDefaults = true) val maybeSerializer = sparkConf.getOption("spark.serializer") val maybeKryoReferenceTracking = sparkConf.getOption("spark.kryo.referenceTracking") // If user doesn't specify the appName, we want to get [SparkSQL::localHostName] instead of // the default appName [SparkSQLCLIDriver] in cli or beeline. val maybeAppName = sparkConf .getOption("spark.app.name") .filterNot(_ == classOf[SparkSQLCLIDriver].getName) sparkConf .setAppName(maybeAppName.getOrElse(s"SparkSQL::${Utils.localHostName()}")) .set("spark.serializer", maybeSerializer.getOrElse("org.apache.spark.serializer.KryoSerializer")) .set("spark.kryo.referenceTracking", maybeKryoReferenceTracking.getOrElse("false")) sparkContext = new SparkContext(sparkConf) sparkContext.addSparkListener(new StatsReportListener()) hiveContext = new SapHiveContext(sparkContext) hiveContext.metadataHive.setOut(new PrintStream(System.out, true, "UTF-8")) hiveContext.metadataHive.setInfo(new PrintStream(System.err, true, "UTF-8")) hiveContext.metadataHive.setError(new PrintStream(System.err, true, "UTF-8")) hiveContext.setConf("spark.sql.hive.version", HiveContext.hiveExecutionVersion) if (log.isDebugEnabled) { hiveContext.hiveconf.getAllProperties.toSeq.sorted.foreach { case (k, v) => logDebug(s"HiveConf var: $k=$v") } } } } }
Example 115
Source File: Preparator.scala From pio-template-sr with Apache License 2.0 | 5 votes |
package org.template.sr import org.apache.predictionio.controller.PPreparator import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import org.apache.spark.ml.feature.StandardScaler import org.apache.spark.sql.DataFrame import org.apache.spark.ml.feature.StandardScalerModel import org.apache.spark.sql.SQLContext import org.apache.spark.mllib.linalg.Vectors class PreparedData( val rows: DataFrame, val dsp: DataSourceParams, val ssModel: org.apache.spark.mllib.feature.StandardScalerModel ) extends Serializable class Preparator extends PPreparator[TrainingData, PreparedData] { def prepare(sc: SparkContext, trainingData: TrainingData): PreparedData = { val sqlContext = new SQLContext(sc) import sqlContext.implicits._ if (trainingData.dsp.useStandardScaler) { val training = trainingData.rows.map(x=>(x._1,x._2,Vectors.dense(x._3))).toDF("label", "censor", "features") val scaler = new StandardScaler().setInputCol("features").setOutputCol("scaledFeatures").setWithStd(trainingData.dsp.standardScalerWithStd).setWithMean(trainingData.dsp.standardScalerWithMean) val scalerModel = scaler.fit(training) val scaledData = scalerModel.transform(training) val s1 = scaledData.select("label","censor","scaledFeatures").withColumnRenamed("scaledFeatures","features") //Prepare old StandardScaler val oldScaler = new org.apache.spark.mllib.feature.StandardScaler(withMean = trainingData.dsp.standardScalerWithMean, withStd = trainingData.dsp.standardScalerWithStd) val oldSSModel = oldScaler.fit(trainingData.rows.map(x=>(Vectors.dense(x._3)))) new PreparedData(rows = s1, dsp = trainingData.dsp, ssModel = oldSSModel) } else { new PreparedData(rows = trainingData.rows.map(x=>(x._1,x._2,Vectors.dense(x._3))).toDF("label", "censor", "features"), dsp = trainingData.dsp, ssModel = null) } } }
Example 116
Source File: DataSource.scala From pio-template-sr with Apache License 2.0 | 5 votes |
package org.template.sr import org.apache.predictionio.controller.PDataSource import org.apache.predictionio.controller.EmptyEvaluationInfo import org.apache.predictionio.controller.EmptyActualResult import org.apache.predictionio.controller.Params import org.apache.predictionio.data.store.PEventStore import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import grizzled.slf4j.Logger case class DataSourceParams( val appName: String, val useStandardScaler: Boolean, val standardScalerWithStd: Boolean, val standardScalerWithMean: Boolean ) extends Params class DataSource(val dsp: DataSourceParams) extends PDataSource[TrainingData, EmptyEvaluationInfo, Query, EmptyActualResult] { @transient lazy val logger = Logger[this.type] override def readTraining(sc: SparkContext): TrainingData = { println("Gathering data from event server.") val rowsRDD: RDD[(Double, Double, Array[Double])] = PEventStore.find( appName = dsp.appName, entityType = Some("row"), startTime = None, eventNames = Some(List("$set")))(sc).map { event => try { (event.properties.get[Double]("label"), event.properties.get[Double]("censor"), event.properties.get[Array[Double]]("features")) } catch { case e: Exception => { logger.error(s"Failed to convert event ${event} of. Exception: ${e}.") throw e } } } new TrainingData(rowsRDD, dsp) } } class TrainingData( val rows: RDD[(Double, Double, Array[Double])], val dsp: DataSourceParams ) extends Serializable
Example 117
Source File: SRAlgorithm.scala From pio-template-sr with Apache License 2.0 | 5 votes |
package org.template.sr import org.apache.predictionio.controller.P2LAlgorithm import org.apache.predictionio.controller.Params import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import grizzled.slf4j.Logger import org.apache.spark.mllib.linalg.{Vectors,DenseVector} import org.apache.spark.ml.feature.StandardScalerModel import org.apache.spark.ml.regression.{AFTSurvivalRegression,AFTSurvivalRegressionModel} case class AlgorithmParams( val quantileProbabilities: Array[Double], val fitIntercept: Boolean, val maxIter: Int, val convTolerance: Double ) extends Params class SRModel( val aAFTSRModel: AFTSurvivalRegressionModel, val ssModel: org.apache.spark.mllib.feature.StandardScalerModel, val useStandardScaler: Boolean ) extends Serializable {} class SRAlgorithm(val ap: AlgorithmParams) extends P2LAlgorithm[PreparedData, SRModel, Query, PredictedResult] { @transient lazy val logger = Logger[this.type] def train(sc: SparkContext, data: PreparedData): SRModel = { println("Training SR model.") val aft = new AFTSurvivalRegression().setQuantileProbabilities(ap.quantileProbabilities).setQuantilesCol("quantiles").setFitIntercept(ap.fitIntercept).setMaxIter(ap.maxIter).setTol(ap.convTolerance) val model = aft.fit(data.rows) new SRModel(aAFTSRModel = model, ssModel=data.ssModel, useStandardScaler = data.dsp.useStandardScaler) } def predict(model: SRModel, query: Query): PredictedResult = { // val qryRow0 = Vectors.dense(query.features) val qryRow = if (model.useStandardScaler) { model.ssModel.transform(qryRow0) } else { qryRow0 } val score = model.aAFTSRModel.predict(qryRow) val quantilesVec = model.aAFTSRModel.predictQuantiles(qryRow) PredictedResult(coefficients = model.aAFTSRModel.coefficients.toArray, intercept = model.aAFTSRModel.intercept, scale = model.aAFTSRModel.scale, prediction = score, quantiles = quantilesVec.toArray) } }
Example 118
Source File: VLBFGS1.scala From spark-vl-bfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.optim import java.util.Random import scala.language.implicitConversions import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.optim.VectorFreeLBFGS.{Oracle, VectorSpace} import org.apache.spark.ml.optim.VectorRDDFunctions._ import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors} import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.storage.StorageLevel private def gradient(data: RDD[Array[LabeledPoint]], dx: RDD[Vector]): RDD[Vector] = { data.cartesian(dx).map { case (points, x) => val g = Vectors.zeros(x.size) points.foreach { case LabeledPoint(b, a) => val err = BLAS.dot(a, x) - b BLAS.axpy(err, a, g) } g }.treeSum() } def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("VLBFGS").setMaster("local[*]") val sc = new SparkContext(conf) sc.setCheckpointDir("/tmp/checkpoint") val n = 1000 val p = 100 val random = new Random(0L) val xExact = Vectors.dense(Array.fill(p)(random.nextDouble())) val data = RandomRDDs.normalVectorRDD(sc, n, p, 4, 11L).mapPartitionsWithIndex { (idx, part) => val random = new Random(100 + idx) part.map { v => val target = BLAS.dot(v, xExact) + 0.1 * random.nextGaussian() LabeledPoint(target, v) } }.glom() .cache() val x = solve(data).first() println(s"x_exact = $xExact") println(s"x_vlbfgs = $x") sc.stop() } }
Example 119
Source File: LocalSparkContext.scala From streamliner-examples with Apache License 2.0 | 5 votes |
package test.util import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.BeforeAndAfterEach import org.scalatest._ trait LocalSparkContext extends BeforeAndAfterEach { self: Suite => @transient private var _sc: SparkContext = _ val _sparkConf = new SparkConf(false) .set("spark.ui.showConsoleProgress", "false") def sc: SparkContext = _sc override def beforeEach() { _sc = new SparkContext("local[4]", "test", _sparkConf) super.beforeEach() } override def afterEach() { resetSparkContext() super.afterEach() } def resetSparkContext(): Unit = { LocalSparkContext.stop(_sc) _sc = null } } object LocalSparkContext { def stop(sc: SparkContext) { if (sc != null) { sc.stop() } // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown System.clearProperty("spark.driver.port") } def withSpark[T](sc: SparkContext)(f: SparkContext => T): T = { try { f(sc) } finally { stop(sc) } } }
Example 120
Source File: LocalSparkContext.scala From streamliner-examples with Apache License 2.0 | 5 votes |
package test.util import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.BeforeAndAfterEach import org.scalatest._ trait LocalSparkContext extends BeforeAndAfterEach { self: Suite => @transient private var _sc: SparkContext = _ val _sparkConf = new SparkConf(false) .set("spark.ui.showConsoleProgress", "false") def sc: SparkContext = _sc override def beforeEach() { _sc = new SparkContext("local[4]", "test", _sparkConf) super.beforeEach() } override def afterEach() { resetSparkContext() super.afterEach() } def resetSparkContext(): Unit = { LocalSparkContext.stop(_sc) _sc = null } } object LocalSparkContext { def stop(sc: SparkContext) { if (sc != null) { sc.stop() } // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown System.clearProperty("spark.driver.port") } def withSpark[T](sc: SparkContext)(f: SparkContext => T): T = { try { f(sc) } finally { stop(sc) } } }
Example 121
Source File: ThriftRandomExtractor.scala From streamliner-examples with Apache License 2.0 | 5 votes |
package com.memsql.spark.examples.thrift import com.memsql.spark.etl.api._ import com.memsql.spark.etl.utils.PhaseLogger import org.apache.spark.SparkContext import org.apache.spark.sql.{SQLContext, DataFrame, Row} import org.apache.spark.sql.types._ import org.apache.spark.streaming.StreamingContext import org.apache.thrift.protocol.TBinaryProtocol import org.apache.thrift.{TBase, TFieldIdEnum, TSerializer} class ThriftRandomExtractor extends Extractor { var count: Int = 1 var thriftType: Class[_] = null var serializer: TSerializer = null def schema: StructType = StructType(StructField("bytes", BinaryType, false) :: Nil) override def initialize(ssc: StreamingContext, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Unit = { val userConfig = config.asInstanceOf[UserExtractConfig] val className = userConfig.getConfigString("className") match { case Some(s) => s case None => throw new IllegalArgumentException("className must be set in the config") } thriftType = Class.forName(className) serializer = new TSerializer(new TBinaryProtocol.Factory()) count = userConfig.getConfigInt("count").getOrElse(1) } override def next(ssc: StreamingContext, time: Long, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Option[DataFrame] = { val rdd = sqlContext.sparkContext.parallelize((1 to count).map(_ => Row({ val thriftObject = ThriftRandomGenerator.next(thriftType).asInstanceOf[TBase[_ <: TBase[_, _], _ <: TFieldIdEnum]] serializer.serialize(thriftObject) }))) Some(sqlContext.createDataFrame(rdd, schema)) } }
Example 122
Source File: Configuration.scala From spark-util with Apache License 2.0 | 5 votes |
package org.hammerlab.hadoop import java.io.{ ObjectInputStream, ObjectOutputStream } import org.apache.hadoop.conf import org.apache.hadoop.conf.{ Configuration ⇒ HadoopConfiguration } import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.hammerlab.hadoop.kryo.WritableSerializer import org.hammerlab.kryo._ class Configuration(@transient var value: HadoopConfiguration) extends Serializable { private def writeObject(out: ObjectOutputStream): Unit = { value.write(out) } private def readObject(in: ObjectInputStream): Unit = { value = new HadoopConfiguration(false) value.readFields(in) } } object Configuration extends Registrar { def apply(loadDefaults: Boolean = true): Configuration = new HadoopConfiguration(loadDefaults) def apply(conf: HadoopConfiguration): Configuration = new Configuration(conf) implicit def wrapConfiguration(conf: HadoopConfiguration): Configuration = apply(conf) implicit def unwrapConfiguration(conf: Configuration): HadoopConfiguration = conf.value implicit def unwrapConfigurationBroadcast(confBroadcast: Broadcast[Configuration]): Configuration = confBroadcast.value implicit def sparkContextToHadoopConfiguration(sc: SparkContext): Configuration = sc.hadoopConfiguration implicit class Ops(val conf: HadoopConfiguration) extends AnyVal { def serializable: Configuration = conf } register( cls[conf.Configuration] → new WritableSerializer[conf.Configuration], cls[Configuration] → serializeAs[Configuration, conf.Configuration] ) }
Example 123
Source File: Histogram.scala From spark-util with Apache License 2.0 | 5 votes |
package org.hammerlab.spark.accumulator import org.apache.spark.SparkContext import org.apache.spark.util.AccumulatorV2 import scala.collection.immutable.SortedMap import scala.collection.mutable case class Histogram[T: Ordering](var map: mutable.Map[T, Long] = mutable.Map.empty[T, Long]) extends AccumulatorV2[T, SortedMap[T, Long]] { override def isZero: Boolean = map.isEmpty override def copy(): AccumulatorV2[T, SortedMap[T, Long]] = Histogram(map.clone()) override def reset(): Unit = map = mutable.Map.empty[T, Long] override def add(k: T): Unit = map.update( k, map.getOrElse(k, 0L) + 1 ) override def merge(other: AccumulatorV2[T, SortedMap[T, Long]]): Unit = for { (k, v) ← other.value } { map.update(k, map.getOrElse(k, 0L) + v) } override def value: SortedMap[T, Long] = SortedMap(map.toSeq: _*) } object Histogram { def apply[T: Ordering](name: String)(implicit sc: SparkContext): Histogram[T] = { val a = Histogram[T]() sc.register(a, name) a } }
Example 124
Source File: Context.scala From spark-util with Apache License 2.0 | 5 votes |
package org.hammerlab.spark import org.apache.spark.{ SparkConf, SparkContext } import org.hammerlab.hadoop.Configuration case class Context(@transient sc: SparkContext) extends Configuration(sc.hadoopConfiguration) object Context { implicit def makeContext(sc: SparkContext): Context = Context(sc) implicit def deriveContext(implicit sc: SparkContext): Context = Context(sc) implicit def umakeContext(context: Context): SparkContext = context.sc def apply()(implicit conf: SparkConf): Context = Context( new SparkContext( conf ) ) }
Example 125
Source File: ContextTest.scala From spark-util with Apache License 2.0 | 5 votes |
package org.hammerlab.spark import hammerlab.Suite import org.apache.spark.SparkContext class ContextTest extends Suite with ConfSuite { implicit val sc = new SparkContext(conf) def withContext(implicit ctx: Context) = {} test("derive") { // exercise implicit conversion and derivation from SparkContext withContext(sc) withContext } override protected def afterAll(): Unit = { sc.stop() super.afterAll() } }
Example 126
Source File: ContextSuite.scala From spark-util with Apache License 2.0 | 5 votes |
package org.hammerlab.spark import org.apache.spark.SparkContext import org.hammerlab.test.Suite trait ConfSuite extends SparkConfBase { implicit lazy val conf = makeSparkConf sparkConf( "spark.master" → s"local[4]", "spark.app.name" → getClass.getName, "spark.driver.host" → "localhost", "spark.kryo.classesToRegister" → "org.apache.spark.internal.io.FileCommitProtocol$TaskCommitMessage" ) } abstract class ContextSuite extends Suite with ConfSuite with SelfRegistrar { private var _sc: Context = _ implicit lazy val sc = { _sc = Context() _sc } implicit def sparkContext: SparkContext = sc override def afterAll(): Unit = { // Do this before the super delegation, which will remove the temporary event-log dir if (_sc != null) _sc.stop() super.afterAll() } }
Example 127
Source File: KeyPartitionerTest.scala From spark-util with Apache License 2.0 | 5 votes |
package org.hammerlab.spark import org.apache.spark.SparkContext import org.hammerlab.test.Suite class KeyPartitionerTest extends Suite { test("basic calls") { KeyPartitioner(456).getPartition(123) should be(123) KeyPartitioner(456).getPartition(123 → "abc") should be(123) intercept[UnexpectedKey] { KeyPartitioner(456).getPartition("abc") }.key should be("abc") } test("partitioner") { val partitioner = Partitioner[Either[Int, String]]( 2, { case Left(n) ⇒ 0 case Right(str) ⇒ 1 } ) partitioner.getPartition(Left(222)) should be(0) partitioner.getPartition(Right("abc")) should be(1) intercept[UnexpectedKey] { partitioner.getPartition(333) }.key should be(333) intercept[UnexpectedKey] { partitioner.getPartition("ddd") }.key should be("ddd") } }
Example 128
Source File: Sessionize.scala From Mastering-Scala-Machine-Learning with MIT License | 5 votes |
package org.akozlov.chapter06 import java.io._ import java.time.ZoneOffset import java.time.LocalDateTime import java.time.format.DateTimeFormatter import org.apache.spark.{SparkConf,SparkContext} import org.apache.spark.storage.StorageLevel object Sessionize extends App { val sc = new SparkContext("local[8]", "Sessionize", new SparkConf()) val checkoutPattern = ".*>checkout.*".r.pattern // a basic page view structure case class PageView(ts: String, path: String) extends Serializable with Ordered[PageView] { override def toString: String = { s"($ts #$path)" } def compare(other: PageView) = ts compare other.ts } // represent a session case class Session[A <: PageView](id: String, visits: Seq[A]) extends Serializable { override def toString: String = { val vsts = visits.mkString("[", ",", "]") s"($id -> $vsts)" } } def toEpochSeconds(str: String) = { LocalDateTime.parse(str, DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")).toEpochSecond(ZoneOffset.UTC) } val sessions = sc.textFile("data/clickstream") .map(line => {val parts = line.split("\t"); (parts(4), new PageView(parts(0), parts(20)))}) .groupByKey.map(x => { new Session(x._1, x._2.toSeq.sorted) } ) .cache // sessions.take(100).foreach(println) def findAllCheckoutSessions(s: Session[PageView]) = { s.visits.tails.filter { _ match { case PageView(ts1, "mycompanycom>homepage") :: PageView(ts2, page) :: tail if (page != "mycompanycom>homepage" ) => true; case _ => false } } .foldLeft(Seq[Session[PageView]]()) { case (r, x) => { x.find(y => checkoutPattern.matcher(y.path).matches) match { case Some(checkout) if (toEpochSeconds(checkout.ts) > toEpochSeconds(x.head.ts) + 60) => r.:+(new Session(s.id, x.slice(0, x.indexOf(checkout)))) case _ => r } } } } val prodLandingSessions = sessions.flatMap(findAllCheckoutSessions) prodLandingSessions.collect.foreach(println) sc.stop() }
Example 129
Source File: CustomPartitioner.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_3 import com.tomekl007.UserTransaction import org.apache.spark.sql.SparkSession import org.apache.spark.{Partitioner, SparkContext} import org.scalatest.FunSuite import org.scalatest.Matchers._ class CustomPartitioner extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should use custom partitioner") { //given val numberOfExecutors = 2 val data = spark .parallelize(List( UserTransaction("a", 100), UserTransaction("b", 101), UserTransaction("a", 202), UserTransaction("b", 1), UserTransaction("c", 55) ) ).keyBy(_.userId) .partitionBy(new Partitioner { override def numPartitions: Int = numberOfExecutors override def getPartition(key: Any): Int = { key.hashCode % numberOfExecutors } }) println(data.partitions.length) //when val res = data.mapPartitions[Long](iter => iter.map(_._2).map(_.amount) ).collect().toList //then res should contain theSameElementsAs List(55, 100, 202, 101, 1) } }
Example 130
Source File: ExecutionPlanForJoins.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_3 import org.apache.spark.sql.SparkSession import org.apache.spark.{HashPartitioner, SparkContext} import org.scalatest.FunSuite import org.scalatest.Matchers._ class ExecutionPlanForJoins extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should use custom partitioner while join") { //given val transactions = spark.makeRDD(List((1, "bag"), (2, "dog"), (4, "car"))) val persons = spark.makeRDD(List((1, "Tom"), (2, "Michael"), (3, "Johnny"))) //when val personsDataPartitioner = transactions.partitioner match { case Some(p) => p case None => new HashPartitioner(persons.partitions.length) } val res = persons.join(transactions, personsDataPartitioner).collect().toList res should contain theSameElementsAs List((2, ("Michael", "dog")), (1, ("Tom", "bag"))) } test("can broadcast small data set to every executor and join in-memory") { //given val smallDataSet = spark.makeRDD(List((1, "bag"), (2, "dog"), (4, "car"))) val hugeDataSet = spark.makeRDD(List((1, "Tom"), (2, "Michael"), (3, "Johnny"))) //when broadcast small rdd to all executors val smallInMemoryDataSet = spark.broadcast(smallDataSet.collectAsMap()) //then join will not need to do shuffle val res = hugeDataSet.mapPartitions(iter => { iter.flatMap { case (k, v1) => smallInMemoryDataSet.value.get(k) match { case None => Seq.empty case Some(v2) => Seq((k, (v1, v2))) } } }) res.collect().toList should contain theSameElementsAs List((2, ("Michael", "dog")), (1, ("Tom", "bag"))) } }
Example 131
Source File: IntegrationTesting.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_6 import com.tomekl007.UserTransaction import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite import org.scalatest.Matchers._ import scala.collection.mutable import scala.collection.mutable.ArrayBuffer class IntegrationTesting extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("Integration testing of already unit-tested logic") { //given val keysWithValuesList = Array( UserTransaction("A", 100), UserTransaction("B", 4), UserTransaction("A", 100001), UserTransaction("B", 10), UserTransaction("C", 10) ) val data = spark.parallelize(keysWithValuesList) //when val aggregatedTransactionsForUserId = data.filter(BonusVerifier.qualifyForBonus) //then aggregatedTransactionsForUserId.collect().toList should contain theSameElementsAs List( UserTransaction("A", 100001) ) } }
Example 132
Source File: MockingDataSources.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_6 import com.tomekl007.UserTransaction import org.apache.spark.SparkContext import org.apache.spark.sql.{DataFrame, SparkSession} import org.scalatest.{FunSuite, Ignore} class MockingDataSources extends FunSuite { val spark = SparkSession.builder().master("local[2]").getOrCreate() ignore("loading data on prod from hive") { UserDataLogic.loadAndGetAmount(spark, HiveDataLoader.loadUserTransactions) } test("mock loading data from hive"){ //given import spark.sqlContext.implicits._ val df = spark.sparkContext .makeRDD(List(UserTransaction("a", 100), UserTransaction("b", 200))) .toDF() //when val res = UserDataLogic.loadAndGetAmount(spark, _ => df) //then res.show() } } object UserDataLogic { def loadAndGetAmount(sparkSession: SparkSession, provider: SparkSession => DataFrame): DataFrame = { val df = provider(sparkSession) df.select(df("amount")) } } object HiveDataLoader { def loadUserTransactions(sparkSession: SparkSession): DataFrame = { sparkSession.sql("select * from transactions") } }
Example 133
Source File: InheritanceRdd.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_2 import com.example.{MultipliedRDD, Record} import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite import org.scalatest.Matchers._ class InheritanceRdd extends FunSuite { val spark: SparkContext = SparkSession .builder().master("local[2]").getOrCreate().sparkContext test("use extended RDD") { //given val rdd = spark.makeRDD(List(Record(1, "d1"))) val extendedRdd = new MultipliedRDD(rdd, 10) extendedRdd.collect().toList should contain theSameElementsAs List( Record(10, "d1") ) } }
Example 134
Source File: ImmutableRDD.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_2 import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite import org.scalatest.Matchers._ class ImmutableRDD extends FunSuite { val spark: SparkContext = SparkSession .builder().master("local[2]").getOrCreate().sparkContext test("RDD should be immutable") { //given val data = spark.makeRDD(0 to 5) //when val res = data.map(_ * 2) //then res.collect().toList should contain theSameElementsAs List( 0, 2, 4, 6, 8, 10 ) data.collect().toList should contain theSameElementsAs List( 0, 1, 2, 3, 4, 5 ) } }
Example 135
Source File: SaveJSON.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_4 import com.tomekl007.UserTransaction import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession import org.scalatest.Matchers._ import org.scalatest.{BeforeAndAfterEach, FunSuite} import scala.reflect.io.Path class SaveJSON extends FunSuite with BeforeAndAfterEach { val spark = SparkSession.builder().master("local[2]").getOrCreate() private val FileName = "transactions.json" override def afterEach() { val path = Path(FileName) path.deleteRecursively() } test("should save and load in JSON") { //given import spark.sqlContext.implicits._ val rdd = spark.sparkContext .makeRDD(List(UserTransaction("a", 100), UserTransaction("b", 200))) .toDF() //when rdd.coalesce(1).write.format("json").save(FileName) val fromFile = spark.read.json(FileName) fromFile.show() assert(fromFile.count() == 2) } }
Example 136
Source File: SavePlainText.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_4 import java.io.File import com.tomekl007.UserTransaction import org.apache.spark.sql.SparkSession import org.apache.spark.{Partitioner, SparkContext} import org.scalatest.{BeforeAndAfterEach, FunSuite} import org.scalatest.Matchers._ import scala.reflect.io.Path class SavePlainText extends FunSuite with BeforeAndAfterEach{ val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext private val FileName = "transactions.txt" override def afterEach() { val path = Path (FileName) path.deleteRecursively() } test("should save and load in plain text") { //given val rdd = spark.makeRDD(List(UserTransaction("a", 100), UserTransaction("b", 200))) //when rdd.coalesce(1).saveAsTextFile(FileName) val fromFile = spark.textFile(FileName) fromFile.collect().toList should contain theSameElementsAs List( "UserTransaction(a,100)", "UserTransaction(b,200)" //note - this is string! ) } }
Example 137
Source File: ReUseWithCheckpoint.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_4 import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.scalatest.FunSuite class ReUseWithCheckpoint extends FunSuite { private val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext private val checkpointEnabled = true private val storageLevel = StorageLevel.MEMORY_AND_DISK test("should use checkpoint for re-usability of RDD") { //given val sortedRDD = spark.makeRDD(List(1, 2, 5, 77, 888)) if (storageLevel != StorageLevel.NONE) { sortedRDD.persist(storageLevel) } if (checkpointEnabled) { sortedRDD.sparkContext.setCheckpointDir("hdfs://tmp/checkpoint") sortedRDD.checkpoint() } //when performALotOfExpensiveComputations(sortedRDD) //then sortedRDD.collect().toList } def performALotOfExpensiveComputations(sortedRDD: RDD[Int]): Unit = { //.... sortedRDD.count() //failure sortedRDD.collect() } }
Example 138
Source File: CreatingGraph.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_7 import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite class CreatingGraph extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should load graph from a file") { //given val path = getClass.getResource("/graph.g").getPath //when val graph = GraphBuilder.loadFromFile(spark, path) //then graph.triplets.foreach(println(_)) assert(graph.triplets.count() == 4) } }
Example 139
Source File: VertexAPI.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_7 import org.apache.spark.SparkContext import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite class VertexAPI extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("Should use Vertex API") { //given val users: RDD[(VertexId, (String))] = spark.parallelize(Array( (1L, "a"), (2L, "b"), (3L, "c"), (4L, "d") )) val relationships = spark.parallelize(Array( Edge(1L, 2L, "friend"), Edge(1L, 3L, "friend"), Edge(2L, 4L, "wife") )) val graph = Graph(users, relationships) //when val res = graph.mapVertices((_, att) => att.toUpperCase()) res.vertices.collect().toList } }
Example 140
Source File: EdgeAPI.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_7 import org.apache.spark.SparkContext import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite class EdgeAPI extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("Should use Edge API") { //given val users: RDD[(VertexId, (String))] = spark.parallelize(Array( (1L, "a"), (2L, "b"), (3L, "c"), (4L, "d") )) val relationships = spark.parallelize(Array( Edge(1L, 2L, "friend"), Edge(1L, 3L, "friend"), Edge(2L, 4L, "wife") )) val graph = Graph(users, relationships) //when val res = graph.mapEdges(e => e.attr.toUpperCase) println(res.edges.collect().toList) } }
Example 141
Source File: ReduceAPI.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_1 import com.tomekl007.UserTransaction import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite import org.scalatest.Matchers._ class ReduceAPI extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should use reduce API") { //given val input = spark.makeRDD(List( UserTransaction("A", 10), UserTransaction("B", 1), UserTransaction("A", 101) )) //when val result = input .map(_.amount) .reduce((a, b) => if (a > b) a else b) //then assert(result == 101) } test("should use reduceByKey API") { //given val input = spark.makeRDD( List( UserTransaction("A", 10), UserTransaction("B", 1), UserTransaction("A", 101) ) ) //when val result = input .keyBy(_.userId) .reduceByKey((firstTransaction, secondTransaction) => TransactionChecker.higherTransactionAmount(firstTransaction, secondTransaction)) .collect() .toList //then result should contain theSameElementsAs List(("B", UserTransaction("B", 1)), ("A", UserTransaction("A", 101))) } } object TransactionChecker { def higherTransactionAmount(firstTransaction: UserTransaction, secondTransaction: UserTransaction): UserTransaction = { if (firstTransaction.amount > secondTransaction.amount) firstTransaction else secondTransaction } }
Example 142
Source File: TriggerComputationsReusingRDD.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_1 import com.tomekl007.UserTransaction import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite class TriggerComputationsReusingRDD extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should trigger computations using actions without reuse") { //given val input = spark.makeRDD( List( UserTransaction(userId = "A", amount = 1001), UserTransaction(userId = "A", amount = 100), UserTransaction(userId = "A", amount = 102), UserTransaction(userId = "A", amount = 1), UserTransaction(userId = "B", amount = 13))) //when apply transformation val rdd = input .filter(_.userId.contains("A")) .keyBy(_.userId) .map(_._2.amount) //then every call to action means that we are going up to the RDD chain //if we are loading data from external file-system (I.E.: HDFS), every action means //that we need to load it from FS. val start = System.currentTimeMillis() println(rdd.collect().toList) println(rdd.count()) println(rdd.first()) rdd.foreach(println(_)) rdd.foreachPartition(t => t.foreach(println(_))) println(rdd.max()) println(rdd.min()) println(rdd.takeOrdered(1).toList) println(rdd.takeSample(false, 2).toList) val result = System.currentTimeMillis() - start println(s"time taken (no-cache): $result") } test("should trigger computations using actions with reuse") { //given val input = spark.makeRDD( List( UserTransaction(userId = "A", amount = 1001), UserTransaction(userId = "A", amount = 100), UserTransaction(userId = "A", amount = 102), UserTransaction(userId = "A", amount = 1), UserTransaction(userId = "B", amount = 13))) //when apply transformation val rdd = input .filter(_.userId.contains("A")) .keyBy(_.userId) .map(_._2.amount) .cache() //then every call to action means that we are going up to the RDD chain //if we are loading data from external file-system (I.E.: HDFS), every action means //that we need to load it from FS. val start = System.currentTimeMillis() println(rdd.collect().toList) println(rdd.count()) println(rdd.first()) rdd.foreach(println(_)) rdd.foreachPartition(t => t.foreach(println(_))) println(rdd.max()) println(rdd.min()) println(rdd.takeOrdered(1).toList) println(rdd.takeSample(false, 2).toList) val result = System.currentTimeMillis() - start println(s"time taken(cache): $result") } }
Example 143
Source File: TriggerComputations.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_1 import com.tomekl007.UserTransaction import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite class TriggerComputations extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should trigger computations using actions") { //given val input = spark.makeRDD( List( UserTransaction(userId = "A", amount = 1001), UserTransaction(userId = "A", amount = 100), UserTransaction(userId = "A", amount = 102), UserTransaction(userId = "A", amount = 1), UserTransaction(userId = "B", amount = 13))) //when apply transformation val rdd = input .filter(_.userId.contains("A")) .keyBy(_.userId) .map(_._2.amount) //then println(rdd.collect().toList) println(rdd.count()) //and all count* println(rdd.first()) rdd.foreach(println(_)) rdd.foreachPartition(t => t.foreach(println(_))) println(rdd.max()) println(rdd.min()) println(rdd.takeOrdered(1).toList) println(rdd.takeSample(false, 2).toList) //all reduce will be covered in separate video } }
Example 144
Source File: DeferComputations.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_1 import com.tomekl007.InputRecord import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite class DeferComputations extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should defer computations") { //given val input = spark.makeRDD( List(InputRecord(userId = "A"), InputRecord(userId = "B"))) //when apply transformation val rdd = input .filter(_.userId.contains("A")) .keyBy(_.userId) .map(_._2.userId.toLowerCase) //.... built processing graph lazy if (shouldExecutePartOfCode()) { //rdd.saveAsTextFile("") || rdd.collect().toList } else { //condition changed - don't need to evaluate DAG } } private def shouldExecutePartOfCode(): Boolean = { //domain logic that decide if we still need to calculate true } }
Example 145
Source File: GroupByKey.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_1 import com.tomekl007.UserTransaction import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite import org.scalatest.Matchers._ import scala.collection.mutable import scala.collection.mutable.ArrayBuffer class GroupByKey extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should trigger computations using actions") { //given val input = spark.makeRDD( List( UserTransaction(userId = "A", amount = 1001), UserTransaction(userId = "A", amount = 100), UserTransaction(userId = "A", amount = 102), UserTransaction(userId = "A", amount = 1), UserTransaction(userId = "B", amount = 13))) //when apply transformation val rdd = input .groupBy(_.userId) .map(x => (x._1,x._2.toList)) .collect() .toList //then rdd should contain theSameElementsAs List( ("B", List(UserTransaction("B", 13))), ("A", List( UserTransaction("A", 1001), UserTransaction("A", 100), UserTransaction("A", 102), UserTransaction("A", 1)) ) ) } }
Example 146
Source File: UsePartitioner.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_5 import com.tomekl007.UserTransaction import org.apache.spark.{HashPartitioner, RangePartitioner, SparkContext} import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite import org.scalatest.Matchers._ class UsePartitioner extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should use different partitioners") { //given val keysWithValuesList = Array( UserTransaction("A", 100), UserTransaction("B", 4), UserTransaction("A", 100001), UserTransaction("B", 10), UserTransaction("C", 10) ) val data = spark.parallelize(keysWithValuesList) val keyed = data.keyBy(_.userId) //when, then val partitioner = keyed.partitioner assert(partitioner.isEmpty) val hashPartitioner = keyed.partitionBy(new HashPartitioner(100)) println(hashPartitioner) assert(hashPartitioner.partitioner.isDefined) val rangePartitioner = keyed.partitionBy(new RangePartitioner(100, keyed)) println(rangePartitioner) assert(rangePartitioner.partitioner.isDefined) } }
Example 147
Source File: AggregateByKey.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_5 import com.tomekl007.UserTransaction import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite import org.scalatest.Matchers._ import scala.collection.mutable import scala.collection.mutable.ArrayBuffer class AggregateByKey extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should use aggregateByKey instead of groupBy to reduce shuffle") { //given val keysWithValuesList = Array( UserTransaction("A", 100), UserTransaction("B", 4), UserTransaction("A", 100001), UserTransaction("B", 10), UserTransaction("C", 10) ) val data = spark.parallelize(keysWithValuesList) val keyed = data.keyBy(_.userId) val amountForUser = mutable.ArrayBuffer.empty[Long] val addAmount = (responseTimes: mutable.ArrayBuffer[Long], transaction: UserTransaction) => responseTimes += transaction.amount val mergeAmounts = (p1: mutable.ArrayBuffer[Long], p2: mutable.ArrayBuffer[Long]) => p1 ++= p2 //when val aggregatedTransactionsForUserId = keyed .aggregateByKey(amountForUser)(addAmount, mergeAmounts) //then aggregatedTransactionsForUserId.collect().toList should contain theSameElementsAs List( ("A", ArrayBuffer(100, 100001)), ("B", ArrayBuffer(4,10)), ("C", ArrayBuffer(10))) } }
Example 148
Source File: TransformationsOnPairs.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_5 import com.tomekl007.UserTransaction import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite import org.scalatest.Matchers._ import scala.collection.mutable import scala.collection.mutable.ArrayBuffer class TransformationsOnPairs extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should use transformation on k/v pair") { //given val keysWithValuesList = Array( UserTransaction("A", 100), UserTransaction("B", 4), UserTransaction("A", 100001), UserTransaction("B", 10), UserTransaction("C", 10) ) val data = spark.parallelize(keysWithValuesList) val keyed = data.keyBy(_.userId) //when val counted = keyed.countByKey() // keyed.combineByKey() // keyed.aggregateByKey() // keyed.foldByKey() // keyed.groupByKey() //then counted should contain theSameElementsAs Map("B" -> 2, "A" -> 2, "C" -> 1) } }
Example 149
Source File: ActionsOnPairs.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_5 import com.tomekl007.UserTransaction import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite import org.scalatest.Matchers._ class ActionsOnPairs extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should use action to see k/v data format after collect") { //given val keysWithValuesList = Array( UserTransaction("A", 100), UserTransaction("B", 4), UserTransaction("A", 100001), UserTransaction("B", 10), UserTransaction("C", 10) ) val data = spark.parallelize(keysWithValuesList) val keyed = data.keyBy(_.userId) //when val res = keyed.collect().toList //then res should contain theSameElementsAs List( ("A",UserTransaction("A",100)), ("B",UserTransaction("B",4)), ("A",UserTransaction("A",100001)), ("B",UserTransaction("B",10)), ("C",UserTransaction("C",10)) )//note duplicated key } }
Example 150
Source File: CustomRangePartitioner.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_5 import com.tomekl007.UserTransaction import org.apache.spark.sql.SparkSession import org.apache.spark.{HashPartitioner, Partitioner, RangePartitioner, SparkContext} import org.scalatest.FunSuite class CustomRangePartitionerTest extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should use custom range partitioner") { //given val keysWithValuesList = Array( UserTransaction("A", 100), UserTransaction("B", 4), UserTransaction("A", 100001), UserTransaction("B", 10), UserTransaction("C", 10) ) val data = spark.parallelize(keysWithValuesList) val keyed = data.keyBy(_.amount) //when, then val partitioned = keyed.partitionBy(new CustomRangePartitioner(List((0,100), (100, 10000), (10000, 1000000)))) //then partitioned.collect().toList } } class CustomRangePartitioner(ranges: List[(Int,Int)]) extends Partitioner{ override def numPartitions: Int = ranges.size override def getPartition(key: Any): Int = { if(!key.isInstanceOf[Int]){ throw new IllegalArgumentException("partitioner works only for Int type") } val keyInt = key.asInstanceOf[Int] val index = ranges.lastIndexWhere(v => keyInt >= v._1 && keyInt <= v._2) println(s"for key: $key return $index") index } }
Example 151
Source File: BulkTableWriter.scala From spark-cassandra-stress with Apache License 2.0 | 5 votes |
package com.datastax.bdp.spark.writer import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import com.datastax.spark.connector._ import com.datastax.spark.connector.writer._ import java.nio.file.{Path, Files} import scala.language.implicitConversions object BulkTableWriter{ implicit def toBulkTableWriter[T](rdd: RDD[T]): BulkTableWriter[T] = new BulkTableWriter(rdd) } class BulkTableWriter[T](rdd: RDD[T]) { def bulkSaveToCassandra(keyspaceName: String, tableName: String, columns: ColumnSelector = AllColumns, writeConf: BulkWriteConf = BulkWriteConf()): Unit = { throw new UnsupportedOperationException } } case class BulkWriteConf(outputDirectory: Option[Path] = None, deleteSource: Boolean = true, bufferSizeInMB: Int = 64)
Example 152
Source File: L10-9Graph.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.graphx.Edge import org.apache.spark.graphx.Graph import org.apache.spark.graphx.Graph.graphToGraphOps import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object UserRankApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: UserRankApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) ssc.socketTextStream(hostname, port.toInt) .map(r => { implicit val formats = DefaultFormats parse(r) }) .foreachRDD(rdd => { val edges = rdd.map(jvalue => { implicit val formats = DefaultFormats ((jvalue \ "user_id").extract[String], (jvalue \ "friends").extract[Array[String]]) }) .flatMap(r => r._2.map(f => Edge(r._1.hashCode.toLong, f.hashCode.toLong, 1.0))) val vertices = rdd.map(jvalue => { implicit val formats = DefaultFormats ((jvalue \ "user_id").extract[String]) }) .map(r => (r.hashCode.toLong, r)) val tolerance = 0.0001 val graph = Graph(vertices, edges, "defaultUser") .subgraph(vpred = (id, idStr) => idStr != "defaultUser") val pr = graph.pageRank(tolerance).cache graph.outerJoinVertices(pr.vertices) { (userId, attrs, rank) => (rank.getOrElse(0.0).asInstanceOf[Number].doubleValue, attrs) }.vertices.top(10) { Ordering.by(_._2._1) }.foreach(rec => println("User id: %s, Rank: %f".format(rec._2._2, rec._2._1))) }) ssc.start() ssc.awaitTermination() } }
Example 153
Source File: L10-2DataProc.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.HashPartitioner import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.json4s.DefaultFormats import org.json4s.JsonAST.JNothing import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object DataProcApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: DataProcApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) ssc.socketTextStream(hostname, port.toInt) .map(r => { implicit val formats = DefaultFormats parse(r) }) .filter(jvalue => { jvalue \ "attributes" \ "Wi-Fi" != JNothing }) .map(jvalue => { implicit val formats = DefaultFormats ((jvalue \ "attributes" \ "Wi-Fi").extract[String], (jvalue \ "stars").extract[Int]) }) .combineByKey( (v) => (v, 1), (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1), (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2), new HashPartitioner(ssc.sparkContext.defaultParallelism)) .map({ case (k, v) => (k, v._1 / v._2.toFloat) }) .print() ssc.start() ssc.awaitTermination() } }
Example 154
Source File: L5-7MultipleSocketStreams.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Seconds, StreamingContext } import org.apache.spark.streaming.dstream.PairDStreamFunctions import java.util.Calendar object TripByYearMultiApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: TripByYearMultiApp <appname> <hostname> <base_port> <num_of_sockets>") System.exit(1) } val Seq(appName, hostname, basePort, nSockets) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) val streams = (0 to nSockets.toInt - 1).map(i => ssc.socketTextStream(hostname, basePort.toInt + i)) val uniStream = ssc.union(streams) uniStream .map(rec => rec.split(",")) .map(rec => (rec(13), rec(0).toInt)) .reduceByKey(_ + _) .map(pair => (pair._2, normalizeYear(pair._1))) .transform(rec => rec.sortByKey(ascending = false)) .saveAsTextFiles("TripByYear") ssc.start() ssc.awaitTermination() } def normalizeYear(s: String): String = { try { (Calendar.getInstance().get(Calendar.YEAR) - s.toInt).toString } catch { case e: Exception => s } } }
Example 155
Source File: L5-9Mqtt.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.mqtt.MQTTUtils object YearlyDistributionApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: YearlyDistributionApp <appname> <brokerUrl> <topic> <checkpointDir>") System.exit(1) } val Seq(appName, brokerUrl, topic, checkpointDir) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) MQTTUtils.createStream(ssc, brokerUrl, topic, StorageLevel.MEMORY_ONLY_SER_2) .map(rec => rec.split(",")) .map(rec => (rec(1).split(" ")(0), 1)) .updateStateByKey(statefulCount) .map(pair => (pair._2, pair._1)) .transform(rec => rec.sortByKey(ascending = false)) .saveAsTextFiles("YearlyDistribution") ssc.start() ssc.awaitTermination() } val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) }
Example 156
Source File: L5-11FlumePull.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.flume.FlumeUtils object DailyUserTypeDistributionApp2 { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: DailyUserTypeDistributionApp <appname> <hostname> <port> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, hostname, port, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) FlumeUtils.createPollingStream(ssc, hostname, port.toInt, StorageLevel.MEMORY_ONLY_SER_2) .map(rec => new String(rec.event.getBody().array()).split(",")) .map(rec => ((rec(1).split(" ")(0), rec(12)), 1)) .updateStateByKey(statefulCount) .repartition(1) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) }
Example 157
Source File: L5-6SocketStream.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Seconds, StreamingContext } import org.apache.spark.streaming.dstream.PairDStreamFunctions import java.util.Calendar object TripByYearApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: TripByYearApp <appname> <hostname> <port>") System.exit(1) } val Seq(appName, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.socketTextStream(hostname, port.toInt) .map(rec => rec.split(",")) .map(rec => (rec(13), rec(0).toInt)) .reduceByKey(_ + _) .map(pair => (pair._2, normalizeYear(pair._1))) .transform(rec => rec.sortByKey(ascending = false)) .saveAsTextFiles("TripByYear") ssc.start() ssc.awaitTermination() } def normalizeYear(s: String): String = { try { (Calendar.getInstance().get(Calendar.YEAR) - s.toInt).toString } catch { case e: Exception => s } } }
Example 158
Source File: L5-16Twitter.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.twitter.TwitterUtils import org.apache.spark.storage.StorageLevel import twitter4j.conf.ConfigurationBuilder import twitter4j.TwitterFactory object TwitterApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: TwitterApp <appname> <outputPath>") System.exit(1) } val Seq(appName, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) val cb = new ConfigurationBuilder() cb.setOAuthConsumerKey("") cb.setOAuthConsumerSecret("") cb.setOAuthAccessToken("") cb.setOAuthAccessTokenSecret("") val twitterAuth = new TwitterFactory(cb.build()).getInstance().getAuthorization() val tweetStream = TwitterUtils.createStream(ssc, Some(twitterAuth), Array("nyc citi bike", "nyc bike share")) tweetStream.count().print() tweetStream.saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 159
Source File: L5-11FlumePush.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.flume.FlumeUtils object DailyUserTypeDistributionApp { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: DailyUserTypeDistributionApp <appname> <hostname> <port> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, hostname, port, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) FlumeUtils.createStream(ssc, hostname, port.toInt, StorageLevel.MEMORY_ONLY_SER_2) .map(rec => new String(rec.event.getBody().array()).split(",")) .map(rec => ((rec(1).split(" ")(0), rec(12)), 1)) .updateStateByKey(statefulCount) .repartition(1) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) }
Example 160
Source File: L5-13Kafka.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.kafka.KafkaUtils object StationJourneyCountApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) //.set("spark.streaming.receiver.writeAheadLog.enable", "true") val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val topics = Map[String, Int]( topic -> 1) KafkaUtils.createStream(ssc, zkQuorum, consumerGroupId, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2) .map(rec => rec.split(",")) .map(rec => ((rec(3), rec(7)), 1)) .reduceByKey(_ + _) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 161
Source File: L5-18Http.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object HttpApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: HttpApp <appname> <outputPath>") System.exit(1) } val Seq(appName, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://www.citibikenyc.com/stations/json", interval = batchInterval) .flatMap(rec => (parse(rec) \ "stationBeanList").children) .filter(rec => { implicit val formats = DefaultFormats (rec \ "statusKey").extract[Integer] != 1 }) .map(rec => rec.filterField { case JField("id", _) => true case JField("stationName", _) => true case JField("statusValue", _) => true case _ => false }) .map(rec => { implicit val formats = DefaultFormats (rec(0)._2.extract[Integer], rec(1)._2.extract[String], rec(2)._2.extract[String]) }) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 162
Source File: L5-14KafkaCustomConf.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.kafka.KafkaUtils import kafka.serializer.StringDecoder import org.apache.spark.storage.StorageLevel object StationJourneyCountCustomApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) //.set("spark.streaming.receiver.writeAheadLog.enable", "true") val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val topics = Map[String, Int]( topic -> 1) val params = Map[String, String]( "zookeeper.connect" -> zkQuorum, "group.id" -> consumerGroupId, "bootstrap.servers" -> brokerUrl) KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](ssc, params, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2) .map(rec => rec.split(",")) .map(rec => ((rec(3), rec(7)), 1)) .reduceByKey(_ + _) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 163
Source File: L7-2-3Tachyon.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions object ReferrerApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: ReferrerApp <appname> <hostname> <port> <tachyonUrl> <checkpointDir> <outputPathTop> <outputPathSpark>") System.exit(1) } val Seq(appName, hostname, port, tachyonUrl, checkpointDir, outputPathTop, outputPathSpark) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.externalBlockStore.url", tachyonUrl) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val clickstream = ssc.socketTextStream(hostname, port.toInt) .map(rec => rec.split("\\t")) .persist(StorageLevel.OFF_HEAP) val topRefStream = clickstream .map(rec => { var prev_title = rec(3) if (!prev_title.startsWith("other")) { prev_title = "wikipedia" } (prev_title, 1) }) val topSparkStream = clickstream .filter(rec => rec(4).equals("Apache_Spark")) .map(rec => (rec(3), 1)) saveTopKeys(topRefStream, outputPathTop) saveTopKeys(topSparkStream, outputPathSpark) ssc.start() ssc.awaitTermination() } def saveTopKeys(clickstream: DStream[(String, Int)], outputPath: String) { clickstream.updateStateByKey((values, state: Option[Int]) => Some(values.sum + state.getOrElse(0))) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rec => rec.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) } }
Example 164
Source File: L7-4UI.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.util.concurrent.atomic.AtomicLong import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object SocialSearchApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: SocialSearchApp <appname> <hostname> <port>") System.exit(1) } val Seq(appName, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) //.set("spark.eventLog.enabled", "true") //.set("spark.eventLog.dir", "/tmp/historical") val countSearch = new AtomicLong(0) val countSocial = new AtomicLong(0) val ssc = new StreamingContext(conf, Seconds(1)) val titleStream = ssc.socketTextStream(hostname, port.toInt) .map(rec => rec.split("\\t")) .filter(_(3) match { case "other-google" | "other-bing" | "other-yahoo" | "other-facebook" | "other-twitter" => true case _ => false }) .map(rec => (rec(3), rec(4))) .cache() val searchStream = titleStream.filter(_._1 match { case "other-google" | "other-bing" | "other-yahoo" => true case _ => false }) .map(rec => rec._2) val socialStream = titleStream.filter(_._1 match { case "other-facebook" | "other-twitter" => true case _ => false }) .map(rec => rec._2) val exclusiveSearch = searchStream.transformWith(socialStream, (searchRDD: RDD[String], socialRDD: RDD[String]) => searchRDD.subtract(socialRDD)) .foreachRDD(rdd => { countSearch.addAndGet(rdd.count()) println("Exclusive count search engines: " + countSearch) }) val exclusiveSocial = socialStream.transformWith(searchStream, (socialRDD: RDD[String], searchRDD: RDD[String]) => socialRDD.subtract(searchRDD)) .foreachRDD(rdd => { countSocial.addAndGet(rdd.count()) println("Exclusive count social media: " + countSocial) }) ssc.start() ssc.awaitTermination() } }
Example 165
Source File: L4-1Voyager.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.fs.Path import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions object VoyagerApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: VoyagerApp <appname> <inputPath> <outputPath>") System.exit(1) } val Seq(appName, inputPath, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.executor.extraJavaOptions", "-XX:+UseConcMarkSweepGC") val ssc = new StreamingContext(conf, Seconds(10)) val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) voyager1.map(rec => { val attrs = rec.split("\\s+") ((attrs(0).toInt), attrs.slice(18, 28).map(_.toDouble)) }).filter(pflux => pflux._2.exists(_ > 1.0)).map(rec => (rec._1, 1)) .reduceByKey(_ + _) .transform(rec => rec.sortByKey(ascending = false, numPartitions = 1)).saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 166
Source File: L4-4Kryo.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.fs.Path import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions object VoyagerAppKryo { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: VoyagerAppKryo <appname> <inputPath> <outputPath>") System.exit(1) } val Seq(appName, inputPath, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .registerKryoClasses(Array(classOf[ProtonFlux])) val ssc = new StreamingContext(conf, Seconds(10)) val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val projected = voyager1.map(rec => { val attrs = rec.split("\\s+") new ProtonFlux(attrs(0), attrs(18), attrs(19), attrs(20), attrs(21), attrs(22), attrs(23), attrs(24), attrs(25), attrs(26), attrs(27), attrs(28)) }) val filtered = projected.filter(pflux => pflux.isSolarStorm) val yearlyBreakdown = filtered.map(rec => (rec.year, 1)) .reduceByKey(_ + _) .transform(rec => rec.sortByKey(ascending = false)) yearlyBreakdown.saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 167
Source File: L8-1DataFrameAPI.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions.desc import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrDataframeApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 168
Source File: L8-3-6-7DataFrameCreation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions.desc import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.native.Serialization.write import org.json4s.DefaultFormats object DataframeCreationApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { //val cdrs = sqlC.createDataFrame(seqToCdr(rdd)) //val cdrs = sqlC.createDataFrame(seqToCdr(rdd).collect()) //val cdrs = seqToCdr(rdd).toDF() val cdrsJson = seqToCdr(rdd).map(r => { implicit val formats = DefaultFormats write(r) }) val cdrs = sqlC.read.json(cdrsJson) cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 169
Source File: L8-29DataFrameExamplesJoin.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.JDouble import org.json4s.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.compact import org.json4s.native.JsonMethods.parse import org.json4s.native.JsonMethods.render import org.json4s.string2JsonInput object CdrDataframeExamples3App { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: CdrDataframeExamples3App <appname> <batchInterval> <hostname> <port> <gridJsonPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, gridJsonPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ implicit val formats = DefaultFormats val gridFile = scala.io.Source.fromFile(gridJsonPath).mkString val gridGeo = (parse(gridFile) \ "features") val gridStr = gridGeo.children.map(r => { val c = (r \ "geometry" \ "coordinates").extract[List[List[List[Float]]]].flatten.flatten.map(r => JDouble(r)) val l = List(("id", r \ "id"), ("x1", c(0)), ("y1", c(1)), ("x2", c(2)), ("y2", c(3)), ("x3", c(4)), ("y3", c(5)), ("x4", c(6)), ("y4", c(7))) compact(render(JObject(l))) }) val gridDF = sqlC.read.json(ssc.sparkContext.makeRDD(gridStr)) val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.join(gridDF, $"squareId" === $"id").show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 170
Source File: L8-38SparkR.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import java.nio.file.Paths import org.apache.spark.SparkFiles object CdrStreamingSparkRApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: CdrStreamingSparkRApp <appname> <batchInterval> <hostname> <port> <tableName> <RScriptPath> <RScriptLogsPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, tableName, rScriptPath, logsPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val cl = Thread.currentThread().getContextClassLoader() val hiveC = new HiveContext(ssc.sparkContext) Thread.currentThread().setContextClassLoader(cl) import hiveC.implicits._ ssc.sparkContext.addFile(rScriptPath) val rScriptName = SparkFiles.get(Paths.get(rScriptPath).getFileName.toString) val master = hiveC.sparkContext.getConf.get("spark.master") val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD((rdd, time) => { val iTableName = tableName + time.milliseconds seqToCdr(rdd).toDF().write.saveAsTable(iTableName) hiveC.sparkContext.parallelize(Array(iTableName)).pipe("%s %s".format(rScriptName, master)).saveAsTextFile(Paths.get(logsPath, iTableName).toString) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 171
Source File: Index.scala From SpatialSpark with Apache License 2.0 | 5 votes |
package spatialspark.main import spatialspark.index.IndexConf import spatialspark.index.STIndex import org.apache.spark.{SparkConf, SparkContext} object Index { val usage = """ create spatial index on dataset Usage: index --input input path --geom geometry index (default 0) --output output path --conf configuration (dim --help """ def main(args: Array[String]) { if (args.length == 0) println(usage) val arglist = args.toList type OptionMap = Map[Symbol, Any] def nextOption(map: OptionMap, list: List[String]): OptionMap = { list match { case Nil => map case "--help" :: tail => println(usage) sys.exit(0) case "--input" :: value :: tail => nextOption(map ++ Map('input -> value), tail) case "--geom" :: value :: tail => nextOption(map ++ Map('geom -> value.toInt), tail) case "--output" :: value :: tail => nextOption(map ++ Map('output -> value), tail) case "--conf" :: value :: tail => nextOption(map = map ++ Map('conf -> value), list = tail) case option :: tail => println("Unknown option " + option) sys.exit(1) } } val options = nextOption(Map(), arglist) val conf = new SparkConf().setAppName("Build Index") conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.kryo.registrator", "spatialspark.util.KyroRegistrator") val sc = new SparkContext(conf) val inputFile = options.getOrElse('input, "").asInstanceOf[String] val outputFile = options.getOrElse('output, "").asInstanceOf[String] val methodConf = options.getOrElse('conf, "").asInstanceOf[String] val SEPARATOR = "\t" val geometryIndex = options.getOrElse('geom, 0).asInstanceOf[Int] val dimX = methodConf.split(":").apply(0).toInt val dimY = methodConf.split(":").apply(1).toInt val ratio = methodConf.split(":").apply(2).toDouble val indexConf = new IndexConf(inputFile, outputFile, SEPARATOR, geometryIndex, dimX, dimY, ratio) val timerBegin = System.currentTimeMillis() STIndex.build(sc, indexConf) val timerEnd = System.currentTimeMillis() println("index time: " + (timerEnd - timerBegin) + " ms") } }
Example 172
Source File: RangeQuery.scala From SpatialSpark with Apache License 2.0 | 5 votes |
package spatialspark.query import com.vividsolutions.jts.geom.Geometry import com.vividsolutions.jts.index.strtree.STRtree import spatialspark.operator.SpatialOperator import spatialspark.operator.SpatialOperator._ import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD class RangeQuery extends Serializable { } object RangeQuery { def apply(sc: SparkContext, geometryWithId: RDD[(Long, Geometry)], filterGeometry: Geometry, operator: SpatialOperator, radius: Double = 0): RDD[(Long, Geometry)] = { if (operator == SpatialOperator.Contains) geometryWithId.filter(_._2.contains(filterGeometry)) else if (operator == SpatialOperator.Within) geometryWithId.filter(_._2.within(filterGeometry)) else if (operator == SpatialOperator.WithinD) geometryWithId.filter(_._2.isWithinDistance(filterGeometry, radius)) else { //TODO: error for not support sc.emptyRDD } } }
Example 173
Source File: FixedGridPartition.scala From SpatialSpark with Apache License 2.0 | 5 votes |
package spatialspark.partition.fgp import spatialspark.util.MBR import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD class FixedGridPartition extends Serializable { } object FixedGridPartition { def apply(sc: SparkContext, extent: MBR, gridDimX: Int, gridDimY: Int): Array[MBR] = { val xSize = (extent.xmax - extent.xmin) / gridDimX.toDouble val ySize = (extent.ymax - extent.ymin) / gridDimY.toDouble val results = for (i <- Array.range(0, gridDimX); j <- Array.range(0, gridDimY)) yield new MBR(i * xSize + extent.xmin, j * ySize + extent.ymin, (i + 1) * xSize + extent.xmin, (j + 1) * ySize + extent.ymin) results } def genTileRDD(sc: SparkContext, extent: MBR, gridDimX: Int, gridDimY: Int): RDD[MBR] = { val xSize = (extent.xmax - extent.xmin) / gridDimX.toDouble val ySize = (extent.ymax - extent.ymin) / gridDimY.toDouble val results = for (i <- Array.range(0, gridDimX); j <- Array.range(0, gridDimY)) yield new MBR(i * xSize + extent.xmin, j * ySize + extent.ymin, (i + 1) * xSize + extent.xmin, (j + 1) * ySize + extent.ymin) sc.parallelize(wrapRefArray(results)) } }
Example 174
Source File: BroadcastSpatialJoin.scala From SpatialSpark with Apache License 2.0 | 5 votes |
package spatialspark.join import com.vividsolutions.jts.geom.Geometry import com.vividsolutions.jts.index.strtree.{ItemBoundable, ItemDistance, STRtree} import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import spatialspark.operator.SpatialOperator import spatialspark.operator.SpatialOperator.SpatialOperator object BroadcastSpatialJoin { def queryRtree(rtree: => Broadcast[STRtree], leftId: Long, geom: Geometry, predicate: SpatialOperator, radius: Double): Array[(Long, Long)] = { val queryEnv = geom.getEnvelopeInternal //queryEnv.expandBy(radius) lazy val candidates = rtree.value.query(queryEnv).toArray //.asInstanceOf[Array[(Long, Geometry)]] if (predicate == SpatialOperator.Within) { candidates.filter { case (id_, geom_) => geom.within(geom_.asInstanceOf[Geometry]) } .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) } } else if (predicate == SpatialOperator.Contains) { candidates.filter { case (id_, geom_) => geom.contains(geom_.asInstanceOf[Geometry]) } .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) } } else if (predicate == SpatialOperator.WithinD) { candidates.filter { case (id_, geom_) => geom.isWithinDistance(geom_.asInstanceOf[Geometry], radius) } .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) } } else if (predicate == SpatialOperator.Intersects) { candidates.filter { case (id_, geom_) => geom.intersects(geom_.asInstanceOf[Geometry]) } .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) } } else if (predicate == SpatialOperator.Overlaps) { candidates.filter { case (id_, geom_) => geom.overlaps(geom_.asInstanceOf[Geometry]) } .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) } } else if (predicate == SpatialOperator.NearestD) { //if (candidates.isEmpty) // return Array.empty[(Long, Long)] //val nearestItem = candidates.map { // case (id_, geom_) => (id_.asInstanceOf[Long], geom_.asInstanceOf[Geometry].distance(geom)) //}.reduce((a, b) => if (a._2 < b._2) a else b) class dist extends ItemDistance { override def distance(itemBoundable: ItemBoundable, itemBoundable1: ItemBoundable): Double = { val geom = itemBoundable.getItem.asInstanceOf[(Long, Geometry)]._2 val geom1 = itemBoundable1.getItem.asInstanceOf[(Long, Geometry)]._2 geom.distance(geom1) } } val nearestItem = rtree.value.nearestNeighbour(queryEnv, (0l, geom), new dist) .asInstanceOf[(Long, Geometry)] Array((leftId, nearestItem._1)) } else { Array.empty[(Long, Long)] } } def apply(sc: SparkContext, leftGeometryWithId: RDD[(Long, Geometry)], rightGeometryWithId: RDD[(Long, Geometry)], joinPredicate: SpatialOperator, radius: Double = 0): RDD[(Long, Long)] = { // create R-tree on right dataset val strtree = new STRtree() val rightGeometryWithIdLocal = rightGeometryWithId.collect() rightGeometryWithIdLocal.foreach(x => { val y = x._2.getEnvelopeInternal y.expandBy(radius) strtree.insert(y, x) }) val rtreeBroadcast = sc.broadcast(strtree) leftGeometryWithId.flatMap(x => queryRtree(rtreeBroadcast, x._1, x._2, joinPredicate, radius)) } }
Example 175
Source File: NetezzaRDD.scala From spark-netezza with Apache License 2.0 | 5 votes |
package com.ibm.spark.netezza import java.sql.Connection import java.util.Properties import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ import org.apache.spark.{Partition, SparkContext, TaskContext} override def compute(thePart: Partition, context: TaskContext): Iterator[Row] = new Iterator[Row] { var closed = false var finished = false var gotNext = false var nextValue: Row = null context.addTaskCompletionListener { context => close() } val part = thePart.asInstanceOf[NetezzaPartition] val conn = getConnection() val reader = new NetezzaDataReader(conn, table, columns, filters, part, schema) reader.startExternalTableDataUnload() def getNext(): Row = { if (reader.hasNext) { reader.next() } else { finished = true null.asInstanceOf[Row] } } def close() { if (closed) return try { if (null != reader) { reader.close() } } catch { case e: Exception => logWarning("Exception closing Netezza record reader", e) } try { if (null != conn) { conn.close() } logInfo("closed connection") } catch { case e: Exception => logWarning("Exception closing connection", e) } } override def hasNext: Boolean = { if (!finished) { if (!gotNext) { nextValue = getNext() if (finished) { close() } gotNext = true } } !finished } override def next(): Row = { if (!hasNext) { throw new NoSuchElementException("End of stream") } gotNext = false nextValue } } }
Example 176
Source File: IntegrationSuiteBase.scala From spark-netezza with Apache License 2.0 | 5 votes |
package com.ibm.spark.netezza.integration import java.sql.Connection import com.ibm.spark.netezza.NetezzaJdbcUtils import com.typesafe.config.ConfigFactory import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.{Row, DataFrame, SQLContext} import org.scalatest.{BeforeAndAfterAll, FunSuite} import org.slf4j.LoggerFactory trait IntegrationSuiteBase extends FunSuite with BeforeAndAfterAll with QueryTest{ private val log = LoggerFactory.getLogger(getClass) protected var sc: SparkContext = _ protected var sqlContext: SQLContext = _ protected var conn: Connection = _ protected val prop = new java.util.Properties // Configurable vals protected var configFile = "application" protected var testURL: String = _ protected var testTable: String = _ protected var user: String = _ protected var password: String = _ protected var numPartitions: Int = _ protected var sampleDbmaxNumTables: Int = _ override def beforeAll(): Unit = { super.beforeAll() sc = new SparkContext("local[*]", "IntegrationTest", new SparkConf()) sqlContext = new SQLContext(sc) val conf = ConfigFactory.load(configFile) testURL = conf.getString("test.integration.dbURL") testTable = conf.getString("test.integration.table") user = conf.getString("test.integration.user") password = conf.getString("test.integration.password") numPartitions = conf.getInt("test.integration.partition.number") sampleDbmaxNumTables = conf.getInt("test.integration.max.numtables") prop.setProperty("user", user) prop.setProperty("password", password) log.info("Attempting to get connection from" + testURL) conn = NetezzaJdbcUtils.getConnector(testURL, prop)() log.info("got connection.") } override def afterAll(): Unit = { try { sc.stop() } finally { conn.close() super.afterAll() } } def withTable(tableNames: String*)(f: => Unit): Unit = { try f finally { tableNames.foreach { name => executeJdbcStmt(s"DROP TABLE $name") } } } }
Example 177
Source File: DataSource.scala From pio-template-fpm with Apache License 2.0 | 5 votes |
package org.template.fpm import org.apache.predictionio.controller.PDataSource import org.apache.predictionio.controller.EmptyEvaluationInfo import org.apache.predictionio.controller.EmptyActualResult import org.apache.predictionio.controller.Params import org.apache.predictionio.data.store.PEventStore import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import grizzled.slf4j.Logger case class DataSourceParams(appName: String) extends Params class DataSource(val dsp: DataSourceParams) extends PDataSource[TrainingData, EmptyEvaluationInfo, Query, EmptyActualResult] { @transient lazy val logger = Logger[this.type] override def readTraining(sc: SparkContext): TrainingData = { println("Gathering data from event server.") val transactionsRDD: RDD[Array[String]] = PEventStore.find( appName = dsp.appName, entityType = Some("transaction"), startTime = None, eventNames = Some(List("$set")))(sc).map { event => try { event.properties.get[Array[String]]("items") } catch { case e: Exception => { logger.error(s"Failed to convert event ${event} of. Exception: ${e}.") throw e } } } new TrainingData(transactionsRDD) } } class TrainingData( val transactions: RDD[Array[String]] ) extends Serializable
Example 178
Source File: FPGAlgorithm.scala From pio-template-fpm with Apache License 2.0 | 5 votes |
package org.template.fpm import org.apache.predictionio.controller.P2LAlgorithm import org.apache.predictionio.controller.Params import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import grizzled.slf4j.Logger import org.apache.spark.mllib.fpm.{FPGrowth,FPGrowthModel} case class AlgorithmParams( val minSupport: Double, val minConfidence: Double, val numPartitions: Int ) extends Params class FPGModel( val resultList: List[(String,Array[String],Double)] ) extends Serializable {} class FPGAlgorithm(val ap: AlgorithmParams) extends P2LAlgorithm[PreparedData, FPGModel, Query, PredictedResult] { @transient lazy val logger = Logger[this.type] def train(sc: SparkContext, data: PreparedData): FPGModel = { println("Training FPM model.") val fpg = new FPGrowth().setMinSupport(ap.minSupport).setNumPartitions(ap.numPartitions) val model = fpg.run(data.transactions.cache) val res = model.generateAssociationRules(ap.minConfidence).map(x=>(x.antecedent.mkString(" "),x.consequent,x.confidence)).collect.toList new FPGModel(resultList=res) } def predict(model: FPGModel, query: Query): PredictedResult = { val qArr = query.items.toList.sorted.mkString(" ") val result = model.resultList.filter(x=>{x._1==qArr}).sortBy(_._3).map(x=>{new ConsequentItem(x._2,x._3)}) PredictedResult(consequentItems=result.toArray) } }
Example 179
Source File: AbstractExactor.scala From TextRank with Apache License 2.0 | 5 votes |
package AbstractExactor import KeywordExactor.PropertyExtractor import org.apache.spark.mllib.feature.Word2VecModel import org.apache.spark.{SparkConf, SparkContext} def run(graphName: String, vectorSize: Int, sentenceList: Array[(Int, Array[String])], keySentenceNum: Int, iterator: Int, word2vecModel: Word2VecModel, df: Float): List[(String, Float)] = { // 生成关键词图 val constructTextGraph = new ConstructSentenceGraph(graphName, vectorSize, sentenceList, word2vecModel) val textGraph = constructTextGraph.constructGraph // 输出提取的关键词 val keywordExtractor = new PropertyExtractor(textGraph, keySentenceNum) val result = keywordExtractor.textrank(iterator, df).sortBy(_._1) result } def main (args: Array[String]): Unit = { val conf = new SparkConf().setAppName("AbstractExtractor") val sc = new SparkContext(conf) val filePath = args(0) val word2VecModelPath = args(1) // val data = sc.textFile("/Users/li/workshop/MyRepository/TextRank/src/main/resources/2.txt").flatMap(_.split("。")).collect.map(x=> x.split(" ")) val data = sc.textFile(filePath).flatMap(_.split("。")).collect.map(x=> x.split(" ")) val dataIndex = data.zipWithIndex.map(x=>(x._2, x._1)) dataIndex.foreach(x=> println(x._1, x._2.mkString(""))) // val word2VecModelPath = "hdfs://61.147.114.85:9000/home/liyu/word2vec/model2/10_100_5_102017-02-08-word2VectorModel" // val word2VecModelPath = "/Users/li/workshop/DataSet/word2vec/model-10-100-20/2016-08-16-word2VectorModel/" val model = Word2VecModel.load(sc, word2VecModelPath) val da = model.findSynonyms("共产党", 2) da.foreach(x => println(x)) val result = run("jiji", 100, dataIndex, 2, 100, model, 0.9F) println(result) // 转换成句子 val index = result.map(x=> x._1) for (elem <- index) { print(dataIndex(elem.toInt)._2.mkString("")) } } }
Example 180
Source File: GpuEnablerExample.scala From GPUEnabler with Apache License 2.0 | 5 votes |
package com.ibm.gpuenabler import org.apache.spark.{SparkContext, SparkConf} import com.ibm.gpuenabler.CUDARDDImplicits._ object GpuEnablerExample { def main(args: Array[String]) = { val masterURL = if (args.length > 0) args(0) else "local[*]" val sparkConf = new SparkConf().setAppName("GpuEnablerExample1").setMaster(masterURL) val sc = new SparkContext(sparkConf) val ptxURL = getClass.getResource("/GpuEnablerExamples.ptx") val mapFunction = new CUDAFunction( "multiplyBy2", Array("this"), Array("this"), ptxURL) val dimensions = (size: Long, stage: Int) => stage match { case 0 => (64, 256) case 1 => (1, 1) } val reduceFunction = new CUDAFunction( "sum", Array("this"), Array("this"), ptxURL, Seq(), Some((size: Long) => 2), Some(dimensions)) val n = 10 val output = sc.parallelize(1 to n, 1) .mapExtFunc((x: Int) => 2 * x, mapFunction) .reduceExtFunc((x: Int, y: Int) => x + y, reduceFunction) println("Sum of the list is " + output) } }
Example 181
Source File: GpuEnablerCodegen.scala From GPUEnabler with Apache License 2.0 | 5 votes |
// bin/spark-submit --jars gpu-enabler/target/gpu-enabler_2.10-1.0.0.jar // --class com.ibm.gpuenabler.GpuEnablerCodegen examples/target/gpu-enabler-examples_2.10-1.0.0.jar package com.ibm.gpuenabler import org.apache.spark.{SparkContext, SparkConf} import com.ibm.gpuenabler.CUDARDDImplicits._ object GpuEnablerCodegen { def main(args: Array[String]) = { val masterURL = if (args.length > 0) args(0) else "local[*]" val sparkConf = new SparkConf().setAppName("GpuEnablerCodegen").setMaster(masterURL) sparkConf.set("spark.gpu.codegen", "true") val sc = new SparkContext(sparkConf) val n = 10 val intOut = sc.parallelize(1 to n, 1) .mapGpu((x: Int) => 2 * x) .reduceGpu((x: Int, y: Int) => x + y) println("Int sum of the list is " + intOut) val doubleOut = sc.parallelize(1 to n, 1).map(x => x.toDouble) .mapGpu((x: Double) => 2.5D * x) .reduceGpu((x: Double, y: Double) => x + y) println("Double sum of the list is " + doubleOut) } }
Example 182
Source File: Main.scala From stellar-random-walk with Apache License 2.0 | 5 votes |
package au.csiro.data61.randomwalk import au.csiro.data61.randomwalk.algorithm.{UniformRandomWalk, VCutRandomWalk} import au.csiro.data61.randomwalk.common.CommandParser.TaskName import au.csiro.data61.randomwalk.common.{CommandParser, Params, Property} import com.typesafe.config.Config import org.apache.log4j.LogManager import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} import org.scalactic.{Every, Good, Or} import spark.jobserver.SparkJobInvalid import spark.jobserver.api._ object Main extends SparkJob { lazy val logger = LogManager.getLogger("myLogger") def main(args: Array[String]) { CommandParser.parse(args) match { case Some(params) => val conf = new SparkConf().setAppName("stellar-random-walk") val context: SparkContext = new SparkContext(conf) runJob(context, null, params) case None => sys.exit(1) } } override def validate(sc: SparkContext, runtime: JobEnvironment, config: Config): JobData Or Every[SparkJobInvalid] = { val args = config.getString("rw.input").split("\\s+") CommandParser.parse(args) match { case Some(params) => Good(params) } } }
Example 183
Source File: StorageHelper.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.storage import java.io.File import org.apache.hadoop.fs.{FileSystem, FileUtil, Path} import org.apache.spark.{SparkContext, SparkFiles} import org.apache.spark.sql.SparkSession object StorageHelper { def resolveStorageName(database: String, storageRef: String): String = new Path(database + "_" + storageRef).toString def load( storageSourcePath: String, spark: SparkSession, database: String, storageRef: String, withinStorage: Boolean ): RocksDBConnection = { val dbFolder = StorageHelper.resolveStorageName(database.toString, storageRef) val src = StorageLocator.getStorageSerializedPath(storageSourcePath.replaceAllLiterally("\\", "/"), dbFolder, withinStorage) val locator = StorageLocator(database, storageRef, spark) sendToCluster(src, locator.clusterFilePath, locator.clusterFileName, locator.destinationScheme, spark.sparkContext) RocksDBConnection.getOrCreate(locator.clusterFileName) } def save(path: String, connection: RocksDBConnection, spark: SparkSession, withinStorage: Boolean): Unit = { val indexUri = "file://"+(new java.net.URI(connection.findLocalIndex.replaceAllLiterally("\\", "/")).getPath) val index = new Path(indexUri) val uri = new java.net.URI(path.replaceAllLiterally("\\", "/")) val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration) val dst = new Path(path+{if (withinStorage) "/storage/" else ""}) save(fs, index, dst) } private def save(fs: FileSystem, index: Path, dst: Path): Unit = { if (!fs.exists(dst)) fs.mkdirs(dst) fs.copyFromLocalFile(false, true, index, dst) } def sendToCluster(source: Path, clusterFilePath: Path, clusterFileName: String, destinationScheme: String, sparkContext: SparkContext): Unit = { if (destinationScheme == "file") { copyIndexToLocal(source, new Path(RocksDBConnection.getLocalPath(clusterFileName)), sparkContext) } else { copyIndexToCluster(source, clusterFilePath, sparkContext) } } private def copyIndexToCluster(sourcePath: Path, dst: Path, spark: SparkContext): String = { if (!new File(SparkFiles.get(dst.getName)).exists()) { val srcFS = sourcePath.getFileSystem(spark.hadoopConfiguration) val dstFS = dst.getFileSystem(spark.hadoopConfiguration) if (srcFS.getScheme == "file") { val src = sourcePath dstFS.copyFromLocalFile(false, true, src, dst) } else { FileUtil.copy(srcFS, sourcePath, dstFS, dst, false, true, spark.hadoopConfiguration) } spark.addFile(dst.toString, recursive = true) } dst.toString } private def copyIndexToLocal(source: Path, destination: Path, context: SparkContext): Unit = { val fs = source.getFileSystem(context.hadoopConfiguration) if (!fs.exists(destination)) fs.copyFromLocalFile(false, true, source, destination) } }
Example 184
Source File: Quickstart.scala From delta with Apache License 2.0 | 5 votes |
package example import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.{SparkSession, SQLContext} import io.delta.tables._ import org.apache.spark.sql.functions._ import org.apache.commons.io.FileUtils import java.io.File object Quickstart { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("Quickstart") .master("local[*]") .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") .getOrCreate() val file = new File("/tmp/delta-table") if (file.exists()) FileUtils.deleteDirectory(file) // Create a table println("Creating a table") val path = file.getCanonicalPath var data = spark.range(0, 5) data.write.format("delta").save(path) // Read table println("Reading the table") val df = spark.read.format("delta").load(path) df.show() // Upsert (merge) new data println("Upsert new data") val newData = spark.range(0, 20).toDF val deltaTable = DeltaTable.forPath(path) deltaTable.as("oldData") .merge( newData.as("newData"), "oldData.id = newData.id") .whenMatched .update(Map("id" -> col("newData.id"))) .whenNotMatched .insert(Map("id" -> col("newData.id"))) .execute() deltaTable.toDF.show() // Update table data println("Overwrite the table") data = spark.range(5, 10) data.write.format("delta").mode("overwrite").save(path) deltaTable.toDF.show() // Update every even value by adding 100 to it println("Update to the table (add 100 to every even value)") deltaTable.update( condition = expr("id % 2 == 0"), set = Map("id" -> expr("id + 100"))) deltaTable.toDF.show() // Delete every even value deltaTable.delete(condition = expr("id % 2 == 0")) deltaTable.toDF.show() // Read old version of the data using time travel print("Read old data using time travel") val df2 = spark.read.format("delta").option("versionAsOf", 0).load(path) df2.show() // Cleanup FileUtils.deleteDirectory(file) spark.stop() } }
Example 185
Source File: DeltaSink.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.sources import org.apache.spark.sql.delta._ import org.apache.spark.sql.delta.actions.SetTransaction import org.apache.spark.sql.delta.metering.DeltaLogging import org.apache.spark.sql.delta.schema.{ImplicitMetadataOperation, SchemaUtils} import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.sql._ import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.execution.metric.SQLMetrics.createMetric import org.apache.spark.sql.execution.streaming.{Sink, StreamExecution} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.NullType class DeltaSink( sqlContext: SQLContext, path: Path, partitionColumns: Seq[String], outputMode: OutputMode, options: DeltaOptions) extends Sink with ImplicitMetadataOperation with DeltaLogging { private val deltaLog = DeltaLog.forTable(sqlContext.sparkSession, path) private val sqlConf = sqlContext.sparkSession.sessionState.conf override protected val canOverwriteSchema: Boolean = outputMode == OutputMode.Complete() && options.canOverwriteSchema override protected val canMergeSchema: Boolean = options.canMergeSchema override def addBatch(batchId: Long, data: DataFrame): Unit = deltaLog.withNewTransaction { txn => val sc = data.sparkSession.sparkContext val metrics = Map[String, SQLMetric]( "numAddedFiles" -> createMetric(sc, "number of files added"), "numRemovedFiles" -> createMetric(sc, "number of files removed") ) val queryId = sqlContext.sparkContext.getLocalProperty(StreamExecution.QUERY_ID_KEY) assert(queryId != null) if (SchemaUtils.typeExistsRecursively(data.schema)(_.isInstanceOf[NullType])) { throw DeltaErrors.streamWriteNullTypeException } // If the batch reads the same Delta table as this sink is going to write to, then this // write has dependencies. Then make sure that this commit set hasDependencies to true // by injecting a read on the whole table. This needs to be done explicitly because // MicroBatchExecution has already enforced all the data skipping (by forcing the generation // of the executed plan) even before the transaction was started. val selfScan = data.queryExecution.analyzed.collectFirst { case DeltaTable(index) if index.deltaLog.isSameLogAs(txn.deltaLog) => true }.nonEmpty if (selfScan) { txn.readWholeTable() } // Streaming sinks can't blindly overwrite schema. See Schema Management design doc for details updateMetadata( txn, data, partitionColumns, configuration = Map.empty, outputMode == OutputMode.Complete()) val currentVersion = txn.txnVersion(queryId) if (currentVersion >= batchId) { logInfo(s"Skipping already complete epoch $batchId, in query $queryId") return } val deletedFiles = outputMode match { case o if o == OutputMode.Complete() => deltaLog.assertRemovable() txn.filterFiles().map(_.remove) case _ => Nil } val newFiles = txn.writeFiles(data, Some(options)) val setTxn = SetTransaction(queryId, batchId, Some(deltaLog.clock.getTimeMillis())) :: Nil val info = DeltaOperations.StreamingUpdate(outputMode, queryId, batchId, options.userMetadata) metrics("numRemovedFiles").set(deletedFiles.size) metrics("numAddedFiles").set(newFiles.size) txn.registerSQLMetrics(sqlContext.sparkSession, metrics) txn.commit(setTxn ++ newFiles ++ deletedFiles, info) // This is needed to make the SQL metrics visible in the Spark UI val executionId = sqlContext.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) SQLMetrics.postDriverMetricUpdates( sqlContext.sparkContext, executionId, metrics.values.toSeq) } override def toString(): String = s"DeltaSink[$path]" }
Example 186
Source File: HadoopFileSystemLogStore.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.storage import java.io.{BufferedReader, FileNotFoundException, InputStreamReader} import java.nio.charset.StandardCharsets.UTF_8 import java.nio.file.FileAlreadyExistsException import java.util.UUID import scala.collection.JavaConverters._ import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SparkSession protected def writeWithRename( path: Path, actions: Iterator[String], overwrite: Boolean = false): Unit = { val fs = path.getFileSystem(getHadoopConfiguration) if (!fs.exists(path.getParent)) { throw new FileNotFoundException(s"No such file or directory: ${path.getParent}") } if (overwrite) { val stream = fs.create(path, true) try { actions.map(_ + "\n").map(_.getBytes(UTF_8)).foreach(stream.write) } finally { stream.close() } } else { if (fs.exists(path)) { throw new FileAlreadyExistsException(path.toString) } val tempPath = createTempPath(path) var streamClosed = false // This flag is to avoid double close var renameDone = false // This flag is to save the delete operation in most of cases. val stream = fs.create(tempPath) try { actions.map(_ + "\n").map(_.getBytes(UTF_8)).foreach(stream.write) stream.close() streamClosed = true try { if (fs.rename(tempPath, path)) { renameDone = true } else { if (fs.exists(path)) { throw new FileAlreadyExistsException(path.toString) } else { throw new IllegalStateException(s"Cannot rename $tempPath to $path") } } } catch { case _: org.apache.hadoop.fs.FileAlreadyExistsException => throw new FileAlreadyExistsException(path.toString) } } finally { if (!streamClosed) { stream.close() } if (!renameDone) { fs.delete(tempPath, false) } } } } protected def createTempPath(path: Path): Path = { new Path(path.getParent, s".${path.getName}.${UUID.randomUUID}.tmp") } override def invalidateCache(): Unit = {} }
Example 187
Source File: DeltaHiveTest.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.test import org.apache.spark.sql.delta.catalog.DeltaCatalog import io.delta.sql.DeltaSparkSessionExtension import org.scalatest.BeforeAndAfterAll import org.apache.spark.{SparkContext, SparkFunSuite} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.hive.test.{TestHive, TestHiveContext} import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} import org.apache.spark.sql.test.SQLTestUtils trait DeltaHiveTest extends SparkFunSuite with BeforeAndAfterAll { self: SQLTestUtils => private var _session: SparkSession = _ private var _hiveContext: TestHiveContext = _ private var _sc: SparkContext = _ override def beforeAll(): Unit = { val conf = TestHive.sparkSession.sparkContext.getConf.clone() TestHive.sparkSession.stop() conf.set(SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION.key, classOf[DeltaCatalog].getName) conf.set(StaticSQLConf.SPARK_SESSION_EXTENSIONS.key, classOf[DeltaSparkSessionExtension].getName) _sc = new SparkContext("local", this.getClass.getName, conf) _hiveContext = new TestHiveContext(_sc) _session = _hiveContext.sparkSession SparkSession.setActiveSession(_session) super.beforeAll() } override protected def spark: SparkSession = _session override def afterAll(): Unit = { try { _hiveContext.reset() } finally { _sc.stop() } } }
Example 188
Source File: SPLScalaReflection.scala From spark-druid-olap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.sparklinedata import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.ScalaReflection object SPLScalaReflection { import ScalaReflection.universe import ScalaReflection.mirror def changeSessionStateClass : Unit = { val spkSessionCSymbol = mirror.classSymbol(classOf[SparkSession]) val spkSessionModSymbol = spkSessionCSymbol.companion.asModule val spkSessionModClassMirror = mirror.reflectModule(spkSessionModSymbol) val spkSessionModule = spkSessionModClassMirror.instance val spkSessionModuleMirror = mirror.reflect(spkSessionModule) val spkSessionModuleTyp = spkSessionModuleMirror.symbol.selfType val termSessionState = spkSessionModuleTyp.decl( universe.TermName("HIVE_SESSION_STATE_CLASS_NAME")).asTerm.accessed.asTerm val sessionStateField = spkSessionModuleMirror.reflectField(termSessionState) sessionStateField.set("org.apache.spark.sql.hive.sparklinedata.SPLSessionState") } // def main(args : Array[String]) : Unit = { // changeSessionStateClass // // println(new SparkSession(new SparkContext()).sharedState.getClass) // } }
Example 189
Source File: DruidQueriesTab.scala From spark-druid-olap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.sparklinedata.ui import org.apache.spark.sql.hive.thriftserver.sparklinedata.ui.DruidQueriesTab._ import org.apache.spark.ui.{SparkUI, SparkUITab} import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.sql.SPLLogging private[thriftserver] class DruidQueriesTab(sparkContext: SparkContext) extends SparkUITab(getSparkUI(sparkContext), "druid") with SPLLogging { override val name = "Druid Query Details" val parent = getSparkUI(sparkContext) attachPage(new DruidQueriesPage(this)) parent.attachTab(this) def detach() { getSparkUI(sparkContext).detachTab(this) } } private[spark] object DruidQueriesTab { def getSparkUI(sparkContext: SparkContext): SparkUI = { sparkContext.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 190
Source File: ScalingVariable.scala From reforest with Apache License 2.0 | 5 votes |
package reforest.data import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import reforest.TypeInfo import scala.reflect.ClassTag /** * It scales the value of the raw data according to different methodologies * @tparam T raw data type * @tparam U working data type */ trait ScalingVariable[T, U] extends Serializable { /** * It scales the data passed as argument * @param data The value to be scaled * @return The scaled data */ def scale(data: RawDataLabeled[T, U]): RawDataLabeled[T, U] } /** * It scales the values according to the Basic Scaling of Blaser et al. "Random rotation ensembles". * Numeric values are scaled to [0, 1] using the min and max values. * @param sc The Spark Context * @param typeInfo The type information about the raw data * @param featureNumber The number of feature in the dataset * @param input The raw dataset * @tparam T raw data type * @tparam U working data type */ class ScalingBasic[T : ClassTag, U : ClassTag](@transient private val sc: SparkContext, typeInfo: Broadcast[TypeInfo[T]], featureNumber: Int, input: RDD[RawDataLabeled[T, U]]) extends ScalingVariable[T, U] { private val scaling: Broadcast[scala.collection.Map[Int, (T, T)]] = sc.broadcast(init()) private def scaleValue(index: Int, value: T): T = { val (min, max) = scaling.value(index) val doubleValue = typeInfo.value.toDouble(value) typeInfo.value.fromDouble(Math.min(1, Math.max(0, (doubleValue - typeInfo.value.toDouble(min)) / (typeInfo.value.toDouble(max) - typeInfo.value.toDouble(min))))) } override def scale(data: RawDataLabeled[T, U]): RawDataLabeled[T, U] = { val densed = data.features.toDense val values = new Array[T](densed.size) var count = 0 while (count < values.length) { values(count) = scaleValue(count, densed(count)) count += 1 } RawDataLabeled(data.label, new RawDataDense(values, densed.nan)) } private def init(): scala.collection.Map[Int, (T, T)] = { input.mapPartitions(it => { val min = Array.fill(featureNumber)(typeInfo.value.maxValue) val max = Array.fill(featureNumber)(typeInfo.value.minValue) def setMinMax(index: Int, value: T): Unit = { if (typeInfo.value.isMinOrEqual(value, min(index))) { min(index) = value } if (typeInfo.value.isMinOrEqual(max(index), value)) { max(index) = value } } it.foreach(t => { t.features.foreachActive(setMinMax) }) min.zip(max).zipWithIndex.map(_.swap).toIterator }).reduceByKey((a, b) => (typeInfo.value.min(a._1, b._1), typeInfo.value.max(a._2, b._2))).collectAsMap() } }
Example 191
Source File: DataLoad.scala From reforest with Apache License 2.0 | 5 votes |
package reforest.data.load import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import reforest.data.RawDataLabeled /** * An utility to load data from different file formats in raw data labeled * @tparam T raw data type * @tparam U working data type */ trait DataLoad[T, U] extends Serializable { /** * Load the data from a file * @param sc the Spark Context * @param path the file path * @param numFeatures the number of features in the dataset * @param minPartitions the minimum number of partition of the RDD * @return the loaded dataset in RawDataLabeled format */ def loadFile(sc: SparkContext, path: String, numFeatures: Int, minPartitions: Int): RDD[RawDataLabeled[T, U]] }
Example 192
Source File: ARFFUtil.scala From reforest with Apache License 2.0 | 5 votes |
package reforest.data.load import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import reforest.TypeInfo import reforest.data.{RawData, RawDataLabeled} import reforest.rf.RFCategoryInfo import reforest.util.GCInstrumented import scala.reflect.ClassTag /** * Load data in ARFF format * * @param typeInfo the type information of the raw data * @param instrumented the instrumentation of the GC * @param categoryInfo the information for the categorical features * @tparam T raw data type * @tparam U working data type */ class ARFFUtil[T: ClassTag, U: ClassTag](typeInfo: Broadcast[TypeInfo[T]], instrumented: Broadcast[GCInstrumented], categoryInfo: Broadcast[RFCategoryInfo]) extends DataLoad[T, U] { override def loadFile(sc: SparkContext, path: String, numFeatures: Int, minPartitions: Int): RDD[RawDataLabeled[T, U]] = { val parsed = parseARFFFile(sc, path, minPartitions) instrumented.value.gcALL parsed.map { case (label, values) => RawDataLabeled(label, RawData.dense[T, U](values, typeInfo.value.NaN)) } } private def parseARFFFile(sc: SparkContext, path: String, minPartitions: Int): RDD[(Double, Array[T])] = { sc.textFile(path, minPartitions) .map(_.trim) .filter(line => !(line.isEmpty || line.startsWith("#") || line.startsWith("%") || line.startsWith("@"))) .mapPartitions(it => { val toReturn = it.map(u => parseARFFRecord(u)) instrumented.value.gc() toReturn }) } private[load] def parseARFFRecord(line: String): (Double, Array[T]) = { val items = line.split(',') val label = Math.max(items.last.toDouble, 0) val values = items.dropRight(1).filter(_.nonEmpty).map({ try { typeInfo.value.fromString } catch { case e : NumberFormatException => { println("Malformed input. Details: \n"+e.getMessage) System.exit(1) null } case e : Exception => { e.printStackTrace() System.exit(1) null } } }) (label, values) } }
Example 193
Source File: LibSVMUtil.scala From reforest with Apache License 2.0 | 5 votes |
package reforest.data.load import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import reforest.TypeInfo import reforest.data.{RawData, RawDataLabeled} import reforest.rf.RFCategoryInfo import reforest.util.GCInstrumented import scala.reflect.ClassTag /** * Forked from Apache Spark MLlib * Load data in LibSVM format * * @param typeInfo the type information of the raw data * @param instrumented the instrumentation of the GC * @param categoryInfo the information for the categorical features * @tparam T raw data type * @tparam U working data type */ class LibSVMUtil[T: ClassTag, U: ClassTag](typeInfo: Broadcast[TypeInfo[T]], instrumented: Broadcast[GCInstrumented], categoryInfo: Broadcast[RFCategoryInfo]) extends DataLoad[T, U] { override def loadFile(sc: SparkContext, path: String, numFeatures: Int, minPartitions: Int): RDD[RawDataLabeled[T, U]] = { val parsed = parseLibSVMFile(sc, path, minPartitions) instrumented.value.gcALL parsed.map { case (label, indices, values) => RawDataLabeled(label, RawData.sparse[T, U](numFeatures, indices, values, typeInfo.value.NaN).compressed) } } private def parseLibSVMFile(sc: SparkContext, path: String, minPartitions: Int): RDD[(Double, Array[Int], Array[T])] = { sc.textFile(path, minPartitions) .map(_.trim) .filter(line => !(line.isEmpty || line.startsWith("#"))) .mapPartitions(it => { val toReturn = it.map(u => parseLibSVMRecord(u)) instrumented.value.gc() toReturn }) } private[load] def parseLibSVMRecord(line: String): (Double, Array[Int], Array[T]) = { val items = line.split(' ') val label = Math.max(items.head.toDouble, 0) val (indices, values) = items.tail.filter(_.nonEmpty).flatMap { item => try { val indexAndValue = item.split(':') val index = indexAndValue(0).toInt - 1 // Convert 1-based indices to 0-based val value = typeInfo.value.fromString(indexAndValue(1)) if (categoryInfo.value.isCategorical(index)) { Some((index, typeInfo.value.fromInt(categoryInfo.value.rawRemapping(typeInfo.value.toInt(value))))) } else { Some((index, value)) } } catch { case e : NumberFormatException => { println("Malformed input. Details: \n"+e.getMessage) System.exit(1) None } case e : Exception => { e.printStackTrace() System.exit(1) None } } }.unzip // check if indices are one-based and in ascending order var previous = -1 var i = 0 val indicesLength = indices.length while (i < indicesLength) { val current = indices(i) require(current > previous, s"indices should be one-based and in ascending order;" + " found current=$current, previous=$previous; line=\"$line\"") previous = current i += 1 } (label, indices, values) } }
Example 194
Source File: SLCTreeGeneration.scala From reforest with Apache License 2.0 | 5 votes |
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package reforest.rf.slc import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import reforest.TypeInfo import reforest.data._ import reforest.data.tree.ForestManager import reforest.rf.feature.RFFeatureManager import reforest.rf.parameter.RFParameter import reforest.rf.{RFSkip, RFStrategy, RFTreeGeneration} import reforest.util._ class SLCTreeGeneration[T, U](@transient private val sc: SparkContext, property: Broadcast[RFParameter], typeInfo: Broadcast[TypeInfo[T]], typeInfoWorking: Broadcast[TypeInfo[U]], sampleSize: Long) extends Serializable { var fcsExecutor : Option[SLCExecutor[T, U]] = Option.empty def findBestCutSLC(dataIndex: RDD[StaticData[U]], forestManager: ForestManager[T, U], featureManager: RFFeatureManager, depthToStop : Int, instrumented: Broadcast[GCInstrumented], skip : RFSkip): ForestManager[T, U] = { if (featureManager.getActiveNodesNum <= 0) { forestManager } else { var toReturn = forestManager val splitterManagerBC = sc.broadcast(forestManager.splitterManager) if(fcsExecutor.isEmpty) { fcsExecutor = Some(SLCExecutor.build(sc, typeInfo, typeInfoWorking, property, splitterManagerBC, sampleSize)) } toReturn = fcsExecutor.get.executeSLC(toReturn, featureManager, dataIndex, depthToStop, skip) splitterManagerBC.unpersist() toReturn } } }
Example 195
Source File: CCUtil.scala From reforest with Apache License 2.0 | 5 votes |
package reforest.util import org.apache.commons.io.FilenameUtils import org.apache.spark.broadcast.Broadcast import org.apache.spark.{SparkConf, SparkContext} import reforest.TypeInfo import reforest.data.load.{ARFFUtil, DataLoad, LibSVMUtil} import reforest.rf.RFCategoryInfo import reforest.rf.parameter.RFParameter import scala.reflect.ClassTag def getDataLoader[T:ClassTag, U:ClassTag](property : RFParameter, typeInfo: Broadcast[TypeInfo[T]], instrumented: Broadcast[GCInstrumented], categoryInfo: Broadcast[RFCategoryInfo]): DataLoad[T, U] = { val extension = FilenameUtils.getExtension(property.dataset).toUpperCase() property.fileType match { case "LIBSVM" => new LibSVMUtil(typeInfo, instrumented, categoryInfo) case "SVM" => new LibSVMUtil(typeInfo, instrumented, categoryInfo) case "ARFF" => new ARFFUtil(typeInfo, instrumented, categoryInfo) case _ => new LibSVMUtil(typeInfo, instrumented, categoryInfo) } } }
Example 196
Source File: ReForeStLoader.scala From reforest with Apache License 2.0 | 5 votes |
package reforest import org.apache.commons.math3.distribution.PoissonDistribution import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import reforest.data.{RawDataLabeled, RawDataset, StaticData} import reforest.data.tree.ForestManager import reforest.rf.parameter.RFParameter import reforest.rf.split.RFSplitterManager import reforest.rf.{RFCategoryInfo, RFDataPrepare, RFStrategy} import reforest.util.{GCInstrumented, GCInstrumentedEmpty, MemoryUtil} class ReForeStLoader[T, U](@transient private val sc: SparkContext, parameter: Broadcast[RFParameter], strategyBC: Broadcast[RFStrategy[T, U]], val typeInfoBC: Broadcast[TypeInfo[T]], val typeInfoWorkingBC: Broadcast[TypeInfo[U]], val categoricalFeaturesInfoBC: Broadcast[RFCategoryInfo], rawDataset: RawDataset[T, U]) extends Serializable { val instrumented: Broadcast[GCInstrumented] = sc.broadcast(new GCInstrumentedEmpty) val dataPrepare = new RFDataPrepare[T, U](typeInfoBC, instrumented, strategyBC, false, 1) private var memoryUtil : Option[MemoryUtil] = Option.empty private var forestManager : Option[ForestManager[T, U]] = Option.empty private var workingData : Option[RDD[StaticData[U]]] = Option.empty private var previousWorkingData : Option[RDD[StaticData[U]]] = Option.empty private var splitterManager : Option[RFSplitterManager[T,U]] = Option.empty def testdatafreeze(): Unit = { rawDataset.testingData.persist(parameter.value.storageLevel) } def trainingdatafreeze(): Unit = { // rawDataset.trainingData.persist(property.storageLevel) rawDataset.trainingData.count() } def getRawDataset = rawDataset def getTestingData: RDD[RawDataLabeled[T, U]] = rawDataset.testingData def getMemoryUtil = memoryUtil def getForestManager = forestManager def getWorkingData(numTrees: Int = parameter.value.getMaxNumTrees, macroIteration: Int = 0, skipPreparation : Boolean =false) = { val timePreparationSTART = System.currentTimeMillis() if(skipPreparation) { forestManager = Some(new ForestManager[T, U](parameter.value.applyNumTrees(numTrees), splitterManager.get)) previousWorkingData = workingData workingData = Some(dataPrepare.prepareData(rawDataset.trainingData, sc.broadcast(forestManager.get.splitterManager.getSplitter(macroIteration)), parameter.value.numFeatures, memoryUtil.get, numTrees, macroIteration)) // workingData = Some(workingData.get.mapPartitionsWithIndex{case (partitionIndex, elements) => // strategyBC.value.reGenerateBagging(numTrees, partitionIndex, elements)}) val dataSize = workingData.get.persist(parameter.value.storageLevel).count() if(previousWorkingData.isDefined) { previousWorkingData.get.unpersist() } val timePreparationEND = System.currentTimeMillis() println("TIME PREPARATION SKIPPED INIT ("+dataSize+"): " + (timePreparationEND - timePreparationSTART)) workingData.get } else { previousWorkingData = workingData val zzz = strategyBC.value.findSplits(rawDataset.trainingData, typeInfoBC, typeInfoWorkingBC, instrumented, categoricalFeaturesInfoBC) splitterManager = Some(zzz._1) forestManager = Some(new ForestManager[T, U](parameter.value.applyNumTrees(numTrees), zzz._1)) memoryUtil = Some(zzz._2) val splitter = forestManager.get.splitterManager.getSplitter(macroIteration) // TODO the broadcast of the splitter must be unpersisted!!! workingData = Some(dataPrepare.prepareData(rawDataset.trainingData, sc.broadcast(splitter), parameter.value.numFeatures, memoryUtil.get, numTrees, macroIteration)) val dataSize = workingData.get.persist(parameter.value.storageLevel).count() if(previousWorkingData.isDefined) { previousWorkingData.get.unpersist() } val timePreparationEND = System.currentTimeMillis() println("TIME PREPARATION: " + (timePreparationEND - timePreparationSTART)) workingData.get } } }
Example 197
Source File: CVLogPerplexity.scala From spectrallda-tensorspark with Apache License 2.0 | 5 votes |
package edu.uci.eecs.spectralLDA import breeze.linalg.sum import org.apache.spark.{SparkConf, SparkContext} import edu.uci.eecs.spectralLDA.algorithm._ import org.apache.spark.rdd._ import org.apache.spark.mllib.clustering._ import org.apache.spark.mllib.linalg._ object CVLogPerplexity { def main(args: Array[String]) = { val conf: SparkConf = new SparkConf().setAppName(s"Spectral LDA") val sc: SparkContext = new SparkContext(conf) val cv = args(0).toInt val documentsPath = args(1) val k = args(2).toInt val alpha0 = args(3).toDouble val maxIterations = args(4).toInt val tol = args(5).toDouble val minWords = args(6).toInt val docs = sc.objectFile[(Long, breeze.linalg.SparseVector[Double])](documentsPath) .filter { case (_, tc) => sum(tc) >= minWords } for (i <- 0 until cv) { val splits = docs.randomSplit(Array[Double](0.9, 0.1)) computeLogLikelihood(splits, k, alpha0, maxIterations, tol) } sc.stop() } def computeLogLikelihood(splits: Array[RDD[(Long, breeze.linalg.SparseVector[Double])]], k: Int, alpha0: Double, maxIterations: Int, tol: Double ): Unit = { val numTestTokens = splits(1) .map { case (_, tc) => breeze.linalg.sum(tc) } .reduce(_ + _) val tensorLDA = new TensorLDA( dimK = k, alpha0 = alpha0, maxIterations = maxIterations, tol = tol ) val (beta, alpha, _, _, m1) = tensorLDA.fit(splits(0)) val augBeta = breeze.linalg.DenseMatrix.zeros[Double](beta.rows, k + 1) val augAlpha = breeze.linalg.DenseVector.ones[Double](alpha.length + 1) augBeta(::, 0 until k) := beta val dummyTopic = m1 + 0.1 * breeze.linalg.DenseVector.ones[Double](beta.rows) / beta.rows.toDouble augBeta(::, k) := dummyTopic / sum(dummyTopic) augAlpha(0 until k) := alpha val tensorLDAModel = new TensorLDAModel(augBeta, augAlpha) val tensorLDALogL = tensorLDAModel.logLikelihood(splits(1), smoothing = 1e-6, maxIterations = 50) println(s"Tensor LDA log-perplexity no extra smoothing: ${- tensorLDALogL / numTestTokens}") val trainMapped: RDD[(Long, Vector)] = splits(0).map { case (id, tc) => val (idx, v) = tc.activeIterator.toArray.unzip (id, new SparseVector(tc.length, idx, v)) } val testMapped: RDD[(Long, Vector)] = splits(1).map { case (id, tc) => val (idx, v) = tc.activeIterator.toArray.unzip (id, new SparseVector(tc.length, idx, v)) } val ldaOptimizer = new OnlineLDAOptimizer() .setMiniBatchFraction(0.05) val lda = new LDA() .setOptimizer(ldaOptimizer) .setMaxIterations(80) .setK(k) .setDocConcentration(alpha0 / k.toDouble) .setBeta(1.0) val ldaModel: LDAModel = lda.run(trainMapped) val ldaLogL = ldaModel.asInstanceOf[LocalLDAModel].logLikelihood(testMapped) println(s"Variational Inference log-perplexity: ${- ldaLogL / numTestTokens}") } }
Example 198
Source File: SimpleTokenizer.scala From spectrallda-tensorspark with Apache License 2.0 | 5 votes |
package edu.uci.eecs.spectralLDA.textprocessing import java.text.BreakIterator import org.apache.spark.SparkContext import scala.collection.mutable import org.apache.spark.rdd.RDD class SimpleTokenizer(sc: SparkContext, stopwordFile: String) extends Serializable { private val stopwords: Set[String] = if (stopwordFile.isEmpty) { Set.empty[String] } else { val stopwordText = sc.textFile(stopwordFile).collect() stopwordText.flatMap(_.stripMargin.split("\\s+")).toSet } // Matches sequences of Unicode letters private val allWordRegex = "^(\\p{L}*)$".r // Ignore words shorter than this length. private val minWordLength = 3 def getWords(text: String): IndexedSeq[String] = { val words = new mutable.ArrayBuffer[String]() // Use Java BreakIterator to tokenize text into words. val wb = BreakIterator.getWordInstance wb.setText(text) // current,end index start,end of each word var current = wb.first() var end = wb.next() while (end != BreakIterator.DONE) { // Convert to lowercase val word: String = text.substring(current, end).toLowerCase // Remove short words and strings that aren't only letters word match { case allWordRegex(w) if w.length >= minWordLength && !stopwords.contains(w) => words += w case _ => } current = end try { end = wb.next() } catch { case e: Exception => // Ignore remaining text in line. // This is a known bug in BreakIterator (for some Java versions), // which fails when it sees certain characters. end = BreakIterator.DONE } } words } }
Example 199
Source File: RandNLATest.scala From spectrallda-tensorspark with Apache License 2.0 | 5 votes |
package edu.uci.eecs.spectralLDA.utils import breeze.linalg._ import breeze.linalg.qr.QR import breeze.stats.distributions.{Gaussian, RandBasis, ThreadLocalRandomGenerator, Uniform} import edu.uci.eecs.spectralLDA.testharness.Context import org.apache.commons.math3.random.MersenneTwister import org.apache.spark.SparkContext import org.scalatest._ class RandNLATest extends FlatSpec with Matchers { private val sc: SparkContext = Context.getSparkContext "M2 sketching" should "be correct" in { val a1 = SparseVector(DenseVector.rand[Double](100).toArray) val a2 = SparseVector(DenseVector.rand[Double](100).toArray) val a3 = SparseVector(DenseVector.rand[Double](100).toArray) val docs = Seq((1000L, a1), (1001L, a2), (1002L, a3)) val docsRDD = sc.parallelize(docs) // Random Gaussian matrix val g = DenseMatrix.rand[Double](100, 50, Gaussian(mu = 0.0, sigma = 1.0)) val result = DenseMatrix.zeros[Double](100, 50) docsRDD .flatMap { case (id: Long, w: SparseVector[Double]) => RandNLA.accumulate_M_mul_S(g, w, sum(w)) } .reduceByKey(_ + _) .collect .foreach { case (r: Int, a: DenseVector[Double]) => result(r, ::) := a.t } val m2 = docsRDD .map { case (id: Long, w: SparseVector[Double]) => val l = sum(w) (w * w.t - diag(w)) / (l * (l - 1.0)) } .reduce(_ + _) val expectedResult = m2 * g val diff: DenseMatrix[Double] = result - expectedResult val normDiff: Double = norm(norm(diff(::, *)).toDenseVector) normDiff should be <= 1e-8 } "Randomised Power Iteration method" should "be approximately correct" in { implicit val randBasis: RandBasis = new RandBasis(new ThreadLocalRandomGenerator(new MersenneTwister(234787))) val n = 100 val k = 5 val alpha: DenseVector[Double] = DenseVector[Double](25.0, 20.0, 15.0, 10.0, 5.0) val beta: DenseMatrix[Double] = DenseMatrix.rand(n, k, Uniform(0.0, 1.0)) val norms = norm(beta(::, *)).toDenseVector for (j <- 0 until k) { beta(::, j) /= norms(j) } val a: DenseMatrix[Double] = beta * diag(alpha) * beta.t val sigma: DenseMatrix[Double] = DenseMatrix.rand(n, k, Gaussian(mu = 0.0, sigma = 1.0)) val y = a * sigma val QR(q: DenseMatrix[Double], _) = qr.reduced(y) val (s: DenseVector[Double], u: DenseMatrix[Double]) = RandNLA.decomp2(a * q, q) val diff_a = u * diag(s) * u.t - a val norm_diff_a = norm(norm(diff_a(::, *)).toDenseVector) norm_diff_a should be <= 1e-8 } }
Example 200
Source File: TensorLDAModelTest.scala From spectrallda-tensorspark with Apache License 2.0 | 5 votes |
package edu.uci.eecs.spectralLDA.algorithm import breeze.linalg.{DenseMatrix, DenseVector, SparseVector, norm} import breeze.numerics.abs import org.scalatest._ import org.apache.spark.SparkContext import edu.uci.eecs.spectralLDA.testharness.Context class TensorLDAModelTest extends FlatSpec with Matchers { private val sc: SparkContext = Context.getSparkContext "Multinomial log-likelihood" should "be correct" in { val p = DenseVector[Double](0.2, 0.5, 0.3) val x1 = DenseVector[Double](20, 50, 30) val x2 = DenseVector[Double](40, 40, 20) abs(TensorLDAModel.multinomialLogLikelihood(p, x1) - (-4.697546)) should be <= 1e-6 abs(TensorLDAModel.multinomialLogLikelihood(p, x2) - (-15.42038)) should be <= 1e-6 } }