org.apache.spark.SparkContext Scala Example

Source File: DeltaQA.scala From spark-tools with Apache License 2.0

12 votes

package io.univalence.deltaqa.kpialgebra

import org.apache.spark.rdd.RDD
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import shapeless.contrib.spire._
import spire.algebra._
import spire.implicits._

import scala.reflect.ClassTag

case class DeltaPart[T: AdditiveMonoid](
  count: Long,
  part: T
)

case class DeltaCommon[T: AdditiveMonoid](
  count: Long,
  countZero: Long,
  diff: T,
  error: T,
  left: T,
  right: T
)

case class Delta[L: AdditiveMonoid, R: AdditiveMonoid, C: AdditiveMonoid](
  left: DeltaPart[L],
  right: DeltaPart[R],
  common: DeltaCommon[C]
)

object KpiAlgebra {

  def computeCommon[LRC: AdditiveAbGroup: MultiplicativeSemigroup](left: LRC, right: LRC): DeltaCommon[LRC] = {
    val diff  = left - right
    val error = diff * diff
    DeltaCommon(
      count     = 1,
      countZero = if (diff == Monoid.additive[LRC].id) 1 else 0,
      diff      = diff,
      error     = error,
      left      = left,
      right     = right
    )
  }

  def monoid[LM: AdditiveMonoid, RM: AdditiveMonoid, LRC: AdditiveMonoid]: Monoid[Delta[LM, RM, LRC]] =
    Monoid.additive[Delta[LM, RM, LRC]]

  def compare[
    K: ClassTag,
    L: ClassTag,
    R: ClassTag,
    LM: AdditiveMonoid: ClassTag,
    RM: AdditiveMonoid: ClassTag,
    LRC: AdditiveAbGroup: MultiplicativeSemigroup: ClassTag
  ](
    left: RDD[(K, L)],
    right: RDD[(K, R)]
  )(flm: L => LM, frm: R => RM, flc: L => LRC, frc: R => LRC): Delta[LM, RM, LRC] = {

    val map: RDD[Delta[LM, RM, LRC]] = left
      .fullOuterJoin(right)
      .map({
        case (_, (Some(l), None)) =>
          monoid[LM, RM, LRC].id
            .copy(left = DeltaPart(count = 1, part = flm(l)))
        case (_, (None, Some(r))) =>
          monoid[LM, RM, LRC].id
            .copy(right = DeltaPart(count = 1, part = frm(r)))
        case (_, (Some(l), Some(r))) =>
          monoid[LM, RM, LRC].id.copy(common = computeCommon(flc(l), frc(r)))
      })

    map.reduce((x, y) => monoid[LM, RM, LRC].op(x, y))
  }
}

case class KpiLeaf(l1: Long, l2: Long, l3: Long)

object KpiAlgebraTest {

  def main(args: Array[String]) {
    val sc = new SparkContext(new SparkConf().setMaster("local[*]").setAppName("smoketest"))

    val parallelize: RDD[(Int, Int)] = sc.parallelize((1 to 4).zipWithIndex)

    

    // Delta(DeltaPart(0,0),DeltaPart(0,0),DeltaCommon(4,4,0,0,6,6))

    val p2: RDD[(Int, KpiLeaf)] =
      sc.parallelize((1 to 4)).map(_ -> KpiLeaf(1, 2, 3))

    import spire.implicits._
    import shapeless.contrib.spire._

    ////println(((KpiAlgebra.compare(p2, p2)(identity, identity, identity, identity))

  }
}

Source File: Test1.scala From BigData-News with Apache License 2.0

12 votes

package com.vita.spark.test

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object Test1 {
  def main(args: Array[String]): Unit = {

    val conf: SparkConf = new SparkConf()
    conf.setMaster("local")
    conf.setAppName("TransformationOperator")
    val sc: SparkContext = new SparkContext(conf)
    val list: List[String] = List("张无忌", "赵敏", "周芷若")
    val rdd: RDD[String] = sc.parallelize(list)


    val list1: List[(Int, String)] = List((1, "东方不败"), (2, "令狐冲"), (3, "林平之"))
    val list2: List[(Int, Int)] = List((1, 99), (2, 98), (3, 97))

    val rdd1: RDD[(Int, String)] = sc.parallelize(list1)
    val rdd2: RDD[(Int, Int)] = sc.parallelize(list2)
    rdd1.join(rdd2).foreach(x => println("学号： " + x._1 + "名字：" + x._2._1 + " 分数：" + x._2._2))

  }
}

Source File: CleanupUtil.scala From hazelcast-spark with Apache License 2.0

7 votes

package com.hazelcast.spark.connector.util

import com.hazelcast.spark.connector.util.ConnectionUtil.closeAll
import org.apache.spark.SparkContext
import org.apache.spark.scheduler.{SparkListener, SparkListenerJobEnd, SparkListenerJobStart}

object CleanupUtil {

  val jobIds: collection.mutable.Map[Int, Seq[Int]] = collection.mutable.Map[Int, Seq[Int]]()
  val cleanupJobRddName: String = "HazelcastResourceCleanupJob"

  def addCleanupListener(sc: SparkContext): Unit = {
    sc.addSparkListener(new SparkListener {
      override def onJobStart(jobStart: SparkListenerJobStart): Unit = {
        this.synchronized {
          jobStart.stageInfos.foreach(info => {
            info.rddInfos.foreach(rdd => {
              if (!cleanupJobRddName.equals(rdd.name)) {
                val ids: Seq[Int] = info.rddInfos.map(_.id)
                val maybeIds: Option[Seq[Int]] = jobIds.get(jobStart.jobId)
                if (maybeIds.isDefined) {
                  jobIds.put(jobStart.jobId, ids ++ maybeIds.get)
                } else {
                  jobIds.put(jobStart.jobId, ids)
                }
              }
            })
          })
        }
      }

      override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = {
        this.synchronized {
          if (jobIds.contains(jobEnd.jobId)) {
            try {
              val workers = sc.getConf.getInt("spark.executor.instances", sc.getExecutorStorageStatus.length)
              val rddId: Option[Seq[Int]] = jobIds.get(jobEnd.jobId)
              if (rddId.isDefined) {
                sc.parallelize(1 to workers, workers).setName(cleanupJobRddName).foreachPartition(it ⇒ closeAll(rddId.get))
              }
              jobIds -= jobEnd.jobId
            } catch {
              case e: Exception =>
            }
          }
        }
      }
    })
  }


}

Source File: SummaryStatisticsExample.scala From drizzle-spark with Apache License 2.0

6 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
// $example off$

object SummaryStatisticsExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("SummaryStatisticsExample")
    val sc = new SparkContext(conf)

    // $example on$
    val observations = sc.parallelize(
      Seq(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(3.0, 30.0, 300.0)
      )
    )

    // Compute column summary statistics.
    val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
    println(summary.mean)  // a dense vector containing the mean value for each column
    println(summary.variance)  // column-wise variance
    println(summary.numNonzeros)  // number of nonzeros in each column
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: DenseKMeans.scala From drizzle-spark with Apache License 2.0

6 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors


object DenseKMeans {

  object InitializationMode extends Enumeration {
    type InitializationMode = Value
    val Random, Parallel = Value
  }

  import InitializationMode._

  case class Params(
      input: String = null,
      k: Int = -1,
      numIterations: Int = 10,
      initializationMode: InitializationMode = Parallel) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DenseKMeans") {
      head("DenseKMeans: an example k-means app for dense data.")
      opt[Int]('k', "k")
        .required()
        .text(s"number of clusters, required")
        .action((x, c) => c.copy(k = x))
      opt[Int]("numIterations")
        .text(s"number of iterations, default: ${defaultParams.numIterations}")
        .action((x, c) => c.copy(numIterations = x))
      opt[String]("initMode")
        .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " +
        s"default: ${defaultParams.initializationMode}")
        .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x)))
      arg[String]("<input>")
        .text("input paths to examples")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"DenseKMeans with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = sc.textFile(params.input).map { line =>
      Vectors.dense(line.split(' ').map(_.toDouble))
    }.cache()

    val numExamples = examples.count()

    println(s"numExamples = $numExamples.")

    val initMode = params.initializationMode match {
      case Random => KMeans.RANDOM
      case Parallel => KMeans.K_MEANS_PARALLEL
    }

    val model = new KMeans()
      .setInitializationMode(initMode)
      .setK(params.k)
      .setMaxIterations(params.numIterations)
      .run(examples)

    val cost = model.computeCost(examples)

    println(s"Total cost = $cost.")

    sc.stop()
  }
}
// scalastyle:on println

Source File: OperatorsDSL.scala From sparkling-graph with BSD 2-Clause "Simplified" License

6 votes

package ml.sparkling.graph.operators

import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection._
import ml.sparkling.graph.api.operators.measures.{EdgeMeasure, VertexMeasureConfiguration}
import ml.sparkling.graph.operators.algorithms.coarsening.labelpropagation.LPCoarsening
import ml.sparkling.graph.operators.algorithms.community.pscan.PSCAN._
import ml.sparkling.graph.operators.algorithms.link.BasicLinkPredictor
import ml.sparkling.graph.operators.measures.edge.{AdamicAdar, CommonNeighbours}
import ml.sparkling.graph.operators.measures.vertex.{Degree, NeighborhoodConnectivity, VertexEmbeddedness}
import ml.sparkling.graph.operators.measures.vertex.clustering.LocalClustering
import ml.sparkling.graph.operators.measures.graph.{FreemanCentrality, Modularity}
import ml.sparkling.graph.operators.partitioning.CommunityBasedPartitioning._
import ml.sparkling.graph.operators.measures.vertex.closenes.Closeness
import ml.sparkling.graph.operators.measures.vertex.eigenvector.EigenvectorCentrality
import ml.sparkling.graph.operators.measures.vertex.hits.Hits
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph

import scala.reflect.ClassTag


object OperatorsDSL {

  implicit class ModularityDSL[E:ClassTag](graph:Graph[ComponentID,E]){
    def modularity()=Modularity.compute(graph)
  }

  implicit class DSL[VD:ClassTag ,ED:ClassTag](graph:Graph[VD,ED]){
    def PSCAN(epsilon:Double=0.1)=
      computeConnectedComponents(graph,epsilon)

    def LPCoarse(treatAsUndirected:Boolean=false)=LPCoarsening.coarse(graph,treatAsUndirected = treatAsUndirected)

    def closenessCentrality(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      Closeness.compute(graph,vertexMeasureConfiguration)

    def eigenvectorCentrality(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      EigenvectorCentrality.compute(graph,vertexMeasureConfiguration)

    def hits(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      Hits.compute(graph,vertexMeasureConfiguration)

    def degreeCentrality(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      Degree.compute(graph,vertexMeasureConfiguration)

    def neighborhoodConnectivity(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      NeighborhoodConnectivity.compute(graph,vertexMeasureConfiguration)

    def vertexEmbeddedness(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      VertexEmbeddedness.compute(graph,vertexMeasureConfiguration)

    def localClustering(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      LocalClustering.compute(graph,vertexMeasureConfiguration)

    def freemanCentrality()=FreemanCentrality.compute(graph)

    def partitionBy(communityDetectionMethod:CommunityDetectionMethod[VD,ED])(implicit sc:SparkContext)=
      partitionGraphBy(graph,communityDetectionMethod)

    def partitionBy(communityDetectionMethod:CommunityDetectionAlgorithm,numParts:Int= -1)(implicit sc:SparkContext)=
      partitionGraphUsing(graph,communityDetectionMethod,numParts)

    def adamicAdar(treatAsUndirected:Boolean=false)={
      AdamicAdar.computeWithPreprocessing(graph,treatAsUndirected)
    }

    def commonNeighbours(treatAsUndirected:Boolean=false)={
      CommonNeighbours.computeWithPreprocessing(graph,treatAsUndirected)
    }

    def predictLinks[EV: ClassTag, EO: ClassTag]( edgeMeasure: EdgeMeasure[EO, EV],threshold: EO,treatAsUndirected:Boolean=false)(implicit num: Numeric[EO]) = {
      BasicLinkPredictor.predictLinks(graph, edgeMeasure, threshold, treatAsUndirected)
    }
    }
}

Source File: HBase.scala From AI with Apache License 2.0

6 votes

package com.bigchange.hbase

import com.bigchange.util.HBaseUtil._
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.{Result, _}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos
import org.apache.hadoop.hbase.util.Base64
import org.apache.spark.SparkContext


  def existRowKey(row:String, table: Table): Boolean ={

    val get = new Get(row.getBytes())
    val result = table.get(get)

    if (result.isEmpty) {
      warn("hbase table don't have this data,execute insert")
      return false
    }

    true

  }

  def getConfiguration = if(hBaseConfiguration == null) {
      warn("hbase setDefaultConfiguration....")
      setDefaultConfiguration
    } else  hBaseConfiguration

  def setDefaultConfiguration = {

    hBaseConfiguration = HBaseConfiguration.create

    // 本地测试 需配置的选项， 在集群上通过对应配置文件路径自动获得
    hBaseConfiguration.set("fs.defaultFS", "hdfs://ns1"); // nameservices的路径
    hBaseConfiguration.set("dfs.nameservices", "ns1");  //
    hBaseConfiguration.set("dfs.ha.namenodes.ns1", "nn1,nn2"); //namenode的路径
    hBaseConfiguration.set("dfs.namenode.rpc-address.ns1.nn1", "server3:9000"); // namenode 通信地址
    hBaseConfiguration.set("dfs.namenode.rpc-address.ns1.nn2", "server4:9000"); // namenode 通信地址
    // 设置namenode自动切换的实现类
    hBaseConfiguration.set("dfs.client.failover.proxy.provider.ns1", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider")
    hBaseConfiguration.set("hbase.rootdir", "hdfs://ns1/hbase")
    hBaseConfiguration.set("hbase.zookeeper.quorum", "server0,server1,server2")
    hBaseConfiguration.set("hbase.zookeeper.property.clientPort", "2181")

    hBaseConfiguration

  }

}

Source File: TFIDF.scala From AI with Apache License 2.0

6 votes

package com.bigchange.mllib

import org.apache.spark.mllib.feature.{HashingTF, IDF}
import org.apache.spark.mllib.linalg.{SparseVector => SV}
import org.apache.spark.{SparkConf, SparkContext}

import scala.io.Source


object TFIDF {
  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("TfIdfTest")
      .setMaster("local")
    val sc = new SparkContext(conf)

    // Load documents (one per line).要求每行作为一个document,这里zipWithIndex将每一行的行号作为doc id
    val documents = sc.parallelize(Source.fromFile("J:\\github\\dataSet\\TFIDF-DOC").getLines()
      .filter(_.trim.length > 0).toSeq)
      .map(_.split(" ").toSeq)
      .zipWithIndex()


    // feature number
    val hashingTF = new HashingTF(Math.pow(2, 18).toInt)
    //line number for doc id，每一行的分词结果生成tf vector
    val idAndTFVector = documents.map {
      case (seq, num) =>
        val tf = hashingTF.transform(seq)
        (num + 1, tf)
    }
    idAndTFVector.cache()
    // build idf model
    val idf = new IDF().fit(idAndTFVector.values)
    // transform tf vector to tf-idf vector
    val idAndTFIDFVector = idAndTFVector.mapValues(v => idf.transform(v))
    // broadcast tf-idf vectors
    val idAndTFIDFVectorBroadCast = sc.broadcast(idAndTFIDFVector.collect())

    // cal doc cosineSimilarity
    val docSims = idAndTFIDFVector.flatMap {
      case (id1, idf1) =>
        // filter the same doc id
        val idfs = idAndTFIDFVectorBroadCast.value.filter(_._1 != id1)
        val sv1 = idf1.asInstanceOf[SV]
        import breeze.linalg._
        val bsv1 = new SparseVector[Double](sv1.indices, sv1.values, sv1.size)
        idfs.map {
          case (id2, idf2) =>
            val sv2 = idf2.asInstanceOf[SV]
            val bsv2 = new SparseVector[Double](sv2.indices, sv2.values, sv2.size)
            val cosSim = bsv1.dot(bsv2) / (norm(bsv1) * norm(bsv2))
            (id1, id2, cosSim)
        }
    }
    docSims.foreach(println)

    sc.stop()

  }
}

Source File: SqlUnitTest.scala From SparkUnitTestingExamples with Apache License 2.0

6 votes

package com.cloudera.sa.spark.unittest.sql

import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}

import scala.collection.mutable

class SqlUnitTest extends FunSuite with
BeforeAndAfterEach with BeforeAndAfterAll{

  @transient var sc: SparkContext = null
  @transient var hiveContext: HiveContext = null

  override def beforeAll(): Unit = {

    val envMap = Map[String,String](("Xmx", "512m"))

    val sparkConfig = new SparkConf()
    sparkConfig.set("spark.broadcast.compress", "false")
    sparkConfig.set("spark.shuffle.compress", "false")
    sparkConfig.set("spark.shuffle.spill.compress", "false")
    sparkConfig.set("spark.io.compression.codec", "lzf")
    sc = new SparkContext("local[2]", "unit test", sparkConfig)
    hiveContext = new HiveContext(sc)
  }

  override def afterAll(): Unit = {
    sc.stop()
  }

  test("Test table creation and summing of counts") {
    val personRDD = sc.parallelize(Seq(Row("ted", 42, "blue"),
      Row("tj", 11, "green"),
      Row("andrew", 9, "green")))

    hiveContext.sql("create table person (name string, age int, color string)")

    val emptyDataFrame = hiveContext.sql("select * from person limit 0")

    val personDataFrame = hiveContext.createDataFrame(personRDD, emptyDataFrame.schema)
    personDataFrame.registerTempTable("tempPerson")

    val ageSumDataFrame = hiveContext.sql("select sum(age) from tempPerson")

    val localAgeSum = ageSumDataFrame.take(10)

    assert(localAgeSum(0).get(0) == 62, "The sum of age should equal 62 but it equaled " + localAgeSum(0).get(0))
  }
}

Source File: GraphGeneration.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License

6 votes

package com.github.maxpumperla.ml_spark.graphs

import org.apache.spark.graphx.lib.TriangleCount
import org.apache.spark.graphx.util.GraphGenerators
import org.apache.spark.graphx.{Graph, GraphLoader, PartitionStrategy, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}


object GraphGeneration extends App {

  val conf = new SparkConf()
    .setAppName("Graph generation")
    .setMaster("local[4]")
  val sc = new SparkContext(conf)

  val edgeListGraph = GraphLoader.edgeListFile(sc, "./edge_list.txt")

  val rawEdges: RDD[(VertexId, VertexId)] = sc.textFile("./edge_list.txt").map {
    line =>
      val field = line.split(" ")
      (field(0).toLong, field(1).toLong)
  }
  val edgeTupleGraph = Graph.fromEdgeTuples(
    rawEdges=rawEdges, defaultValue="")

  val gridGraph = GraphGenerators.gridGraph(sc, 5, 5)
  val starGraph = GraphGenerators.starGraph(sc, 11)
  val logNormalGraph  = GraphGenerators.logNormalGraph(
    sc, numVertices = 20, mu=1, sigma = 3
  )
  logNormalGraph.outDegrees.map(_._2).collect().sorted

  val actorGraph = GraphLoader.edgeListFile(
    sc, "./ca-hollywood-2009.txt", true
  ).partitionBy(PartitionStrategy.RandomVertexCut)
  actorGraph.edges.count()

  val actorComponents = actorGraph.connectedComponents().cache
  actorComponents.vertices.map(_._2).distinct().count

  val clusterSizes =actorComponents.vertices.map(
    v => (v._2, 1)).reduceByKey(_ + _)
  clusterSizes.map(_._2).max
  clusterSizes.map(_._2).min

  val smallActorGraph = GraphLoader.edgeListFile(sc, "./ca-hollywood-2009.txt")
  val strongComponents = smallActorGraph.stronglyConnectedComponents(numIter = 5)
  strongComponents.vertices.map(_._2).distinct().count

  val canonicalGraph = actorGraph.mapEdges(e => 1).removeSelfEdges().convertToCanonicalEdges()
  val partitionedGraph = canonicalGraph.partitionBy(PartitionStrategy.RandomVertexCut)

  actorGraph.triangleCount()
  val triangles = TriangleCount.runPreCanonicalized(partitionedGraph)

  actorGraph.staticPageRank(10)
  val actorPrGraph: Graph[Double, Double] = actorGraph.pageRank(0.0001)
  actorPrGraph.vertices.reduce((v1, v2) => {
    if (v1._2 > v2._2) v1 else v2
  })

  actorPrGraph.inDegrees.filter(v => v._1 == 33024L).collect.foreach(println)

  actorPrGraph.inDegrees.map(_._2).collect().sorted.takeRight(10)

  actorPrGraph.inDegrees.map(_._2).filter(_ >= 62).count

}

Source File: L5-15KafkaDirect.scala From prosparkstreaming with Apache License 2.0

6 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.kafka.KafkaUtils

object StationJourneyCountDirectApp {

  def main(args: Array[String]) {
    if (args.length != 7) {
      System.err.println(
        "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>")
      System.exit(1)
    }

    val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(10))
    ssc.checkpoint(checkpointDir)

    val topics = Set(topic)
    val params = Map[String, String](
      "zookeeper.connect" -> zkQuorum,
      "group.id" -> consumerGroupId,
      "bootstrap.servers" -> brokerUrl)
    KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, params, topics).map(_._2)
      .map(rec => rec.split(","))
      .map(rec => ((rec(3), rec(7)), 1))
      .reduceByKey(_ + _)
      .repartition(1)
      .map(rec => (rec._2, rec._1))
      .transform(rdd => rdd.sortByKey(ascending = false))
      .saveAsTextFiles(outputPath)

    ssc.start()
    ssc.awaitTermination()
  }

}

Source File: gihyo_6_3_reduceByWindow.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{StreamingContext, Seconds}
import org.apache.spark.streaming.dstream.InputDStream

object gihyo_6_3_reduceByWindow {
  def main(args: Array[String]) {
    if (args.length != 2) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }
    val targetHost = args(0)
    val targetHostPort = args(1).toInt

    val conf = new SparkConf().setAppName("NetworkWordCount")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, Seconds(5))
    val lines = ssc.socketTextStream(targetHost, targetHostPort)
    run(lines)

    ssc.start
    ssc.awaitTermination
  }

  def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) {
    val userList = stream.reduceByWindow((x, y) =>
      x + y, Seconds(windowLength), Seconds(slideInterval))
    userList.print
  }
}

Source File: gihyo_6_3_KafkaStream.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

// scalastyle:off println
import kafka.serializer.StringDecoder
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.InputDStream

object gihyo_6_3_KafkaStream {
  def main(args: Array[String]) {
    if (args.length != 4) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }
    val brokerList = args(0)
    val consumeTopic = args(1)
    val checkpointDir = args(2)
    val saveDir = args(3)

    val f = createStreamingContext(brokerList, consumeTopic, checkpointDir, saveDir)
    // StreamingContextの取得
    val ssc = StreamingContext.getOrCreate(checkpointDir, f)

    sys.ShutdownHookThread {
      System.out.println("Gracefully stopping SparkStreaming Application")
      ssc.stop(true, true)
      System.out.println("SparkStreaming Application stopped")
    }
    ssc.start
    ssc.awaitTermination
  }

  def createStreamingContext(brokerList: String,
      consumeTopic: String,
      checkpointDir: String,
      saveDir: String): () => StreamingContext = { () => {
      
    System.out.println(values)
    Some(running.getOrElse(0) + values.length)
  }

  def run(stream: InputDStream[(String, String)],
    saveDir: String, windowLength: Int = 30, slideInterval: Int = 5) {
    val baseStream = stream.transform(rdd => {
      val t = (Long.MaxValue - System.currentTimeMillis)
      rdd.map(x => (x._1, x._2 + ", " + t))
    }).map(x => {
      val splitVal = x._2.split(",")
      val userVal = splitVal(0).split(":")
      val actionVal = splitVal(1).split(":")
      val pageVal = splitVal(2).split(":")
      val timestamp = splitVal(3)
      (actionVal(1), userVal(1), pageVal(1), timestamp)
    })
    baseStream.persist()

    val accountStream = baseStream.filter(_._1 == "view")
      .map(x => x._2)
      .countByValue()

    val totalUniqueUser = accountStream
      .updateStateByKey[Int](updateStateByKeyFunction _)
      .count()
      .map(x => "totalUniqueUser:" + x)

    val baseStreamPerTirty = baseStream
      .window(Seconds(windowLength), Seconds(slideInterval))
      .filter(_._1 == "view")
    baseStreamPerTirty.persist()

    val pageViewPerTirty = baseStreamPerTirty
      .count()
      .map(x => "PageView:" + x)

    val uniqueUserPerTirty = baseStreamPerTirty
      .map(x => x._2)
      .countByValue()
      .count()
      .map(x => "UniqueUser:" + x)

    val pageViewStream = baseStream
      .filter(_._1 == "view")
      .map(x => x._3)
      .count()
      .map(x => "PageView:" + x)

    val outputStream = totalUniqueUser
      .union(pageViewPerTirty)
      .union(uniqueUserPerTirty)
      .union(pageViewStream)
      .reduce((x, y) => x + ", " + y)
      .saveAsTextFiles(saveDir)
  }
}

// scalastyle:on println

Source File: gihyo_6_3_TwitterStream.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

// scalastyle:off println

import org.atilika.kuromoji.Token
import twitter4j.Status

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.twitter.TwitterUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}

object gihyo_6_3_TwitterStream {
  def main(args: Array[String]) {
    if (args.length != 7) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }

    val Array(cKey, cSecret, aToken, aSecret, cDir, tagDir, wordDir) = args

    System.setProperty("twitter4j.oauth.consumerKey", cKey)
    System.setProperty("twitter4j.oauth.consumerSecret", cSecret)
    System.setProperty("twitter4j.oauth.accessToken", aToken)
    System.setProperty("twitter4j.oauth.accessTokenSecret", aSecret)
    val f = createStreamingContext(cDir, tagDir, wordDir)
    val ssc = StreamingContext.getOrCreate(cDir, f)

    sys.ShutdownHookThread {
      System.out.println("Gracefully stopping SparkStreaming Application")
      ssc.stop(true, true)
      System.out.println("SparkStreaming Application stopped")
    }
    ssc.start
    ssc.awaitTermination
  }

  def createStreamingContext(checkpointDir: String,
      tagDir: String,
      wordDir: String): () => StreamingContext = { () => {
    
    val conf = new SparkConf().setAppName("gihyoSample_Application")
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.registerKryoClasses(Array(classOf[UserDic]))
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, Seconds(5))
    ssc.checkpoint(checkpointDir)
    val twitterStream = TwitterUtils.createStream(ssc, None)
    run(sc, twitterStream, tagDir, wordDir)
    ssc
  }
  }

  def run(sc: SparkContext, stream: InputDStream[Status], tagDir: String, wordDir: String) {
    val tokenizer = sc.broadcast(UserDic.getInstance)
    val tweets = stream.map(tweet => tweet.getText())
    tweets.persist()
    val TweetText = tweets
      .flatMap(text => {
        val tokens = tokenizer.value.tokenize(text).toArray
        tokens.filter(t => {
          val token = t.asInstanceOf[Token]
          ((token.getPartOfSpeech.indexOf("名詞") > -1 &&
            token.getPartOfSpeech.indexOf("一般") > -1) ||
            token.getPartOfSpeech.indexOf("カスタム名詞") > -1) &&
            token.getSurfaceForm.length > 1 &&
            !(token.getSurfaceForm matches "^[a-zA-Z]+$|^[0-9]+$")
        }).map(t => t.asInstanceOf[Token].getSurfaceForm)
      })
      .countByValue()
      .map(x => (x._2, x._1))
      .transform(_.sortByKey(false))
      .map(x => (x._2, x._1))

    val TweetTags = tweets
      .flatMap(tweet => tweet.split(" ").filter(_.startsWith("#")))
      .countByValue()
      .map(x => (x._2, x._1))
      .transform(_.sortByKey(false))
      .map(x => (x._2, x._1))

    TweetText.saveAsTextFiles(wordDir)
    TweetTags.saveAsTextFiles(tagDir)
  }
}

// scalastyle:on println

Source File: gihyo_6_3_reduceByKeyAndWindow_efficient.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{StreamingContext, Seconds}
import org.apache.spark.streaming.dstream.InputDStream

object gihyo_6_3_reduceByKeyAndWindow_efficient {
  def main(args: Array[String]) {
    if (args.length != 3) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }
    val targetHost = args(0)
    val targetHostPort = args(1).toInt
    val checkpointDir = args(2)

    val conf = new SparkConf().setAppName("NetworkWordCount")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, Seconds(5))
    val lines = ssc.socketTextStream(targetHost, targetHostPort)
    ssc.checkpoint(checkpointDir)
    run(lines)

    ssc.start
    ssc.awaitTermination
  }

  def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) {
    val userList = stream.map(x => (x, 1))
      .reduceByKeyAndWindow(
        (a: Int, b: Int) => a + b,
        (a: Int, b: Int) => a - b, Seconds(windowLength), Seconds(slideInterval))
    userList.print
  }
}

Source File: gihyo_6_3_Transform.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{StreamingContext, Seconds}
import org.apache.spark.streaming.dstream.InputDStream

object gihyo_6_3_Transform {
  def main(args: Array[String]) {
    if (args.length != 2) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }
    val targetHost = args(0)
    val targetHostPort = args(1).toInt

    val conf = new SparkConf().setAppName("NetworkWordCount")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, Seconds(5))
    val lines = ssc.socketTextStream(targetHost, targetHostPort)
    val blackList = sc.parallelize(Array(("user002", "rockLogin"), ("user003", "rockPayment")))
    run(lines, blackList)

    ssc.start
    ssc.awaitTermination
  }

  def run(stream: InputDStream[String], blackList: RDD[(String, String)]) {
    val userList = stream.map(x => (x, "action:Login")).transform(rdd => {
      val tmpUserList = rdd.leftOuterJoin(blackList)
      tmpUserList.filter(user => (user._2._2 == None))
    })
    userList.print
  }
}

Source File: gihyo_6_3_reduceByKeyAndWindow.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{StreamingContext, Seconds}
import org.apache.spark.streaming.dstream.InputDStream

object gihyo_6_3_reduceByKeyAndWindow {
  def main(args: Array[String]) {
    if (args.length != 2) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }
    val targetHost = args(0)
    val targetHostPort = args(1).toInt

    val conf = new SparkConf().setAppName("NetworkWordCount")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, Seconds(5))
    val lines = ssc.socketTextStream(targetHost, targetHostPort)
    run(lines)

    ssc.start
    ssc.awaitTermination
  }

  def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) {
    val userList = stream.map(x => (x, 1))
      .reduceByKeyAndWindow((a: Int, b: Int) =>
        a + b, Seconds(windowLength), Seconds(slideInterval))
    userList.print
  }
}

Source File: gihyo_6_3_countByValueAndWindow.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

// scalastyle:off println
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{StreamingContext, Seconds}
import org.apache.spark.streaming.dstream.InputDStream

object gihyo_6_3_countByValueAndWindow {
  def main(args: Array[String]) {
    if (args.length != 3) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }
    val targetHost = args(0)
    val targetHostPort = args(1).toInt
    val checkpointDir = args(2)

    val f = createStreamingContext(targetHost, targetHostPort, checkpointDir)
    val ssc = StreamingContext.getOrCreate(checkpointDir, f)

    sys.ShutdownHookThread {
      System.out.println("Gracefully stopping SparkStreaming Application")
      ssc.stop(true, true)
      System.out.println("SparkStreaming Application stopped")
    }
    ssc.start
    ssc.awaitTermination
  }

  def createStreamingContext(
      targetHost: String,
      targetHostPort: Int, checkpointDir: String): () => StreamingContext = { () => {
    
    val conf = new SparkConf().setAppName("gihyoSample_Application")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, Seconds(5))
    ssc.checkpoint(checkpointDir)

    val lines = ssc.socketTextStream(targetHost, targetHostPort)
    run(lines)
    ssc
  }
  }

  def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) {
    val userList = stream.countByValueAndWindow(Seconds(windowLength), Seconds(slideInterval))
    userList.print
  }
}

// scalastyle:on println

Source File: gihyo_6_3_updateStateByKey.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{StreamingContext, Seconds}
import org.apache.spark.streaming.dstream.InputDStream

object gihyo_6_3_updateStateByKey {
  def main(args: Array[String]) {
    if (args.length != 3) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }
    val targetHost = args(0)
    val targetHostPort = args(1).toInt
    val checkpointDir = args(2)

    val conf = new SparkConf().setAppName("NetworkWordCount")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, Seconds(5))
    val lines = ssc.socketTextStream(targetHost, targetHostPort)
    ssc.checkpoint(checkpointDir)
    run(lines)

    ssc.start
    ssc.awaitTermination
  }

  def run(stream: InputDStream[String]) {
    val userList = stream.map(x => (x, 1)).updateStateByKey[Int](updateStateByKeyFunction _)
    userList.print
  }

  def updateStateByKeyFunction(values: Seq[Int], running: Option[Int]): Option[Int] = {
    
    Some(running.getOrElse(0) + values.size)
  }
}

Source File: gihyo_6_3_countByWindow.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{StreamingContext, Seconds}
import org.apache.spark.streaming.dstream.InputDStream

object gihyo_6_3_countByWindow {
  def main(args: Array[String]) {
    if (args.length != 3) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }
    val targetHost = args(0)
    val targetHostPort = args(1).toInt
    val checkpointDir = args(2)

    val conf = new SparkConf().setAppName("NetworkWordCount")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, Seconds(5))
    val lines = ssc.socketTextStream(targetHost, targetHostPort)
    ssc.checkpoint(checkpointDir)
    run(lines)

    ssc.start
    ssc.awaitTermination
  }

  def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) {
    val userList = stream.countByWindow(Seconds(windowLength), Seconds(slideInterval))
    userList.print
  }
}

Source File: gihyo_6_3_Window.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{StreamingContext, Seconds}
import org.apache.spark.streaming.dstream.InputDStream

object gihyo_6_3_Window {
  def main(args: Array[String]) {
    if (args.length != 2) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }
    val targetHost = args(0)
    val targetHostPort = args(1).toInt

    val conf = new SparkConf().setAppName("NetworkWordCount")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, Seconds(5))
    val lines = ssc.socketTextStream(targetHost, targetHostPort)
    run(lines)

    ssc.start
    ssc.awaitTermination
  }

  def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) {
    val userList = stream.window(Seconds(windowLength), Seconds(slideInterval)).countByValue()
    userList.print
  }
}

Source File: ReduceExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object ReduceExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("ReduceExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1, 6, 3), 3)
    nums.reduce((x, y) => x + y)

    println(s"""nums: ${nums.collect().mkString(", ")}""")
    println(s"""sum:  ${nums.reduce((x, y) => x + y)}""")
  }
}

// scalastyle:on println

Source File: StatsExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object StatsExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("StatsExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array.range(1, 11))
    val stats = nums.stats()

    println(s"""nums:   ${nums.collect().mkString(", ")}""")
    println(s"""count:  ${stats.count}""")
    println(s"""mean:   ${stats.mean}""")
    println(s"""stdev:  ${stats.stdev}""")
  }
}

// scalastyle:on println

Source File: FoldExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object FoldExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("FoldExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1, 6, 3), 3)
    nums.reduce((x, y) => x + y)

    println(s"""nums: ${nums.collect().mkString(", ")}""")
    println(s"""sum:  ${nums.fold(0)((x, y) => x + y)}""")
  }
}

// scalastyle:on println

Source File: OrderExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object OrderExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("OrderExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1))

    println(s"""nums:          ${nums.collect().mkString(", ")}""")
    println(s"""top3:          ${nums.top(3).mkString(", ")}""")
    println(s"""takeOredered3: ${nums.takeOrdered(3).mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: AggregateExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object AggregateExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("AggregateExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  private[basic_action]
  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array.range(1, 11), 3)

    val acc = nums.aggregate(zeroValue = (0.0, 0))(
      seqOp = (partAcc, n) => (partAcc._1 + n, partAcc._2 + 1),
      combOp = (acc1, acc2) => (acc1._1 + acc2._1, acc1._2 + acc2._2)
    )
    val avg = acc._1 / acc._2

    println(s"""nums: ${nums.collect().mkString(", ")}""")
    println(s"""sum:  ${nums.fold(0)((x, y) => x + y)}""")
  }
}

// scalastyle:on println

Source File: CollectAsMapExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.pairrdd_action

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object CollectAsMapExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("CollectAsMapExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(
      Array(
        ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1)
      ), 3
    )
    val fruitsAsMap = fruits.collectAsMap()

    println(s"""fruits:      ${fruits.collect().mkString(", ")}""")
    println(s"""fruitsAsMap: $fruitsAsMap""")
  }
}

// scalastyle:on println

Source File: PersistExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.persistence

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

object PersistExample {
  def main(args: Array[String]) {
    if (args.length != 1) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }

    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("PersistExample")
    val sc = new SparkContext(conf)

    run(sc, args(0))
    sc.stop()
  }

  def run(sc: SparkContext, inputFile: String) {
    val lines = sc.textFile(inputFile)
    lines.count()
    lines.collect()

    val persistedLines = sc.textFile(inputFile).persist()
    persistedLines.collect()
    persistedLines.count()

    persistedLines.unpersist()
    persistedLines.collect()
  }
}

Source File: CustomPartitionerExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.partition

import org.apache.log4j.{Level, Logger}
import org.apache.spark.Partitioner
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object CustomPartitionerExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("CustomPartitionerExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))

    val defaultPartitioned = fruits.map((_, 1)).reduceByKey(_ + _)
    val customPartitioned = fruits.map((_, 1)).reduceByKey(
      new FirstLetterPartitioner(sc.defaultParallelism), _ + _)

    println(s"""fruits:\n  ${fruits.collect().mkString(", ")}""")
    println()

    println("partitioned by default partitioner")
    defaultPartitioned.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
    println()

    println("partitioned by first letter partitioner")
    customPartitioned.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
  }
}

private[partition]
class FirstLetterPartitioner(numParts: Int) extends Partitioner {
  override def numPartitions: Int = numParts

  override def getPartition(key: Any): Int = {
    key.toString.charAt(0).hashCode % numPartitions match {
      case p if p < 0 => p + numPartitions
      case p => p
    }
  }

  override def equals(other: Any): Boolean = {
    other match {
      case p: FirstLetterPartitioner => p.numPartitions == numPartitions
      case _ => false
    }
  }
}

// scalastyle:on println

Source File: PartitionExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.partition

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object PartitionExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("Partition")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val nums = sc.parallelize(Array(3, 2, 4, 1, 2, 1), 1)
    println(s"""nums:\n  ${nums.collect().mkString(", ")}""")
    println()

    println("original:")
    nums.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
    println()

    val numsPar3 = nums.repartition(3)
    println("repartition to 3:")
    numsPar3.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
    println()

    val numsPar2 = numsPar3.coalesce(2)
    println("coalesce to 2:")
    numsPar2.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
  }
}

// scalastyle:on println

Source File: WordCountExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.shared_variable

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object WordCountExample {
  def main(args: Array[String]) {
    if (args.length != 1) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }

    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("WordCountExample")
    val sc = new SparkContext(conf)

    run(sc, args(0))
    sc.stop()
  }

  def run(sc: SparkContext, inputFile: String) {
    val stopWordCount = sc.accumulator(0L)
    val stopWords = sc.broadcast(Set("a", "an", "for", "in", "on"))

    val lines = sc.textFile(inputFile)
    val words = lines.flatMap(_.split(" ")).filter(!_.isEmpty)
    val wordCounts = words.map(w => (w, 1)).reduceByKey(_ + _).filter { w =>
      val result = !stopWords.value.contains(w._1)
      if (!result) stopWordCount += 1L
      result
    }
    val sortedWordCounts = wordCounts.sortBy(_._2, ascending = false)

    println(s"""wordCounts:     ${sortedWordCounts.take(10).mkString(", ")}""")
    println(s"""stopWordCounts: ${stopWordCount.value}""")
  }
}

// scalastyle:on println

Source File: AggregateByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object AggregateByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("AggregateByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(
      Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1)))
    val fruitCountAvgs = fruits.aggregateByKey(zeroValue = Acc(0.0, 0))(
      seqOp = (partAcc, n) => partAcc += n,
      combOp = (acc1, acc2) => acc1 ++= acc2
    ).mapValues(acc => acc.sum / acc.count)

    println(s"""fruits:         ${fruits.collect().mkString(", ")}""")
    println(s"""fruitCountAvgs: ${fruitCountAvgs.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: MapValuesExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object MapValuesExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("MapValuesExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array(("Apple", 1), ("Orange", 4), ("Apple", 2), ("Peach", 1)))
    val plusOnes = fruits.mapValues(v => v + 1)

    println(s"""fruits:   ${fruits.collect().mkString(", ")}""")
    println(s"""plusOnes: ${plusOnes.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: SortByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object SortByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("SortByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(
      Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1)))
    val sortedByKeyAsc = fruits.sortByKey(ascending = false)

    println(s"""fruits:         ${fruits.collect().mkString(", ")}""")
    println(s"""sortedByKeyAsc: ${sortedByKeyAsc.collect().mkString(", ")}""")

    val nums = sc.parallelize(
      Array(("One", 1), ("Hundred", 100), ("Three", 3), ("Thousand", 1000)))
    implicit val sortByStrLen = new Ordering[String] {
      def compare(x: String, y: String): Int = x.length - y.length
    }
    val sortedByKeyLength = nums.sortByKey()

    println()
    println(s"""nums:              ${nums.collect().mkString(", ")}""")
    println(s"""sortedByKeyLength: ${sortedByKeyLength.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: CoGroupExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object CoGroupExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("CoGroupExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val persons = sc.parallelize(Array(
      ("Adam", "San francisco"),
      ("Bob", "San francisco"),
      ("Taro", "Tokyo"),
      ("Charles", "New York")
    ))
    val cities = sc.parallelize(Array(
      ("Tokyo", "Japan"),
      ("San francisco", "America"),
      ("Beijing", "China")
    ))
    val grouped = persons.map(_.swap).cogroup(cities)

    println(s"""persons: ${persons.collect().mkString(", ")}""")
    println(s"""cities:  ${cities.collect().mkString(", ")}""")
    println()
    println(s"""grouped:\n${grouped.collect().mkString("\n")}""")
  }
}

// scalastyle:on println

Source File: JoinExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object JoinExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("JoinExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val persons = sc.parallelize(Array(
      ("Adam", "San francisco"),
      ("Bob", "San francisco"),
      ("Taro", "Tokyo"),
      ("Charles", "New York")
    ))
    val cities = sc.parallelize(Array(
      ("Tokyo", "Japan"),
      ("San francisco", "America"),
      ("Beijing", "China")
    ))

    val leftJoined = persons.map(_.swap).join(cities)
    val leftOuterJoined = persons.map(_.swap).leftOuterJoin(cities)
    val rightOuterJoined = persons.map(_.swap).rightOuterJoin(cities)
    val fullOuterJoined = persons.map(_.swap).fullOuterJoin(cities)

    println(s"""persons: ${persons.collect().mkString(", ")}""")
    println(s"""cities:  ${cities.collect().mkString(", ")}""")
    println()
    println(s"""leftJoined:\n${leftJoined.collect().mkString("\n")}""")
    println()
    println(s"""leftOuterJoined:\n${leftOuterJoined.collect().mkString("\n")}""")
    println()
    println(s"""rightOuterJoined:\n${rightOuterJoined.collect().mkString("\n")}""")
    println()
    println(s"""fullOuterJoined:\n${fullOuterJoined.collect().mkString("\n")}""")
  }
}

// scalastyle:on println

Source File: GroupByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object GroupByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("GroupByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(
      Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1)))
    val grouped = fruits.groupByKey()

    println(s"""fruits:  ${fruits.collect().mkString(", ")}""")
    println(s"""grouped: ${grouped.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: ReduceByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object ReduceByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("ReduceByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array(
      ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1)))
    val fruitCounts = fruits.reduceByKey((x, y) => x + y)

    println(s"""fruits:      ${fruits.collect().mkString(", ")}""")
    println(s"""fruitCounts: ${fruitCounts.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: CombineByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object CombineByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("CombineByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(
      Array(("Apple", 6), ("Orange", 1), ("Apple", 2), ("Orange", 5), ("PineApple", 1)))
    val fruitCountAvgs = fruits.combineByKey(
      createCombiner = (v: Int) => Acc(v.toDouble, 1),
      mergeValue = (partAcc: Acc, n: Int) => partAcc += n,
      mergeCombiners = (acc1: Acc, acc2: Acc) => acc1 ++= acc2
    ).mapValues(acc => acc.sum / acc.count)

    println(s"""fruits:         ${fruits.collect().mkString(", ")}""")
    println(s"""fruitCountAvgs: ${fruitCountAvgs.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: FoldByKeyExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.pairrdd_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object FoldByKeyExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("FoldByKeyExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array(
      ("Apple", 1), ("Orange", 1), ("Peach", 1), ("Orange", 1), ("PineApple", 1), ("Orange", 1)))
    val fruitCounts = fruits.foldByKey(0)((x, y) => x + y)

    println(s"""fruits:      ${fruits.collect().mkString(", ")}""")
    println(s"""fruitCounts: ${fruitCounts.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: MapPartitionsExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_transformation

import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object MapPartitionsExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("MapPartitionsExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val jsonLines = sc.parallelize(Array(
      """{"name": "Apple",  "num": 1}""",
      """{"name": "Orange", "num": 4}""",
      """{"name": "Apple",  "num": 2}""",
      """{"name": "Peach",  "num": 1}"""
    ))

    val parsed = jsonLines.mapPartitions { lines =>
      val mapper = new ObjectMapper()
      mapper.registerModule(DefaultScalaModule)
      lines.map { line =>
        val f = mapper.readValue(line, classOf[Map[String, String]])
        (f("name"), f("num"))
      }
    }

    println(s"""json:\n${jsonLines.collect().mkString("\n")}""")
    println()
    println(s"""parsed:\n${parsed.collect().mkString("\n")}""")
  }
}

// scalastyle:on println

Source File: FlatMapExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object FlatMapExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("FlatMapExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val lines = sc.parallelize(Array("Apple is red", "PineApple is yellow"))
    val words = lines.flatMap(line => line.split(" "))

    println(s"""lines: ${lines.collect().mkString(", ")}""")
    println(s"""words: ${words.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: SetOperationsExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object SetOperationsExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("SetOperationsExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits1 = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val fruits2 = sc.parallelize(Array("Grape", "Apple", "Banana", "Orange"))

    val union = fruits1.union(fruits2)
    val subtract = fruits1.subtract(fruits2)
    val intersection = fruits1.intersection(fruits2)
    val cartesian = fruits1.cartesian(fruits2)

    println(s"""fruits1: ${fruits1.collect().mkString(", ")}""")
    println(s"""fruits2: ${fruits2.collect().mkString(", ")}""")
    println(s"""union: ${union.collect().mkString(", ")}""")
    println(s"""subtract: ${subtract.collect().mkString(", ")}""")
    println(s"""intersection: ${intersection.collect().mkString(", ")}""")
    println(s"""cartesian: ${cartesian.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: MapExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object MapExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("MapExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val lengths = fruits.map(fruit => fruit.length)

    println(s"""fruits:  ${fruits.collect().mkString(", ")}""")
    println(s"""lengths: ${lengths.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: ZipExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object ZipExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("ZipExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits1 = sc.parallelize(
      Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val fruits2 = sc.parallelize(
      Array("りんご", "オレンジ", "桃", "オレンジ", "パイナップル", "オレンジ"))
    val zipped = fruits1.zip(fruits2)

    println(s"""fruits1: ${fruits1.collect().mkString(", ")}""")
    println(s"""fruits2: ${fruits2.collect().mkString(", ")}""")
    println(s"""zipped:  ${zipped.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: DistinctExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object DistinctExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("DistinctExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val uniques = fruits.distinct()

    println(s"""fruits:  ${fruits.collect().mkString(", ")}""")
    println(s"""uniques: ${uniques.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: SampleExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object SampleExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("SampleExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val samples = fruits.sample(withReplacement = false, 0.5, 1)

    println(s"""fruits:  ${fruits.collect().mkString(", ")}""")
    println(s"""samples: ${samples.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: FilterExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.basic_transformation

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object FilterExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("FilterExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))
    val startWithPs = fruits.filter(fruit => fruit.startsWith("P"))

    println(s"""fruits:      ${fruits.collect().mkString(", ")}""")
    println(s"""startWithPs: ${startWithPs.collect().mkString(", ")}""")
  }
}

// scalastyle:on println

Source File: JdbcExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch05

// scalastyle:off println
import java.util.Properties

import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}

object JdbcExample {

  
  def main(args: Seq[String]): Unit = {
    if (args.length != 3) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }
    val url = args(0)
    val user = args(1)
    val pass = args(2)

    val conf = new SparkConf().setAppName("JdbcExample")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    run(sc, sqlContext, url, user, pass)

    sc.stop()
  }

  def run(sc: SparkContext, sqlContext: SQLContext,
      url: String, user: String, pass: String): Unit = {
    val prop = new Properties()
    prop.setProperty("user", user)
    prop.setProperty("password", pass)

    val df: DataFrame = sqlContext.read.jdbc(url, "gihyo_spark.person", prop)
    df.printSchema()
    println("# Rows: " + df.count())
  }
}
// scalastyle:on println

Source File: DataFrameNaFunctionExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch05

// scalastyle:off println
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}


object DataFrameNaFunctionExample {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("BasicDataFrameExample")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    run(sc, sqlContext)
    sc.stop()
  }

  def run(
      sc: SparkContext,
      sqlContext: SQLContext): Unit = {
    import sqlContext.implicits._

    val nullDF = Seq[(String, java.lang.Integer, java.lang.Double)](
      ("Bob", 16, 176.5),
      ("Alice", null, 164.3),
      ("", 60, null),
      ("UNKNOWN", 25, Double.NaN),
      ("Amy", null, null),
      (null, null, Double.NaN)
    ).toDF("name", "age", "height")

    // drop
    nullDF.na.drop("any").show()
    nullDF.na.drop("all").show()
    nullDF.na.drop(Array("age")).show()
    nullDF.na.drop(Seq("age", "height")).show()
    nullDF.na.drop("any", Array("name", "age")).show()
    nullDF.na.drop("all", Array("age", "height")).show()

    // fill
    nullDF.na.fill(0.0, Array("name", "height")).show()
    nullDF.na.fill(Map(
      "name" -> "UNKNOWN",
      "height" -> 0.0
    )).show()

    // replace
    nullDF.na.replace("name", Map("" -> "UNKNOWN")).show()
  }
}

// scalastyle:on println

Source File: DatasetExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch05

import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.sql.{Dataset, SQLContext}
import org.apache.spark.sql.functions._

private case class Person(id: Int, name: String, age: Int)

object DatasetExample {

  
  def main(args: Seq[String]): Unit = {
    val conf = new SparkConf().setAppName("DatasetExample")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    run(sc, sqlContext)
    sc.stop()
  }

  def run(sc: SparkContext, sqlContext: SQLContext): Unit = {
    import sqlContext.implicits._

    // Creates a Dataset from a `Seq`
    val seq = Seq((1, "Bob", 23), (2, "Tom", 23), (3, "John", 22))
    val ds1: Dataset[(Int, String, Int)] = sqlContext.createDataset(seq)
    val ds2: Dataset[(Int, String, Int)] = seq.toDS()

    // Creates a Dataset from a `RDD`
    val rdd = sc.parallelize(seq)
    val ds3: Dataset[(Int, String, Int)] = sqlContext.createDataset(rdd)
    val ds4: Dataset[(Int, String, Int)] = rdd.toDS()

    // Creates a Dataset from a `DataFrame`
    val df = rdd.toDF("id", "name", "age")
    val ds5: Dataset[Person] = df.as[Person]

    // Selects a column
    ds5.select(expr("name").as[String]).show()

    // Filtering
    ds5.filter(_.name == "Bob").show()
    ds5.filter(person => person.age == 23).show()

    // Groups and counts the number of rows
    ds5.groupBy(_.age).count().show()
  }
}

Source File: TestStreamingContext.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark

import org.scalatest.{BeforeAndAfterEach, Suite}

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{StreamingContext, Seconds}
import jp.gihyo.spark.ch06.UserDic

private[spark]
trait TestStreamingContext extends BeforeAndAfterEach { self: Suite =>
  @transient var ssc: StreamingContext = _
  @transient var sc: SparkContext = _
  val master = "local[2]"
  val appN = "StreamingUnitTest"
  val bd = Seconds(1)

  override def beforeEach() {
    super.beforeEach()
    val conf = new SparkConf().setMaster(master)
      .setAppName(appN)
      .set("spark.streaming.clock", "org.apache.spark.util.ManualClock")
      .registerKryoClasses(Array(classOf[UserDic]))

    ssc = new StreamingContext(conf, bd)
    sc = ssc.sparkContext
  }

  override def afterEach() {
    try {
      if (ssc != null) {
        // stop with sc
        ssc.stop(true)
      }
      ssc = null;
    } finally {
      super.afterEach()
    }
  }
}

Source File: TestSparkContext.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark

import org.scalatest.{BeforeAndAfterAll, Suite}

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext

private[spark]
trait TestSparkContext extends BeforeAndAfterAll { self: Suite =>
  @transient var sc: SparkContext = _
  @transient var sqlContext: SQLContext = _

  override def beforeAll() {
    super.beforeAll()
    val conf = new SparkConf()
      .setMaster("local[2]")
      .setAppName("SparkUnitTest")
      .set("spark.sql.shuffle.partitions", "2")
    sc = new SparkContext(conf)
    SQLContext.clearActive()
    sqlContext = new SQLContext(sc)
    SQLContext.setActive(sqlContext)
  }

  override def afterAll() {
    try {
      sqlContext = null
      SQLContext.clearActive()
      if (sc != null) {
        sc.stop()
      }
      sc = null
    } finally {
      super.afterAll()
    }
  }
}

Source File: TestMain.scala From hbrdd with Apache License 2.0

5 votes

import org.apache.spark.{SparkContext, SparkConf}

object TestMain {
  private val master = "Core1"
  private val port = "7077"
  private val appName = "hbase-rdd_spark"
  private val data = "hdfs://Master1:8020/test/spark/hbase/testhb"

  def main(args: Array[String]) {
    val sparkConf = new SparkConf()
      .setMaster(s"spark://$master:$port")
      .setAppName(appName).setJars(List("/home/lele/coding/hbrdd/out/artifacts/hbrdd_jar/hbrdd.jar"))

    val sc = new SparkContext(sparkConf)
    val ret = sc.textFile(data).map({ line =>
        val Array(k, col1, col2, _) = line split "\t"
        val content = Map("col1" -> col1, "col2" -> col2)
        k -> content
      })

    println(ret.count())

    sc.stop()
  }
}

Source File: XmlFile.scala From spark-xml with Apache License 2.0

5 votes

package com.databricks.spark.xml.util

import java.io.CharArrayWriter
import java.nio.charset.Charset
import javax.xml.stream.XMLOutputFactory

import scala.collection.Map

import com.databricks.spark.xml.parsers.StaxXmlGenerator
import com.sun.xml.txw2.output.IndentingXMLStreamWriter
import org.apache.hadoop.io.{Text, LongWritable}

import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.sql.DataFrame
import com.databricks.spark.xml.{XmlOptions, XmlInputFormat}

private[xml] object XmlFile {
  val DEFAULT_INDENT = "    "

  def withCharset(
      context: SparkContext,
      location: String,
      charset: String,
      rowTag: String): RDD[String] = {
    // This just checks the charset's validity early, to keep behavior
    Charset.forName(charset)
    context.hadoopConfiguration.set(XmlInputFormat.START_TAG_KEY, s"<$rowTag>")
    context.hadoopConfiguration.set(XmlInputFormat.END_TAG_KEY, s"</$rowTag>")
    context.hadoopConfiguration.set(XmlInputFormat.ENCODING_KEY, charset)
    context.newAPIHadoopFile(location,
      classOf[XmlInputFormat],
      classOf[LongWritable],
      classOf[Text]).map { case (_, text) => new String(text.getBytes, 0, text.getLength, charset) }
  }

  
  def saveAsXmlFile(
      dataFrame: DataFrame,
      path: String,
      parameters: Map[String, String] = Map()): Unit = {
    val options = XmlOptions(parameters.toMap)
    val codecClass = CompressionCodecs.getCodecClass(options.codec)
    val rowSchema = dataFrame.schema
    val indent = XmlFile.DEFAULT_INDENT

    val xmlRDD = dataFrame.rdd.mapPartitions { iter =>
      val factory = XMLOutputFactory.newInstance()
      val writer = new CharArrayWriter()
      val xmlWriter = factory.createXMLStreamWriter(writer)
      val indentingXmlWriter = new IndentingXMLStreamWriter(xmlWriter)
      indentingXmlWriter.setIndentStep(indent)

      new Iterator[String] {
        var firstRow: Boolean = true
        var lastRow: Boolean = true

        override def hasNext: Boolean = iter.hasNext || firstRow || lastRow

        override def next: String = {
          if (iter.nonEmpty) {
            if (firstRow) {
              indentingXmlWriter.writeStartElement(options.rootTag)
              firstRow = false
            }
            val xml = {
              StaxXmlGenerator(
                rowSchema,
                indentingXmlWriter,
                options)(iter.next())
              indentingXmlWriter.flush()
              writer.toString
            }
            writer.reset()
            xml
          } else {
            if (!firstRow) {
              lastRow = false
              indentingXmlWriter.writeEndElement()
              indentingXmlWriter.close()
              writer.toString
            } else {
              // This means the iterator was initially empty.
              firstRow = false
              lastRow = false
              ""
            }
          }
        }
      }
    }

    codecClass match {
      case null => xmlRDD.saveAsTextFile(path)
      case codec => xmlRDD.saveAsTextFile(path, codec)
    }
  }
}

Source File: XmlFileSuite.scala From spark-xml with Apache License 2.0

5 votes

package com.databricks.spark.xml.util

import java.nio.charset.{StandardCharsets, UnsupportedCharsetException}

import org.apache.spark.SparkContext
import org.scalatest.BeforeAndAfterAll
import org.scalatest.funsuite.AnyFunSuite

final class XmlFileSuite extends AnyFunSuite with BeforeAndAfterAll {

  private val booksFile = "src/test/resources/books.xml"
  private val booksUnicodeInTagNameFile = "src/test/resources/books-unicode-in-tag-name.xml"
  private val booksFileTag = "book"
  private val booksUnicodeFileTag = "\u66F8" // scalastyle:ignore
  private val numBooks = 12
  private val numBooksUnicodeInTagName = 3
  private val fiasHouse = "src/test/resources/fias_house.xml"
  private val fiasRowTag = "House"
  private val numHouses = 37
  private val utf8 = StandardCharsets.UTF_8.name

  private var sparkContext: SparkContext = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    sparkContext = new SparkContext("local[2]", "TextFileSuite")
  }

  override def afterAll(): Unit = {
    try {
      sparkContext.stop()
      sparkContext = null
    } finally {
      super.afterAll()
    }
  }

  test("read utf-8 encoded file") {
    val baseRDD = XmlFile.withCharset(sparkContext, booksFile, utf8, rowTag = booksFileTag)
    assert(baseRDD.count() === numBooks)
  }

  test("read file with unicode chars in row tag name") {
    val baseRDD = XmlFile.withCharset(
      sparkContext, booksUnicodeInTagNameFile, utf8, rowTag = booksUnicodeFileTag)
    assert(baseRDD.count() === numBooksUnicodeInTagName)
  }

  test("read utf-8 encoded file with empty tag") {
    val baseRDD = XmlFile.withCharset(sparkContext, fiasHouse, utf8, rowTag = fiasRowTag)
    assert(baseRDD.count() == numHouses)
    baseRDD.collect().foreach(x => assert(x.contains("/>")))
  }

  test("unsupported charset") {
    val exception = intercept[UnsupportedCharsetException] {
      XmlFile.withCharset(sparkContext, booksFile, "frylock", rowTag = booksFileTag).count()
    }
    assert(exception.getMessage.contains("frylock"))
  }

}

Source File: SparkSuite.scala From spark-sorted with Apache License 2.0

5 votes

package com.tresata.spark.sorted

import org.scalactic.Equality
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.{ Dataset, SparkSession }

object SparkSuite {
  lazy val spark: SparkSession = {
    val session = SparkSession.builder
      .master("local[*]")
      .appName("test")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .config("spark.ui.enabled", false)
      .config("spark.sql.shuffle.partitions", 4)
      .getOrCreate()
    session
  }
  lazy val sc: SparkContext = spark.sparkContext

  lazy val jsc = new JavaSparkContext(sc)
  def javaSparkContext() = jsc
}

trait SparkSuite {
  implicit lazy val spark: SparkSession = SparkSuite.spark
  implicit lazy val sc: SparkContext = SparkSuite.spark.sparkContext

  implicit def rddEq[X]: Equality[RDD[X]] = new Equality[RDD[X]] {
    private def toCounts[Y](s: Seq[Y]): Map[Y, Int] = s.groupBy(identity).mapValues(_.size)

    def areEqual(a: RDD[X], b: Any): Boolean = b match {
      case s: Seq[_] => toCounts(a.collect) == toCounts(s)
      case rdd: RDD[_] => toCounts(a.collect) == toCounts(rdd.collect)
    }
  }

  implicit def gsEq[K, V](implicit rddEq: Equality[RDD[(K, V)]]): Equality[GroupSorted[K, V]] = new Equality[GroupSorted[K, V]] {
    def areEqual(a: GroupSorted[K, V], b: Any): Boolean = rddEq.areEqual(a, b)
  }
  
  implicit def dsEq[X](implicit rddEq: Equality[RDD[X]]): Equality[Dataset[X]] = new Equality[Dataset[X]] {
    def areEqual(a: Dataset[X], b: Any): Boolean = b match {
      case ds: Dataset[_] => rddEq.areEqual(a.rdd, ds.rdd)
      case x => rddEq.areEqual(a.rdd, x)
    }
  }
}

Source File: TestUtils.scala From odsc-east-realish-predictions with Apache License 2.0

5 votes

package com.twilio.open.odsc.realish

import com.holdenkarau.spark.testing.{LocalSparkContext, SparkContextProvider}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import org.scalatest.{BeforeAndAfterAll, Suite}

object TestUtils {

}

@SerialVersionUID(1L)
case class UserPersonality(uuid: String, name: String, tags: Array[String])
  extends Serializable

@SerialVersionUID(1L)
case class Author(uuid: String, name: String, age: Int) extends Serializable

@SerialVersionUID(1L)
case class LibraryBook(uuid: String, name: String, author: Author) extends Serializable

case class MockKafkaDataFrame(key: Array[Byte], value: Array[Byte])

trait SharedSparkSql extends BeforeAndAfterAll with SparkContextProvider {
  self: Suite =>

  @transient var _sparkSql: SparkSession = _
  @transient private var _sc: SparkContext = _

  override def sc: SparkContext = _sc

  def conf: SparkConf

  def sparkSql: SparkSession = _sparkSql

  override def beforeAll() {
    _sparkSql = SparkSession.builder().config(conf).getOrCreate()

    _sc = _sparkSql.sparkContext
    setup(_sc)
    super.beforeAll()
  }

  override def afterAll() {
    try {
      _sparkSql.close()
      _sparkSql = null
      LocalSparkContext.stop(_sc)
      _sc = null
    } finally {
      super.afterAll()
    }
  }

}

Source File: TestUtils.scala From odsc-east-realish-predictions with Apache License 2.0

5 votes

package com.twilio.open.odsc.realish

import com.holdenkarau.spark.testing.{LocalSparkContext, SparkContextProvider}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import org.scalatest.{BeforeAndAfterAll, Suite}

object TestUtils {

}

@SerialVersionUID(1L)
case class UserPersonality(uuid: String, name: String, tags: Array[String])
  extends Serializable

@SerialVersionUID(1L)
case class Author(uuid: String, name: String, age: Int) extends Serializable

@SerialVersionUID(1L)
case class LibraryBook(uuid: String, name: String, author: Author) extends Serializable

case class MockKafkaDataFrame(key: Array[Byte], value: Array[Byte])

trait SharedSparkSql extends BeforeAndAfterAll with SparkContextProvider {
  self: Suite =>

  @transient var _sparkSql: SparkSession = _
  @transient private var _sc: SparkContext = _

  override def sc: SparkContext = _sc

  def conf: SparkConf

  def sparkSql: SparkSession = _sparkSql

  override def beforeAll() {
    _sparkSql = SparkSession.builder().config(conf).getOrCreate()

    _sc = _sparkSql.sparkContext
    setup(_sc)
    super.beforeAll()
  }

  override def afterAll() {
    try {
      _sparkSql.close()
      _sparkSql = null
      LocalSparkContext.stop(_sc)
      _sc = null
    } finally {
      super.afterAll()
    }
  }

}

Source File: HyperLogLog.scala From spark-hyperloglog with Apache License 2.0

5 votes

package com.mozilla.spark.sql.hyperloglog.test

import com.mozilla.spark.sql.hyperloglog.aggregates._
import com.mozilla.spark.sql.hyperloglog.functions._
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions._
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{FlatSpec, Matchers}

class HyperLogLogTest extends FlatSpec with Matchers{
 "Algebird's HyperLogLog" can "be used from Spark" in {
  val sparkConf = new SparkConf().setAppName("HyperLogLog")
  sparkConf.setMaster(sparkConf.get("spark.master", "local[1]"))

  val sc = new SparkContext(sparkConf)
  val sqlContext = new SQLContext(sc)
  import sqlContext.implicits._

  val hllMerge = new HyperLogLogMerge
  sqlContext.udf.register("hll_merge", hllMerge)
  sqlContext.udf.register("hll_create", hllCreate _)
  sqlContext.udf.register("hll_cardinality", hllCardinality _)

  val frame = sc.parallelize(List("a", "b", "c", "c"), 4).toDF("id")
  val count = frame
    .select(expr("hll_create(id, 12) as hll"))
    .groupBy()
    .agg(expr("hll_cardinality(hll_merge(hll)) as count"))
    .collect()
  count(0)(0) should be (3)
 }
}

Source File: ImageLoaderUtils.scala From keystone with Apache License 2.0

5 votes

package keystoneml.loaders

import java.awt.image.BufferedImage
import java.io.{InputStream, ByteArrayInputStream}
import java.net.URI
import java.util.zip.GZIPInputStream
import javax.imageio.ImageIO

import keystoneml.loaders.VOCLoader._
import org.apache.commons.compress.archivers.ArchiveStreamFactory
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import keystoneml.pipelines.Logging
import keystoneml.utils._

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

object ImageLoaderUtils extends Logging {
  
  def loadFiles[L, I <: AbstractLabeledImage[L] : ClassTag](
      filePathsRDD: RDD[URI],
      labelsMap: String => L,
      imageBuilder: (Image, L, Option[String]) => I, // TODO(etrain): We can probably do this with implicits.
      namePrefix: Option[String] = None): RDD[I] = {
    filePathsRDD.flatMap(fileUri => loadFile(fileUri, labelsMap, imageBuilder, namePrefix))
  }

  private def loadFile[L, I <: AbstractLabeledImage[L]](
      fileUri: URI,
      labelsMap: String => L,
      imageBuilder: (Image, L, Option[String]) => I,
      namePrefix: Option[String]): Iterator[I] = {
    val filePath = new Path(fileUri)
    val conf = new Configuration(true)
    val fs = FileSystem.get(filePath.toUri(), conf)
    val fStream = fs.open(filePath)

    val tarStream = new ArchiveStreamFactory().createArchiveInputStream(
      "tar", fStream).asInstanceOf[TarArchiveInputStream]

    var entry = tarStream.getNextTarEntry()
    val imgs = new ArrayBuffer[I]
    while (entry != null) {
      if (!entry.isDirectory && (namePrefix.isEmpty || entry.getName.startsWith(namePrefix.get))) {
        var offset = 0
        var ret = 0
        val content = new Array[Byte](entry.getSize().toInt)
        while (ret >= 0 && offset != entry.getSize()) {
          ret = tarStream.read(content, offset, content.length - offset)
          if (ret >= 0) {
            offset += ret
          }
        }

        val bais = new ByteArrayInputStream(content)

        val image = ImageUtils.loadImage(bais).map { img =>
          imageBuilder(img, labelsMap(entry.getName), Some(entry.getName))
        }

        imgs ++= image
      }
      entry = tarStream.getNextTarEntry()
    }

    imgs.iterator
  }
}

Source File: TimitFeaturesDataLoader.scala From keystone with Apache License 2.0

5 votes

package keystoneml.loaders

import breeze.linalg.DenseVector
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import scala.collection.mutable


  def apply(sc: SparkContext,
      trainDataLocation: String,
      trainLabelsLocation: String,
      testDataLocation: String,
      testLabelsLocation: String,
      numParts: Int = 512): TimitFeaturesData = {
    val trainData = CsvDataLoader(sc, trainDataLocation, numParts)
    val trainLabels = createLabelsRDD(parseSparseLabels(trainLabelsLocation), trainData)

    val testData = CsvDataLoader(sc, testDataLocation, numParts)
    val testLabels = createLabelsRDD(parseSparseLabels(testLabelsLocation), testData)
    TimitFeaturesData(LabeledData(trainLabels.zip(trainData)), LabeledData(testLabels.zip(testData)))
  }
}

Source File: ImageNetLoader.scala From keystone with Apache License 2.0

5 votes

package keystoneml.loaders

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import keystoneml.utils.LabeledImage


  def apply(sc: SparkContext, dataPath: String, labelsPath: String): RDD[LabeledImage] = {
    val filePathsRDD = ImageLoaderUtils.getFilePathsRDD(sc, dataPath)

    val labelsMapFile = scala.io.Source.fromFile(labelsPath)
    val labelsMap = labelsMapFile.getLines().map(x => x.toString).toArray.map { line =>
      val parts = line.split(" ")
      (parts(0), parts(1).toInt)
    }.toMap

    def labelsMapF(fname: String): Int = {
      labelsMap(fname.split('/')(0))
    }

    ImageLoaderUtils.loadFiles(filePathsRDD, labelsMapF, LabeledImage.apply)
  }
}

Source File: VOCLoader.scala From keystone with Apache License 2.0

5 votes

package keystoneml.loaders

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import keystoneml.pipelines.Logging
import keystoneml.utils.MultiLabeledImage


case class VOCDataPath(imagesDirName: String, namePrefix: String, numParts: Option[Int])
case class VOCLabelPath(labelsFileName: String)


  def apply(sc: SparkContext, dataPath: VOCDataPath, labelsPath: VOCLabelPath): RDD[MultiLabeledImage] = {
    val filePathsRDD = ImageLoaderUtils.getFilePathsRDD(sc, dataPath.imagesDirName, dataPath.numParts)

    val labelsMapFile = scala.io.Source.fromFile(labelsPath.labelsFileName)

    val labelsMap: Map[String, Array[Int]] = labelsMapFile
      .getLines()
      .drop(1)
      .map(x => x.toString)
      .map { line =>
        val parts = line.split(",")
        (parts(4).replace("\"", ""), parts(1).toInt - 1)
      }
      .toArray
      .groupBy(_._1)
      .mapValues(_.map(_._2))
      .map(identity)

    labelsMapFile.close()

    ImageLoaderUtils.loadFiles(filePathsRDD, labelsMap, MultiLabeledImage.apply, Some(dataPath.namePrefix))
  }
}

Source File: LinearPixels.scala From keystone with Apache License 2.0

5 votes

package keystoneml.pipelines.images.cifar

import breeze.linalg.DenseVector
import keystoneml.evaluation.MulticlassClassifierEvaluator
import keystoneml.loaders.CifarLoader
import keystoneml.nodes.images.{GrayScaler, ImageExtractor, ImageVectorizer, LabelExtractor}
import keystoneml.nodes.learning.LinearMapEstimator
import keystoneml.nodes.util.{Cacher, ClassLabelIndicatorsFromIntLabels, MaxClassifier}
import org.apache.spark.{SparkConf, SparkContext}
import keystoneml.pipelines.Logging
import scopt.OptionParser
import keystoneml.utils.Image
import keystoneml.workflow.Pipeline


object LinearPixels extends Logging {
  val appName = "LinearPixels"
  case class LinearPixelsConfig(trainLocation: String = "", testLocation: String = "")

  def run(sc: SparkContext, config: LinearPixelsConfig): Pipeline[Image, Int] = {
    val numClasses = 10

    // Load and cache the training data.
    val trainData = CifarLoader(sc, config.trainLocation).cache()

    val trainImages = ImageExtractor(trainData)

    val labelExtractor = LabelExtractor andThen
        ClassLabelIndicatorsFromIntLabels(numClasses) andThen
        new Cacher[DenseVector[Double]]
    val trainLabels = labelExtractor(trainData)

    // A featurizer maps input images into vectors. For this pipeline, we'll also convert the image to grayscale.
    // We then estimate our model by calling a linear solver on our data.
    val predictionPipeline = GrayScaler andThen
      ImageVectorizer andThen
      (new LinearMapEstimator, trainImages, trainLabels) andThen
      MaxClassifier

    // Calculate training error.
    val evaluator = new MulticlassClassifierEvaluator(numClasses)
    val trainEval = evaluator.evaluate(predictionPipeline(trainImages), LabelExtractor(trainData))

    // Do testing.
    val testData = CifarLoader(sc, config.testLocation)
    val testImages = ImageExtractor(testData)
    val testLabels = labelExtractor(testData)

    val testEval = evaluator.evaluate(predictionPipeline(testImages), LabelExtractor(testData))

    logInfo(s"Training accuracy: \n${trainEval.totalAccuracy}")
    logInfo(s"Test accuracy: \n${testEval.totalAccuracy}")

    predictionPipeline
  }

  def parse(args: Array[String]): LinearPixelsConfig = new OptionParser[LinearPixelsConfig](appName) {
    head(appName, "0.1")
    help("help") text("prints this usage text")
    opt[String]("trainLocation") required() action { (x,c) => c.copy(trainLocation=x) }
    opt[String]("testLocation") required() action { (x,c) => c.copy(testLocation=x) }
  }.parse(args, LinearPixelsConfig()).get

  
  def main(args: Array[String]) = {
    val appConfig = parse(args)

    val conf = new SparkConf().setAppName(appName)
    conf.setIfMissing("spark.master", "local[2]") // This is a fallback if things aren't set via spark submit.
    val sc = new SparkContext(conf)
    run(sc, appConfig)

    sc.stop()
  }

}

Source File: AmazonReviewsPipeline.scala From keystone with Apache License 2.0

5 votes

package keystoneml.pipelines.text

import breeze.linalg.SparseVector
import keystoneml.evaluation.BinaryClassifierEvaluator
import keystoneml.loaders.{AmazonReviewsDataLoader, LabeledData}
import keystoneml.nodes.learning.LogisticRegressionEstimator
import keystoneml.nodes.nlp._
import keystoneml.nodes.stats.TermFrequency
import keystoneml.nodes.util.CommonSparseFeatures
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import keystoneml.pipelines.Logging
import scopt.OptionParser
import keystoneml.workflow.Pipeline

object AmazonReviewsPipeline extends Logging {
  val appName = "AmazonReviewsPipeline"

  def run(spark: SparkSession, conf: AmazonReviewsConfig): Pipeline[String, Double] = {
    val amazonTrainData = AmazonReviewsDataLoader(spark, conf.trainLocation, conf.threshold).labeledData
    val trainData = LabeledData(amazonTrainData.repartition(conf.numParts).cache())

    val training = trainData.data
    val labels = trainData.labels

    // Build the classifier estimator
    val predictor = Trim andThen
        LowerCase() andThen
        Tokenizer() andThen
        NGramsFeaturizer(1 to conf.nGrams) andThen
        TermFrequency(x => 1) andThen
        (CommonSparseFeatures[Seq[String]](conf.commonFeatures), training) andThen
        (LogisticRegressionEstimator[SparseVector[Double]](numClasses = 2, numIters = conf.numIters), training, labels)

    // Evaluate the classifier
    val amazonTestData = AmazonReviewsDataLoader(spark, conf.testLocation, conf.threshold).labeledData
    val testData = LabeledData(amazonTestData.repartition(conf.numParts).cache())
    val testLabels = testData.labels
    val testResults = predictor(testData.data)
    val eval = BinaryClassifierEvaluator.evaluate(testResults.get.map(_ > 0), testLabels.map(_ > 0))

    logInfo("\n" + eval.summary())
    predictor
  }

  case class AmazonReviewsConfig(
    trainLocation: String = "",
    testLocation: String = "",
    threshold: Double = 3.5,
    nGrams: Int = 2,
    commonFeatures: Int = 100000,
    numIters: Int = 20,
    numParts: Int = 512)

  def parse(args: Array[String]): AmazonReviewsConfig = new OptionParser[AmazonReviewsConfig](appName) {
    head(appName, "0.1")
    opt[String]("trainLocation") required() action { (x,c) => c.copy(trainLocation=x) }
    opt[String]("testLocation") required() action { (x,c) => c.copy(testLocation=x) }
    opt[Double]("threshold") action { (x,c) => c.copy(threshold=x)}
    opt[Int]("nGrams") action { (x,c) => c.copy(nGrams=x) }
    opt[Int]("commonFeatures") action { (x,c) => c.copy(commonFeatures=x) }
    opt[Int]("numIters") action { (x,c) => c.copy(numParts=x) }
    opt[Int]("numParts") action { (x,c) => c.copy(numParts=x) }
  }.parse(args, AmazonReviewsConfig()).get

  
  def main(args: Array[String]) = {
    val conf = new SparkConf().setAppName(appName)
    conf.setIfMissing("spark.master", "local[2]") // This is a fallback if things aren't set via spark submit.

    val spark = SparkSession.builder.config(conf).getOrCreate()

    val appConfig = parse(args)
    run(spark, appConfig)

    spark.stop()
  }
}

Source File: NewsgroupsPipeline.scala From keystone with Apache License 2.0

5 votes

package keystoneml.pipelines.text

import breeze.linalg.SparseVector
import keystoneml.evaluation.MulticlassClassifierEvaluator
import keystoneml.loaders.NewsgroupsDataLoader
import keystoneml.nodes.learning.NaiveBayesEstimator
import keystoneml.nodes.nlp._
import keystoneml.nodes.stats.TermFrequency
import keystoneml.nodes.util.{CommonSparseFeatures, MaxClassifier}
import org.apache.spark.{SparkConf, SparkContext}
import keystoneml.pipelines.Logging
import scopt.OptionParser
import keystoneml.workflow.Pipeline

object NewsgroupsPipeline extends Logging {
  val appName = "NewsgroupsPipeline"

  def run(sc: SparkContext, conf: NewsgroupsConfig): Pipeline[String, Int] = {

    val trainData = NewsgroupsDataLoader(sc, conf.trainLocation)
    val numClasses = NewsgroupsDataLoader.classes.length

    // Build the classifier estimator
    logInfo("Training classifier")
    val predictor = Trim andThen
        LowerCase() andThen
        Tokenizer() andThen
        NGramsFeaturizer(1 to conf.nGrams) andThen
        TermFrequency(x => 1) andThen
        (CommonSparseFeatures[Seq[String]](conf.commonFeatures), trainData.data) andThen
        (NaiveBayesEstimator[SparseVector[Double]](numClasses), trainData.data, trainData.labels) andThen
        MaxClassifier

    // Evaluate the classifier
    logInfo("Evaluating classifier")

    val testData = NewsgroupsDataLoader(sc, conf.testLocation)
    val testLabels = testData.labels
    val testResults = predictor(testData.data)
    val eval = new MulticlassClassifierEvaluator(numClasses).evaluate(testResults, testLabels)

    logInfo("\n" + eval.summary(NewsgroupsDataLoader.classes))

    predictor
  }

  case class NewsgroupsConfig(
    trainLocation: String = "",
    testLocation: String = "",
    nGrams: Int = 2,
    commonFeatures: Int = 100000)

  def parse(args: Array[String]): NewsgroupsConfig = new OptionParser[NewsgroupsConfig](appName) {
    head(appName, "0.1")
    opt[String]("trainLocation") required() action { (x,c) => c.copy(trainLocation=x) }
    opt[String]("testLocation") required() action { (x,c) => c.copy(testLocation=x) }
    opt[Int]("nGrams") action { (x,c) => c.copy(nGrams=x) }
    opt[Int]("commonFeatures") action { (x,c) => c.copy(commonFeatures=x) }
  }.parse(args, NewsgroupsConfig()).get

  
  def main(args: Array[String]) = {
    val conf = new SparkConf().setAppName(appName)
    conf.setIfMissing("spark.master", "local[2]") // This is a fallback if things aren't set via spark submit.

    val sc = new SparkContext(conf)

    val appConfig = parse(args)
    run(sc, appConfig)

    sc.stop()
  }

}

Source File: MeanAveragePrecisionSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.evaluation

import breeze.linalg.DenseVector
import org.scalatest.FunSuite
import org.apache.spark.SparkContext
import keystoneml.utils.Stats
import keystoneml.workflow.PipelineContext

class MeanAveragePrecisionSuite extends FunSuite with PipelineContext {

  test("random map test") {
    sc = new SparkContext("local", "test")

    // Build some random test data with 4 classes 0,1,2,3
    val actual = List(Array(0, 3), Array(2), Array(1, 2), Array(0))
    val actualRdd = sc.parallelize(actual)

    val predicted = List(
      DenseVector(0.1, -0.05, 0.12, 0.5),
      DenseVector(-0.23, -0.45, 0.23, 0.1),
      DenseVector(-0.34, -0.32, -0.66, 1.52),
      DenseVector(-0.1, -0.2, 0.5, 0.8))

    val predictedRdd = sc.parallelize(predicted)

    val map = new MeanAveragePrecisionEvaluator(4).evaluate(predictedRdd, actualRdd)

    // Expected values from running this in MATLAB
    val expected = DenseVector(1.0, 0.3333, 0.5, 0.3333)

    assert(Stats.aboutEq(map, expected, 1e-4))
  }
}

Source File: MulticlassClassifierEvaluatorSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.evaluation

import breeze.linalg.DenseMatrix
import org.apache.spark.SparkContext
import org.scalatest.FunSuite
import keystoneml.workflow.PipelineContext

class MulticlassClassifierEvaluatorSuite extends FunSuite with PipelineContext {
  test("Multiclass keystoneml.evaluation metrics") {
    
    sc = new SparkContext("local", "test")
    val confusionMatrix = new DenseMatrix(3, 3, Array(2, 1, 0, 1, 3, 0, 1, 0, 1))
    val labels = Array(0.0, 1.0, 2.0)
    val predictionAndLabels = sc.parallelize(
      Seq((0.0, 0.0), (0.0, 1.0), (0.0, 0.0), (1.0, 0.0), (1.0, 1.0),
        (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)), 2)
    val evaluator = new MulticlassClassifierEvaluator(3)
    val metrics = evaluator.evaluate(predictionAndLabels.map(_._1.toInt), predictionAndLabels.map(_._2.toInt)
    )
    val delta = 0.0000001
    val precision0 = 2.0 / (2 + 1)
    val precision1 = 3.0 / (3 + 1)
    val precision2 = 1.0 / (1 + 1)
    val recall0 = 2.0 / (2 + 2)
    val recall1 = 3.0 / (3 + 1)
    val recall2 = 1.0 / (1 + 0)
    val f1measure0 = 2 * precision0 * recall0 / (precision0 + recall0)
    val f1measure1 = 2 * precision1 * recall1 / (precision1 + recall1)
    val f1measure2 = 2 * precision2 * recall2 / (precision2 + recall2)
    val f2measure0 = (1 + 2 * 2) * precision0 * recall0 / (2 * 2 * precision0 + recall0)
    val f2measure1 = (1 + 2 * 2) * precision1 * recall1 / (2 * 2 * precision1 + recall1)
    val f2measure2 = (1 + 2 * 2) * precision2 * recall2 / (2 * 2 * precision2 + recall2)

    assert(metrics.confusionMatrix.toArray.sameElements(confusionMatrix.toArray))
    assert(math.abs(metrics.classMetrics(0).precision - precision0) < delta)
    assert(math.abs(metrics.classMetrics(1).precision - precision1) < delta)
    assert(math.abs(metrics.classMetrics(2).precision - precision2) < delta)
    assert(math.abs(metrics.classMetrics(0).recall - recall0) < delta)
    assert(math.abs(metrics.classMetrics(1).recall - recall1) < delta)
    assert(math.abs(metrics.classMetrics(2).recall - recall2) < delta)
    assert(math.abs(metrics.classMetrics(0).fScore() - f1measure0) < delta)
    assert(math.abs(metrics.classMetrics(1).fScore() - f1measure1) < delta)
    assert(math.abs(metrics.classMetrics(2).fScore() - f1measure2) < delta)
    assert(math.abs(metrics.classMetrics(0).fScore(2.0) - f2measure0) < delta)
    assert(math.abs(metrics.classMetrics(1).fScore(2.0) - f2measure1) < delta)
    assert(math.abs(metrics.classMetrics(2).fScore(2.0) - f2measure2) < delta)

    assert(math.abs(metrics.microRecall -
        (2.0 + 3.0 + 1.0) / ((2 + 3 + 1) + (1 + 1 + 1))) < delta)
    assert(math.abs(metrics.microRecall - metrics.microPrecision) < delta)
    assert(math.abs(metrics.microRecall - metrics.microFScore()) < delta)
    assert(math.abs(metrics.macroPrecision -
        (precision0 + precision1 + precision2) / 3.0) < delta)
    assert(math.abs(metrics.macroRecall -
        (recall0 + recall1 + recall2) / 3.0) < delta)
    assert(math.abs(metrics.macroFScore() -
        (f1measure0 + f1measure1 + f1measure2) / 3.0) < delta)
    assert(math.abs(metrics.macroFScore(2.0) -
        (f2measure0 + f2measure1 + f2measure2) / 3.0) < delta)
  }
}

Source File: BinaryClassifierEvaluatorSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.evaluation

import org.apache.spark.SparkContext
import org.scalatest.FunSuite
import keystoneml.utils.Stats
import keystoneml.workflow.PipelineContext

class BinaryClassifierEvaluatorSuite extends FunSuite with PipelineContext {
  test("Multiclass keystoneml.evaluation metrics") {
    
    sc = new SparkContext("local", "test")

    val predictionAndLabels = sc.parallelize( Seq.fill(6)((true, true)) ++ Seq.fill(2)((false, true))
        ++ Seq.fill(1)((true, false)) ++ Seq.fill(3)((false, false)), 2)
    val metrics = BinaryClassifierEvaluator.evaluate(predictionAndLabels.map(_._1), predictionAndLabels.map(_._2))

    assert(metrics.tp === 6)
    assert(metrics.fp === 1)
    assert(metrics.tn === 3)
    assert(metrics.fn === 2)

    assert(Stats.aboutEq(metrics.precision, 6.0/7.0))
    assert(Stats.aboutEq(metrics.recall, 6.0/8.0))
    assert(Stats.aboutEq(metrics.accuracy, 9.0/12.0))
    assert(Stats.aboutEq(metrics.specificity, 3.0/4.0))
    assert(Stats.aboutEq(metrics.fScore(), 2.0 * 6.0 / (2.0 * 6.0 + 2.0 + 1.0)))
  }
}

Source File: TestUtils.scala From keystone with Apache License 2.0

5 votes

package keystoneml.utils

import java.io.{FileReader, ByteArrayInputStream}
import breeze.linalg.DenseMatrix
import breeze.stats.distributions.{Gaussian, RandBasis, ThreadLocalRandomGenerator, Rand}
import edu.berkeley.cs.amplab.mlmatrix.RowPartitionedMatrix
import org.apache.commons.io.IOUtils
import org.apache.commons.math3.random.MersenneTwister
import org.apache.spark.SparkContext

import scala.io.Source
import scala.util.Random


  def genChannelMajorArrayVectorizedImage(x: Int, y: Int, z: Int): ChannelMajorArrayVectorizedImage = {
    ChannelMajorArrayVectorizedImage(genData(x, y, z), ImageMetadata(x,y,z))
  }

  def genRowColumnMajorByteArrayVectorizedImage(x: Int, y: Int, z: Int): RowColumnMajorByteArrayVectorizedImage = {
    RowColumnMajorByteArrayVectorizedImage(genData(x,y,z).map(_.toByte), ImageMetadata(x,y,z))
  }

  def createRandomMatrix(
      sc: SparkContext,
      numRows: Int,
      numCols: Int,
      numParts: Int,
      seed: Int = 42): RowPartitionedMatrix = {

    val rowsPerPart = numRows / numParts
    val matrixParts = sc.parallelize(1 to numParts, numParts).mapPartitionsWithIndex { (index, part) =>
      val randBasis: RandBasis = new RandBasis(new ThreadLocalRandomGenerator(new MersenneTwister(seed+index)))
      Iterator(DenseMatrix.rand(rowsPerPart, numCols, Gaussian(0.0, 1.0)(randBasis)))
    }
    RowPartitionedMatrix.fromMatrix(matrixParts.cache())
  }

  def createLocalRandomMatrix(numRows: Int, numCols: Int, seed: Int = 42): DenseMatrix[Double] = {
    val randBasis: RandBasis = new RandBasis(new ThreadLocalRandomGenerator(new MersenneTwister(seed)))
    DenseMatrix.rand(numRows, numCols, Gaussian(0.0, 1.0)(randBasis))
  }
}

Source File: MatrixUtilsSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.utils

import org.scalatest.FunSuite

import breeze.linalg._
import breeze.stats._

import org.apache.spark.SparkContext

import keystoneml.pipelines._
import keystoneml.workflow.PipelineContext

class MatrixUtilsSuite extends FunSuite with PipelineContext {

  test("computeMean works correctly") {
    val numRows = 1000
    val numCols = 32
    val numParts = 4
    sc = new SparkContext("local", "test")
    val in = DenseMatrix.rand(numRows, numCols)
    val inArr = MatrixUtils.matrixToRowArray(in)
    val rdd = sc.parallelize(inArr, numParts).mapPartitions { iter => 
      Iterator.single(MatrixUtils.rowsToMatrix(iter))
    }
    val expected = mean(in(::, *)).t
    val actual = MatrixUtils.computeMean(rdd)
    assert(Stats.aboutEq(expected, actual, 1e-6))
  }

}

Source File: EstimatorSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.workflow

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite
import keystoneml.pipelines.Logging

class EstimatorSuite extends FunSuite with PipelineContext with Logging {
  test("Estimator fit RDD") {
    sc = new SparkContext("local", "test")

    val intEstimator = new Estimator[Int, Int] {
      def fit(data: RDD[Int]): Transformer[Int, Int] = {
        val first = data.first()
        Transformer(x => x + first)
      }
    }

    val trainData = sc.parallelize(Seq(32, 94, 12))
    val testData = sc.parallelize(Seq(42, 58, 61))

    val pipeline = intEstimator.withData(trainData)
    assert(pipeline.apply(testData).get().collect().toSeq === Seq(42 + 32, 58 + 32, 61 + 32))
  }

  test("Estimator fit Pipeline Data") {
    sc = new SparkContext("local", "test")

    val transformer = Transformer[Int, Int](_ * 2)

    val intEstimator = new Estimator[Int, Int] {
      def fit(data: RDD[Int]): Transformer[Int, Int] = {
        val first = data.first()
        Transformer(x => x + first)
      }
    }

    val trainData = sc.parallelize(Seq(32, 94, 12))
    val testData = sc.parallelize(Seq(42, 58, 61))

    val pipeline = intEstimator.withData(transformer(trainData))
    assert(pipeline.apply(testData).get().collect().toSeq === Seq(42 + 64, 58 + 64, 61 + 64))
  }

}

Source File: LabelEstimatorSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.workflow

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite
import keystoneml.pipelines.Logging

class LabelEstimatorSuite extends FunSuite with PipelineContext with Logging {
  test("LabelEstimator fit RDD") {
    sc = new SparkContext("local", "test")

    val intEstimator = new LabelEstimator[Int, Int, String] {
      def fit(data: RDD[Int], labels: RDD[String]): Transformer[Int, Int] = {
        val first = data.first()
        val label = labels.first().hashCode
        Transformer(x => x + first + label)

      }
    }

    val trainData = sc.parallelize(Seq(32, 94, 12))
    val trainLabels = sc.parallelize(Seq("sjkfdl", "iw", "432"))
    val testData = sc.parallelize(Seq(42, 58, 61))

    val pipeline = intEstimator.withData(trainData, trainLabels)
    val offset = 32 + "sjkfdl".hashCode
    assert(pipeline.apply(testData).get().collect().toSeq === Seq(42 + offset, 58 + offset, 61 + offset))
  }

  test("LabelEstimator fit pipeline data") {
    sc = new SparkContext("local", "test")

    val dataTransformer = Transformer[Int, Int](_ * 2)
    val labelTransformer = Transformer[String, String](_ + "hi")

    val intEstimator = new LabelEstimator[Int, Int, String] {
      def fit(data: RDD[Int], labels: RDD[String]): Transformer[Int, Int] = {
        val first = data.first()
        val label = labels.first().hashCode
        Transformer(x => x + first + label)

      }
    }

    val trainData = sc.parallelize(Seq(32, 94, 12))
    val trainLabels = sc.parallelize(Seq("sjkfdl", "iw", "432"))
    val testData = sc.parallelize(Seq(42, 58, 61))

    val pipeline = intEstimator.withData(dataTransformer(trainData), labelTransformer(trainLabels))
    val offset = 64 + "sjkfdlhi".hashCode
    assert(pipeline.apply(testData).get().collect().toSeq === Seq(42 + offset, 58 + offset, 61 + offset))
  }
}

Source File: KMeansPlusPlusSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.learning

import breeze.linalg._
import org.apache.spark.SparkContext
import org.scalatest.FunSuite
import keystoneml.pipelines._
import keystoneml.utils.{MatrixUtils, Stats}
import keystoneml.workflow.PipelineContext

class KMeansPlusPlusSuite extends FunSuite with PipelineContext with Logging {

  test("K-Means++ Single Center") {
    sc = new SparkContext("local", "test")

    val k = 1

    val data = sc.parallelize(Array(
      DenseVector[Double](1.0, 2.0, 6.0),
      DenseVector[Double](1.0, 3.0, 0.0),
      DenseVector[Double](1.0, 4.0, 6.0)
    ))

    val center = DenseVector[Double](1.0, 3.0, 4.0).asDenseMatrix

    val kMeans = KMeansPlusPlusEstimator(k, maxIterations = 1).fit(data)
    assert(Stats.aboutEq(kMeans.means, center))

    val kMeans10 = KMeansPlusPlusEstimator(k, maxIterations = 10).fit(data)
    assert(Stats.aboutEq(kMeans.means, center))

    val out = kMeans.apply(data).collect()
  }

  test("K-Means++ Two Centers") {
    sc = new SparkContext("local", "test")

    val k = 2

    val data = sc.parallelize(Array(
      DenseVector[Double](1.0, 2.0, 6.0),
      DenseVector[Double](1.0, 3.0, 0.0),
      DenseVector[Double](1.0, 4.0, 6.0),
      DenseVector[Double](1.0, 1.0, 0.0)
    ))

    val centers = Set(
      DenseVector[Double](1.0, 2.0, 0.0),
      DenseVector[Double](1.0, 3.0, 6.0)
    )

    val kMeans = KMeansPlusPlusEstimator(k, maxIterations = 10).fit(data)
    val fitCenters = MatrixUtils.matrixToRowArray(kMeans.means).toSet
    assert(fitCenters === centers )

    val kMeans5 = KMeansPlusPlusEstimator(k, maxIterations = 5).fit(data)
    val fitCenters5 = MatrixUtils.matrixToRowArray(kMeans5.means).toSet
    assert(fitCenters5 === centers )

    val out = kMeans.apply(data).collect()
  }

  test("K-Means Transformer") {
    sc = new SparkContext("local", "test")

    val data = Array(
      DenseVector[Double](1.0, 2.0, 6.0),
      DenseVector[Double](1.0, 3.0, 0.0),
      DenseVector[Double](1.0, 4.0, 6.0),
      DenseVector[Double](1.0, 1.0, 0.0)
    )

    val centers = MatrixUtils.rowsToMatrix(Array(
      DenseVector[Double](1.0, 2.0, 0.0),
      DenseVector[Double](1.0, 3.0, 6.0)
    ))

    val clusterOne = DenseVector[Double](1.0, 0.0)
    val clusterTwo = DenseVector[Double](0.0, 1.0)

    val assignments = Seq(clusterTwo, clusterOne, clusterTwo, clusterOne)
    val kMeans = KMeansModel(centers)

    // Test Single Apply
    assert(kMeans.apply(DenseVector[Double](1.0, 3.0, 0.0)) === clusterOne)
    assert(kMeans.apply(DenseVector[Double](1.0, 1.0, 0.0)) === clusterOne)
    assert(kMeans.apply(DenseVector[Double](1.0, 2.0, 6.0)) === clusterTwo)
    assert(kMeans.apply(DenseVector[Double](1.0, 4.0, 6.0)) === clusterTwo)

    // Test Matrix Apply
    assert(kMeans.apply(MatrixUtils.rowsToMatrix(data)) === MatrixUtils.rowsToMatrix(assignments))

    // Test RDD Apply
    assert(kMeans.apply(sc.parallelize(data)).collect().toSeq === assignments)
  }
}

Source File: KernelModelSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.learning

import breeze.linalg._

import org.apache.spark.SparkContext
import org.scalatest.FunSuite

import keystoneml.workflow.PipelineContext
import keystoneml.utils.{MatrixUtils, Stats}

class KernelModelSuite extends FunSuite with PipelineContext {

  test("KernelModel XOR test") {
    sc = new SparkContext("local", "test")

    val x = Array(DenseVector(-1.0, -1.0), DenseVector(1.0, 1.0), DenseVector(-1.0, 1.0),DenseVector(1.0, -1.0))
    val xTest = Array(DenseVector(-1.0, -1.0), DenseVector(1.0, 1.0), DenseVector(-1.0, 1.0))
    val y = Array(DenseVector(0.0, 1.0), DenseVector(0.0, 1.0), DenseVector(1.0, 0.0), DenseVector(1.0, 0.0))
    val yTest = Array(DenseVector(0.0, 1.0), DenseVector(0.0, 1.0), DenseVector(1.0, 0.0))

    val xRDD = sc.parallelize(x, 2)
    val yRDD = sc.parallelize(y, 2)
    val xTestRDD = sc.parallelize(xTest, 2)

    val gaussian = new GaussianKernelGenerator(10)
    // Set block size to number of data points so no blocking happens
    val clf = new KernelRidgeRegression(gaussian, 0, 4, 2)

    val kernelModel = clf.fit(xRDD, yRDD)
    val yHat = kernelModel(xTestRDD).collect()
    // Fit should be good
    val delta = MatrixUtils.rowsToMatrix(yHat) - MatrixUtils.rowsToMatrix(yTest)

    delta :*= delta
    println("SUM OF DELTA1 " + sum(delta))
    assert(Stats.aboutEq(sum(delta), 0, 1e-4))
  }

  test("KernelModel XOR blocked test") {
    sc = new SparkContext("local", "test")

    val x = Array(DenseVector(-1.0, -1.0), DenseVector(1.0, 1.0), DenseVector(-1.0, 1.0),DenseVector(1.0, -1.0))
    val xTest = Array(DenseVector(-1.0, -1.0), DenseVector(1.0, 1.0), DenseVector(-1.0, 1.0))
    val y = Array(DenseVector(0.0, 1.0), DenseVector(0.0, 1.0), DenseVector(1.0, 0.0), DenseVector(1.0, 0.0))
    val yTest = Array(DenseVector(0.0, 1.0), DenseVector(0.0, 1.0), DenseVector(1.0, 0.0))

    val xRDD = sc.parallelize(x, 2)
    val yRDD = sc.parallelize(y, 2)
    val xTestRDD = sc.parallelize(xTest, 2)

    val gaussian = new GaussianKernelGenerator(10)

    // Set block size to half number of data points so blocking happens
    val clf = new KernelRidgeRegression(gaussian, 0, 2, 2)

    val kernelModel = clf.fit(xRDD, yRDD)
    val yHat = kernelModel(xTestRDD).collect()
    // Fit should be good
    val delta = MatrixUtils.rowsToMatrix(yHat) - MatrixUtils.rowsToMatrix(yTest)

    delta :*= delta
    assert(Stats.aboutEq(sum(delta), 0, 1e-4))
  }
}

Source File: BlockLinearMapperSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.learning

import breeze.linalg.{DenseVector, DenseMatrix}
import breeze.stats.distributions.Rand
import keystoneml.workflow.PipelineContext
import scala.collection.mutable.ArrayBuffer

import org.scalatest.FunSuite

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import keystoneml.pipelines._
import keystoneml.utils.Stats

class BlockLinearMapperSuite extends FunSuite with PipelineContext with Logging {

  test("BlockLinearMapper transformation") {
    sc = new SparkContext("local", "test")

    val inDims = 1000
    val outDims = 100
    val numChunks = 5
    val numPerChunk = inDims/numChunks

    val mat = DenseMatrix.rand(inDims, outDims, Rand.gaussian)
    val vec = DenseVector.rand(inDims, Rand.gaussian)
    val intercept = DenseVector.rand(outDims, Rand.gaussian)

    val splitVec = (0 until numChunks).map(i => vec((numPerChunk*i) until (numPerChunk*i + numPerChunk)))
    val splitMat = (0 until numChunks).map(i => mat((numPerChunk*i) until (numPerChunk*i + numPerChunk), ::))

    val linearMapper = new LinearMapper[DenseVector[Double]](mat, Some(intercept))
    val blockLinearMapper = new BlockLinearMapper(splitMat, numPerChunk, Some(intercept))

    val linearOut = linearMapper(vec)

    // Test with intercept
    assert(Stats.aboutEq(blockLinearMapper(vec), linearOut, 1e-4))

    // Test the apply and evaluate call
    val blmOuts = new ArrayBuffer[RDD[DenseVector[Double]]]
    val splitVecRDDs = splitVec.map { vec =>
      sc.parallelize(Seq(vec), 1)
    }
    blockLinearMapper.applyAndEvaluate(splitVecRDDs,
      (predictedValues: RDD[DenseVector[Double]]) => {
        blmOuts += predictedValues
        ()
      }
    )

    // The last blmOut should match the linear mapper's output
    assert(Stats.aboutEq(blmOuts.last.collect()(0), linearOut, 1e-4))
  }
}

Source File: LinearMapperSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.learning

import breeze.linalg._
import edu.berkeley.cs.amplab.mlmatrix.RowPartitionedMatrix
import keystoneml.nodes.stats.StandardScaler
import org.apache.spark.SparkContext
import org.scalatest.FunSuite
import keystoneml.pipelines.Logging
import keystoneml.utils.{TestUtils, MatrixUtils, Stats}
import keystoneml.workflow.PipelineContext

class LinearMapperSuite extends FunSuite with PipelineContext with Logging {
  test("Solve and apply a linear system") {
    sc = new SparkContext("local", "test")

    // Create the data.
    val A = TestUtils.createRandomMatrix(sc, 128, 5, 4)
    val x = DenseVector(5.0, 4.0, 3.0, 2.0, -1.0).toDenseMatrix
    val b = A.mapPartitions(part => part * x.t)

    val Aary = A.rdd.flatMap(part => MatrixUtils.matrixToRowArray(part.mat).toIterator)
    val bary = b.rdd.flatMap(part => MatrixUtils.matrixToRowArray(part.mat).toIterator)

    val mapper = new LinearMapEstimator().fit(Aary, bary)

    assert(Stats.aboutEq(mapper.x, x.t), "Coefficients from the solve must match the hand-created model.")

    val point = DenseVector(2.0, -3.0, 2.0, 3.0, 5.0)

    assert(Stats.aboutEq(mapper(sc.parallelize(Seq(point))).first()(0), 5.0),
        "Linear model applied to a point should be 5.0")

    val bt = mapper(Aary)
    assert(Stats.aboutEq(bt.collect()(0), bary.collect()(0)),
        "Linear model applied to input should be the same as training points.")
  }

  test("LocalLeastSquaresEstimator doesn't crash") {
    sc = new SparkContext("local", "test")

    // Create the data.
    val A = TestUtils.createRandomMatrix(sc, 50, 400, 4)
    val x = DenseVector(5.0, 4.0, 3.0, 2.0, -1.0).toDenseMatrix
    val b = A.mapPartitions(part => DenseMatrix.rand(part.rows, 3))

    val Aary = A.rdd.flatMap(part => MatrixUtils.matrixToRowArray(part.mat).toIterator)
    val bary = b.rdd.flatMap(part => MatrixUtils.matrixToRowArray(part.mat).toIterator)

    val mapper = new LocalLeastSquaresEstimator(1e-2).fit(Aary, bary)
    assert(mapper.x.rows === 400)
    assert(mapper.x.cols === 3)
  }

  test("Solve a dense linear system (fit intercept) using local least squares") {
    sc = new SparkContext("local", "test")

    // Create the data.
    val A = TestUtils.createRandomMatrix(sc, 128, 5, 4)
    val x = DenseMatrix((5.0, 4.0, 3.0, 2.0, -1.0), (3.0, -1.0, 2.0, -2.0, 1.0))
    val dataMean = DenseVector(1.0, 0.0, 1.0, 2.0, 0.0)
    val extraBias = DenseVector(3.0, 4.0)

    val initialAary = A.rdd.flatMap(part => MatrixUtils.matrixToRowArray(part.mat).toIterator)
    val meanScaler = new StandardScaler(normalizeStdDev = false).fit(initialAary)
    val Aary = meanScaler.apply(initialAary).map(_ + dataMean)
    val bary = Aary.map(a => (x * (a - dataMean)) + extraBias)

    val mapper = new LocalLeastSquaresEstimator(0).fit(Aary, bary)

    val trueResult = MatrixUtils.rowsToMatrix(bary.collect())
    val solverResult = MatrixUtils.rowsToMatrix(mapper(Aary).collect())

    assert(Stats.aboutEq(trueResult, solverResult, 1e-5), "Results from the solve must match the hand-created model.")
    assert(Stats.aboutEq(mapper.x, x.t, 1e-6), "Model weights from the solve must match the hand-created model.")
    assert(Stats.aboutEq(mapper.bOpt.get, extraBias, 1e-6), "Learned intercept must match the hand-created model.")
    assert(Stats.aboutEq(mapper.featureScaler.get.mean, dataMean, 1e-6),
      "Learned intercept must match the hand-created model.")

  }

}

Source File: LinearDiscriminantAnalysisSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.learning

import breeze.linalg._
import breeze.stats.distributions.{Multinomial, Uniform, Gaussian}
import keystoneml.nodes.stats.StandardScaler
import org.apache.spark.SparkContext
import org.scalatest.FunSuite
import keystoneml.pipelines.Logging
import keystoneml.utils.{TestUtils, MatrixUtils, Stats}
import keystoneml.workflow.PipelineContext

class LinearDiscriminantAnalysisSuite extends FunSuite with PipelineContext with Logging {
  test("Solve Linear Discriminant Analysis on the Iris Dataset") {
    sc = new SparkContext("local", "test")

    // Uses the Iris flower dataset
    val irisData = sc.parallelize(TestUtils.loadFile("iris.data"))
    val trainData = irisData.map(_.split(",").dropRight(1).map(_.toDouble)).map(new DenseVector(_))
    val features = new StandardScaler().fit(trainData).apply(trainData)
    val labels = irisData.map(_ match {
      case x if x.endsWith("Iris-setosa") => 1
      case x if x.endsWith("Iris-versicolor") => 2
      case x if x.endsWith("Iris-virginica") => 3
    })

    val lda = new LinearDiscriminantAnalysis(2)
    val out = lda.fit(features, labels)

    // Correct output taken from http://sebastianraschka.com/Articles/2014_python_lda.html#introduction
    logInfo(s"\n${out.x}")
    val majorVector = DenseVector(-0.1498, -0.1482, 0.8511, 0.4808)
    val minorVector = DenseVector(0.0095, 0.3272, -0.5748, 0.75)

    // Note that because eigenvectors can be reversed and still valid, we allow either direction
    assert(Stats.aboutEq(out.x(::, 0), majorVector, 1E-4) || Stats.aboutEq(out.x(::, 0), majorVector * -1.0, 1E-4))
    assert(Stats.aboutEq(out.x(::, 1), minorVector, 1E-4) || Stats.aboutEq(out.x(::, 1), minorVector * -1.0, 1E-4))
  }

  test("Check LDA output for a diagonal covariance") {
    sc = new SparkContext("local", "test")

    val matRows = 1000
    val matCols = 10
    val dimRed = 5

    // Generate a random Gaussian matrix.
    val gau = new Gaussian(0.0, 1.0)
    val randMatrix = new DenseMatrix(matRows, matCols, gau.sample(matRows*matCols).toArray)

    // Parallelize and estimate the LDA.
    val data = sc.parallelize(MatrixUtils.matrixToRowArray(randMatrix))
    val labels = data.map(x => Multinomial(DenseVector(0.2, 0.2, 0.2, 0.2, 0.2)).draw(): Int)
    val lda = new LinearDiscriminantAnalysis(dimRed).fit(data, labels)

    // Apply LDA to the input data.
    val redData = lda(data)
    val redMat = MatrixUtils.rowsToMatrix(redData.collect)

    // Compute its covariance.
    val redCov = cov(redMat)
    log.info(s"Covar\n$redCov")

    // The covariance of the dimensionality reduced matrix should be diagonal.
    for (
      x <- 0 until dimRed;
      y <- 0 until dimRed if x != y
    ) {
      assert(Stats.aboutEq(redCov(x,y), 0.0, 1e-6), s"LDA Matrix should be 0 off-diagonal. $x,$y = ${redCov(x,y)}")
    }
  }

}

Source File: TermFrequencySuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.misc

import keystoneml.nodes.stats.TermFrequency
import org.apache.spark.SparkContext
import org.scalatest.FunSuite
import keystoneml.workflow.PipelineContext

class TermFrequencySuite extends FunSuite with PipelineContext {
  test("term frequency of simple strings") {
    sc = new SparkContext("local", "test")
    val in = Seq(Seq[Any]("b", "a", "c", "b", "b", "a", "b"))
    val out = TermFrequency().apply(sc.parallelize(in)).first().toMap
    assert(out === Map("a" -> 2, "b" -> 4, "c" -> 1))
  }

  test("term frequency of varying types") {
    sc = new SparkContext("local", "test")
    val in = Seq(Seq("b", "a", "c", ("b", "b"), ("b", "b"), 12, 12, "a", "b", 12))
    val out = TermFrequency().apply(sc.parallelize(in)).first().toMap
    assert(out === Map("a" -> 2, "b" -> 2, "c" -> 1, ("b", "b") -> 2, 12 -> 3))
  }

  test("log term frequency") {
    sc = new SparkContext("local", "test")
    val in = Seq(Seq[Any]("b", "a", "c", "b", "b", "a", "b"))
    val out = TermFrequency(x => math.log(x + 1)).apply(sc.parallelize(in)).first().toMap
    assert(out === Map("a" -> math.log(3), "b" -> math.log(5), "c" -> math.log(2)))
  }
}

Source File: SparseFeatureVectorizerSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.misc

import keystoneml.nodes.util.{SparseFeatureVectorizer, AllSparseFeatures, CommonSparseFeatures}
import org.apache.spark.SparkContext
import org.scalatest.FunSuite
import keystoneml.pipelines.Logging
import keystoneml.workflow.PipelineContext

class SparseFeatureVectorizerSuite extends FunSuite with PipelineContext with Logging {
  test("sparse feature vectorization") {
    sc = new SparkContext("local", "test")

    val featureVectorizer = new SparseFeatureVectorizer(Map("First" -> 0, "Second" -> 1, "Third" -> 2))
    val test = Seq(("Third", 4.0), ("Fourth", 6.0), ("First", 1.0))
    val vector = featureVectorizer.apply(sc.parallelize(Seq(test))).first()

    assert(vector.size == 3)
    assert(vector(0) == 1)
    assert(vector(1) == 0)
    assert(vector(2) == 4)
  }

  test("all sparse feature selection") {
    sc = new SparkContext("local", "test")
    val train = sc.parallelize(List(Seq(("First", 0.0), ("Second", 6.0)), Seq(("Third", 3.0), ("Second", 4.0))))

    val featureVectorizer = AllSparseFeatures().fit(train.map(x => x))
    // The selected features should now be "First", "Second", and "Third"

    val test = Seq(("Third", 4.0), ("Fourth", 6.0), ("First", 1.0))
    val out = featureVectorizer.apply(sc.parallelize(Seq(test))).first().toArray

    assert(out === Array(1.0, 0.0, 4.0))
  }

  test("common sparse feature selection") {
    sc = new SparkContext("local", "test")
    val train = sc.parallelize(List(
      Seq(("First", 0.0), ("Second", 6.0)),
      Seq(("Third", 3.0), ("Second", 4.8)),
      Seq(("Third", 7.0), ("Fourth", 5.0)),
      Seq(("Fifth", 5.0), ("Second", 7.3))
    ))

    val featureVectorizer = CommonSparseFeatures(2).fit(train.map(x => x))
    // The selected features should now be "Second", and "Third"

    val test = Seq(("Third", 4.0), ("Seventh", 8.0), ("Second", 1.3), ("Fourth", 6.0), ("First", 1.0))
    val out = featureVectorizer.apply(sc.parallelize(Seq(test))).first().toArray

    assert(out === Array(1.3, 4.0))
  }
}

Source File: LinearRectifierSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.stats

import breeze.linalg.DenseMatrix
import breeze.stats.distributions.Rand
import org.apache.spark.SparkContext
import org.scalatest.FunSuite
import keystoneml.pipelines._
import keystoneml.utils.{TestUtils, MatrixUtils}
import keystoneml.workflow.PipelineContext

class LinearRectifierSuite extends FunSuite with PipelineContext with Logging {

  test("Test MaxVal") {
    sc = new SparkContext("local", "test")
    val matrixParts = TestUtils.createRandomMatrix(sc, 128, 16, 4).rdd.map(_.mat)

    val x = matrixParts.flatMap(y => MatrixUtils.matrixToRowArray(y))
    val y = x.map(r => r.forall(_ >= 0.0))

    val valmaxNode = LinearRectifier()
    val maxy = valmaxNode.apply(x).map(r => r.forall(_ >= 0.0))

    //The random matrix should *not* all be >= 0
    assert(!y.reduce {(a,b) => a | b})

    //The valmax'ed random matrix *should* all be >= 0.
    assert(maxy.reduce {(a,b) => a | b})
  }
}

Source File: PaddedFFTSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.stats

import breeze.linalg._
import org.apache.spark.SparkContext
import org.scalatest.FunSuite
import keystoneml.pipelines.Logging
import keystoneml.utils.Stats
import keystoneml.workflow.PipelineContext


class PaddedFFTSuite extends FunSuite with PipelineContext with Logging {
  test("Test PaddedFFT node") {
    sc = new SparkContext("local", "test")

    // Set up a test matrix.
    val ones = DenseVector.zeros[Double](100)
    val twos = DenseVector.zeros[Double](100)
    ones(0) = 1.0
    twos(2) = 1.0

    val x = sc.parallelize(Seq(twos, ones))
    val fftd = PaddedFFT().apply(x).collect()

    val twosout = fftd(0)
    val onesout = fftd(1)

    // Proof by agreement w/ R: Re(fft(c(0, 0, 1, rep(0, 125))))
    assert(twosout.length === 64)
    assert(Stats.aboutEq(twosout(0), 1.0))
    assert(Stats.aboutEq(twosout(16), 0.0))
    assert(Stats.aboutEq(twosout(32), -1.0))
    assert(Stats.aboutEq(twosout(48), 0.0))

    // Proof by agreement w/ R: Re(fft(c(1, rep(0, 127))))
    assert(Stats.aboutEq(onesout, DenseVector.ones[Double](64)))
  }
}

Source File: CoreNLPFeatureExtractorSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.nlp

import org.apache.spark.SparkContext
import org.scalatest.FunSuite
import keystoneml.pipelines.Logging
import keystoneml.workflow.PipelineContext

class CoreNLPFeatureExtractorSuite extends FunSuite with PipelineContext with Logging {
  test("lemmatization") {
    sc = new SparkContext("local", "test")

    val text = "jumping snakes lakes oceans hunted"
    val tokens = CoreNLPFeatureExtractor(1 to 3).apply(sc.parallelize(Seq(text))).first().toSet

    // Make sure at least very simple cases were lemmatized
    assert(tokens.contains("jump"))
    assert(tokens.contains("snake"))
    assert(tokens.contains("lake"))
    assert(tokens.contains("ocean"))
    assert(tokens.contains("hunt"))

    // Assert the unlemmatized tokens are no longer there
    assert(!tokens.contains("jumping"))
    assert(!tokens.contains("snakes"))
    assert(!tokens.contains("oceans"))
    assert(!tokens.contains("lakes"))
    assert(!tokens.contains("hunted"))
  }

  test("entity extraction") {
    sc = new SparkContext("local", "test")

    val text = "John likes cake and he lives in Florida"
    val tokens = CoreNLPFeatureExtractor(1 to 3).apply(sc.parallelize(Seq(text))).first().toSet

    // Make sure at least very simple entities were identified and extracted
    assert(tokens.contains("PERSON"))
    assert(tokens.contains("LOCATION"))

    // Assert the original tokens are no longer there
    assert(!tokens.contains("John"))
    assert(!tokens.contains("Florida"))
  }

  test("1-2-3-grams") {
    sc = new SparkContext("local", "test")

    val text = "a b c d"
    val tokens = CoreNLPFeatureExtractor(1 to 3).apply(sc.parallelize(Seq(text))).first().toSet

    // Make sure expected unigrams appear
    assert(tokens.contains("a"))
    assert(tokens.contains("b"))
    assert(tokens.contains("c"))
    assert(tokens.contains("d"))

    // Make sure expected bigrams appear
    assert(tokens.contains("a b"))
    assert(tokens.contains("b c"))
    assert(tokens.contains("c d"))

    // Make sure expected 3-grams appear
    assert(tokens.contains("a b c"))
    assert(tokens.contains("b c d"))
  }
}

Source File: StringUtilsSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.nlp

import org.apache.spark.SparkContext
import org.scalatest.FunSuite
import keystoneml.workflow.PipelineContext

class StringUtilsSuite extends FunSuite with PipelineContext {
  val stringToManip = Array("  The quick BROWN fo.X ", " ! !.,)JumpeD. ovER the LAZy DOG.. ! ")
  test("trim") {
    sc = new SparkContext("local", "test")
    val out = Trim.apply(sc.parallelize(stringToManip, 1)).collect().toSeq
    assert(out === Seq("The quick BROWN fo.X", "! !.,)JumpeD. ovER the LAZy DOG.. !"))
  }

  test("lower case") {
    sc = new SparkContext("local", "test")
    val out = LowerCase().apply(sc.parallelize(stringToManip, 1)).collect().toSeq
    assert(out === Seq("  the quick brown fo.x ", " ! !.,)jumped. over the lazy dog.. ! "))
  }

  test("tokenizer") {
    sc = new SparkContext("local", "test")
    val out = Tokenizer().apply(sc.parallelize(stringToManip, 1)).collect().toSeq
    assert(out === Seq(Seq("", "The", "quick", "BROWN", "fo", "X"), Seq("", "JumpeD", "ovER", "the", "LAZy", "DOG")))
  }
}

Source File: TopKClassifierSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.util

import breeze.linalg.DenseVector
import org.apache.spark.SparkContext
import org.scalatest.FunSuite
import keystoneml.workflow.PipelineContext

class TopKClassifierSuite extends FunSuite with PipelineContext {
  test("top k classifier, k <= vector size") {
    sc = new SparkContext("local", "test")

    assert(TopKClassifier(2).apply(DenseVector(-10.0, 42.4, -43.0, 23.0)) === Array(1, 3))
    assert(TopKClassifier(4).apply(DenseVector(Double.MinValue, Double.MaxValue, 12.0, 11.0, 10.0)) === Array(1, 2, 3, 4))
    assert(TopKClassifier(3).apply(DenseVector(3.0, -23.2, 2.99)) === Array(0, 2, 1))
  }

  test("top k classifier, k > vector size") {
    sc = new SparkContext("local", "test")

    assert(TopKClassifier(5).apply(DenseVector(-10.0, 42.4, -43.0, 23.0)) === Array(1, 3, 0, 2))
    assert(TopKClassifier(2).apply(DenseVector(Double.MinValue)) === Array(0))
    assert(TopKClassifier(20).apply(DenseVector(3.0, -23.2, 2.99)) === Array(0, 2, 1))
  }

}

Source File: VOCLoaderSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.loaders

import org.scalatest.FunSuite
import org.apache.spark.SparkContext
import keystoneml.utils.TestUtils
import keystoneml.workflow.PipelineContext

class VOCLoaderSuite extends FunSuite with PipelineContext {
  test("load a sample of VOC data") {
    sc = new SparkContext("local", "test")
    val dataPath = TestUtils.getTestResourceFileName("images/voc")
    val labelsPath = TestUtils.getTestResourceFileName("images/voclabels.csv")

    val imgs = VOCLoader(sc,
      VOCDataPath(dataPath, "VOCdevkit/VOC2007/JPEGImages/", Some(1)),
      VOCLabelPath(labelsPath)).collect()

    // We should have 10 images
    assert(imgs.length === 10)

    // There should be one file whose name ends with "000104.jpg"
    val personMonitor = imgs.filter(_.filename.get.endsWith("000104.jpg"))
    assert(personMonitor.length === 1)

    // It should have two labels, 14 and 19.
    assert(personMonitor(0).label.contains(14) && personMonitor(0).label.contains(19))

    // There should be two 13 labels total and 9 should be distinct.
    assert(imgs.map(_.label).flatten.length === 13)
    assert(imgs.map(_.label).flatten.distinct.length === 9)
  }
}

Source File: ImageNetLoaderSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.loaders

import org.scalatest.FunSuite
import org.apache.spark.SparkContext
import keystoneml.utils.TestUtils
import keystoneml.workflow.PipelineContext

class ImageNetLoaderSuite extends FunSuite with PipelineContext {
  test("load a sample of imagenet data") {
    sc = new SparkContext("local", "test")
    val dataPath = TestUtils.getTestResourceFileName("images/imagenet")
    val labelsPath = TestUtils.getTestResourceFileName("images/imagenet-test-labels")

    val imgs = ImageNetLoader.apply(sc, dataPath, labelsPath).collect()
    // We should have 5 images
    assert(imgs.length === 5)

    // The images should all have label 12
    assert(imgs.map(_.label).distinct.length === 1)
    assert(imgs.map(_.label).distinct.head === 12)

    // The image filenames should begin with n15075141
    assert(imgs.forall(_.filename.get.startsWith("n15075141")), "Image filenames should be correct")
  }
}

Source File: StupidBackoffSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.pipelines.nlp

import keystoneml.nodes.nlp._

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import org.scalatest.FunSuite
import keystoneml.workflow.PipelineContext

import scala.collection.JavaConverters._

class StupidBackoffSuite extends FunSuite with PipelineContext {

  val data = Seq("Winter is coming",
    "Finals are coming",
    "Summer is coming really soon")

  def featurizer(orders: Seq[Int], mode: NGramsCountsMode.Value = NGramsCountsMode.Default) = {
    def feat(data: RDD[String]) = {
      NGramsCounts[String](mode).apply(
        (Tokenizer() andThen NGramsFeaturizer[String](orders)).apply(data).get)
    }
    feat _
  }

  def requireNGramColocation[T, V](
      ngrams: RDD[(NGram[T], V)],
      indexer: BackoffIndexer[T, NGram[T]]) = {

    ngrams.mapPartitions { part =>
      val map = new java.util.HashMap[NGram[T], V]().asScala
      part.foreach { case (ngramId, count) => map.put(ngramId, count) }

      map.keySet.foreach { ngramId =>
        var currNGram = ngramId
        while (indexer.ngramOrder(currNGram) > 2) {
          val context = indexer.removeCurrentWord(currNGram)
          require(map.contains(context),
            s"ngram $currNGram is not co-located with its context $context within same partition")
          currNGram = context
        }
      }
      Iterator.empty
    }.count()
  }

  test("end-to-end InitialBigramPartitioner") {
    sc = new SparkContext("local[4]", "StupidBackoffSuite")
    val corpus = sc.parallelize(data, 3)
    val ngrams = featurizer(2 to 5, NGramsCountsMode.NoAdd)(corpus)
    val unigrams = featurizer(1 to 1)(corpus)
      .collectAsMap()
      .map { case (key, value) => key.words(0) -> value }

    val stupidBackoff = StupidBackoffEstimator[String](unigrams).fit(ngrams)
    requireNGramColocation[String, Double](stupidBackoff.scoresRDD, new NGramIndexerImpl)
  }

  test("Stupid Backoff calculates correct scores") {
    sc = new SparkContext("local[4]", "StupidBackoffSuite")
    val corpus = sc.parallelize(data, 3)
    val ngrams = featurizer(2 to 5, NGramsCountsMode.NoAdd)(corpus)
    val unigrams = featurizer(1 to 1)(corpus)
      .collectAsMap()
      .map { case (key, value) => key.words(0) -> value }
    val lm = StupidBackoffEstimator[String](unigrams).fit(ngrams)

    assert(lm.score(new NGram(Seq("is", "coming"))) === 2.0 / 2.0)
    assert(lm.score(new NGram(Seq("is", "coming", "really"))) === 1.0 / 2.0)

    assert(lm.score(new NGram(Seq("is", "unseen-coming"))) === 0,
      "not equal to expected: bacoffed once & curr word unseen, so should be zero")
    assert(lm.score(new NGram(Seq("is-unseen", "coming"))) === lm.alpha * 3.0 / lm.numTokens,
      "not equal to expected: backoffed once, should be alpha * currWordCount / numTokens")
  }

}

Source File: HiSpeedRead.scala From spark-db2 with Apache License 2.0

5 votes

import com.ibm.spark.ibmdataserver.Constants
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkContext, SparkConf}

object HiSpeedRead {

  def main(args: Array[String]) {
    val DB2_CONNECTION_URL = "jdbc:db2://localhost:50700/sample:traceFile=C:\\1.txt;"

    val conf = new SparkConf().setMaster("local[2]").setAppName("read test")

    val sparkContext = new SparkContext(conf)

    val sqlContext = new SQLContext(sparkContext)

    Class.forName("com.ibm.db2.jcc.DB2Driver")

    val jdbcRdr = sqlContext.read.format("com.ibm.spark.ibmdataserver")
      .option("url", DB2_CONNECTION_URL)
      // .option(Constants.TABLE, tableName)
      .option("user", "pallavipr")
      .option("password", "9manjari")
      .option("dbtable", "employee")
      .load()

    jdbcRdr.show()
  }
}

Source File: MultiZippedPartitionRDD.scala From spark-vlbfgs with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{Partition, SparkContext, TaskContext}

import scala.reflect.ClassTag

private[spark] class MultiZippedPartitionsRDD[A: ClassTag, V: ClassTag](
    sc: SparkContext,
    var f: (List[Iterator[A]]) => Iterator[V],
    var rddList: List[RDD[A]],
    preservesPartitioning: Boolean = false)
  extends ZippedPartitionsBaseRDD[V](sc, rddList, preservesPartitioning) {

  override def compute(s: Partition, context: TaskContext): Iterator[V] = {
    val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions
    val iterList = rddList.zipWithIndex.map{ case (rdd: RDD[A], index: Int) =>
      rdd.iterator(partitions(index), context)
    }
    f(iterList)
  }

  override def clearDependencies() {
    super.clearDependencies()
    rddList = null
    f = null
  }
}

Source File: ConcurrentHiveSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.scalatest.BeforeAndAfterAll

import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
import org.apache.spark.sql.hive.test.TestHiveContext

class ConcurrentHiveSuite extends SparkFunSuite with BeforeAndAfterAll {
  ignore("multiple instances not supported") {
    test("Multiple Hive Instances") {
      (1 to 10).map { i =>
        val conf = new SparkConf()
        conf.set("spark.ui.enabled", "false")
        val ts =
          new TestHiveContext(new SparkContext("local", s"TestSQLContext$i", conf))
        ts.sparkSession.sql("SHOW TABLES").collect()
        ts.sparkSession.sql("SELECT * FROM src").collect()
        ts.sparkSession.sql("SHOW TABLES").collect()
      }
    }
  }
}

Source File: HiveContextCompatibilitySuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.scalatest.BeforeAndAfterEach

import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}


class HiveContextCompatibilitySuite extends SparkFunSuite with BeforeAndAfterEach {

  override protected val enableAutoThreadAudit = false
  private var sc: SparkContext = null
  private var hc: HiveContext = null

  override def beforeAll(): Unit = {
    super.beforeAll()
    sc = SparkContext.getOrCreate(new SparkConf().setMaster("local").setAppName("test"))
    HiveUtils.newTemporaryConfiguration(useInMemoryDerby = true).foreach { case (k, v) =>
      sc.hadoopConfiguration.set(k, v)
    }
    hc = new HiveContext(sc)
  }

  override def afterEach(): Unit = {
    try {
      hc.sharedState.cacheManager.clearCache()
      hc.sessionState.catalog.reset()
    } finally {
      super.afterEach()
    }
  }

  override def afterAll(): Unit = {
    try {
      sc = null
      hc = null
    } finally {
      super.afterAll()
    }
  }

  test("basic operations") {
    val _hc = hc
    import _hc.implicits._
    val df1 = (1 to 20).map { i => (i, i) }.toDF("a", "x")
    val df2 = (1 to 100).map { i => (i, i % 10, i % 2 == 0) }.toDF("a", "b", "c")
      .select($"a", $"b")
      .filter($"a" > 10 && $"b" > 6 && $"c")
    val df3 = df1.join(df2, "a")
    val res = df3.collect()
    val expected = Seq((18, 18, 8)).toDF("a", "x", "b").collect()
    assert(res.toSeq == expected.toSeq)
    df3.createOrReplaceTempView("mai_table")
    val df4 = hc.table("mai_table")
    val res2 = df4.collect()
    assert(res2.toSeq == expected.toSeq)
  }

  test("basic DDLs") {
    val _hc = hc
    import _hc.implicits._
    val databases = hc.sql("SHOW DATABASES").collect().map(_.getString(0))
    assert(databases.toSeq == Seq("default"))
    hc.sql("CREATE DATABASE mee_db")
    hc.sql("USE mee_db")
    val databases2 = hc.sql("SHOW DATABASES").collect().map(_.getString(0))
    assert(databases2.toSet == Set("default", "mee_db"))
    val df = (1 to 10).map { i => ("bob" + i.toString, i) }.toDF("name", "age")
    df.createOrReplaceTempView("mee_table")
    hc.sql("CREATE TABLE moo_table (name string, age int)")
    hc.sql("INSERT INTO moo_table SELECT * FROM mee_table")
    assert(
      hc.sql("SELECT * FROM moo_table order by name").collect().toSeq ==
      df.collect().toSeq.sortBy(_.getString(0)))
    val tables = hc.sql("SHOW TABLES IN mee_db").select("tableName").collect().map(_.getString(0))
    assert(tables.toSet == Set("moo_table", "mee_table"))
    hc.sql("DROP TABLE moo_table")
    hc.sql("DROP TABLE mee_table")
    val tables2 = hc.sql("SHOW TABLES IN mee_db").select("tableName").collect().map(_.getString(0))
    assert(tables2.isEmpty)
    hc.sql("USE default")
    hc.sql("DROP DATABASE mee_db CASCADE")
    val databases3 = hc.sql("SHOW DATABASES").collect().map(_.getString(0))
    assert(databases3.toSeq == Seq("default"))
  }

}

Source File: ThriftServerTab.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver.ui

import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.internal.Logging
import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2
import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._
import org.apache.spark.ui.{SparkUI, SparkUITab}


private[thriftserver] class ThriftServerTab(sparkContext: SparkContext)
  extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging {

  override val name = "JDBC/ODBC Server"

  val parent = getSparkUI(sparkContext)
  val listener = HiveThriftServer2.listener

  attachPage(new ThriftServerPage(this))
  attachPage(new ThriftServerSessionPage(this))
  parent.attachTab(this)

  def detach() {
    getSparkUI(sparkContext).detachTab(this)
  }
}

private[thriftserver] object ThriftServerTab {
  def getSparkUI(sparkContext: SparkContext): SparkUI = {
    sparkContext.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: SparkSQLEnv.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import java.io.PrintStream

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.internal.Logging
import org.apache.spark.sql.{SparkSession, SQLContext}
import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils}
import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
import org.apache.spark.util.Utils


  def stop() {
    logDebug("Shutting down Spark SQL Environment")
    // Stop the SparkContext
    if (SparkSQLEnv.sparkContext != null) {
      sparkContext.stop()
      sparkContext = null
      sqlContext = null
    }
  }
}

Source File: XSQLTestSparkSession.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.xsql.test

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.internal.SessionState
import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
import org.apache.spark.sql.test.TestSparkSession
import org.apache.spark.sql.xsql.XSQLSessionStateBuilder


class XSQLTestSparkSession(sc: SparkContext) extends TestSparkSession(sc) { self =>
  def this(sparkConf: SparkConf) {
    this(
      new SparkContext(
        "local[2]",
        "test-sql-context",
        sparkConf.set("spark.sql.testkey", "true").set(CATALOG_IMPLEMENTATION, "xsql")))
  }

  @transient
  override lazy val sessionState: SessionState = {
    new XSQLSessionStateBuilder(this, None).build()
  }
}

Source File: SparkPlanner.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, FileSourceStrategy}
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Strategy
import org.apache.spark.sql.internal.SQLConf

class SparkPlanner(
    val sparkContext: SparkContext,
    val conf: SQLConf,
    val experimentalMethods: ExperimentalMethods)
  extends SparkStrategies {

  def numPartitions: Int = conf.numShufflePartitions

  override def strategies: Seq[Strategy] =
    experimentalMethods.extraStrategies ++
      extraPlanningStrategies ++ (
      PythonEvals ::
      DataSourceV2Strategy ::
      FileSourceStrategy ::
      DataSourceStrategy(conf) ::
      SpecialLimits ::
      Aggregation ::
      Window ::
      JoinSelection ::
      InMemoryScans ::
      BasicOperators :: Nil)

  
  def pruneFilterProject(
      projectList: Seq[NamedExpression],
      filterPredicates: Seq[Expression],
      prunePushedDownFilters: Seq[Expression] => Seq[Expression],
      scanBuilder: Seq[Attribute] => SparkPlan): SparkPlan = {

    val projectSet = AttributeSet(projectList.flatMap(_.references))
    val filterSet = AttributeSet(filterPredicates.flatMap(_.references))
    val filterCondition: Option[Expression] =
      prunePushedDownFilters(filterPredicates).reduceLeftOption(catalyst.expressions.And)

    // Right now we still use a projection even if the only evaluation is applying an alias
    // to a column.  Since this is a no-op, it could be avoided. However, using this
    // optimization with the current implementation would change the output schema.
    // TODO: Decouple final output schema from expression evaluation so this copy can be
    // avoided safely.

    if (AttributeSet(projectList.map(_.toAttribute)) == projectSet &&
        filterSet.subsetOf(projectSet)) {
      // When it is possible to just use column pruning to get the right projection and
      // when the columns of this projection are enough to evaluate all filter conditions,
      // just do a scan followed by a filter, with no extra project.
      val scan = scanBuilder(projectList.asInstanceOf[Seq[Attribute]])
      filterCondition.map(FilterExec(_, scan)).getOrElse(scan)
    } else {
      val scan = scanBuilder((projectSet ++ filterSet).toSeq)
      ProjectExec(projectList, filterCondition.map(FilterExec(_, scan)).getOrElse(scan))
    }
  }
}

Source File: DataSourceRDD.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.v2

import scala.reflect.ClassTag

import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.v2.reader.InputPartition

class DataSourceRDDPartition[T : ClassTag](val index: Int, val inputPartition: InputPartition[T])
  extends Partition with Serializable

class DataSourceRDD[T: ClassTag](
    sc: SparkContext,
    @transient private val inputPartitions: Seq[InputPartition[T]])
  extends RDD[T](sc, Nil) {

  override protected def getPartitions: Array[Partition] = {
    inputPartitions.zipWithIndex.map {
      case (inputPartition, index) => new DataSourceRDDPartition(index, inputPartition)
    }.toArray
  }

  override def compute(split: Partition, context: TaskContext): Iterator[T] = {
    val reader = split.asInstanceOf[DataSourceRDDPartition[T]].inputPartition
        .createPartitionReader()
    context.addTaskCompletionListener[Unit](_ => reader.close())
    val iter = new Iterator[T] {
      private[this] var valuePrepared = false

      override def hasNext: Boolean = {
        if (!valuePrepared) {
          valuePrepared = reader.next()
        }
        valuePrepared
      }

      override def next(): T = {
        if (!hasNext) {
          throw new java.util.NoSuchElementException("End of stream")
        }
        valuePrepared = false
        reader.get()
      }
    }
    new InterruptibleIterator(context, iter)
  }

  override def getPreferredLocations(split: Partition): Seq[String] = {
    split.asInstanceOf[DataSourceRDDPartition[T]].inputPartition.preferredLocations()
  }
}

Source File: BasicWriteStatsTracker.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.io.FileNotFoundException

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.spark.{SparkContext, TaskContext}
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.SQLExecution
import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
import org.apache.spark.util.SerializableConfiguration



class BasicWriteJobStatsTracker(
    serializableHadoopConf: SerializableConfiguration,
    @transient val metrics: Map[String, SQLMetric])
  extends WriteJobStatsTracker {

  override def newTaskInstance(): WriteTaskStatsTracker = {
    new BasicWriteTaskStatsTracker(serializableHadoopConf.value)
  }

  override def processStats(stats: Seq[WriteTaskStats]): Unit = {
    val sparkContext = SparkContext.getActive.get
    var numPartitions: Long = 0L
    var numFiles: Long = 0L
    var totalNumBytes: Long = 0L
    var totalNumOutput: Long = 0L

    val basicStats = stats.map(_.asInstanceOf[BasicWriteTaskStats])

    basicStats.foreach { summary =>
      numPartitions += summary.numPartitions
      numFiles += summary.numFiles
      totalNumBytes += summary.numBytes
      totalNumOutput += summary.numRows
    }

    metrics(BasicWriteJobStatsTracker.NUM_FILES_KEY).add(numFiles)
    metrics(BasicWriteJobStatsTracker.NUM_OUTPUT_BYTES_KEY).add(totalNumBytes)
    metrics(BasicWriteJobStatsTracker.NUM_OUTPUT_ROWS_KEY).add(totalNumOutput)
    metrics(BasicWriteJobStatsTracker.NUM_PARTS_KEY).add(numPartitions)

    val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
    SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toList)
  }
}

object BasicWriteJobStatsTracker {
  private val NUM_FILES_KEY = "numFiles"
  private val NUM_OUTPUT_BYTES_KEY = "numOutputBytes"
  private val NUM_OUTPUT_ROWS_KEY = "numOutputRows"
  private val NUM_PARTS_KEY = "numParts"

  def metrics: Map[String, SQLMetric] = {
    val sparkContext = SparkContext.getActive.get
    Map(
      NUM_FILES_KEY -> SQLMetrics.createMetric(sparkContext, "number of written files"),
      NUM_OUTPUT_BYTES_KEY -> SQLMetrics.createMetric(sparkContext, "bytes of written output"),
      NUM_OUTPUT_ROWS_KEY -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
      NUM_PARTS_KEY -> SQLMetrics.createMetric(sparkContext, "number of dynamic part")
    )
  }
}

Source File: SQLExecution.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.atomic.AtomicLong

import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.ui.{SparkListenerSQLExecutionEnd, SparkListenerSQLExecutionStart}

object SQLExecution {

  val EXECUTION_ID_KEY = "spark.sql.execution.id"

  private val _nextExecutionId = new AtomicLong(0)

  private def nextExecutionId: Long = _nextExecutionId.getAndIncrement

  private val executionIdToQueryExecution = new ConcurrentHashMap[Long, QueryExecution]()

  def getQueryExecution(executionId: Long): QueryExecution = {
    executionIdToQueryExecution.get(executionId)
  }

  private val testing = sys.props.contains("spark.testing")

  private[sql] def checkSQLExecutionId(sparkSession: SparkSession): Unit = {
    val sc = sparkSession.sparkContext
    // only throw an exception during tests. a missing execution ID should not fail a job.
    if (testing && sc.getLocalProperty(EXECUTION_ID_KEY) == null) {
      // Attention testers: when a test fails with this exception, it means that the action that
      // started execution of a query didn't call withNewExecutionId. The execution ID should be
      // set by calling withNewExecutionId in the action that begins execution, like
      // Dataset.collect or DataFrameWriter.insertInto.
      throw new IllegalStateException("Execution ID should be set")
    }
  }

  
  def withSQLConfPropagated[T](sparkSession: SparkSession)(body: => T): T = {
    val sc = sparkSession.sparkContext
    // Set all the specified SQL configs to local properties, so that they can be available at
    // the executor side.
    val allConfigs = sparkSession.sessionState.conf.getAllConfs
    val originalLocalProps = allConfigs.collect {
      case (key, value) if key.startsWith("spark") =>
        val originalValue = sc.getLocalProperty(key)
        sc.setLocalProperty(key, value)
        (key, originalValue)
    }

    try {
      body
    } finally {
      for ((key, value) <- originalLocalProps) {
        sc.setLocalProperty(key, value)
      }
    }
  }
}

Source File: ContinuousShuffleReadRDD.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming.continuous.shuffle

import java.util.UUID

import org.apache.spark.{Partition, SparkContext, SparkEnv, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.rpc.RpcAddress
import org.apache.spark.sql.catalyst.expressions.UnsafeRow
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.util.NextIterator

case class ContinuousShuffleReadPartition(
      index: Int,
      endpointName: String,
      queueSize: Int,
      numShuffleWriters: Int,
      epochIntervalMs: Long)
    extends Partition {
  // Initialized only on the executor, and only once even as we call compute() multiple times.
  lazy val (reader: ContinuousShuffleReader, endpoint) = {
    val env = SparkEnv.get.rpcEnv
    val receiver = new RPCContinuousShuffleReader(
      queueSize, numShuffleWriters, epochIntervalMs, env)
    val endpoint = env.setupEndpoint(endpointName, receiver)

    TaskContext.get().addTaskCompletionListener[Unit] { ctx =>
      env.stop(endpoint)
    }
    (receiver, endpoint)
  }
}


class ContinuousShuffleReadRDD(
    sc: SparkContext,
    numPartitions: Int,
    queueSize: Int = 1024,
    numShuffleWriters: Int = 1,
    epochIntervalMs: Long = 1000,
    val endpointNames: Seq[String] = Seq(s"RPCContinuousShuffleReader-${UUID.randomUUID()}"))
  extends RDD[UnsafeRow](sc, Nil) {

  override protected def getPartitions: Array[Partition] = {
    (0 until numPartitions).map { partIndex =>
      ContinuousShuffleReadPartition(
        partIndex, endpointNames(partIndex), queueSize, numShuffleWriters, epochIntervalMs)
    }.toArray
  }

  override def compute(split: Partition, context: TaskContext): Iterator[UnsafeRow] = {
    split.asInstanceOf[ContinuousShuffleReadPartition].reader.read()
  }
}

Source File: TestSQLContext.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.test

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.internal.{SessionState, SessionStateBuilder, SQLConf, WithTestConf}


  val overrideConfs: Map[String, String] =
    Map(
      // Fewer shuffle partitions to speed up testing.
      SQLConf.SHUFFLE_PARTITIONS.key -> "5")
}

private[sql] class TestSQLSessionStateBuilder(
    session: SparkSession,
    state: Option[SessionState])
  extends SessionStateBuilder(session, state) with WithTestConf {
  override def overrideConfs: Map[String, String] = TestSQLContext.overrideConfs
  override def newBuilder: NewBuilder = new TestSQLSessionStateBuilder(_, _)
}

Source File: ExecutorNumListener.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.monitor

import java.text.SimpleDateFormat
import java.util
import java.util.Date
import java.util.concurrent.atomic.AtomicBoolean

import com.fasterxml.jackson.annotation.JsonIgnore

import org.apache.spark.SparkContext
import org.apache.spark.internal.Logging
import org.apache.spark.scheduler.{
  SparkListener,
  SparkListenerExecutorAdded,
  SparkListenerExecutorRemoved
}
import org.apache.spark.util.kvstore.KVIndex

class ExecutorNumListener extends SparkListener with Logging {

  lazy val kvstore = SparkContext.getActive.get.statusStore.store
  var initialized: AtomicBoolean = new AtomicBoolean(false)
  var lastPointTime: Long = new Date().getTime
  var recentEventTime: Long = new Date().getTime
  private val liveExecutors = new util.HashSet[String]()

  def initialize(): Unit = {
    SparkContext.getActive.map(_.ui).flatten.foreach {
      case ui =>
        ui.attachTab(new ExecutorNumTab(ui))
        ui.addStaticHandler("static", "/static/special")
    }
  }

  def maybeAddPoint(time: Long): Unit = {
    if (!initialized.get) {
      initialize()
      initialized.compareAndSet(false, true)
    }
    if (time - lastPointTime > 20 * 1000) {
      addPoint(recentEventTime)
      addPoint(time)
      lastPointTime = time
    }
    recentEventTime = time
  }
  def addPoint(time: Long): Unit = {
    val executorNum = liveExecutors.size
    kvstore.write(new ExecutorNumWrapper(new ExecutorNum(
      s"own ${executorNum} executors at ${new SimpleDateFormat("HH:mm:ss").format(new Date(time))}",
      IndexedSeq(time, executorNum))))
  }

  override def onExecutorAdded(event: SparkListenerExecutorAdded): Unit = {
    liveExecutors.add(event.executorId)
    maybeAddPoint(event.time)
  }

  override def onExecutorRemoved(event: SparkListenerExecutorRemoved): Unit = {
    liveExecutors.remove(event.executorId)
    maybeAddPoint(event.time)
  }

}

private[spark] class ExecutorNumWrapper(val point: ExecutorNum) {
  @JsonIgnore @KVIndex
  def id: Long = point.value(0)
}

private[spark] class ExecutorNum(val name: String, val value: IndexedSeq[Long])

Source File: ExtendableHiveContext.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.apache.spark.SparkContext
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.ParserDialect
import org.apache.spark.sql.catalyst.analysis.{Analyzer, _}
import org.apache.spark.sql.catalyst.optimizer.Optimizer
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.execution.ui.SQLListener
import org.apache.spark.sql.execution.{CacheManager, ExtractPythonUDFs}
import org.apache.spark.sql.extension._
import org.apache.spark.sql.hive.client.{ClientInterface, ClientWrapper}
import org.apache.spark.sql.sources.commands.hive.HiveEmulationCatalog


  @transient
  override protected[sql] lazy val analyzer: Analyzer =
    new Analyzer(catalog, functionRegistry, conf) {
      override val extendedResolutionRules = resolutionRules(this) ++
        (catalog.ParquetConversions ::
          catalog.CreateTables ::
          catalog.PreInsertionCasts ::
          ExtractPythonUDFs ::
          ResolveHiveWindowFunction ::
          PreInsertCastAndRename ::
          Nil)

      override val extendedCheckRules = ExtendableHiveContext.this.extendedCheckRules(this)
    }

  @transient
  override protected[sql] lazy val optimizer: Optimizer =
    OptimizerFactory.produce(
      earlyBatches = optimizerEarlyBatches,
      mainBatchRules = optimizerMainBatchRules,
      postBatches = optimizerPostBatches
    )

  @transient
  override protected[sql] val planner: SparkPlanner with HiveStrategies =
    new SparkPlanner with HiveStrategies with ExtendedPlanner {
      def baseStrategies(hiveContext: HiveContext): Seq[Strategy] =
        Seq(
          DataSourceStrategy,
          HiveCommandStrategy(self),
          HiveDDLStrategy,
          DDLStrategy,
          TakeOrderedAndProject,
          InMemoryScans,
          HiveTableScans,
          DataSinks,
          Scripts,
          Aggregation,
          LeftSemiJoin,
          EquiJoinSelection,
          BasicOperators,
          BroadcastNestedLoop,
          CartesianProduct,
          DefaultJoin
        )

      override def strategies: Seq[Strategy] =
        self.strategies(this) ++
          experimental.extraStrategies ++
          baseStrategies(self)

      override val hiveContext = self
    }
}

Source File: SapHiveContext.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.apache.spark.SparkContext
import org.apache.spark.sql.{CommonSapSQLContext, SQLContext}
import org.apache.spark.sql.execution.CacheManager
import org.apache.spark.sql.execution.ui.SQLListener
import org.apache.spark.sql.hive.client.{ClientInterface, ClientWrapper}


class SapHiveContext(
    @transient sparkContext: SparkContext,
    cacheManager: CacheManager,
    listener: SQLListener,
    @transient execHive: ClientWrapper,
    @transient metaHive: ClientInterface,
    isRootContext: Boolean)
  extends ExtendableHiveContext(
    sparkContext,
    cacheManager,
    listener,
    execHive,
    metaHive,
    isRootContext)
  with CommonSapSQLContext {

  def this(sc: SparkContext) =
    this(sc, new CacheManager, SQLContext.createListenerAndUI(sc), null, null, true)

  override def newSession(): HiveContext =
    new SapHiveContext(
      sparkContext = this.sparkContext,
      cacheManager = this.cacheManager,
      listener = this.listener,
      executionHive.newSession(),
      metadataHive.newSession(),
      isRootContext = false)
}

Source File: ExtendableSQLContext.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.extension

import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.ParserDialect
import org.apache.spark.sql.catalyst.analysis._
import org.apache.spark.sql.catalyst.optimizer.Optimizer
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.ExtractPythonUDFs
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.sources.commands.hive.HiveEmulationCatalog


  @transient
  override protected[sql] val planner =
  // HiveStrategies defines its own strategies, we should be back to SparkPlanner strategies
    new SparkPlanner with ExtendedPlanner {

      def baseStrategies: Seq[Strategy] =
        DataSourceStrategy ::
          DDLStrategy ::
          TakeOrderedAndProject ::
          Aggregation ::
          LeftSemiJoin ::
          EquiJoinSelection ::
          InMemoryScans ::
          BasicOperators ::
          BroadcastNestedLoop ::
          CartesianProduct ::
          DefaultJoin :: Nil

      override def strategies: Seq[Strategy] =
        self.strategies(this) ++
          experimental.extraStrategies ++
          baseStrategies
    }
}

Source File: BasicCurrencyConversionFunction.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.currency.basic

import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.currency._
import org.apache.spark.sql.util.ValidatingPropertyMap._

import scala.util.Try

protected object BasicCurrencyConversionConfig {

  
  private def updateRatesMapByTable(ratesTable: String, sqlContext: SQLContext): Unit = {
    val ratesTableData = sqlContext.sql(s"SELECT * FROM $ratesTable").collect()
    ratesTableData.foreach { row =>
      val from = row.getString(0)
      val to = row.getString(1)
      val date = row.getString(2).replaceAll("-", "").toInt
      val rate =
        Try(row.getDecimal(3)).recover {
          case ex: ClassCastException => new java.math.BigDecimal(row.getDouble(3))
        }.get
      ratesMap.put((from, to), date, rate)
    }
  }
}

Source File: SQLRunner.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package com.sap.spark.cli

import java.io._

import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.{Logging, SparkContext}

import scala.annotation.tailrec

protected[cli] case class CLIOptions(
    sqlFiles: List[String] = Nil, output: Option[String] = None)


  def main(args: Array[String]): Unit = {
    def fail(msg: String = USAGE): Unit = {
      logError(msg)
      System.exit(1)
    }

    val opts = parseOpts(args.toList)

    val outputStream: OutputStream = opts.output match {
      case Some(filename) => new FileOutputStream(new File(filename))
      case None => System.out
    }

    opts.sqlFiles
      .map((string: String) => new FileInputStream(new File(string)))
      .foreach(sql(_, outputStream))
  }
}

Source File: GlobalSapSQLContext.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql

import java.io.File

import com.sap.spark.util.TestUtils
import com.sap.spark.{GlobalSparkContext, WithSQLContext}
import org.apache.spark.SparkContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{BoundReference, Cast}
import org.apache.spark.unsafe.types._
import org.apache.spark.sql.types._
import org.scalatest.Suite

import scala.io.Source

trait GlobalSapSQLContext extends GlobalSparkContext with WithSQLContext {
  self: Suite =>

  override implicit def sqlContext: SQLContext = GlobalSapSQLContext._sqlc

  override protected def setUpSQLContext(): Unit =
    GlobalSapSQLContext.init(sc)

  override protected def tearDownSQLContext(): Unit =
    GlobalSapSQLContext.reset()

  def getDataFrameFromSourceFile(sparkSchema: StructType, path: File): DataFrame = {
    val conversions = sparkSchema.toSeq.zipWithIndex.map({
      case (field, index) =>
        Cast(BoundReference(index, StringType, nullable = true), field.dataType)
    })
    val data = Source.fromFile(path)
      .getLines()
      .map({ line =>
      val stringRow = InternalRow.fromSeq(line.split(",", -1).map(UTF8String.fromString))
      Row.fromSeq(conversions.map({ c => c.eval(stringRow) }))
    })
    val rdd = sc.parallelize(data.toSeq, numberOfSparkWorkers)
    sqlContext.createDataFrame(rdd, sparkSchema)
  }
}

object GlobalSapSQLContext {

  private var _sqlc: SQLContext = _

  private def init(sc: SparkContext): Unit =
    if (_sqlc == null) {
      _sqlc = TestUtils.newSQLContext(sc)
    }

  private def reset(): Unit = {
    if (_sqlc != null) {
      _sqlc.catalog.unregisterAllTables()
    }
  }

}

Source File: WithSparkContext.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package com.sap.spark

import com.sap.spark.util.TestUtils._
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{BeforeAndAfterAll, Suite}

trait WithSparkContext extends BeforeAndAfterAll {
  self: Suite =>

  override def beforeAll(): Unit = {
    try {
      super.beforeAll()
      setUpSparkContext()
    } catch {
      case ex: Throwable =>
        tearDownSparkContext()
        throw ex
    }
  }

  override def afterAll(): Unit = {
    try {
      super.afterAll()
    } finally {
      tearDownSparkContext()
    }
  }

  
    conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")
    conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.HttpBroadcastFactory")
    conf.set("spark.shuffle.spill", "false")
    conf.set("spark.shuffle.compress", "false")
    conf.set("spark.ui.enabled", "false")
    conf.set("spark.ui.showConsoleProgress", "false")
  }

  def sc: SparkContext

  protected def setUpSparkContext(): Unit

  protected def tearDownSparkContext(): Unit

}

Source File: GlobalSparkContext.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package com.sap.spark

import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{BeforeAndAfterAll, Suite}


  }

}

object GlobalSparkContext {
  @transient private var _sc: SparkContext = _

  def init(sparkMaster: String, sparkConf: SparkConf): Unit = {
    if (_sc == null) {
      this.synchronized {
        if (_sc == null) {
          _sc = new SparkContext(sparkMaster, "test", sparkConf)
        }
      }
    }
  }

  def reset(): Unit = {
    if (_sc != null) {
      _sc.cancelAllJobs()
    }
  }

  def close(): Unit = {
    if (_sc != null) {
      _sc.stop()
      _sc = null
    }
  }

}

Source File: TestUtils.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package com.sap.spark.util

import java.util.Locale

import scala.io.Source
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.{Row, SQLContext, SapSQLContext}
import org.apache.spark.sql.hive.SapHiveContext
import org.apache.spark.sql.sources.sql.SqlLikeRelation
import org.apache.spark.sql.sources.{BaseRelation, CatalystSource, Table}
import org.apache.spark.sql.types.StructType
import org.mockito.Matchers._
import org.mockito.Mockito._

import scala.tools.nsc.io.Directory
import scala.util.{Failure, Success}


  def parsePTestFile(fileName: String): List[(String, String, String)] = {
    val filePath = getFileFromClassPath(fileName)
    val fileContents = Source.fromFile(filePath).getLines
      .map(p => p.stripMargin.trim)
      .filter(p => !p.isEmpty && !p.startsWith("//")) // filter empty rows and comments
      .mkString("\n")
    val p = new PTestFileParser

    // strip semicolons from query and parsed
    p(fileContents) match {
      case Success(lines) =>
        lines.map {
          case (query, parsed, expect) =>
            (stripSemicolon(query).trim, stripSemicolon(parsed).trim, expect.trim)
        }
      case Failure(ex) => throw ex
    }
  }

  private def stripSemicolon(sql: String): String =
    if (sql.endsWith(";")) {
      sql.substring(0, sql.length-1)
    } else {
      sql
    }

  def withTempDirectory[A](f: Directory => A): A = {
    val dir = Directory.makeTemp()
    try {
      f(dir)
    } finally {
      dir.deleteIfExists()
    }
  }
}

Source File: SQLRunnerSuite.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package com.sap.spark.cli

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, InputStream}

import org.apache.spark.SparkContext
import org.apache.spark.sql.{GlobalSapSQLContext, SQLContext}
import org.scalatest.{BeforeAndAfterEach, FunSuite, ShouldMatchers}



    // good call
    val goodOpts =
      SQLRunner.parseOpts(List("a.sql", "b.sql", "-o", "output.csv"))

    goodOpts.sqlFiles should be(List("a.sql", "b.sql"))
    goodOpts.output should be(Some("output.csv"))

    // bad call
    val badOpts = SQLRunner.parseOpts(List())

    badOpts.sqlFiles should be(List())
    badOpts.output should be(None)

    // ugly call
    val uglyOpts =
      SQLRunner.parseOpts(List("a.sql", "-o", "output.csv", "b.sql"))

    uglyOpts.sqlFiles should be(List("a.sql", "b.sql"))
    uglyOpts.output should be(Some("output.csv"))
  }

  def runSQLTest(input: String, expectedOutput: String): Unit = {
    val inputStream: InputStream = new ByteArrayInputStream(input.getBytes())
    val outputStream = new ByteArrayOutputStream()

    SQLRunner.sql(inputStream, outputStream)

    val output = outputStream.toString
    output should be(expectedOutput)
  }

  test("can run dummy query") {
    val input = "SELECT 1;"
    val output = "1\n"

    runSQLTest(input, output)
  }

  test("can run multiple dummy queries") {
    val input = """
        |SELECT 1;SELECT 2;
        |SELECT 3;
      """.stripMargin

    val output = "1\n2\n3\n"

    runSQLTest(input, output)
  }

  test("can run a basic example with tables") {
    val input = """
                  |SELECT * FROM DEMO_TABLE;
                  |SELECT * FROM DEMO_TABLE LIMIT 1;
                  |DROP TABLE DEMO_TABLE;
                """.stripMargin

    val output = "1,a\n2,b\n3,c\n1,a\n"

    runSQLTest(input, output)
  }

  test("can run an example with comments") {
    val input = """
                  |SELECT * FROM DEMO_TABLE; -- this is the first query
                  |SELECT * FROM DEMO_TABLE LIMIT 1;
                  |-- now let's drop a table
                  |DROP TABLE DEMO_TABLE;
                """.stripMargin

    val output = "1,a\n2,b\n3,c\n1,a\n"

    runSQLTest(input, output)
  }
}

Source File: SapSQLEnv.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.sap.thriftserver

import java.io.PrintStream

import org.apache.spark.scheduler.StatsReportListener
import org.apache.spark.sql.hive.{HiveContext, SapHiveContext}
import org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver
import org.apache.spark.sql.hive.thriftserver.SparkSQLEnv._
import org.apache.spark.util.Utils
import org.apache.spark.{Logging, SparkConf, SparkContext}

import scala.collection.JavaConversions._


object SapSQLEnv extends Logging {

  def init() {
    logDebug("Initializing SapSQLEnv")
    if (hiveContext == null) {
      logInfo("Creating SapSQLContext")
      val sparkConf = new SparkConf(loadDefaults = true)
      val maybeSerializer = sparkConf.getOption("spark.serializer")
      val maybeKryoReferenceTracking = sparkConf.getOption("spark.kryo.referenceTracking")
      // If user doesn't specify the appName, we want to get [SparkSQL::localHostName] instead of
      // the default appName [SparkSQLCLIDriver] in cli or beeline.
      val maybeAppName = sparkConf
        .getOption("spark.app.name")
        .filterNot(_ == classOf[SparkSQLCLIDriver].getName)

      sparkConf
        .setAppName(maybeAppName.getOrElse(s"SparkSQL::${Utils.localHostName()}"))
        .set("spark.serializer",
          maybeSerializer.getOrElse("org.apache.spark.serializer.KryoSerializer"))
        .set("spark.kryo.referenceTracking",
          maybeKryoReferenceTracking.getOrElse("false"))

      sparkContext = new SparkContext(sparkConf)
      sparkContext.addSparkListener(new StatsReportListener())
      hiveContext = new SapHiveContext(sparkContext)

      hiveContext.metadataHive.setOut(new PrintStream(System.out, true, "UTF-8"))
      hiveContext.metadataHive.setInfo(new PrintStream(System.err, true, "UTF-8"))
      hiveContext.metadataHive.setError(new PrintStream(System.err, true, "UTF-8"))

      hiveContext.setConf("spark.sql.hive.version", HiveContext.hiveExecutionVersion)

      if (log.isDebugEnabled) {
        hiveContext.hiveconf.getAllProperties.toSeq.sorted.foreach { case (k, v) =>
          logDebug(s"HiveConf var: $k=$v")
        }
      }
    }
  }
}

Source File: Preparator.scala From pio-template-sr with Apache License 2.0

5 votes

package org.template.sr



import org.apache.predictionio.controller.PPreparator
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD
import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.sql.DataFrame
import org.apache.spark.ml.feature.StandardScalerModel
import org.apache.spark.sql.SQLContext
import org.apache.spark.mllib.linalg.Vectors

class PreparedData(
  val rows: DataFrame,
  val dsp: DataSourceParams,
  val ssModel: org.apache.spark.mllib.feature.StandardScalerModel
) extends Serializable

class Preparator
  extends PPreparator[TrainingData, PreparedData] {

  def prepare(sc: SparkContext, trainingData: TrainingData): PreparedData = {
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    if (trainingData.dsp.useStandardScaler) {
      val training = trainingData.rows.map(x=>(x._1,x._2,Vectors.dense(x._3))).toDF("label", "censor", "features")
      val scaler = new StandardScaler().setInputCol("features").setOutputCol("scaledFeatures").setWithStd(trainingData.dsp.standardScalerWithStd).setWithMean(trainingData.dsp.standardScalerWithMean)
      val scalerModel = scaler.fit(training)
      val scaledData = scalerModel.transform(training)
      val s1 = scaledData.select("label","censor","scaledFeatures").withColumnRenamed("scaledFeatures","features")

      //Prepare old StandardScaler
      val oldScaler = new org.apache.spark.mllib.feature.StandardScaler(withMean = trainingData.dsp.standardScalerWithMean, withStd = trainingData.dsp.standardScalerWithStd)
      val oldSSModel = oldScaler.fit(trainingData.rows.map(x=>(Vectors.dense(x._3))))
            
      new PreparedData(rows = s1, dsp = trainingData.dsp, ssModel = oldSSModel)
    }
    else {
      new PreparedData(rows = trainingData.rows.map(x=>(x._1,x._2,Vectors.dense(x._3))).toDF("label", "censor", "features"), dsp = trainingData.dsp, ssModel = null)
    }
  }
}

Source File: DataSource.scala From pio-template-sr with Apache License 2.0

5 votes

package org.template.sr



import org.apache.predictionio.controller.PDataSource
import org.apache.predictionio.controller.EmptyEvaluationInfo
import org.apache.predictionio.controller.EmptyActualResult
import org.apache.predictionio.controller.Params
import org.apache.predictionio.data.store.PEventStore
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD

import grizzled.slf4j.Logger

case class DataSourceParams(
  val appName: String, 
  val useStandardScaler: Boolean,
  val standardScalerWithStd: Boolean,
  val standardScalerWithMean: Boolean 
) extends Params

class DataSource(val dsp: DataSourceParams)
  extends PDataSource[TrainingData, EmptyEvaluationInfo, Query, EmptyActualResult] {

  @transient lazy val logger = Logger[this.type]

  override
  def readTraining(sc: SparkContext): TrainingData = {
    println("Gathering data from event server.")
    val rowsRDD: RDD[(Double, Double, Array[Double])] = PEventStore.find(
      appName = dsp.appName,
      entityType = Some("row"),
      startTime = None,
      eventNames = Some(List("$set")))(sc).map { event =>
        try {
	        (event.properties.get[Double]("label"), event.properties.get[Double]("censor"), event.properties.get[Array[Double]]("features"))
        } catch {
          case e: Exception => {
            logger.error(s"Failed to convert event ${event} of. Exception: ${e}.")
            throw e
          }
        }
      }
    new TrainingData(rowsRDD, dsp)
  }
}

class TrainingData(
  val rows: RDD[(Double, Double, Array[Double])],
  val dsp: DataSourceParams
) extends Serializable

Source File: SRAlgorithm.scala From pio-template-sr with Apache License 2.0

5 votes

package org.template.sr



import org.apache.predictionio.controller.P2LAlgorithm
import org.apache.predictionio.controller.Params
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD
import grizzled.slf4j.Logger
import org.apache.spark.mllib.linalg.{Vectors,DenseVector}
import org.apache.spark.ml.feature.StandardScalerModel
import org.apache.spark.ml.regression.{AFTSurvivalRegression,AFTSurvivalRegressionModel}

case class AlgorithmParams(
  val quantileProbabilities: Array[Double],
  val fitIntercept: Boolean,
  val maxIter: Int,
  val convTolerance: Double
) extends Params

class SRModel(
  val aAFTSRModel: AFTSurvivalRegressionModel,
  val ssModel: org.apache.spark.mllib.feature.StandardScalerModel,
  val useStandardScaler: Boolean
) extends Serializable {}

class SRAlgorithm(val ap: AlgorithmParams) extends P2LAlgorithm[PreparedData, SRModel, Query, PredictedResult] {

  @transient lazy val logger = Logger[this.type]

  def train(sc: SparkContext, data: PreparedData): SRModel = {
    println("Training SR model.")
    val aft = new AFTSurvivalRegression().setQuantileProbabilities(ap.quantileProbabilities).setQuantilesCol("quantiles").setFitIntercept(ap.fitIntercept).setMaxIter(ap.maxIter).setTol(ap.convTolerance)
    val model = aft.fit(data.rows)

    new SRModel(aAFTSRModel = model, ssModel=data.ssModel, useStandardScaler = data.dsp.useStandardScaler)
  }

  def predict(model: SRModel, query: Query): PredictedResult = {
    // 
    val qryRow0 = Vectors.dense(query.features)
    val qryRow = if (model.useStandardScaler) {
      model.ssModel.transform(qryRow0)
    } else {
      qryRow0
    }
    val score = model.aAFTSRModel.predict(qryRow)
    val quantilesVec = model.aAFTSRModel.predictQuantiles(qryRow)

    PredictedResult(coefficients = model.aAFTSRModel.coefficients.toArray,
                    intercept = model.aAFTSRModel.intercept,
                    scale = model.aAFTSRModel.scale,
                    prediction = score,
                    quantiles = quantilesVec.toArray)
  }
}

Source File: VLBFGS1.scala From spark-vl-bfgs with Apache License 2.0

5 votes

package org.apache.spark.ml.optim

import java.util.Random

import scala.language.implicitConversions

import org.apache.hadoop.fs.{FileSystem, Path}

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.optim.VectorFreeLBFGS.{Oracle, VectorSpace}
import org.apache.spark.ml.optim.VectorRDDFunctions._
import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.mllib.random.RandomRDDs
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.{RDD, UnionRDD}
import org.apache.spark.storage.StorageLevel


  private def gradient(data: RDD[Array[LabeledPoint]], dx: RDD[Vector]): RDD[Vector] = {
    data.cartesian(dx).map { case (points, x) =>
      val g = Vectors.zeros(x.size)
      points.foreach { case LabeledPoint(b, a) =>
        val err = BLAS.dot(a, x) - b
        BLAS.axpy(err, a, g)
      }
      g
    }.treeSum()
  }

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("VLBFGS").setMaster("local[*]")
    val sc = new SparkContext(conf)
    sc.setCheckpointDir("/tmp/checkpoint")
    val n = 1000
    val p = 100
    val random = new Random(0L)
    val xExact = Vectors.dense(Array.fill(p)(random.nextDouble()))
    val data = RandomRDDs.normalVectorRDD(sc, n, p, 4, 11L).mapPartitionsWithIndex { (idx, part) =>
      val random = new Random(100 + idx)
      part.map { v =>
        val target = BLAS.dot(v, xExact) + 0.1 * random.nextGaussian()
        LabeledPoint(target, v)
      }
    }.glom()
    .cache()

    val x = solve(data).first()

    println(s"x_exact = $xExact")
    println(s"x_vlbfgs = $x")

    sc.stop()
  }
}

Source File: LocalSparkContext.scala From streamliner-examples with Apache License 2.0

5 votes

package test.util

import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.BeforeAndAfterEach
import org.scalatest._

trait LocalSparkContext extends BeforeAndAfterEach { self: Suite =>

  @transient private var _sc: SparkContext = _

  val _sparkConf = new SparkConf(false)
    .set("spark.ui.showConsoleProgress", "false")

  def sc: SparkContext = _sc

  override def beforeEach() {
    _sc = new SparkContext("local[4]", "test", _sparkConf)
    super.beforeEach()
  }

  override def afterEach() {
    resetSparkContext()
    super.afterEach()
  }

  def resetSparkContext(): Unit = {
    LocalSparkContext.stop(_sc)
    _sc = null
  }

}

object LocalSparkContext {
  def stop(sc: SparkContext) {
    if (sc != null) {
      sc.stop()
    }
    // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
    System.clearProperty("spark.driver.port")
  }

  
  def withSpark[T](sc: SparkContext)(f: SparkContext => T): T = {
    try {
      f(sc)
    } finally {
      stop(sc)
    }
  }

}

Source File: LocalSparkContext.scala From streamliner-examples with Apache License 2.0

5 votes

package test.util

import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.BeforeAndAfterEach
import org.scalatest._

trait LocalSparkContext extends BeforeAndAfterEach { self: Suite =>

  @transient private var _sc: SparkContext = _

  val _sparkConf = new SparkConf(false)
    .set("spark.ui.showConsoleProgress", "false")

  def sc: SparkContext = _sc

  override def beforeEach() {
    _sc = new SparkContext("local[4]", "test", _sparkConf)
    super.beforeEach()
  }

  override def afterEach() {
    resetSparkContext()
    super.afterEach()
  }

  def resetSparkContext(): Unit = {
    LocalSparkContext.stop(_sc)
    _sc = null
  }

}

object LocalSparkContext {
  def stop(sc: SparkContext) {
    if (sc != null) {
      sc.stop()
    }
    // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
    System.clearProperty("spark.driver.port")
  }

  
  def withSpark[T](sc: SparkContext)(f: SparkContext => T): T = {
    try {
      f(sc)
    } finally {
      stop(sc)
    }
  }

}

Source File: ThriftRandomExtractor.scala From streamliner-examples with Apache License 2.0

5 votes

package com.memsql.spark.examples.thrift

import com.memsql.spark.etl.api._
import com.memsql.spark.etl.utils.PhaseLogger
import org.apache.spark.SparkContext
import org.apache.spark.sql.{SQLContext, DataFrame, Row}
import org.apache.spark.sql.types._
import org.apache.spark.streaming.StreamingContext
import org.apache.thrift.protocol.TBinaryProtocol
import org.apache.thrift.{TBase, TFieldIdEnum, TSerializer}

class ThriftRandomExtractor extends Extractor {
  var count: Int = 1
  var thriftType: Class[_] = null
  var serializer: TSerializer = null

  def schema: StructType = StructType(StructField("bytes", BinaryType, false) :: Nil)

  override def initialize(ssc: StreamingContext, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Unit = {
    val userConfig = config.asInstanceOf[UserExtractConfig]
    val className = userConfig.getConfigString("className") match {
      case Some(s) => s
      case None => throw new IllegalArgumentException("className must be set in the config")
    }
    thriftType = Class.forName(className)
    serializer = new TSerializer(new TBinaryProtocol.Factory())
    count = userConfig.getConfigInt("count").getOrElse(1)
  }

  override def next(ssc: StreamingContext, time: Long, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Option[DataFrame] = {
    val rdd = sqlContext.sparkContext.parallelize((1 to count).map(_ => Row({
      val thriftObject = ThriftRandomGenerator.next(thriftType).asInstanceOf[TBase[_ <: TBase[_, _], _ <: TFieldIdEnum]]
      serializer.serialize(thriftObject)
    })))
    Some(sqlContext.createDataFrame(rdd, schema))
  }
}

Source File: Configuration.scala From spark-util with Apache License 2.0

5 votes

package org.hammerlab.hadoop

import java.io.{ ObjectInputStream, ObjectOutputStream }

import org.apache.hadoop.conf
import org.apache.hadoop.conf.{ Configuration ⇒ HadoopConfiguration }
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.hammerlab.hadoop.kryo.WritableSerializer
import org.hammerlab.kryo._


class Configuration(@transient var value: HadoopConfiguration)
  extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = {
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = {
    value = new HadoopConfiguration(false)
    value.readFields(in)
  }
}

object Configuration
  extends Registrar {

  def apply(loadDefaults: Boolean = true): Configuration =
    new HadoopConfiguration(loadDefaults)

  def apply(conf: HadoopConfiguration): Configuration =
    new Configuration(conf)

  implicit def wrapConfiguration(conf: HadoopConfiguration): Configuration =
    apply(conf)

  implicit def unwrapConfiguration(conf: Configuration): HadoopConfiguration =
    conf.value

  implicit def unwrapConfigurationBroadcast(confBroadcast: Broadcast[Configuration]): Configuration =
    confBroadcast.value

  implicit def sparkContextToHadoopConfiguration(sc: SparkContext): Configuration =
    sc.hadoopConfiguration

  implicit class Ops(val conf: HadoopConfiguration) extends AnyVal {
    def serializable: Configuration = conf
  }

  register(
    cls[conf.Configuration] → new WritableSerializer[conf.Configuration],
    cls[Configuration] → serializeAs[Configuration, conf.Configuration]
  )
}

Source File: Histogram.scala From spark-util with Apache License 2.0

5 votes

package org.hammerlab.spark.accumulator

import org.apache.spark.SparkContext
import org.apache.spark.util.AccumulatorV2

import scala.collection.immutable.SortedMap
import scala.collection.mutable


case class Histogram[T: Ordering](var map: mutable.Map[T, Long] = mutable.Map.empty[T, Long])
  extends AccumulatorV2[T, SortedMap[T, Long]] {

  override def isZero: Boolean = map.isEmpty

  override def copy(): AccumulatorV2[T, SortedMap[T, Long]] =
    Histogram(map.clone())

  override def reset(): Unit = map = mutable.Map.empty[T, Long]

  override def add(k: T): Unit =
    map.update(
      k,
      map.getOrElse(k, 0L) + 1
    )

  override def merge(other: AccumulatorV2[T, SortedMap[T, Long]]): Unit =
    for {
      (k, v) ← other.value
    } {
      map.update(k, map.getOrElse(k, 0L) + v)
    }

  override def value: SortedMap[T, Long] = SortedMap(map.toSeq: _*)
}

object Histogram {
  def apply[T: Ordering](name: String)(implicit sc: SparkContext): Histogram[T] = {
    val a = Histogram[T]()
    sc.register(a, name)
    a
  }
}

Source File: Context.scala From spark-util with Apache License 2.0

5 votes

package org.hammerlab.spark

import org.apache.spark.{ SparkConf, SparkContext }
import org.hammerlab.hadoop.Configuration


case class Context(@transient sc: SparkContext)
  extends Configuration(sc.hadoopConfiguration)

object Context {
  implicit def makeContext(sc: SparkContext): Context = Context(sc)
  implicit def deriveContext(implicit sc: SparkContext): Context = Context(sc)
  implicit def umakeContext(context: Context): SparkContext = context.sc

  def apply()(implicit conf: SparkConf): Context =
    Context(
      new SparkContext(
        conf
      )
    )
}

Source File: ContextTest.scala From spark-util with Apache License 2.0

5 votes

package org.hammerlab.spark

import hammerlab.Suite
import org.apache.spark.SparkContext

class ContextTest
  extends Suite
     with ConfSuite {
  implicit val sc = new SparkContext(conf)

  def withContext(implicit ctx: Context) = {}

  test("derive") {
    // exercise implicit conversion and derivation from SparkContext
    withContext(sc)
    withContext
  }

  override protected def afterAll(): Unit = {
    sc.stop()
    super.afterAll()
  }
}

Source File: ContextSuite.scala From spark-util with Apache License 2.0

5 votes

package org.hammerlab.spark

import org.apache.spark.SparkContext
import org.hammerlab.test.Suite

trait ConfSuite
  extends SparkConfBase {
  implicit lazy val conf = makeSparkConf
  sparkConf(
    "spark.master" → s"local[4]",
    "spark.app.name" → getClass.getName,
    "spark.driver.host" → "localhost",
    "spark.kryo.classesToRegister" → "org.apache.spark.internal.io.FileCommitProtocol$TaskCommitMessage"
  )
}

abstract class ContextSuite
  extends Suite
    with ConfSuite
    with SelfRegistrar {

  private var _sc: Context = _
  implicit lazy val sc = {
    _sc = Context()
    _sc
  }
  implicit def sparkContext: SparkContext = sc

  override def afterAll(): Unit = {
    // Do this before the super delegation, which will remove the temporary event-log dir
    if (_sc != null)
      _sc.stop()

    super.afterAll()
  }
}

Source File: KeyPartitionerTest.scala From spark-util with Apache License 2.0

5 votes

package org.hammerlab.spark

import org.apache.spark.SparkContext
import org.hammerlab.test.Suite

class KeyPartitionerTest
  extends Suite {

  test("basic calls") {
    KeyPartitioner(456).getPartition(123) should be(123)
    KeyPartitioner(456).getPartition(123 → "abc") should be(123)
    intercept[UnexpectedKey] {
      KeyPartitioner(456).getPartition("abc")
    }.key should be("abc")
  }

  test("partitioner") {
    val partitioner =
      Partitioner[Either[Int, String]](
        2,
        {
          case Left(n) ⇒ 0
          case Right(str) ⇒ 1
        }
      )

    partitioner.getPartition(Left(222)) should be(0)
    partitioner.getPartition(Right("abc")) should be(1)

    intercept[UnexpectedKey] {
      partitioner.getPartition(333)
    }.key should be(333)

    intercept[UnexpectedKey] {
      partitioner.getPartition("ddd")
    }.key should be("ddd")
  }
}

Source File: Sessionize.scala From Mastering-Scala-Machine-Learning with MIT License

5 votes

package org.akozlov.chapter06

import java.io._

import java.time.ZoneOffset
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter

import org.apache.spark.{SparkConf,SparkContext}
import org.apache.spark.storage.StorageLevel


object Sessionize extends App {
  val sc = new SparkContext("local[8]", "Sessionize", new SparkConf())

  val checkoutPattern = ".*>checkout.*".r.pattern

  // a basic page view structure
  case class PageView(ts: String, path: String) extends Serializable with Ordered[PageView] {
    override def toString: String = {
      s"($ts #$path)"
    }
    def compare(other: PageView) = ts compare other.ts
  }

  // represent a session
  case class Session[A  <: PageView](id: String, visits: Seq[A]) extends Serializable {
    override def toString: String = {
      val vsts = visits.mkString("[", ",", "]")
      s"($id -> $vsts)"
    }
  }

  def toEpochSeconds(str: String) = { LocalDateTime.parse(str, DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")).toEpochSecond(ZoneOffset.UTC) }

  val sessions = sc.textFile("data/clickstream")
    .map(line => {val parts = line.split("\t"); (parts(4), new PageView(parts(0), parts(20)))})
    .groupByKey.map(x => { new Session(x._1, x._2.toSeq.sorted) } )
    .cache

  // sessions.take(100).foreach(println)

  def findAllCheckoutSessions(s: Session[PageView]) = {
    s.visits.tails.filter {
      _ match { case PageView(ts1, "mycompanycom>homepage") :: PageView(ts2, page) :: tail if (page != "mycompanycom>homepage" ) => true; case _ => false }
    }
    .foldLeft(Seq[Session[PageView]]()) {
      case (r, x) => {
        x.find(y => checkoutPattern.matcher(y.path).matches) match {
          case Some(checkout) if (toEpochSeconds(checkout.ts) > toEpochSeconds(x.head.ts) + 60) => r.:+(new Session(s.id, x.slice(0, x.indexOf(checkout))))
          case _ => r
        }
      }
    }
  }

  val prodLandingSessions = sessions.flatMap(findAllCheckoutSessions)

  prodLandingSessions.collect.foreach(println)

  sc.stop()
}

Source File: CustomPartitioner.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_3

import com.tomekl007.UserTransaction
import org.apache.spark.sql.SparkSession
import org.apache.spark.{Partitioner, SparkContext}
import org.scalatest.FunSuite
import org.scalatest.Matchers._

class CustomPartitioner extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("should use custom partitioner") {
    //given
    val numberOfExecutors = 2
    val data = spark
      .parallelize(List(
        UserTransaction("a", 100),
        UserTransaction("b", 101),
        UserTransaction("a", 202),
        UserTransaction("b", 1),
        UserTransaction("c", 55)
      )
      ).keyBy(_.userId)
      .partitionBy(new Partitioner {
        override def numPartitions: Int = numberOfExecutors

        override def getPartition(key: Any): Int = {
          key.hashCode % numberOfExecutors
        }
      })

    println(data.partitions.length)

    //when
    val res = data.mapPartitions[Long](iter =>
      iter.map(_._2).map(_.amount)
    ).collect().toList

    //then
    res should contain theSameElementsAs List(55, 100, 202, 101, 1)
  }
}

Source File: ExecutionPlanForJoins.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_3

import org.apache.spark.sql.SparkSession
import org.apache.spark.{HashPartitioner, SparkContext}
import org.scalatest.FunSuite
import org.scalatest.Matchers._

class ExecutionPlanForJoins extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext
  test("should use custom partitioner while join") {
    //given
    val transactions = spark.makeRDD(List((1, "bag"), (2, "dog"), (4, "car")))
    val persons = spark.makeRDD(List((1, "Tom"), (2, "Michael"), (3, "Johnny")))

    //when
    val personsDataPartitioner = transactions.partitioner match {
      case Some(p) => p
      case None => new HashPartitioner(persons.partitions.length)
    }


    val res = persons.join(transactions, personsDataPartitioner).collect().toList

    res should contain theSameElementsAs
      List((2, ("Michael", "dog")), (1, ("Tom", "bag")))
  }

  test("can broadcast small data set to every executor and join in-memory") {
    //given
    val smallDataSet = spark.makeRDD(List((1, "bag"), (2, "dog"), (4, "car")))
    val hugeDataSet = spark.makeRDD(List((1, "Tom"), (2, "Michael"), (3, "Johnny")))

    //when broadcast small rdd to all executors
    val smallInMemoryDataSet = spark.broadcast(smallDataSet.collectAsMap())

    //then join will not need to do shuffle
    val res = hugeDataSet.mapPartitions(iter => {
      iter.flatMap {
        case (k, v1) => smallInMemoryDataSet.value.get(k) match {
          case None => Seq.empty
          case Some(v2) => Seq((k, (v1, v2)))
        }
      }
    })

    res.collect().toList should contain theSameElementsAs
      List((2, ("Michael", "dog")), (1, ("Tom", "bag")))
  }

}

Source File: IntegrationTesting.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_6

import com.tomekl007.UserTransaction
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite
import org.scalatest.Matchers._

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

class IntegrationTesting extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext


  test("Integration testing of already unit-tested logic") {
    //given
    val keysWithValuesList =
      Array(
        UserTransaction("A", 100),
        UserTransaction("B", 4),
        UserTransaction("A", 100001),
        UserTransaction("B", 10),
        UserTransaction("C", 10)
      )
    val data = spark.parallelize(keysWithValuesList)


    //when
    val aggregatedTransactionsForUserId = data.filter(BonusVerifier.qualifyForBonus)

    //then
    aggregatedTransactionsForUserId.collect().toList should contain theSameElementsAs List(
      UserTransaction("A", 100001)
    )
  }
}

Source File: MockingDataSources.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_6

import com.tomekl007.UserTransaction
import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.scalatest.{FunSuite, Ignore}

class MockingDataSources extends FunSuite {
  val spark = SparkSession.builder().master("local[2]").getOrCreate()


  ignore("loading data on prod from hive") {
    UserDataLogic.loadAndGetAmount(spark, HiveDataLoader.loadUserTransactions)
  }

  test("mock loading data from hive"){
    //given
    import spark.sqlContext.implicits._
    val df = spark.sparkContext
      .makeRDD(List(UserTransaction("a", 100), UserTransaction("b", 200)))
      .toDF()

    //when
    val res = UserDataLogic.loadAndGetAmount(spark, _ => df)

    //then
    res.show()
  }

}

object UserDataLogic {
  def loadAndGetAmount(sparkSession: SparkSession, provider: SparkSession => DataFrame): DataFrame = {
    val df = provider(sparkSession)
    df.select(df("amount"))
  }
}

object HiveDataLoader {
  def loadUserTransactions(sparkSession: SparkSession): DataFrame = {
    sparkSession.sql("select * from transactions")
  }
}

Source File: InheritanceRdd.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_2

import com.example.{MultipliedRDD, Record}
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite
import org.scalatest.Matchers._

class InheritanceRdd extends FunSuite {
  val spark: SparkContext = SparkSession
    .builder().master("local[2]").getOrCreate().sparkContext

  test("use extended RDD") {
    //given
    val rdd = spark.makeRDD(List(Record(1, "d1")))
    val extendedRdd = new MultipliedRDD(rdd, 10)

    extendedRdd.collect().toList should contain theSameElementsAs List(
      Record(10, "d1")
    )
  }

}

Source File: ImmutableRDD.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_2

import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite
import org.scalatest.Matchers._

class ImmutableRDD extends FunSuite {
  val spark: SparkContext = SparkSession
    .builder().master("local[2]").getOrCreate().sparkContext

  test("RDD should be immutable") {
    //given
    val data = spark.makeRDD(0 to 5)

    //when
    val res = data.map(_ * 2)

    //then
    res.collect().toList should contain theSameElementsAs List(
      0, 2, 4, 6, 8, 10
    )

    data.collect().toList should contain theSameElementsAs List(
      0, 1, 2, 3, 4, 5
    )

  }

}

Source File: SaveJSON.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_4

import com.tomekl007.UserTransaction
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.scalatest.Matchers._
import org.scalatest.{BeforeAndAfterEach, FunSuite}

import scala.reflect.io.Path

class SaveJSON extends FunSuite with BeforeAndAfterEach {
  val spark = SparkSession.builder().master("local[2]").getOrCreate()

  private val FileName = "transactions.json"

  override def afterEach() {
    val path = Path(FileName)
    path.deleteRecursively()
  }

  test("should save and load in JSON") {
    //given
    import spark.sqlContext.implicits._
    val rdd = spark.sparkContext
      .makeRDD(List(UserTransaction("a", 100), UserTransaction("b", 200)))
      .toDF()

    //when
    rdd.coalesce(1).write.format("json").save(FileName)

    val fromFile = spark.read.json(FileName)

    fromFile.show()
    assert(fromFile.count() == 2)
  }
}

Source File: SavePlainText.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_4

import java.io.File

import com.tomekl007.UserTransaction
import org.apache.spark.sql.SparkSession
import org.apache.spark.{Partitioner, SparkContext}
import org.scalatest.{BeforeAndAfterEach, FunSuite}
import org.scalatest.Matchers._

import scala.reflect.io.Path

class SavePlainText extends FunSuite with BeforeAndAfterEach{
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  private val FileName = "transactions.txt"

  override def afterEach() {
    val path = Path (FileName)
    path.deleteRecursively()
  }

  test("should save and load in plain text") {
    //given
    val rdd = spark.makeRDD(List(UserTransaction("a", 100), UserTransaction("b", 200)))

    //when
    rdd.coalesce(1).saveAsTextFile(FileName)

    val fromFile = spark.textFile(FileName)

    fromFile.collect().toList should contain theSameElementsAs List(
      "UserTransaction(a,100)", "UserTransaction(b,200)"
      //note - this is string!
    )
  }
}

Source File: ReUseWithCheckpoint.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_4

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.storage.StorageLevel
import org.scalatest.FunSuite

class ReUseWithCheckpoint extends FunSuite {
  private val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext
  private val checkpointEnabled = true
  private val storageLevel = StorageLevel.MEMORY_AND_DISK

  test("should use checkpoint for re-usability of RDD") {
    //given
    val sortedRDD = spark.makeRDD(List(1, 2, 5, 77, 888))

    if (storageLevel != StorageLevel.NONE) {
      sortedRDD.persist(storageLevel)
    }
    if (checkpointEnabled) {
      sortedRDD.sparkContext.setCheckpointDir("hdfs://tmp/checkpoint")
      sortedRDD.checkpoint()
    }

    //when
    performALotOfExpensiveComputations(sortedRDD)

    //then
    sortedRDD.collect().toList
  }

  def performALotOfExpensiveComputations(sortedRDD: RDD[Int]): Unit = {
    //....
    sortedRDD.count()
    //failure
    sortedRDD.collect()
  }
}

Source File: CreatingGraph.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_7

import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite

class CreatingGraph extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("should load graph from a file") {
    //given
    val path = getClass.getResource("/graph.g").getPath

    //when
    val graph = GraphBuilder.loadFromFile(spark, path)

    //then
    graph.triplets.foreach(println(_))
    assert(graph.triplets.count() == 4)
  }

}

Source File: VertexAPI.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_7

import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite

class VertexAPI extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("Should use Vertex API") {
    //given
    val users: RDD[(VertexId, (String))] =
      spark.parallelize(Array(
        (1L, "a"),
        (2L, "b"),
        (3L, "c"),
        (4L, "d")
      ))


    val relationships =
      spark.parallelize(Array(
        Edge(1L, 2L, "friend"),
        Edge(1L, 3L, "friend"),
        Edge(2L, 4L, "wife")
      ))

    val graph = Graph(users, relationships)

    //when
    val res = graph.mapVertices((_, att) => att.toUpperCase())
    res.vertices.collect().toList
  }

}

Source File: EdgeAPI.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_7

import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite

class EdgeAPI extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("Should use Edge API") {
    //given
    val users: RDD[(VertexId, (String))] =
      spark.parallelize(Array(
        (1L, "a"),
        (2L, "b"),
        (3L, "c"),
        (4L, "d")
      ))


    val relationships =
      spark.parallelize(Array(
        Edge(1L, 2L, "friend"),
        Edge(1L, 3L, "friend"),
        Edge(2L, 4L, "wife")
      ))

    val graph = Graph(users, relationships)

    //when
    val res = graph.mapEdges(e => e.attr.toUpperCase)

    println(res.edges.collect().toList)
  }

}

Source File: ReduceAPI.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_1

import com.tomekl007.UserTransaction
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite
import org.scalatest.Matchers._

class ReduceAPI extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext


  test("should use reduce API") {
    //given
    val input = spark.makeRDD(List(
      UserTransaction("A", 10),
      UserTransaction("B", 1),
      UserTransaction("A", 101)
    ))

    //when
    val result = input
      .map(_.amount)
      .reduce((a, b) => if (a > b) a else b)

    //then
    assert(result == 101)
  }

  test("should use reduceByKey API") {
    //given
    val input = spark.makeRDD(
      List(
        UserTransaction("A", 10),
        UserTransaction("B", 1),
        UserTransaction("A", 101)
      )
    )

    //when
    val result = input
      .keyBy(_.userId)
      .reduceByKey((firstTransaction, secondTransaction) =>
        TransactionChecker.higherTransactionAmount(firstTransaction, secondTransaction))
      .collect()
      .toList

    //then
    result should contain theSameElementsAs
      List(("B", UserTransaction("B", 1)), ("A", UserTransaction("A", 101)))
  }

}

object TransactionChecker {
  def higherTransactionAmount(firstTransaction: UserTransaction, secondTransaction: UserTransaction): UserTransaction = {
    if (firstTransaction.amount > secondTransaction.amount) firstTransaction else secondTransaction
  }
}

Source File: TriggerComputationsReusingRDD.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_1

import com.tomekl007.UserTransaction
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite

class TriggerComputationsReusingRDD extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext


  test("should trigger computations using actions without reuse") {
    //given
    val input = spark.makeRDD(
      List(
        UserTransaction(userId = "A", amount = 1001),
        UserTransaction(userId = "A", amount = 100),
        UserTransaction(userId = "A", amount = 102),
        UserTransaction(userId = "A", amount = 1),
        UserTransaction(userId = "B", amount = 13)))

    //when apply transformation
    val rdd = input
      .filter(_.userId.contains("A"))
      .keyBy(_.userId)
      .map(_._2.amount)


    //then every call to action means that we are going up to the RDD chain
    //if we are loading data from external file-system (I.E.: HDFS), every action means
    //that we need to load it from FS.
    val start = System.currentTimeMillis()
    println(rdd.collect().toList)
    println(rdd.count())
    println(rdd.first())
    rdd.foreach(println(_))
    rdd.foreachPartition(t => t.foreach(println(_)))
    println(rdd.max())
    println(rdd.min())
    println(rdd.takeOrdered(1).toList)
    println(rdd.takeSample(false, 2).toList)
    val result = System.currentTimeMillis() - start

    println(s"time taken (no-cache): $result")


  }


  test("should trigger computations using actions with reuse") {
    //given
    val input = spark.makeRDD(
      List(
        UserTransaction(userId = "A", amount = 1001),
        UserTransaction(userId = "A", amount = 100),
        UserTransaction(userId = "A", amount = 102),
        UserTransaction(userId = "A", amount = 1),
        UserTransaction(userId = "B", amount = 13)))

    //when apply transformation
    val rdd = input
      .filter(_.userId.contains("A"))
      .keyBy(_.userId)
      .map(_._2.amount)
      .cache()


    //then every call to action means that we are going up to the RDD chain
    //if we are loading data from external file-system (I.E.: HDFS), every action means
    //that we need to load it from FS.
    val start = System.currentTimeMillis()
    println(rdd.collect().toList)
    println(rdd.count())
    println(rdd.first())
    rdd.foreach(println(_))
    rdd.foreachPartition(t => t.foreach(println(_)))
    println(rdd.max())
    println(rdd.min())
    println(rdd.takeOrdered(1).toList)
    println(rdd.takeSample(false, 2).toList)
    val result = System.currentTimeMillis() - start

    println(s"time taken(cache): $result")


  }
}

Source File: TriggerComputations.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_1

import com.tomekl007.UserTransaction
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite

class TriggerComputations extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext


  test("should trigger computations using actions") {
    //given
    val input = spark.makeRDD(
      List(
        UserTransaction(userId = "A", amount = 1001),
        UserTransaction(userId = "A", amount = 100),
        UserTransaction(userId = "A", amount = 102),
        UserTransaction(userId = "A", amount = 1),
        UserTransaction(userId = "B", amount = 13)))

    //when apply transformation
    val rdd = input
      .filter(_.userId.contains("A"))
      .keyBy(_.userId)
      .map(_._2.amount)


    //then
    println(rdd.collect().toList)
    println(rdd.count()) //and all count*
    println(rdd.first())
    rdd.foreach(println(_))
    rdd.foreachPartition(t => t.foreach(println(_)))
    println(rdd.max())
    println(rdd.min())
    println(rdd.takeOrdered(1).toList)
    println(rdd.takeSample(false, 2).toList)

    //all reduce will be covered in separate video


  }

}

Source File: DeferComputations.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_1

import com.tomekl007.InputRecord
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite

class DeferComputations extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext


  test("should defer computations") {
    //given
    val input = spark.makeRDD(
      List(InputRecord(userId = "A"),
        InputRecord(userId = "B")))

    //when apply transformation
    val rdd = input
      .filter(_.userId.contains("A"))
      .keyBy(_.userId)
      .map(_._2.userId.toLowerCase)
    //.... built processing graph lazy

    if (shouldExecutePartOfCode()) {
      //rdd.saveAsTextFile("") ||
      rdd.collect().toList
    } else {
      //condition changed - don't need to evaluate DAG
    }

  }


  private def shouldExecutePartOfCode(): Boolean = {
    //domain logic that decide if we still need to calculate
    true
  }
}

Source File: GroupByKey.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_1

import com.tomekl007.UserTransaction
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite
import org.scalatest.Matchers._

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

class GroupByKey extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext


  test("should trigger computations using actions") {
    //given
    val input = spark.makeRDD(
      List(
        UserTransaction(userId = "A", amount = 1001),
        UserTransaction(userId = "A", amount = 100),
        UserTransaction(userId = "A", amount = 102),
        UserTransaction(userId = "A", amount = 1),
        UserTransaction(userId = "B", amount = 13)))

    //when apply transformation
    val rdd = input
      .groupBy(_.userId)
      .map(x => (x._1,x._2.toList))
      .collect()
      .toList

    //then
    rdd should contain theSameElementsAs List(
      ("B", List(UserTransaction("B", 13))),
      ("A", List(
        UserTransaction("A", 1001),
        UserTransaction("A", 100),
        UserTransaction("A", 102),
        UserTransaction("A", 1))
      )
    )
  }

}

Source File: UsePartitioner.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_5

import com.tomekl007.UserTransaction
import org.apache.spark.{HashPartitioner, RangePartitioner, SparkContext}
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite
import org.scalatest.Matchers._

class UsePartitioner extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("should use different partitioners") {
    //given
    val keysWithValuesList =
      Array(
        UserTransaction("A", 100),
        UserTransaction("B", 4),
        UserTransaction("A", 100001),
        UserTransaction("B", 10),
        UserTransaction("C", 10)
      )
    val data = spark.parallelize(keysWithValuesList)
    val keyed = data.keyBy(_.userId)

    //when, then
    val partitioner = keyed.partitioner
    assert(partitioner.isEmpty)

    val hashPartitioner = keyed.partitionBy(new HashPartitioner(100))
    println(hashPartitioner)
    assert(hashPartitioner.partitioner.isDefined)

    val rangePartitioner = keyed.partitionBy(new RangePartitioner(100, keyed))
    println(rangePartitioner)
    assert(rangePartitioner.partitioner.isDefined)

  }
}

Source File: AggregateByKey.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_5

import com.tomekl007.UserTransaction
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite
import org.scalatest.Matchers._

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

class AggregateByKey extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext


  test("should use aggregateByKey instead of groupBy to reduce shuffle") {
    //given
    val keysWithValuesList =
      Array(
        UserTransaction("A", 100),
        UserTransaction("B", 4),
        UserTransaction("A", 100001),
        UserTransaction("B", 10),
        UserTransaction("C", 10)
      )
    val data = spark.parallelize(keysWithValuesList)
    val keyed = data.keyBy(_.userId)

    val amountForUser = mutable.ArrayBuffer.empty[Long]
    val addAmount = (responseTimes: mutable.ArrayBuffer[Long], transaction: UserTransaction) => responseTimes += transaction.amount
    val mergeAmounts = (p1: mutable.ArrayBuffer[Long], p2: mutable.ArrayBuffer[Long]) => p1 ++= p2

    //when
    val aggregatedTransactionsForUserId = keyed
      .aggregateByKey(amountForUser)(addAmount, mergeAmounts)

    //then
    aggregatedTransactionsForUserId.collect().toList should contain theSameElementsAs List(
      ("A", ArrayBuffer(100, 100001)),
      ("B", ArrayBuffer(4,10)),
      ("C", ArrayBuffer(10)))


  }
}

Source File: TransformationsOnPairs.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_5

import com.tomekl007.UserTransaction
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite
import org.scalatest.Matchers._

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

class TransformationsOnPairs extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("should use transformation on k/v pair") {
    //given
    val keysWithValuesList =
      Array(
        UserTransaction("A", 100),
        UserTransaction("B", 4),
        UserTransaction("A", 100001),
        UserTransaction("B", 10),
        UserTransaction("C", 10)
      )
    val data = spark.parallelize(keysWithValuesList)
    val keyed = data.keyBy(_.userId)

    //when
    val counted  = keyed.countByKey()
//    keyed.combineByKey()
//    keyed.aggregateByKey()
//    keyed.foldByKey()
//    keyed.groupByKey()

    //then
    counted should contain theSameElementsAs Map("B" -> 2, "A" -> 2, "C" -> 1)

  }
}

Source File: ActionsOnPairs.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_5

import com.tomekl007.UserTransaction
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite
import org.scalatest.Matchers._

class ActionsOnPairs extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("should use action to see k/v data format after collect") {
    //given
    val keysWithValuesList =
      Array(
        UserTransaction("A", 100),
        UserTransaction("B", 4),
        UserTransaction("A", 100001),
        UserTransaction("B", 10),
        UserTransaction("C", 10)
      )
    val data = spark.parallelize(keysWithValuesList)
    val keyed = data.keyBy(_.userId)

    //when
    val res = keyed.collect().toList

    //then
    res should contain theSameElementsAs List(
      ("A",UserTransaction("A",100)),
      ("B",UserTransaction("B",4)),
      ("A",UserTransaction("A",100001)),
      ("B",UserTransaction("B",10)),
      ("C",UserTransaction("C",10))
    )//note duplicated key

  }
}

Source File: CustomRangePartitioner.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_5

import com.tomekl007.UserTransaction
import org.apache.spark.sql.SparkSession
import org.apache.spark.{HashPartitioner, Partitioner, RangePartitioner, SparkContext}
import org.scalatest.FunSuite

class CustomRangePartitionerTest extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("should use custom range partitioner") {
    //given
    val keysWithValuesList =
      Array(
        UserTransaction("A", 100),
        UserTransaction("B", 4),
        UserTransaction("A", 100001),
        UserTransaction("B", 10),
        UserTransaction("C", 10)
      )
    val data = spark.parallelize(keysWithValuesList)
    val keyed = data.keyBy(_.amount)

    //when, then
    val partitioned = keyed.partitionBy(new CustomRangePartitioner(List((0,100), (100, 10000), (10000, 1000000))))

    //then
    partitioned.collect().toList
  }
}

class CustomRangePartitioner(ranges: List[(Int,Int)]) extends Partitioner{
  override def numPartitions: Int = ranges.size

  override def getPartition(key: Any): Int = {
    if(!key.isInstanceOf[Int]){
      throw new IllegalArgumentException("partitioner works only for Int type")
    }
    val keyInt = key.asInstanceOf[Int]
    val index = ranges.lastIndexWhere(v => keyInt >= v._1 && keyInt <= v._2)
    println(s"for key: $key return $index")
    index
  }
}

Source File: BulkTableWriter.scala From spark-cassandra-stress with Apache License 2.0

5 votes

package com.datastax.bdp.spark.writer

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import com.datastax.spark.connector._
import com.datastax.spark.connector.writer._

import java.nio.file.{Path, Files}

import scala.language.implicitConversions

object BulkTableWriter{

implicit def toBulkTableWriter[T](rdd: RDD[T]): BulkTableWriter[T] =
    new BulkTableWriter(rdd)
}

class BulkTableWriter[T](rdd: RDD[T]) {

    def bulkSaveToCassandra(keyspaceName: String,
                            tableName: String,
                            columns: ColumnSelector = AllColumns,
                            writeConf: BulkWriteConf = BulkWriteConf()): Unit = {
      throw new UnsupportedOperationException
    }
  }
case class BulkWriteConf(outputDirectory: Option[Path] = None,
                         deleteSource: Boolean = true,
                         bufferSizeInMB: Int = 64)

Source File: L10-9Graph.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Edge
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.Graph.graphToGraphOps
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.json4s.DefaultFormats
import org.json4s.jvalue2extractable
import org.json4s.jvalue2monadic
import org.json4s.native.JsonMethods.parse
import org.json4s.string2JsonInput

object UserRankApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: UserRankApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    ssc.socketTextStream(hostname, port.toInt)
      .map(r => {
        implicit val formats = DefaultFormats
        parse(r)
      })
      .foreachRDD(rdd => {
        val edges = rdd.map(jvalue => {
          implicit val formats = DefaultFormats
          ((jvalue \ "user_id").extract[String], (jvalue \ "friends").extract[Array[String]])
        })
          .flatMap(r => r._2.map(f => Edge(r._1.hashCode.toLong, f.hashCode.toLong, 1.0)))

        val vertices = rdd.map(jvalue => {
          implicit val formats = DefaultFormats
          ((jvalue \ "user_id").extract[String])
        })
          .map(r => (r.hashCode.toLong, r))

        val tolerance = 0.0001
        val graph = Graph(vertices, edges, "defaultUser")
          .subgraph(vpred = (id, idStr) => idStr != "defaultUser")
        val pr = graph.pageRank(tolerance).cache

        graph.outerJoinVertices(pr.vertices) {
          (userId, attrs, rank) => (rank.getOrElse(0.0).asInstanceOf[Number].doubleValue, attrs)
        }.vertices.top(10) {
          Ordering.by(_._2._1)
        }.foreach(rec => println("User id: %s, Rank: %f".format(rec._2._2, rec._2._1)))
      })

    ssc.start()
    ssc.awaitTermination()

  }

}

Source File: L10-2DataProc.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.HashPartitioner
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.json4s.DefaultFormats
import org.json4s.JsonAST.JNothing
import org.json4s.jvalue2extractable
import org.json4s.jvalue2monadic
import org.json4s.native.JsonMethods.parse
import org.json4s.string2JsonInput

object DataProcApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: DataProcApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    ssc.socketTextStream(hostname, port.toInt)
      .map(r => {
        implicit val formats = DefaultFormats
        parse(r)
      })
      .filter(jvalue => {
        jvalue \ "attributes" \ "Wi-Fi" != JNothing
      })
      .map(jvalue => {
        implicit val formats = DefaultFormats
        ((jvalue \ "attributes" \ "Wi-Fi").extract[String], (jvalue \ "stars").extract[Int])
      })
      .combineByKey(
        (v) => (v, 1),
        (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1),
        (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2),
        new HashPartitioner(ssc.sparkContext.defaultParallelism))
      .map({ case (k, v) => (k, v._1 / v._2.toFloat) })
      .print()

    ssc.start()
    ssc.awaitTermination()
  }

}

Source File: L5-7MultipleSocketStreams.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf

import org.apache.spark.streaming.{ Seconds, StreamingContext }
import org.apache.spark.streaming.dstream.PairDStreamFunctions

import java.util.Calendar

object TripByYearMultiApp {
  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: TripByYearMultiApp <appname> <hostname> <base_port> <num_of_sockets>")
      System.exit(1)
    }
    val Seq(appName, hostname, basePort, nSockets) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(10))

    val streams = (0 to nSockets.toInt - 1).map(i => ssc.socketTextStream(hostname, basePort.toInt + i))
    val uniStream = ssc.union(streams)

    uniStream
      .map(rec => rec.split(","))
      .map(rec => (rec(13), rec(0).toInt))
      .reduceByKey(_ + _)
      .map(pair => (pair._2, normalizeYear(pair._1)))
      .transform(rec => rec.sortByKey(ascending = false))
      .saveAsTextFiles("TripByYear")

    ssc.start()
    ssc.awaitTermination()
  }

  def normalizeYear(s: String): String = {
    try {
      (Calendar.getInstance().get(Calendar.YEAR) - s.toInt).toString
    } catch {
      case e: Exception => s
    }
  }
}

Source File: L5-9Mqtt.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.apache.spark.streaming.mqtt.MQTTUtils

object YearlyDistributionApp {
  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: YearlyDistributionApp <appname> <brokerUrl> <topic> <checkpointDir>")
      System.exit(1)
    }
    val Seq(appName, brokerUrl, topic, checkpointDir) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(10))
    ssc.checkpoint(checkpointDir)

    MQTTUtils.createStream(ssc, brokerUrl, topic, StorageLevel.MEMORY_ONLY_SER_2)
      .map(rec => rec.split(","))
      .map(rec => (rec(1).split(" ")(0), 1))
      .updateStateByKey(statefulCount)
      .map(pair => (pair._2, pair._1))
      .transform(rec => rec.sortByKey(ascending = false))
      .saveAsTextFiles("YearlyDistribution")

    ssc.start()
    ssc.awaitTermination()
  }

  val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0))

}

Source File: L5-11FlumePull.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.apache.spark.streaming.flume.FlumeUtils

object DailyUserTypeDistributionApp2 {
  def main(args: Array[String]) {
    if (args.length != 5) {
      System.err.println(
        "Usage: DailyUserTypeDistributionApp <appname> <hostname> <port> <checkpointDir> <outputPath>")
      System.exit(1)
    }
    val Seq(appName, hostname, port, checkpointDir, outputPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(10))
    ssc.checkpoint(checkpointDir)

    FlumeUtils.createPollingStream(ssc, hostname, port.toInt, StorageLevel.MEMORY_ONLY_SER_2)
      .map(rec => new String(rec.event.getBody().array()).split(","))
      .map(rec => ((rec(1).split(" ")(0), rec(12)), 1))
      .updateStateByKey(statefulCount)
      .repartition(1)
      .transform(rdd => rdd.sortByKey(ascending = false))
      .saveAsTextFiles(outputPath)

    ssc.start()
    ssc.awaitTermination()
  }

  val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0))

}

Source File: L5-6SocketStream.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf

import org.apache.spark.streaming.{ Seconds, StreamingContext }
import org.apache.spark.streaming.dstream.PairDStreamFunctions

import java.util.Calendar

object TripByYearApp {
  def main(args: Array[String]) {
    if (args.length != 3) {
      System.err.println(
        "Usage: TripByYearApp <appname> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(10))

    ssc.socketTextStream(hostname, port.toInt)
      .map(rec => rec.split(","))
      .map(rec => (rec(13), rec(0).toInt))
      .reduceByKey(_ + _)
      .map(pair => (pair._2, normalizeYear(pair._1)))
      .transform(rec => rec.sortByKey(ascending = false))
      .saveAsTextFiles("TripByYear")

    ssc.start()
    ssc.awaitTermination()
  }

  def normalizeYear(s: String): String = {
    try {
      (Calendar.getInstance().get(Calendar.YEAR) - s.toInt).toString
    } catch {
      case e: Exception => s
    }
  }
}

Source File: L5-16Twitter.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.apache.spark.streaming.twitter.TwitterUtils
import org.apache.spark.storage.StorageLevel
import twitter4j.conf.ConfigurationBuilder
import twitter4j.TwitterFactory

object TwitterApp {

  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println(
        "Usage: TwitterApp <appname> <outputPath>")
      System.exit(1)
    }

    val Seq(appName, outputPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(10))

    val cb = new ConfigurationBuilder()
    cb.setOAuthConsumerKey("")
    cb.setOAuthConsumerSecret("")
    cb.setOAuthAccessToken("")
    cb.setOAuthAccessTokenSecret("")

    val twitterAuth = new TwitterFactory(cb.build()).getInstance().getAuthorization()

    val tweetStream = TwitterUtils.createStream(ssc, Some(twitterAuth), Array("nyc citi bike", "nyc bike share"))
    tweetStream.count().print()
    tweetStream.saveAsTextFiles(outputPath)

    ssc.start()
    ssc.awaitTermination()
  }

}

Source File: L5-11FlumePush.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.apache.spark.streaming.flume.FlumeUtils

object DailyUserTypeDistributionApp {
  def main(args: Array[String]) {
    if (args.length != 5) {
      System.err.println(
        "Usage: DailyUserTypeDistributionApp <appname> <hostname> <port> <checkpointDir> <outputPath>")
      System.exit(1)
    }
    val Seq(appName, hostname, port, checkpointDir, outputPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(10))
    ssc.checkpoint(checkpointDir)

    FlumeUtils.createStream(ssc, hostname, port.toInt, StorageLevel.MEMORY_ONLY_SER_2)
      .map(rec => new String(rec.event.getBody().array()).split(","))
      .map(rec => ((rec(1).split(" ")(0), rec(12)), 1))
      .updateStateByKey(statefulCount)
      .repartition(1)
      .transform(rdd => rdd.sortByKey(ascending = false))
      .saveAsTextFiles(outputPath)

    ssc.start()
    ssc.awaitTermination()
  }

  val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0))

}

Source File: L5-13Kafka.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.apache.spark.streaming.kafka.KafkaUtils

object StationJourneyCountApp {

  def main(args: Array[String]) {
    if (args.length != 7) {
      System.err.println(
        "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>")
      System.exit(1)
    }

    val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
    //.set("spark.streaming.receiver.writeAheadLog.enable", "true")

    val ssc = new StreamingContext(conf, Seconds(10))
    ssc.checkpoint(checkpointDir)

    val topics = Map[String, Int](
      topic -> 1)
    KafkaUtils.createStream(ssc, zkQuorum, consumerGroupId, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2)
      .map(rec => rec.split(","))
      .map(rec => ((rec(3), rec(7)), 1))
      .reduceByKey(_ + _)
      .repartition(1)
      .map(rec => (rec._2, rec._1))
      .transform(rdd => rdd.sortByKey(ascending = false))
      .saveAsTextFiles(outputPath)

    ssc.start()
    ssc.awaitTermination()
  }

}

Source File: L5-18Http.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.json4s.DefaultFormats
import org.json4s.JField
import org.json4s.jvalue2extractable
import org.json4s.jvalue2monadic
import org.json4s.native.JsonMethods.parse
import org.json4s.string2JsonInput

object HttpApp {

  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println(
        "Usage: HttpApp <appname> <outputPath>")
      System.exit(1)
    }

    val Seq(appName, outputPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val batchInterval = 10

    val ssc = new StreamingContext(conf, Seconds(batchInterval))

    HttpUtils.createStream(ssc, url = "https://www.citibikenyc.com/stations/json", interval = batchInterval)
      .flatMap(rec => (parse(rec) \ "stationBeanList").children)
      .filter(rec => {
        implicit val formats = DefaultFormats
        (rec \ "statusKey").extract[Integer] != 1
      })
      .map(rec => rec.filterField {
        case JField("id", _) => true
        case JField("stationName", _) => true
        case JField("statusValue", _) => true
        case _ => false
      })
      .map(rec => {
        implicit val formats = DefaultFormats
        (rec(0)._2.extract[Integer], rec(1)._2.extract[String], rec(2)._2.extract[String])
      })
      .saveAsTextFiles(outputPath)

    ssc.start()
    ssc.awaitTermination()
  }

}

Source File: L5-14KafkaCustomConf.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.apache.spark.streaming.kafka.KafkaUtils
import kafka.serializer.StringDecoder
import org.apache.spark.storage.StorageLevel

object StationJourneyCountCustomApp {

  def main(args: Array[String]) {
    if (args.length != 7) {
      System.err.println(
        "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>")
      System.exit(1)
    }

    val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
      //.set("spark.streaming.receiver.writeAheadLog.enable", "true")

    val ssc = new StreamingContext(conf, Seconds(10))
    ssc.checkpoint(checkpointDir)

    val topics = Map[String, Int](
      topic -> 1)
    val params = Map[String, String](
      "zookeeper.connect" -> zkQuorum,
      "group.id" -> consumerGroupId,
      "bootstrap.servers" -> brokerUrl)
    KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](ssc, params, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2)
      .map(rec => rec.split(","))
      .map(rec => ((rec(3), rec(7)), 1))
      .reduceByKey(_ + _)
      .repartition(1)
      .map(rec => (rec._2, rec._1))
      .transform(rdd => rdd.sortByKey(ascending = false))
      .saveAsTextFiles(outputPath)

    ssc.start()
    ssc.awaitTermination()
  }

}

Source File: L7-2-3Tachyon.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions

object ReferrerApp {
  def main(args: Array[String]) {
    if (args.length != 7) {
      System.err.println(
        "Usage: ReferrerApp <appname> <hostname> <port> <tachyonUrl> <checkpointDir> <outputPathTop> <outputPathSpark>")
      System.exit(1)
    }
    val Seq(appName, hostname, port, tachyonUrl, checkpointDir, outputPathTop, outputPathSpark) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
      .set("spark.externalBlockStore.url", tachyonUrl)

    val ssc = new StreamingContext(conf, Seconds(10))
    ssc.checkpoint(checkpointDir)

    val clickstream = ssc.socketTextStream(hostname, port.toInt)
      .map(rec => rec.split("\\t"))
      .persist(StorageLevel.OFF_HEAP)

    val topRefStream = clickstream
      .map(rec => {
        var prev_title = rec(3)
        if (!prev_title.startsWith("other")) {
          prev_title = "wikipedia"
        }
        (prev_title, 1)
      })

    val topSparkStream = clickstream
      .filter(rec => rec(4).equals("Apache_Spark"))
      .map(rec => (rec(3), 1))

    saveTopKeys(topRefStream, outputPathTop)

    saveTopKeys(topSparkStream, outputPathSpark)

    ssc.start()
    ssc.awaitTermination()
  }

  def saveTopKeys(clickstream: DStream[(String, Int)], outputPath: String) {
    clickstream.updateStateByKey((values, state: Option[Int]) => Some(values.sum + state.getOrElse(0)))
      .repartition(1)
      .map(rec => (rec._2, rec._1))
      .transform(rec => rec.sortByKey(ascending = false))
      .saveAsTextFiles(outputPath)
  }

}

Source File: L7-4UI.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import java.util.concurrent.atomic.AtomicLong

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object SocialSearchApp {
  def main(args: Array[String]) {
    if (args.length != 3) {
      System.err.println(
        "Usage: SocialSearchApp <appname> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
      //.set("spark.eventLog.enabled", "true")
      //.set("spark.eventLog.dir", "/tmp/historical")
      

    val countSearch = new AtomicLong(0)
    val countSocial = new AtomicLong(0)

    val ssc = new StreamingContext(conf, Seconds(1))
    
    val titleStream = ssc.socketTextStream(hostname, port.toInt)
      .map(rec => rec.split("\\t"))
      .filter(_(3) match {
        case "other-google" | "other-bing" | "other-yahoo" | "other-facebook" | "other-twitter" => true
        case _ => false
      })
      .map(rec => (rec(3), rec(4)))
      .cache()

    val searchStream = titleStream.filter(_._1 match {
      case "other-google" | "other-bing" | "other-yahoo" => true
      case _ => false
    })
      .map(rec => rec._2)

    val socialStream = titleStream.filter(_._1 match {
      case "other-facebook" | "other-twitter" => true
      case _ => false
    })
      .map(rec => rec._2)

    val exclusiveSearch = searchStream.transformWith(socialStream,
      (searchRDD: RDD[String], socialRDD: RDD[String]) => searchRDD.subtract(socialRDD))
      .foreachRDD(rdd => {
        countSearch.addAndGet(rdd.count())
        println("Exclusive count search engines: " + countSearch)
      })

    val exclusiveSocial = socialStream.transformWith(searchStream,
      (socialRDD: RDD[String], searchRDD: RDD[String]) => socialRDD.subtract(searchRDD))
      .foreachRDD(rdd => {
        countSocial.addAndGet(rdd.count())
        println("Exclusive count social media: " + countSocial)
      })

    ssc.start()
    ssc.awaitTermination()
  }

}

Source File: L4-1Voyager.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions

object VoyagerApp {
  def main(args: Array[String]) {
    if (args.length != 3) {
      System.err.println(
        "Usage: VoyagerApp <appname> <inputPath> <outputPath>")
      System.exit(1)
    }
    val Seq(appName, inputPath, outputPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
      .set("spark.executor.extraJavaOptions", "-XX:+UseConcMarkSweepGC")

    val ssc = new StreamingContext(conf, Seconds(10))

    val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)
    voyager1.map(rec => {
      val attrs = rec.split("\\s+")
      ((attrs(0).toInt), attrs.slice(18, 28).map(_.toDouble))
    }).filter(pflux => pflux._2.exists(_ > 1.0)).map(rec => (rec._1, 1))
      .reduceByKey(_ + _)
      .transform(rec => rec.sortByKey(ascending = false, numPartitions = 1)).saveAsTextFiles(outputPath)

    ssc.start()
    ssc.awaitTermination()
  }
}

Source File: L4-4Kryo.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions

object VoyagerAppKryo {
  def main(args: Array[String]) {
    if (args.length != 3) {
      System.err.println(
        "Usage: VoyagerAppKryo <appname> <inputPath> <outputPath>")
      System.exit(1)
    }
    val Seq(appName, inputPath, outputPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .registerKryoClasses(Array(classOf[ProtonFlux]))

    val ssc = new StreamingContext(conf, Seconds(10))

    val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)
    val projected = voyager1.map(rec => {
      val attrs = rec.split("\\s+")
      new ProtonFlux(attrs(0), attrs(18), attrs(19), attrs(20), attrs(21),
        attrs(22), attrs(23), attrs(24), attrs(25), attrs(26), attrs(27),
        attrs(28))
    })
    val filtered = projected.filter(pflux => pflux.isSolarStorm)
    val yearlyBreakdown = filtered.map(rec => (rec.year, 1))
      .reduceByKey(_ + _)
      .transform(rec => rec.sortByKey(ascending = false))
    yearlyBreakdown.saveAsTextFiles(outputPath)

    ssc.start()
    ssc.awaitTermination()
  }
}

Source File: L8-1DataFrameAPI.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import scala.reflect.runtime.universe

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions.desc
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object CdrDataframeApp {

  case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
    smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
    callOutActivity: Float, internetTrafficActivity: Float)

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: CdrDataframeApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val sqlC = new SQLContext(ssc.sparkContext)
    import sqlC.implicits._

    val cdrStream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split("\\t", -1))
      .foreachRDD(rdd => {
        val cdrs = seqToCdr(rdd).toDF()

        cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5)
      })

    ssc.start()
    ssc.awaitTermination()
  }

  def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
    rdd.map(c => c.map(f => f match {
      case x if x.isEmpty() => "0"
      case x => x
    })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
      c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
  }
}

Source File: L8-3-6-7DataFrameCreation.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import scala.reflect.runtime.universe

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions.desc
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.json4s.native.Serialization.write
import org.json4s.DefaultFormats

object DataframeCreationApp {

  case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
    smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
    callOutActivity: Float, internetTrafficActivity: Float)

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: CdrDataframeApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val sqlC = new SQLContext(ssc.sparkContext)
    import sqlC.implicits._

    val cdrStream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split("\\t", -1))
      .foreachRDD(rdd => {
        //val cdrs = sqlC.createDataFrame(seqToCdr(rdd))
        //val cdrs = sqlC.createDataFrame(seqToCdr(rdd).collect())
        //val cdrs = seqToCdr(rdd).toDF()
        val cdrsJson = seqToCdr(rdd).map(r => {
          implicit val formats = DefaultFormats
          write(r)
        })
        val cdrs = sqlC.read.json(cdrsJson)

        cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5)
      })

    ssc.start()
    ssc.awaitTermination()

  }

  def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
    rdd.map(c => c.map(f => f match {
      case x if x.isEmpty() => "0"
      case x => x
    })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
      c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
  }
}

Source File: L8-29DataFrameExamplesJoin.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import scala.reflect.runtime.universe

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.json4s.DefaultFormats
import org.json4s.JDouble
import org.json4s.JObject
import org.json4s.jvalue2extractable
import org.json4s.jvalue2monadic
import org.json4s.native.JsonMethods.compact
import org.json4s.native.JsonMethods.parse
import org.json4s.native.JsonMethods.render
import org.json4s.string2JsonInput

object CdrDataframeExamples3App {

  case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
    smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
    callOutActivity: Float, internetTrafficActivity: Float)

  def main(args: Array[String]) {
    if (args.length != 5) {
      System.err.println(
        "Usage: CdrDataframeExamples3App <appname> <batchInterval> <hostname> <port> <gridJsonPath>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port, gridJsonPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val sqlC = new SQLContext(ssc.sparkContext)
    import sqlC.implicits._
    implicit val formats = DefaultFormats

    val gridFile = scala.io.Source.fromFile(gridJsonPath).mkString
    val gridGeo = (parse(gridFile) \ "features")
    val gridStr = gridGeo.children.map(r => {
      val c = (r \ "geometry" \ "coordinates").extract[List[List[List[Float]]]].flatten.flatten.map(r => JDouble(r))
      val l = List(("id", r \ "id"), ("x1", c(0)), ("y1", c(1)), ("x2", c(2)), ("y2", c(3)),
        ("x3", c(4)), ("y3", c(5)), ("x4", c(6)), ("y4", c(7)))
      compact(render(JObject(l)))
    })

    val gridDF = sqlC.read.json(ssc.sparkContext.makeRDD(gridStr))

    val cdrStream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split("\\t", -1))
      .foreachRDD(rdd => {
        val cdrs = seqToCdr(rdd).toDF()
        cdrs.join(gridDF, $"squareId" === $"id").show()
      })

    ssc.start()
    ssc.awaitTermination()
  }

  def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
    rdd.map(c => c.map(f => f match {
      case x if x.isEmpty() => "0"
      case x => x
    })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
      c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
  }
}

Source File: L8-38SparkR.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import scala.reflect.runtime.universe
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import java.nio.file.Paths
import org.apache.spark.SparkFiles

object CdrStreamingSparkRApp {

  case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
    smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
    callOutActivity: Float, internetTrafficActivity: Float)

  def main(args: Array[String]) {
    if (args.length != 7) {
      System.err.println(
        "Usage: CdrStreamingSparkRApp <appname> <batchInterval> <hostname> <port> <tableName> <RScriptPath> <RScriptLogsPath>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port, tableName, rScriptPath, logsPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val cl = Thread.currentThread().getContextClassLoader()
    val hiveC = new HiveContext(ssc.sparkContext)
    Thread.currentThread().setContextClassLoader(cl)

    import hiveC.implicits._

    ssc.sparkContext.addFile(rScriptPath)
    val rScriptName = SparkFiles.get(Paths.get(rScriptPath).getFileName.toString)
    val master = hiveC.sparkContext.getConf.get("spark.master")

    val cdrStream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split("\\t", -1))
      .foreachRDD((rdd, time) => {
        val iTableName = tableName + time.milliseconds
        seqToCdr(rdd).toDF().write.saveAsTable(iTableName)
        hiveC.sparkContext.parallelize(Array(iTableName)).pipe("%s %s".format(rScriptName, master)).saveAsTextFile(Paths.get(logsPath, iTableName).toString)
      })

    ssc.start()
    ssc.awaitTermination()
  }

  def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
    rdd.map(c => c.map(f => f match {
      case x if x.isEmpty() => "0"
      case x => x
    })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
      c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
  }
}

Source File: Index.scala From SpatialSpark with Apache License 2.0

5 votes

package spatialspark.main

import spatialspark.index.IndexConf
import spatialspark.index.STIndex
import org.apache.spark.{SparkConf, SparkContext}

object Index {


  val usage = """
    create spatial index on dataset
    Usage: index --input input path
                 --geom geometry index (default 0)
                 --output output path
                 --conf configuration (dim
                 --help
              """

  def main(args: Array[String]) {
    if (args.length == 0) println(usage)
    val arglist = args.toList
    type OptionMap = Map[Symbol, Any]

    def nextOption(map: OptionMap, list: List[String]): OptionMap = {
      list match {
        case Nil => map
        case "--help" :: tail =>
          println(usage)
          sys.exit(0)
        case "--input" :: value :: tail =>
          nextOption(map ++ Map('input -> value), tail)
        case "--geom" :: value :: tail =>
          nextOption(map ++ Map('geom -> value.toInt), tail)
        case "--output" :: value :: tail =>
          nextOption(map ++ Map('output -> value), tail)
        case "--conf" :: value :: tail =>
          nextOption(map = map ++ Map('conf -> value), list = tail)
        case option :: tail => println("Unknown option " + option)
          sys.exit(1)
      }
    }
    val options = nextOption(Map(), arglist)

    val conf = new SparkConf().setAppName("Build Index")
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryo.registrator", "spatialspark.util.KyroRegistrator")
    val sc = new SparkContext(conf)

    val inputFile = options.getOrElse('input, "").asInstanceOf[String]
    val outputFile = options.getOrElse('output, "").asInstanceOf[String]
    val methodConf = options.getOrElse('conf, "").asInstanceOf[String]

    val SEPARATOR = "\t"
    val geometryIndex = options.getOrElse('geom, 0).asInstanceOf[Int]
    val dimX = methodConf.split(":").apply(0).toInt
    val dimY = methodConf.split(":").apply(1).toInt
    val ratio = methodConf.split(":").apply(2).toDouble

    val indexConf = new IndexConf(inputFile, outputFile, SEPARATOR, geometryIndex, dimX, dimY, ratio)
    val timerBegin = System.currentTimeMillis()
    STIndex.build(sc, indexConf)
    val timerEnd = System.currentTimeMillis()
    println("index time: " + (timerEnd - timerBegin) + " ms")
  }
}

Source File: RangeQuery.scala From SpatialSpark with Apache License 2.0

5 votes

package spatialspark.query

import com.vividsolutions.jts.geom.Geometry
import com.vividsolutions.jts.index.strtree.STRtree
import spatialspark.operator.SpatialOperator
import spatialspark.operator.SpatialOperator._
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD


class RangeQuery extends Serializable {

}

object RangeQuery {

  def apply(sc: SparkContext,
            geometryWithId: RDD[(Long, Geometry)],
            filterGeometry: Geometry,
            operator: SpatialOperator,
            radius: Double = 0): RDD[(Long, Geometry)] = {

    if (operator == SpatialOperator.Contains)
      geometryWithId.filter(_._2.contains(filterGeometry))
    else if (operator == SpatialOperator.Within)
      geometryWithId.filter(_._2.within(filterGeometry))
    else if (operator == SpatialOperator.WithinD)
      geometryWithId.filter(_._2.isWithinDistance(filterGeometry, radius))
    else {
      //TODO: error for not support
      sc.emptyRDD
    }
  }
}

Source File: FixedGridPartition.scala From SpatialSpark with Apache License 2.0

5 votes

package spatialspark.partition.fgp

import spatialspark.util.MBR
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

class FixedGridPartition extends Serializable {

}

object FixedGridPartition {
  def apply(sc: SparkContext, extent: MBR, gridDimX: Int, gridDimY: Int): Array[MBR] = {
    val xSize = (extent.xmax - extent.xmin) / gridDimX.toDouble
    val ySize = (extent.ymax - extent.ymin) / gridDimY.toDouble
    val results = for (i <- Array.range(0, gridDimX); j <- Array.range(0, gridDimY))
      yield new MBR(i * xSize + extent.xmin, j * ySize + extent.ymin,
        (i + 1) * xSize + extent.xmin, (j + 1) * ySize + extent.ymin)
    results
  }

  def genTileRDD(sc: SparkContext, extent: MBR, gridDimX: Int, gridDimY: Int): RDD[MBR] = {
    val xSize = (extent.xmax - extent.xmin) / gridDimX.toDouble
    val ySize = (extent.ymax - extent.ymin) / gridDimY.toDouble
    val results = for (i <- Array.range(0, gridDimX); j <- Array.range(0, gridDimY))
      yield new MBR(i * xSize + extent.xmin, j * ySize + extent.ymin,
        (i + 1) * xSize + extent.xmin, (j + 1) * ySize + extent.ymin)
    sc.parallelize(wrapRefArray(results))
  }
}

Source File: BroadcastSpatialJoin.scala From SpatialSpark with Apache License 2.0

5 votes

package spatialspark.join

import com.vividsolutions.jts.geom.Geometry
import com.vividsolutions.jts.index.strtree.{ItemBoundable, ItemDistance, STRtree}
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import spatialspark.operator.SpatialOperator
import spatialspark.operator.SpatialOperator.SpatialOperator


object BroadcastSpatialJoin {


  def queryRtree(rtree: => Broadcast[STRtree], leftId: Long, geom: Geometry, predicate: SpatialOperator,
                 radius: Double): Array[(Long, Long)] = {
    val queryEnv = geom.getEnvelopeInternal
    //queryEnv.expandBy(radius)
    lazy val candidates = rtree.value.query(queryEnv).toArray //.asInstanceOf[Array[(Long, Geometry)]]
    if (predicate == SpatialOperator.Within) {
      candidates.filter { case (id_, geom_) => geom.within(geom_.asInstanceOf[Geometry]) }
        .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) }
    } else if (predicate == SpatialOperator.Contains) {
      candidates.filter { case (id_, geom_) => geom.contains(geom_.asInstanceOf[Geometry]) }
        .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) }
    } else if (predicate == SpatialOperator.WithinD) {
      candidates.filter { case (id_, geom_) => geom.isWithinDistance(geom_.asInstanceOf[Geometry], radius) }
        .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) }
    } else if (predicate == SpatialOperator.Intersects) {
      candidates.filter { case (id_, geom_) => geom.intersects(geom_.asInstanceOf[Geometry]) }
        .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) }
    } else if (predicate == SpatialOperator.Overlaps) {
      candidates.filter { case (id_, geom_) => geom.overlaps(geom_.asInstanceOf[Geometry]) }
        .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) }
    } else if (predicate == SpatialOperator.NearestD) {
      //if (candidates.isEmpty)
      //  return Array.empty[(Long, Long)]
      //val nearestItem = candidates.map {
      //  case (id_, geom_) => (id_.asInstanceOf[Long], geom_.asInstanceOf[Geometry].distance(geom))
      //}.reduce((a, b) => if (a._2 < b._2) a else b)
      class dist extends ItemDistance {
        override def distance(itemBoundable: ItemBoundable, itemBoundable1: ItemBoundable): Double = {
          val geom = itemBoundable.getItem.asInstanceOf[(Long, Geometry)]._2
          val geom1 = itemBoundable1.getItem.asInstanceOf[(Long, Geometry)]._2
          geom.distance(geom1)
        }
      }
      val nearestItem = rtree.value.nearestNeighbour(queryEnv, (0l, geom), new dist)
                             .asInstanceOf[(Long, Geometry)]
      Array((leftId, nearestItem._1))
    } else {
      Array.empty[(Long, Long)]
    }
  }

  def apply(sc: SparkContext,
            leftGeometryWithId: RDD[(Long, Geometry)],
            rightGeometryWithId: RDD[(Long, Geometry)],
            joinPredicate: SpatialOperator,
            radius: Double = 0): RDD[(Long, Long)] = {
    // create R-tree on right dataset
    val strtree = new STRtree()
    val rightGeometryWithIdLocal = rightGeometryWithId.collect()
    rightGeometryWithIdLocal.foreach(x => {
      val y = x._2.getEnvelopeInternal
      y.expandBy(radius)
      strtree.insert(y, x)
    })
    val rtreeBroadcast = sc.broadcast(strtree)
    leftGeometryWithId.flatMap(x => queryRtree(rtreeBroadcast, x._1, x._2, joinPredicate, radius))
  }
}

Source File: NetezzaRDD.scala From spark-netezza with Apache License 2.0

5 votes

package com.ibm.spark.netezza

import java.sql.Connection
import java.util.Properties

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types._
import org.apache.spark.{Partition, SparkContext, TaskContext}


  override def compute(thePart: Partition, context: TaskContext): Iterator[Row] =
    new Iterator[Row] {
      var closed = false
      var finished = false
      var gotNext = false
      var nextValue: Row = null

      context.addTaskCompletionListener { context => close() }
      val part = thePart.asInstanceOf[NetezzaPartition]
      val conn = getConnection()
      val reader = new NetezzaDataReader(conn, table, columns, filters, part, schema)
      reader.startExternalTableDataUnload()

      def getNext(): Row = {
        if (reader.hasNext) {
          reader.next()
        } else {
          finished = true
          null.asInstanceOf[Row]
        }
      }

      def close() {
        if (closed) return
        try {
          if (null != reader) {
            reader.close()
          }
        } catch {
          case e: Exception => logWarning("Exception closing Netezza record reader", e)
        }
        try {
          if (null != conn) {
            conn.close()
          }
          logInfo("closed connection")
        } catch {
          case e: Exception => logWarning("Exception closing connection", e)
        }
      }

      override def hasNext: Boolean = {
        if (!finished) {
          if (!gotNext) {
            nextValue = getNext()
            if (finished) {
              close()
            }
            gotNext = true
          }
        }
        !finished
      }

      override def next(): Row = {
        if (!hasNext) {
          throw new NoSuchElementException("End of stream")
        }
        gotNext = false
        nextValue
      }
    }
}

Source File: IntegrationSuiteBase.scala From spark-netezza with Apache License 2.0

5 votes

package com.ibm.spark.netezza.integration

import java.sql.Connection

import com.ibm.spark.netezza.NetezzaJdbcUtils
import com.typesafe.config.ConfigFactory
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, DataFrame, SQLContext}
import org.scalatest.{BeforeAndAfterAll, FunSuite}
import org.slf4j.LoggerFactory

trait IntegrationSuiteBase extends FunSuite with BeforeAndAfterAll with QueryTest{
  private val log = LoggerFactory.getLogger(getClass)

  protected var sc: SparkContext = _
  protected var sqlContext: SQLContext = _
  protected var conn: Connection = _
  protected val prop = new java.util.Properties

  // Configurable vals
  protected var configFile = "application"
  protected var testURL: String = _
  protected var testTable: String = _
  protected var user: String = _
  protected var password: String = _
  protected var numPartitions: Int = _
  protected var sampleDbmaxNumTables: Int = _

  override def beforeAll(): Unit = {
    super.beforeAll()

    sc = new SparkContext("local[*]", "IntegrationTest", new SparkConf())
    sqlContext = new SQLContext(sc)

    val conf = ConfigFactory.load(configFile)
    testURL = conf.getString("test.integration.dbURL")
    testTable = conf.getString("test.integration.table")
    user = conf.getString("test.integration.user")
    password = conf.getString("test.integration.password")
    numPartitions = conf.getInt("test.integration.partition.number")
    sampleDbmaxNumTables = conf.getInt("test.integration.max.numtables")
    prop.setProperty("user", user)
    prop.setProperty("password", password)
    log.info("Attempting to get connection from" + testURL)
    conn = NetezzaJdbcUtils.getConnector(testURL, prop)()
    log.info("got connection.")
  }

  override def afterAll(): Unit = {
    try {
      sc.stop()
    }
    finally {
      conn.close()
      super.afterAll()
    }
  }

  
  def withTable(tableNames: String*)(f: => Unit): Unit = {
    try f finally {
      tableNames.foreach { name =>
        executeJdbcStmt(s"DROP TABLE $name")
      }
    }
  }
}

Source File: DataSource.scala From pio-template-fpm with Apache License 2.0

5 votes

package org.template.fpm



import org.apache.predictionio.controller.PDataSource
import org.apache.predictionio.controller.EmptyEvaluationInfo
import org.apache.predictionio.controller.EmptyActualResult
import org.apache.predictionio.controller.Params
import org.apache.predictionio.data.store.PEventStore
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD

import grizzled.slf4j.Logger

case class DataSourceParams(appName: String) extends Params

class DataSource(val dsp: DataSourceParams)
  extends PDataSource[TrainingData, EmptyEvaluationInfo, Query, EmptyActualResult] {

  @transient lazy val logger = Logger[this.type]

  override
  def readTraining(sc: SparkContext): TrainingData = {
    println("Gathering data from event server.")
    val transactionsRDD: RDD[Array[String]] = PEventStore.find(
      appName = dsp.appName,
      entityType = Some("transaction"),
      startTime = None,
      eventNames = Some(List("$set")))(sc).map { event =>
        try {
	  event.properties.get[Array[String]]("items")
        } catch {
          case e: Exception => {
            logger.error(s"Failed to convert event ${event} of. Exception: ${e}.")
            throw e
          }
        }
      }
    new TrainingData(transactionsRDD)
  }
}

class TrainingData(
  val transactions: RDD[Array[String]]
) extends Serializable

Source File: FPGAlgorithm.scala From pio-template-fpm with Apache License 2.0

5 votes

package org.template.fpm



import org.apache.predictionio.controller.P2LAlgorithm
import org.apache.predictionio.controller.Params
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD
import grizzled.slf4j.Logger
import org.apache.spark.mllib.fpm.{FPGrowth,FPGrowthModel}

case class AlgorithmParams(
  val minSupport: Double,
  val minConfidence: Double,
  val numPartitions: Int
) extends Params

class FPGModel(
  val resultList: List[(String,Array[String],Double)]
) extends Serializable {}

class FPGAlgorithm(val ap: AlgorithmParams) extends P2LAlgorithm[PreparedData, FPGModel, Query, PredictedResult] {

  @transient lazy val logger = Logger[this.type]

  def train(sc: SparkContext, data: PreparedData): FPGModel = {
    println("Training FPM model.")
    val fpg = new FPGrowth().setMinSupport(ap.minSupport).setNumPartitions(ap.numPartitions)
    val model = fpg.run(data.transactions.cache)
    val res = model.generateAssociationRules(ap.minConfidence).map(x=>(x.antecedent.mkString(" "),x.consequent,x.confidence)).collect.toList

    new FPGModel(resultList=res)
  }

  def predict(model: FPGModel, query: Query): PredictedResult = {
    val qArr = query.items.toList.sorted.mkString(" ")
    val result = model.resultList.filter(x=>{x._1==qArr}).sortBy(_._3).map(x=>{new ConsequentItem(x._2,x._3)})

    PredictedResult(consequentItems=result.toArray)
  }
}

Source File: AbstractExactor.scala From TextRank with Apache License 2.0

5 votes

package AbstractExactor

import KeywordExactor.PropertyExtractor
import org.apache.spark.mllib.feature.Word2VecModel
import org.apache.spark.{SparkConf, SparkContext}


  def run(graphName: String,
          vectorSize: Int,
          sentenceList: Array[(Int, Array[String])],
          keySentenceNum: Int,
          iterator: Int,
          word2vecModel: Word2VecModel,
          df: Float): List[(String, Float)] = {

    // 生成关键词图
    val constructTextGraph = new ConstructSentenceGraph(graphName, vectorSize, sentenceList, word2vecModel)
    val textGraph = constructTextGraph.constructGraph

    // 输出提取的关键词
    val keywordExtractor = new PropertyExtractor(textGraph, keySentenceNum)
    val result = keywordExtractor.textrank(iterator, df).sortBy(_._1)

    result
  }


  def main (args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("AbstractExtractor")
    val sc = new SparkContext(conf)

    val filePath = args(0)
    val word2VecModelPath = args(1)

//    val data = sc.textFile("/Users/li/workshop/MyRepository/TextRank/src/main/resources/2.txt").flatMap(_.split("。")).collect.map(x=> x.split(" "))
    val data = sc.textFile(filePath).flatMap(_.split("。")).collect.map(x=> x.split(" "))


    val dataIndex = data.zipWithIndex.map(x=>(x._2, x._1))
    dataIndex.foreach(x=> println(x._1, x._2.mkString("")))
    // val word2VecModelPath = "hdfs://61.147.114.85:9000/home/liyu/word2vec/model2/10_100_5_102017-02-08-word2VectorModel"
//    val word2VecModelPath = "/Users/li/workshop/DataSet/word2vec/model-10-100-20/2016-08-16-word2VectorModel/"

    val model = Word2VecModel.load(sc, word2VecModelPath)
    val da = model.findSynonyms("共产党", 2)
    da.foreach(x => println(x))

    val result = run("jiji", 100, dataIndex, 2, 100, model, 0.9F)
    println(result)

    // 转换成句子
    val index = result.map(x=> x._1)
    for (elem <- index) {
      print(dataIndex(elem.toInt)._2.mkString(""))
    }
  }

}

Source File: GpuEnablerExample.scala From GPUEnabler with Apache License 2.0

5 votes

package com.ibm.gpuenabler

import org.apache.spark.{SparkContext, SparkConf}
import com.ibm.gpuenabler.CUDARDDImplicits._

object GpuEnablerExample {

  def main(args: Array[String]) = {
    val masterURL = if (args.length > 0) args(0) else "local[*]"
    val sparkConf = new SparkConf().setAppName("GpuEnablerExample1").setMaster(masterURL)
    val sc = new SparkContext(sparkConf)

    val ptxURL = getClass.getResource("/GpuEnablerExamples.ptx")
    val mapFunction = new CUDAFunction(
      "multiplyBy2",
      Array("this"),
      Array("this"),
      ptxURL)

    val dimensions = (size: Long, stage: Int) => stage match {
      case 0 => (64, 256)
      case 1 => (1, 1)
    }
    val reduceFunction = new CUDAFunction(
      "sum",
      Array("this"),
      Array("this"),
      ptxURL,
      Seq(),
      Some((size: Long) => 2),
      Some(dimensions))

    val n = 10
    val output = sc.parallelize(1 to n, 1)
      .mapExtFunc((x: Int) => 2 * x, mapFunction)
      .reduceExtFunc((x: Int, y: Int) => x + y, reduceFunction)

    println("Sum of the list is " + output)
  }

}

Source File: GpuEnablerCodegen.scala From GPUEnabler with Apache License 2.0

5 votes

// bin/spark-submit --jars gpu-enabler/target/gpu-enabler_2.10-1.0.0.jar
// --class com.ibm.gpuenabler.GpuEnablerCodegen examples/target/gpu-enabler-examples_2.10-1.0.0.jar

package com.ibm.gpuenabler

import org.apache.spark.{SparkContext, SparkConf}
import com.ibm.gpuenabler.CUDARDDImplicits._

object GpuEnablerCodegen {

  def main(args: Array[String]) = {
    val masterURL = if (args.length > 0) args(0) else "local[*]"
    val sparkConf = new SparkConf().setAppName("GpuEnablerCodegen").setMaster(masterURL)
    sparkConf.set("spark.gpu.codegen", "true")

    val sc = new SparkContext(sparkConf)

    val n = 10
    val intOut = sc.parallelize(1 to n, 1)
      .mapGpu((x: Int) => 2 * x)
      .reduceGpu((x: Int, y: Int) => x + y)
    println("Int sum of the list is " + intOut)

    val doubleOut = sc.parallelize(1 to n, 1).map(x => x.toDouble)
      .mapGpu((x: Double) => 2.5D * x)
      .reduceGpu((x: Double, y: Double) => x + y)
    println("Double sum of the list is " + doubleOut)
  }

}

Source File: Main.scala From stellar-random-walk with Apache License 2.0

5 votes

package au.csiro.data61.randomwalk

import au.csiro.data61.randomwalk.algorithm.{UniformRandomWalk, VCutRandomWalk}
import au.csiro.data61.randomwalk.common.CommandParser.TaskName
import au.csiro.data61.randomwalk.common.{CommandParser, Params, Property}
import com.typesafe.config.Config
import org.apache.log4j.LogManager
import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.scalactic.{Every, Good, Or}
import spark.jobserver.SparkJobInvalid
import spark.jobserver.api._

object Main extends SparkJob {
  lazy val logger = LogManager.getLogger("myLogger")

  def main(args: Array[String]) {
    CommandParser.parse(args) match {
      case Some(params) =>
        val conf = new SparkConf().setAppName("stellar-random-walk")
        val context: SparkContext = new SparkContext(conf)
        runJob(context, null, params)

      case None => sys.exit(1)
    }
  }

  
  override def validate(sc: SparkContext, runtime: JobEnvironment, config: Config): JobData Or
    Every[SparkJobInvalid] = {
    val args = config.getString("rw.input").split("\\s+")
    CommandParser.parse(args) match {
      case Some(params) => Good(params)
    }
  }
}

Source File: StorageHelper.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.storage

import java.io.File

import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.spark.{SparkContext, SparkFiles}
import org.apache.spark.sql.SparkSession


object StorageHelper {

  def resolveStorageName(database: String, storageRef: String): String = new Path(database + "_" + storageRef).toString

  def load(
            storageSourcePath: String,
            spark: SparkSession,
            database: String,
            storageRef: String,
            withinStorage: Boolean
          ): RocksDBConnection = {

    val dbFolder = StorageHelper.resolveStorageName(database.toString, storageRef)
    val src = StorageLocator.getStorageSerializedPath(storageSourcePath.replaceAllLiterally("\\", "/"), dbFolder, withinStorage)

    val locator = StorageLocator(database, storageRef, spark)

    sendToCluster(src, locator.clusterFilePath, locator.clusterFileName, locator.destinationScheme, spark.sparkContext)

    RocksDBConnection.getOrCreate(locator.clusterFileName)
  }

  def save(path: String, connection: RocksDBConnection, spark: SparkSession, withinStorage: Boolean): Unit = {
    val indexUri = "file://"+(new java.net.URI(connection.findLocalIndex.replaceAllLiterally("\\", "/")).getPath)
    val index = new Path(indexUri)

    val uri = new java.net.URI(path.replaceAllLiterally("\\", "/"))
    val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration)
    val dst = new Path(path+{if (withinStorage) "/storage/" else ""})

    save(fs, index, dst)
  }

  private def save(fs: FileSystem, index: Path, dst: Path): Unit = {
    if (!fs.exists(dst))
      fs.mkdirs(dst)
    fs.copyFromLocalFile(false, true, index, dst)
  }

  def sendToCluster(source: Path, clusterFilePath: Path, clusterFileName: String, destinationScheme: String, sparkContext: SparkContext): Unit = {
    if (destinationScheme == "file") {
      copyIndexToLocal(source, new Path(RocksDBConnection.getLocalPath(clusterFileName)), sparkContext)
    } else {
      copyIndexToCluster(source, clusterFilePath, sparkContext)
    }
  }

  private def copyIndexToCluster(sourcePath: Path, dst: Path, spark: SparkContext): String = {
    if (!new File(SparkFiles.get(dst.getName)).exists()) {
      val srcFS = sourcePath.getFileSystem(spark.hadoopConfiguration)
      val dstFS = dst.getFileSystem(spark.hadoopConfiguration)

      if (srcFS.getScheme == "file") {
        val src = sourcePath
        dstFS.copyFromLocalFile(false, true, src, dst)
      } else {
        FileUtil.copy(srcFS, sourcePath, dstFS, dst, false, true, spark.hadoopConfiguration)
      }

      spark.addFile(dst.toString, recursive = true)
    }
    dst.toString
  }

  private def copyIndexToLocal(source: Path, destination: Path, context: SparkContext): Unit = {
    
    val fs = source.getFileSystem(context.hadoopConfiguration)
    if (!fs.exists(destination))
      fs.copyFromLocalFile(false, true, source, destination)
  }

}

Source File: Quickstart.scala From delta with Apache License 2.0

5 votes

package example

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SparkSession, SQLContext}
import io.delta.tables._

import org.apache.spark.sql.functions._
import org.apache.commons.io.FileUtils
import java.io.File

object Quickstart {
  def main(args: Array[String]): Unit = {

    val spark = SparkSession
      .builder()
      .appName("Quickstart")
      .master("local[*]")
      .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
      .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
      .getOrCreate()

    val file = new File("/tmp/delta-table")
    if (file.exists()) FileUtils.deleteDirectory(file)
    
    // Create a table
    println("Creating a table")
    val path = file.getCanonicalPath
    var data = spark.range(0, 5)
    data.write.format("delta").save(path)

    // Read table
    println("Reading the table")
    val df = spark.read.format("delta").load(path)
    df.show()

    // Upsert (merge) new data
    println("Upsert new data")
    val newData = spark.range(0, 20).toDF
    val deltaTable = DeltaTable.forPath(path)

    deltaTable.as("oldData")
      .merge(
        newData.as("newData"),
        "oldData.id = newData.id")
      .whenMatched
      .update(Map("id" -> col("newData.id")))
      .whenNotMatched
      .insert(Map("id" -> col("newData.id")))
      .execute()

    deltaTable.toDF.show()

    // Update table data
    println("Overwrite the table")
    data = spark.range(5, 10)
    data.write.format("delta").mode("overwrite").save(path)
    deltaTable.toDF.show()

    // Update every even value by adding 100 to it
    println("Update to the table (add 100 to every even value)")
    deltaTable.update(
      condition = expr("id % 2 == 0"),
      set = Map("id" -> expr("id + 100")))
    deltaTable.toDF.show()

    // Delete every even value
    deltaTable.delete(condition = expr("id % 2 == 0"))
    deltaTable.toDF.show()

    // Read old version of the data using time travel
    print("Read old data using time travel")
    val df2 = spark.read.format("delta").option("versionAsOf", 0).load(path)
    df2.show()

    // Cleanup
    FileUtils.deleteDirectory(file)
    spark.stop()
  }
}

Source File: DeltaSink.scala From delta with Apache License 2.0

5 votes

package org.apache.spark.sql.delta.sources

import org.apache.spark.sql.delta._
import org.apache.spark.sql.delta.actions.SetTransaction
import org.apache.spark.sql.delta.metering.DeltaLogging
import org.apache.spark.sql.delta.schema.{ImplicitMetadataOperation, SchemaUtils}
import org.apache.hadoop.fs.Path

import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.execution.SQLExecution
import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
import org.apache.spark.sql.execution.metric.SQLMetrics.createMetric
import org.apache.spark.sql.execution.streaming.{Sink, StreamExecution}
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.NullType


class DeltaSink(
    sqlContext: SQLContext,
    path: Path,
    partitionColumns: Seq[String],
    outputMode: OutputMode,
    options: DeltaOptions)
  extends Sink with ImplicitMetadataOperation with DeltaLogging {

  private val deltaLog = DeltaLog.forTable(sqlContext.sparkSession, path)

  private val sqlConf = sqlContext.sparkSession.sessionState.conf

  override protected val canOverwriteSchema: Boolean =
    outputMode == OutputMode.Complete() && options.canOverwriteSchema

  override protected val canMergeSchema: Boolean = options.canMergeSchema

  override def addBatch(batchId: Long, data: DataFrame): Unit = deltaLog.withNewTransaction { txn =>
    val sc = data.sparkSession.sparkContext
    val metrics = Map[String, SQLMetric](
      "numAddedFiles" -> createMetric(sc, "number of files added"),
      "numRemovedFiles" -> createMetric(sc, "number of files removed")
    )
    val queryId = sqlContext.sparkContext.getLocalProperty(StreamExecution.QUERY_ID_KEY)
    assert(queryId != null)

    if (SchemaUtils.typeExistsRecursively(data.schema)(_.isInstanceOf[NullType])) {
      throw DeltaErrors.streamWriteNullTypeException
    }

    // If the batch reads the same Delta table as this sink is going to write to, then this
    // write has dependencies. Then make sure that this commit set hasDependencies to true
    // by injecting a read on the whole table. This needs to be done explicitly because
    // MicroBatchExecution has already enforced all the data skipping (by forcing the generation
    // of the executed plan) even before the transaction was started.
    val selfScan = data.queryExecution.analyzed.collectFirst {
      case DeltaTable(index) if index.deltaLog.isSameLogAs(txn.deltaLog) => true
    }.nonEmpty
    if (selfScan) {
      txn.readWholeTable()
    }

    // Streaming sinks can't blindly overwrite schema. See Schema Management design doc for details
    updateMetadata(
      txn,
      data,
      partitionColumns,
      configuration = Map.empty,
      outputMode == OutputMode.Complete())

    val currentVersion = txn.txnVersion(queryId)
    if (currentVersion >= batchId) {
      logInfo(s"Skipping already complete epoch $batchId, in query $queryId")
      return
    }

    val deletedFiles = outputMode match {
      case o if o == OutputMode.Complete() =>
        deltaLog.assertRemovable()
        txn.filterFiles().map(_.remove)
      case _ => Nil
    }
    val newFiles = txn.writeFiles(data, Some(options))
    val setTxn = SetTransaction(queryId, batchId, Some(deltaLog.clock.getTimeMillis())) :: Nil
    val info = DeltaOperations.StreamingUpdate(outputMode, queryId, batchId, options.userMetadata)
    metrics("numRemovedFiles").set(deletedFiles.size)
    metrics("numAddedFiles").set(newFiles.size)
    txn.registerSQLMetrics(sqlContext.sparkSession, metrics)
    txn.commit(setTxn ++ newFiles ++ deletedFiles, info)
    // This is needed to make the SQL metrics visible in the Spark UI
    val executionId = sqlContext.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
    SQLMetrics.postDriverMetricUpdates(
      sqlContext.sparkContext, executionId, metrics.values.toSeq)
  }

  override def toString(): String = s"DeltaSink[$path]"
}

Source File: HadoopFileSystemLogStore.scala From delta with Apache License 2.0

5 votes

package org.apache.spark.sql.delta.storage

import java.io.{BufferedReader, FileNotFoundException, InputStreamReader}
import java.nio.charset.StandardCharsets.UTF_8
import java.nio.file.FileAlreadyExistsException
import java.util.UUID

import scala.collection.JavaConverters._

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession


  protected def writeWithRename(
      path: Path, actions: Iterator[String], overwrite: Boolean = false): Unit = {
    val fs = path.getFileSystem(getHadoopConfiguration)

    if (!fs.exists(path.getParent)) {
      throw new FileNotFoundException(s"No such file or directory: ${path.getParent}")
    }
    if (overwrite) {
      val stream = fs.create(path, true)
      try {
        actions.map(_ + "\n").map(_.getBytes(UTF_8)).foreach(stream.write)
      } finally {
        stream.close()
      }
    } else {
      if (fs.exists(path)) {
        throw new FileAlreadyExistsException(path.toString)
      }
      val tempPath = createTempPath(path)
      var streamClosed = false // This flag is to avoid double close
      var renameDone = false // This flag is to save the delete operation in most of cases.
      val stream = fs.create(tempPath)
      try {
        actions.map(_ + "\n").map(_.getBytes(UTF_8)).foreach(stream.write)
        stream.close()
        streamClosed = true
        try {
          if (fs.rename(tempPath, path)) {
            renameDone = true
          } else {
            if (fs.exists(path)) {
              throw new FileAlreadyExistsException(path.toString)
            } else {
              throw new IllegalStateException(s"Cannot rename $tempPath to $path")
            }
          }
        } catch {
          case _: org.apache.hadoop.fs.FileAlreadyExistsException =>
            throw new FileAlreadyExistsException(path.toString)
        }
      } finally {
        if (!streamClosed) {
          stream.close()
        }
        if (!renameDone) {
          fs.delete(tempPath, false)
        }
      }
    }
  }

  protected def createTempPath(path: Path): Path = {
    new Path(path.getParent, s".${path.getName}.${UUID.randomUUID}.tmp")
  }

  override def invalidateCache(): Unit = {}
}

Source File: DeltaHiveTest.scala From delta with Apache License 2.0

5 votes

package org.apache.spark.sql.delta.test

import org.apache.spark.sql.delta.catalog.DeltaCatalog
import io.delta.sql.DeltaSparkSessionExtension
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.{SparkContext, SparkFunSuite}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.hive.test.{TestHive, TestHiveContext}
import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
import org.apache.spark.sql.test.SQLTestUtils


trait DeltaHiveTest extends SparkFunSuite with BeforeAndAfterAll { self: SQLTestUtils =>

  private var _session: SparkSession = _
  private var _hiveContext: TestHiveContext = _
  private var _sc: SparkContext = _

  override def beforeAll(): Unit = {
    val conf = TestHive.sparkSession.sparkContext.getConf.clone()
    TestHive.sparkSession.stop()
    conf.set(SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION.key, classOf[DeltaCatalog].getName)
    conf.set(StaticSQLConf.SPARK_SESSION_EXTENSIONS.key,
      classOf[DeltaSparkSessionExtension].getName)
    _sc = new SparkContext("local", this.getClass.getName, conf)
    _hiveContext = new TestHiveContext(_sc)
    _session = _hiveContext.sparkSession
    SparkSession.setActiveSession(_session)
    super.beforeAll()
  }

  override protected def spark: SparkSession = _session

  override def afterAll(): Unit = {
    try {
      _hiveContext.reset()
    } finally {
      _sc.stop()
    }
  }
}

Source File: SPLScalaReflection.scala From spark-druid-olap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.sparklinedata

import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.ScalaReflection

object SPLScalaReflection {

  import ScalaReflection.universe
  import ScalaReflection.mirror

  def changeSessionStateClass : Unit = {
    val spkSessionCSymbol = mirror.classSymbol(classOf[SparkSession])
    val spkSessionModSymbol = spkSessionCSymbol.companion.asModule
    val spkSessionModClassMirror = mirror.reflectModule(spkSessionModSymbol)
    val spkSessionModule = spkSessionModClassMirror.instance
    val spkSessionModuleMirror = mirror.reflect(spkSessionModule)
    val spkSessionModuleTyp = spkSessionModuleMirror.symbol.selfType
    val termSessionState = spkSessionModuleTyp.decl(
      universe.TermName("HIVE_SESSION_STATE_CLASS_NAME")).asTerm.accessed.asTerm
    val sessionStateField = spkSessionModuleMirror.reflectField(termSessionState)
    sessionStateField.set("org.apache.spark.sql.hive.sparklinedata.SPLSessionState")
  }

//  def main(args : Array[String]) : Unit = {
//    changeSessionStateClass
//
//    println(new SparkSession(new SparkContext()).sharedState.getClass)
//  }

}

Source File: DruidQueriesTab.scala From spark-druid-olap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver.sparklinedata.ui

import org.apache.spark.sql.hive.thriftserver.sparklinedata.ui.DruidQueriesTab._
import org.apache.spark.ui.{SparkUI, SparkUITab}
import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.sql.SPLLogging

private[thriftserver] class DruidQueriesTab(sparkContext: SparkContext)
  extends SparkUITab(getSparkUI(sparkContext), "druid") with SPLLogging {

  override val name = "Druid Query Details"
  val parent = getSparkUI(sparkContext)
  attachPage(new DruidQueriesPage(this))
  parent.attachTab(this)
  def detach() {
    getSparkUI(sparkContext).detachTab(this)
  }
}

private[spark] object DruidQueriesTab {
  def getSparkUI(sparkContext: SparkContext): SparkUI = {
    sparkContext.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: ScalingVariable.scala From reforest with Apache License 2.0

5 votes

package reforest.data

import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import reforest.TypeInfo

import scala.reflect.ClassTag

/**
  * It scales the value of the raw data according to different methodologies
  * @tparam T raw data type
  * @tparam U working data type
  */
trait ScalingVariable[T, U] extends Serializable {

  /**
    * It scales the data passed as argument
    * @param data The value to be scaled
    * @return The scaled data
    */
  def scale(data: RawDataLabeled[T, U]): RawDataLabeled[T, U]

}

/**
  * It scales the values according to the Basic Scaling of Blaser et al. "Random rotation ensembles".
  * Numeric values are scaled to [0, 1] using the min and max values.
  * @param sc The Spark Context
  * @param typeInfo The type information about the raw data
  * @param featureNumber The number of feature in the dataset
  * @param input The raw dataset
  * @tparam T raw data type
  * @tparam U working data type
  */
class ScalingBasic[T : ClassTag, U : ClassTag](@transient private val sc: SparkContext,
                         typeInfo: Broadcast[TypeInfo[T]],
                         featureNumber: Int,
                         input: RDD[RawDataLabeled[T, U]]) extends ScalingVariable[T, U] {

  private val scaling: Broadcast[scala.collection.Map[Int, (T, T)]] = sc.broadcast(init())

  private def scaleValue(index: Int, value: T): T = {
    val (min, max) = scaling.value(index)
    val doubleValue = typeInfo.value.toDouble(value)
    typeInfo.value.fromDouble(Math.min(1, Math.max(0, (doubleValue - typeInfo.value.toDouble(min)) / (typeInfo.value.toDouble(max) - typeInfo.value.toDouble(min)))))
  }

  override def scale(data: RawDataLabeled[T, U]): RawDataLabeled[T, U] = {
    val densed = data.features.toDense
    val values = new Array[T](densed.size)
    var count = 0

    while (count < values.length) {
      values(count) = scaleValue(count, densed(count))
      count += 1
    }

    RawDataLabeled(data.label, new RawDataDense(values, densed.nan))
  }

  private def init(): scala.collection.Map[Int, (T, T)] = {

    input.mapPartitions(it => {
      val min = Array.fill(featureNumber)(typeInfo.value.maxValue)
      val max = Array.fill(featureNumber)(typeInfo.value.minValue)

      def setMinMax(index: Int, value: T): Unit = {
        if (typeInfo.value.isMinOrEqual(value, min(index))) {
          min(index) = value
        }
        if (typeInfo.value.isMinOrEqual(max(index), value)) {
          max(index) = value
        }
      }

      it.foreach(t => {
        t.features.foreachActive(setMinMax)
      })

      min.zip(max).zipWithIndex.map(_.swap).toIterator
    }).reduceByKey((a, b) => (typeInfo.value.min(a._1, b._1), typeInfo.value.max(a._2, b._2))).collectAsMap()
  }
}

Source File: DataLoad.scala From reforest with Apache License 2.0

5 votes

package reforest.data.load

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import reforest.data.RawDataLabeled

/**
  * An utility to load data from different file formats in raw data labeled
  * @tparam T raw data type
  * @tparam U working data type
  */
trait DataLoad[T, U] extends Serializable {

  /**
    * Load the data from a file
    * @param sc the Spark Context
    * @param path the file path
    * @param numFeatures the number of features in the dataset
    * @param minPartitions the minimum number of partition of the RDD
    * @return the loaded dataset in RawDataLabeled format
    */
  def loadFile(sc: SparkContext,
               path: String,
               numFeatures: Int,
               minPartitions: Int): RDD[RawDataLabeled[T, U]]
}

Source File: ARFFUtil.scala From reforest with Apache License 2.0

5 votes

package reforest.data.load

import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import reforest.TypeInfo
import reforest.data.{RawData, RawDataLabeled}
import reforest.rf.RFCategoryInfo
import reforest.util.GCInstrumented

import scala.reflect.ClassTag

/**
  * Load data in ARFF format
  *
  * @param typeInfo     the type information of the raw data
  * @param instrumented the instrumentation of the GC
  * @param categoryInfo the information for the categorical features
  * @tparam T raw data type
  * @tparam U working data type
  */
class ARFFUtil[T: ClassTag, U: ClassTag](typeInfo: Broadcast[TypeInfo[T]],
                                         instrumented: Broadcast[GCInstrumented],
                                         categoryInfo: Broadcast[RFCategoryInfo]) extends DataLoad[T, U] {
  override def loadFile(sc: SparkContext,
                        path: String,
                        numFeatures: Int,
                        minPartitions: Int): RDD[RawDataLabeled[T, U]] = {
    val parsed = parseARFFFile(sc, path, minPartitions)
    instrumented.value.gcALL

    parsed.map {
      case (label, values) =>
        RawDataLabeled(label, RawData.dense[T, U](values, typeInfo.value.NaN))
    }
  }

  private def parseARFFFile(sc: SparkContext,
                            path: String,
                            minPartitions: Int): RDD[(Double, Array[T])] = {
    sc.textFile(path, minPartitions)
      .map(_.trim)
      .filter(line => !(line.isEmpty || line.startsWith("#") || line.startsWith("%") || line.startsWith("@")))
      .mapPartitions(it => {
        val toReturn = it.map(u => parseARFFRecord(u))
        instrumented.value.gc()
        toReturn
      })
  }

  private[load] def parseARFFRecord(line: String): (Double, Array[T]) = {
    val items = line.split(',')
    val label = Math.max(items.last.toDouble, 0)
    val values = items.dropRight(1).filter(_.nonEmpty).map({
      try {
        typeInfo.value.fromString
      }
      catch {
        case e : NumberFormatException => {
          println("Malformed input. Details: \n"+e.getMessage)
          System.exit(1)
          null
        }
        case e : Exception => {
          e.printStackTrace()
          System.exit(1)
          null
        }
      }
    })

    (label, values)
  }
}

Source File: LibSVMUtil.scala From reforest with Apache License 2.0

5 votes

package reforest.data.load

import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import reforest.TypeInfo
import reforest.data.{RawData, RawDataLabeled}
import reforest.rf.RFCategoryInfo
import reforest.util.GCInstrumented

import scala.reflect.ClassTag

/**
  * Forked from Apache Spark MLlib
  * Load data in LibSVM format
  *
  * @param typeInfo     the type information of the raw data
  * @param instrumented the instrumentation of the GC
  * @param categoryInfo the information for the categorical features
  * @tparam T raw data type
  * @tparam U working data type
  */
class LibSVMUtil[T: ClassTag, U: ClassTag](typeInfo: Broadcast[TypeInfo[T]],
                                           instrumented: Broadcast[GCInstrumented],
                                           categoryInfo: Broadcast[RFCategoryInfo]) extends DataLoad[T, U] {

  override def loadFile(sc: SparkContext,
                        path: String,
                        numFeatures: Int,
                        minPartitions: Int): RDD[RawDataLabeled[T, U]] = {
    val parsed = parseLibSVMFile(sc, path, minPartitions)
    instrumented.value.gcALL

    parsed.map {
      case (label, indices, values) =>
        RawDataLabeled(label, RawData.sparse[T, U](numFeatures, indices, values, typeInfo.value.NaN).compressed)
    }
  }

  private def parseLibSVMFile(sc: SparkContext,
                              path: String,
                              minPartitions: Int): RDD[(Double, Array[Int], Array[T])] = {
    sc.textFile(path, minPartitions)
      .map(_.trim)
      .filter(line => !(line.isEmpty || line.startsWith("#")))
      .mapPartitions(it => {
        val toReturn = it.map(u => parseLibSVMRecord(u))
        instrumented.value.gc()
        toReturn
      })
  }

  private[load] def parseLibSVMRecord(line: String): (Double, Array[Int], Array[T]) = {
    val items = line.split(' ')
    val label = Math.max(items.head.toDouble, 0)
    val (indices, values) = items.tail.filter(_.nonEmpty).flatMap {
      item =>
        try {
          val indexAndValue = item.split(':')
          val index = indexAndValue(0).toInt - 1 // Convert 1-based indices to 0-based
          val value = typeInfo.value.fromString(indexAndValue(1))

          if (categoryInfo.value.isCategorical(index)) {
            Some((index, typeInfo.value.fromInt(categoryInfo.value.rawRemapping(typeInfo.value.toInt(value)))))
          } else {
            Some((index, value))
          }
        }
        catch {
          case e : NumberFormatException => {
            println("Malformed input. Details: \n"+e.getMessage)
            System.exit(1)
            None
          }
          case e : Exception => {
            e.printStackTrace()
            System.exit(1)
            None
          }
        }
    }.unzip

    // check if indices are one-based and in ascending order
    var previous = -1
    var i = 0
    val indicesLength = indices.length
    while (i < indicesLength) {
      val current = indices(i)
      require(current > previous, s"indices should be one-based and in ascending order;"
        + " found current=$current, previous=$previous; line=\"$line\"")
      previous = current
      i += 1
    }
    (label, indices, values)
  }
}

Source File: SLCTreeGeneration.scala From reforest with Apache License 2.0

5 votes

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package reforest.rf.slc

import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import reforest.TypeInfo
import reforest.data._
import reforest.data.tree.ForestManager
import reforest.rf.feature.RFFeatureManager
import reforest.rf.parameter.RFParameter
import reforest.rf.{RFSkip, RFStrategy, RFTreeGeneration}
import reforest.util._

class SLCTreeGeneration[T, U](@transient private val sc: SparkContext,
                              property: Broadcast[RFParameter],
                              typeInfo: Broadcast[TypeInfo[T]],
                              typeInfoWorking: Broadcast[TypeInfo[U]],
                              sampleSize: Long) extends Serializable {

  var fcsExecutor : Option[SLCExecutor[T, U]] = Option.empty

  def findBestCutSLC(dataIndex: RDD[StaticData[U]],
                     forestManager: ForestManager[T, U],
                     featureManager: RFFeatureManager,
                     depthToStop : Int,
                     instrumented: Broadcast[GCInstrumented],
                    skip : RFSkip): ForestManager[T, U] = {

    if (featureManager.getActiveNodesNum <= 0) {
      forestManager
    } else {
      var toReturn = forestManager

      val splitterManagerBC = sc.broadcast(forestManager.splitterManager)

      if(fcsExecutor.isEmpty) {
        fcsExecutor = Some(SLCExecutor.build(sc, typeInfo, typeInfoWorking, property,
          splitterManagerBC, sampleSize))
      }

      toReturn = fcsExecutor.get.executeSLC(toReturn, featureManager, dataIndex, depthToStop, skip)

      splitterManagerBC.unpersist()

      toReturn
    }
  }
}

Source File: CCUtil.scala From reforest with Apache License 2.0

5 votes

package reforest.util

import org.apache.commons.io.FilenameUtils
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.{SparkConf, SparkContext}
import reforest.TypeInfo
import reforest.data.load.{ARFFUtil, DataLoad, LibSVMUtil}
import reforest.rf.RFCategoryInfo
import reforest.rf.parameter.RFParameter

import scala.reflect.ClassTag


  def getDataLoader[T:ClassTag, U:ClassTag](property : RFParameter,
                                             typeInfo: Broadcast[TypeInfo[T]],
                                   instrumented: Broadcast[GCInstrumented],
                                   categoryInfo: Broadcast[RFCategoryInfo]): DataLoad[T, U] = {
    val extension = FilenameUtils.getExtension(property.dataset).toUpperCase()

    property.fileType match {
      case "LIBSVM" => new LibSVMUtil(typeInfo, instrumented, categoryInfo)
      case "SVM" => new LibSVMUtil(typeInfo, instrumented, categoryInfo)
      case "ARFF" => new ARFFUtil(typeInfo, instrumented, categoryInfo)
      case _ => new LibSVMUtil(typeInfo, instrumented, categoryInfo)
    }
  }
}

Source File: ReForeStLoader.scala From reforest with Apache License 2.0

5 votes

package reforest

import org.apache.commons.math3.distribution.PoissonDistribution
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import reforest.data.{RawDataLabeled, RawDataset, StaticData}
import reforest.data.tree.ForestManager
import reforest.rf.parameter.RFParameter
import reforest.rf.split.RFSplitterManager
import reforest.rf.{RFCategoryInfo, RFDataPrepare, RFStrategy}
import reforest.util.{GCInstrumented, GCInstrumentedEmpty, MemoryUtil}

class ReForeStLoader[T, U](@transient private val sc: SparkContext,
                           parameter: Broadcast[RFParameter],
                           strategyBC: Broadcast[RFStrategy[T, U]],
                           val typeInfoBC: Broadcast[TypeInfo[T]],
                           val typeInfoWorkingBC: Broadcast[TypeInfo[U]],
                           val categoricalFeaturesInfoBC: Broadcast[RFCategoryInfo],
                           rawDataset: RawDataset[T, U]) extends Serializable {

  val instrumented: Broadcast[GCInstrumented] = sc.broadcast(new GCInstrumentedEmpty)
  val dataPrepare = new RFDataPrepare[T, U](typeInfoBC, instrumented, strategyBC, false, 1)

  private var memoryUtil : Option[MemoryUtil] = Option.empty
  private var forestManager : Option[ForestManager[T, U]] = Option.empty
  private var workingData : Option[RDD[StaticData[U]]] = Option.empty
  private var previousWorkingData : Option[RDD[StaticData[U]]] = Option.empty
  private var splitterManager : Option[RFSplitterManager[T,U]] = Option.empty

  def testdatafreeze(): Unit = {
    rawDataset.testingData.persist(parameter.value.storageLevel)
  }

  def trainingdatafreeze(): Unit = {
    //    rawDataset.trainingData.persist(property.storageLevel)
    rawDataset.trainingData.count()
  }

  def getRawDataset = rawDataset

  def getTestingData: RDD[RawDataLabeled[T, U]] = rawDataset.testingData

  def getMemoryUtil = memoryUtil
  def getForestManager = forestManager

  
  def getWorkingData(numTrees: Int = parameter.value.getMaxNumTrees, macroIteration: Int = 0, skipPreparation : Boolean =false) = {
    val timePreparationSTART = System.currentTimeMillis()
    if(skipPreparation) {
      forestManager = Some(new ForestManager[T, U](parameter.value.applyNumTrees(numTrees), splitterManager.get))
      previousWorkingData = workingData

      workingData = Some(dataPrepare.prepareData(rawDataset.trainingData,
        sc.broadcast(forestManager.get.splitterManager.getSplitter(macroIteration)),
        parameter.value.numFeatures,
        memoryUtil.get,
        numTrees,
        macroIteration))

//      workingData = Some(workingData.get.mapPartitionsWithIndex{case (partitionIndex, elements) =>
//        strategyBC.value.reGenerateBagging(numTrees, partitionIndex, elements)})
      val dataSize = workingData.get.persist(parameter.value.storageLevel).count()

      if(previousWorkingData.isDefined) {
        previousWorkingData.get.unpersist()
      }

      val timePreparationEND = System.currentTimeMillis()
      println("TIME PREPARATION SKIPPED INIT ("+dataSize+"): " + (timePreparationEND - timePreparationSTART))
      workingData.get
    } else {

      previousWorkingData = workingData

      val zzz = strategyBC.value.findSplits(rawDataset.trainingData, typeInfoBC, typeInfoWorkingBC, instrumented, categoricalFeaturesInfoBC)
      splitterManager = Some(zzz._1)
      forestManager = Some(new ForestManager[T, U](parameter.value.applyNumTrees(numTrees), zzz._1))
      memoryUtil = Some(zzz._2)

      val splitter = forestManager.get.splitterManager.getSplitter(macroIteration)

      // TODO the broadcast of the splitter must be unpersisted!!!
      workingData = Some(dataPrepare.prepareData(rawDataset.trainingData,
        sc.broadcast(splitter),
        parameter.value.numFeatures,
        memoryUtil.get,
        numTrees,
        macroIteration))

      val dataSize = workingData.get.persist(parameter.value.storageLevel).count()
      if(previousWorkingData.isDefined) {
        previousWorkingData.get.unpersist()
      }
      val timePreparationEND = System.currentTimeMillis()
      println("TIME PREPARATION: " + (timePreparationEND - timePreparationSTART))
      workingData.get
    }
  }

}

Source File: CVLogPerplexity.scala From spectrallda-tensorspark with Apache License 2.0

5 votes

package edu.uci.eecs.spectralLDA

import breeze.linalg.sum
import org.apache.spark.{SparkConf, SparkContext}
import edu.uci.eecs.spectralLDA.algorithm._
import org.apache.spark.rdd._
import org.apache.spark.mllib.clustering._
import org.apache.spark.mllib.linalg._

object CVLogPerplexity {
  def main(args: Array[String]) = {
    val conf: SparkConf = new SparkConf().setAppName(s"Spectral LDA")
    val sc: SparkContext = new SparkContext(conf)

    val cv = args(0).toInt
    val documentsPath = args(1)
    val k = args(2).toInt
    val alpha0 = args(3).toDouble
    val maxIterations = args(4).toInt
    val tol = args(5).toDouble
    val minWords = args(6).toInt

    val docs = sc.objectFile[(Long, breeze.linalg.SparseVector[Double])](documentsPath)
      .filter {
        case (_, tc) => sum(tc) >= minWords
      }

    for (i <- 0 until cv) {
      val splits = docs.randomSplit(Array[Double](0.9, 0.1))
      computeLogLikelihood(splits, k, alpha0, maxIterations, tol)
    }

    sc.stop()
  }

  def computeLogLikelihood(splits: Array[RDD[(Long, breeze.linalg.SparseVector[Double])]],
                           k: Int,
                           alpha0: Double,
                           maxIterations: Int,
                           tol: Double
                          ): Unit = {
    val numTestTokens = splits(1)
      .map {
        case (_, tc) => breeze.linalg.sum(tc)
      }
      .reduce(_ + _)

    val tensorLDA = new TensorLDA(
      dimK = k,
      alpha0 = alpha0,
      maxIterations = maxIterations,
      tol = tol
    )
    val (beta, alpha, _, _, m1) = tensorLDA.fit(splits(0))

    val augBeta = breeze.linalg.DenseMatrix.zeros[Double](beta.rows, k + 1)
    val augAlpha = breeze.linalg.DenseVector.ones[Double](alpha.length + 1)
    augBeta(::, 0 until k) := beta
    val dummyTopic = m1 + 0.1 * breeze.linalg.DenseVector.ones[Double](beta.rows) / beta.rows.toDouble
    augBeta(::, k) := dummyTopic / sum(dummyTopic)
    augAlpha(0 until k) := alpha

    val tensorLDAModel = new TensorLDAModel(augBeta, augAlpha)
    val tensorLDALogL = tensorLDAModel.logLikelihood(splits(1), smoothing = 1e-6, maxIterations = 50)
    println(s"Tensor LDA log-perplexity no extra smoothing: ${- tensorLDALogL / numTestTokens}")

    val trainMapped: RDD[(Long, Vector)] = splits(0).map {
      case (id, tc) =>
        val (idx, v) = tc.activeIterator.toArray.unzip
        (id, new SparseVector(tc.length, idx, v))
    }

    val testMapped: RDD[(Long, Vector)] = splits(1).map {
      case (id, tc) =>
        val (idx, v) = tc.activeIterator.toArray.unzip
        (id, new SparseVector(tc.length, idx, v))
    }

    val ldaOptimizer = new OnlineLDAOptimizer()
      .setMiniBatchFraction(0.05)
    val lda = new LDA()
      .setOptimizer(ldaOptimizer)
      .setMaxIterations(80)
      .setK(k)
      .setDocConcentration(alpha0 / k.toDouble)
      .setBeta(1.0)

    val ldaModel: LDAModel = lda.run(trainMapped)
    val ldaLogL = ldaModel.asInstanceOf[LocalLDAModel].logLikelihood(testMapped)

    println(s"Variational Inference log-perplexity: ${- ldaLogL / numTestTokens}")
  }
}

Source File: SimpleTokenizer.scala From spectrallda-tensorspark with Apache License 2.0

5 votes

package edu.uci.eecs.spectralLDA.textprocessing



import java.text.BreakIterator

import org.apache.spark.SparkContext

import scala.collection.mutable

import org.apache.spark.rdd.RDD

class SimpleTokenizer(sc: SparkContext, stopwordFile: String) extends Serializable {

  private val stopwords: Set[String] = if (stopwordFile.isEmpty) {
    Set.empty[String]
  } else {
    val stopwordText = sc.textFile(stopwordFile).collect()
    stopwordText.flatMap(_.stripMargin.split("\\s+")).toSet
  }

  // Matches sequences of Unicode letters
  private val allWordRegex = "^(\\p{L}*)$".r

  // Ignore words shorter than this length.
  private val minWordLength = 3

  def getWords(text: String): IndexedSeq[String] = {

    val words = new mutable.ArrayBuffer[String]()

    // Use Java BreakIterator to tokenize text into words.
    val wb = BreakIterator.getWordInstance
    wb.setText(text)

    // current,end index start,end of each word
    var current = wb.first()
    var end = wb.next()
    while (end != BreakIterator.DONE) {
      // Convert to lowercase
      val word: String = text.substring(current, end).toLowerCase
      // Remove short words and strings that aren't only letters
      word match {
        case allWordRegex(w) if w.length >= minWordLength && !stopwords.contains(w) =>
          words += w
        case _ =>
      }

      current = end
      try {
        end = wb.next()
      } catch {
        case e: Exception =>
          // Ignore remaining text in line.
          // This is a known bug in BreakIterator (for some Java versions),
          // which fails when it sees certain characters.
          end = BreakIterator.DONE
      }
    }
    words
  }

}

Source File: RandNLATest.scala From spectrallda-tensorspark with Apache License 2.0

5 votes

package edu.uci.eecs.spectralLDA.utils

import breeze.linalg._
import breeze.linalg.qr.QR
import breeze.stats.distributions.{Gaussian, RandBasis, ThreadLocalRandomGenerator, Uniform}
import edu.uci.eecs.spectralLDA.testharness.Context
import org.apache.commons.math3.random.MersenneTwister
import org.apache.spark.SparkContext
import org.scalatest._


class RandNLATest extends FlatSpec with Matchers {

  private val sc: SparkContext = Context.getSparkContext

  "M2 sketching" should "be correct" in {
    val a1 = SparseVector(DenseVector.rand[Double](100).toArray)
    val a2 = SparseVector(DenseVector.rand[Double](100).toArray)
    val a3 = SparseVector(DenseVector.rand[Double](100).toArray)

    val docs = Seq((1000L, a1), (1001L, a2), (1002L, a3))
    val docsRDD = sc.parallelize(docs)

    // Random Gaussian matrix
    val g = DenseMatrix.rand[Double](100, 50, Gaussian(mu = 0.0, sigma = 1.0))

    val result = DenseMatrix.zeros[Double](100, 50)
    docsRDD
      .flatMap {
        case (id: Long, w: SparseVector[Double]) => RandNLA.accumulate_M_mul_S(g, w, sum(w))
      }
      .reduceByKey(_ + _)
      .collect
      .foreach {
        case (r: Int, a: DenseVector[Double]) => result(r, ::) := a.t
      }

    val m2 = docsRDD
      .map {
        case (id: Long, w: SparseVector[Double]) =>
          val l = sum(w)
          (w * w.t - diag(w)) / (l * (l - 1.0))
      }
      .reduce(_ + _)
    val expectedResult = m2 * g

    val diff: DenseMatrix[Double] = result - expectedResult
    val normDiff: Double = norm(norm(diff(::, *)).toDenseVector)
    normDiff should be <= 1e-8
  }

  "Randomised Power Iteration method" should "be approximately correct" in {
    implicit val randBasis: RandBasis =
      new RandBasis(new ThreadLocalRandomGenerator(new MersenneTwister(234787)))

    val n = 100
    val k = 5

    val alpha: DenseVector[Double] = DenseVector[Double](25.0, 20.0, 15.0, 10.0, 5.0)
    val beta: DenseMatrix[Double] = DenseMatrix.rand(n, k, Uniform(0.0, 1.0))

    val norms = norm(beta(::, *)).toDenseVector
    for (j <- 0 until k) {
      beta(::, j) /= norms(j)
    }

    val a: DenseMatrix[Double] = beta * diag(alpha) * beta.t
    val sigma: DenseMatrix[Double] = DenseMatrix.rand(n, k, Gaussian(mu = 0.0, sigma = 1.0))
    val y = a * sigma
    val QR(q: DenseMatrix[Double], _) = qr.reduced(y)

    val (s: DenseVector[Double], u: DenseMatrix[Double]) = RandNLA.decomp2(a * q, q)

    val diff_a = u * diag(s) * u.t - a
    val norm_diff_a = norm(norm(diff_a(::, *)).toDenseVector)

    norm_diff_a should be <= 1e-8
  }
}

Source File: TensorLDAModelTest.scala From spectrallda-tensorspark with Apache License 2.0

5 votes

package edu.uci.eecs.spectralLDA.algorithm

import breeze.linalg.{DenseMatrix, DenseVector, SparseVector, norm}
import breeze.numerics.abs
import org.scalatest._
import org.apache.spark.SparkContext
import edu.uci.eecs.spectralLDA.testharness.Context

class TensorLDAModelTest extends FlatSpec with Matchers {

  private val sc: SparkContext = Context.getSparkContext

  "Multinomial log-likelihood" should "be correct" in {
    val p = DenseVector[Double](0.2, 0.5, 0.3)
    val x1 = DenseVector[Double](20, 50, 30)
    val x2 = DenseVector[Double](40, 40, 20)

    abs(TensorLDAModel.multinomialLogLikelihood(p, x1) - (-4.697546)) should be <= 1e-6
    abs(TensorLDAModel.multinomialLogLikelihood(p, x2) - (-15.42038)) should be <= 1e-6
  }
}

org.apache.spark.SparkContext Scala Examples