org.apache.spark.rdd.RDD Scala Example

Source File: DeltaQA.scala From spark-tools with Apache License 2.0

12 votes

package io.univalence.deltaqa.kpialgebra

import org.apache.spark.rdd.RDD
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import shapeless.contrib.spire._
import spire.algebra._
import spire.implicits._

import scala.reflect.ClassTag

case class DeltaPart[T: AdditiveMonoid](
  count: Long,
  part: T
)

case class DeltaCommon[T: AdditiveMonoid](
  count: Long,
  countZero: Long,
  diff: T,
  error: T,
  left: T,
  right: T
)

case class Delta[L: AdditiveMonoid, R: AdditiveMonoid, C: AdditiveMonoid](
  left: DeltaPart[L],
  right: DeltaPart[R],
  common: DeltaCommon[C]
)

object KpiAlgebra {

  def computeCommon[LRC: AdditiveAbGroup: MultiplicativeSemigroup](left: LRC, right: LRC): DeltaCommon[LRC] = {
    val diff  = left - right
    val error = diff * diff
    DeltaCommon(
      count     = 1,
      countZero = if (diff == Monoid.additive[LRC].id) 1 else 0,
      diff      = diff,
      error     = error,
      left      = left,
      right     = right
    )
  }

  def monoid[LM: AdditiveMonoid, RM: AdditiveMonoid, LRC: AdditiveMonoid]: Monoid[Delta[LM, RM, LRC]] =
    Monoid.additive[Delta[LM, RM, LRC]]

  def compare[
    K: ClassTag,
    L: ClassTag,
    R: ClassTag,
    LM: AdditiveMonoid: ClassTag,
    RM: AdditiveMonoid: ClassTag,
    LRC: AdditiveAbGroup: MultiplicativeSemigroup: ClassTag
  ](
    left: RDD[(K, L)],
    right: RDD[(K, R)]
  )(flm: L => LM, frm: R => RM, flc: L => LRC, frc: R => LRC): Delta[LM, RM, LRC] = {

    val map: RDD[Delta[LM, RM, LRC]] = left
      .fullOuterJoin(right)
      .map({
        case (_, (Some(l), None)) =>
          monoid[LM, RM, LRC].id
            .copy(left = DeltaPart(count = 1, part = flm(l)))
        case (_, (None, Some(r))) =>
          monoid[LM, RM, LRC].id
            .copy(right = DeltaPart(count = 1, part = frm(r)))
        case (_, (Some(l), Some(r))) =>
          monoid[LM, RM, LRC].id.copy(common = computeCommon(flc(l), frc(r)))
      })

    map.reduce((x, y) => monoid[LM, RM, LRC].op(x, y))
  }
}

case class KpiLeaf(l1: Long, l2: Long, l3: Long)

object KpiAlgebraTest {

  def main(args: Array[String]) {
    val sc = new SparkContext(new SparkConf().setMaster("local[*]").setAppName("smoketest"))

    val parallelize: RDD[(Int, Int)] = sc.parallelize((1 to 4).zipWithIndex)

    

    // Delta(DeltaPart(0,0),DeltaPart(0,0),DeltaCommon(4,4,0,0,6,6))

    val p2: RDD[(Int, KpiLeaf)] =
      sc.parallelize((1 to 4)).map(_ -> KpiLeaf(1, 2, 3))

    import spire.implicits._
    import shapeless.contrib.spire._

    ////println(((KpiAlgebra.compare(p2, p2)(identity, identity, identity, identity))

  }
}

Source File: Test1.scala From BigData-News with Apache License 2.0

12 votes

package com.vita.spark.test

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object Test1 {
  def main(args: Array[String]): Unit = {

    val conf: SparkConf = new SparkConf()
    conf.setMaster("local")
    conf.setAppName("TransformationOperator")
    val sc: SparkContext = new SparkContext(conf)
    val list: List[String] = List("张无忌", "赵敏", "周芷若")
    val rdd: RDD[String] = sc.parallelize(list)


    val list1: List[(Int, String)] = List((1, "东方不败"), (2, "令狐冲"), (3, "林平之"))
    val list2: List[(Int, Int)] = List((1, 99), (2, 98), (3, 97))

    val rdd1: RDD[(Int, String)] = sc.parallelize(list1)
    val rdd2: RDD[(Int, Int)] = sc.parallelize(list2)
    rdd1.join(rdd2).foreach(x => println("学号： " + x._1 + "名字：" + x._2._1 + " 分数：" + x._2._2))

  }
}

Source File: SqlNetworkWordCount.scala From drizzle-spark with Apache License 2.0

6 votes

// scalastyle:off println
package org.apache.spark.examples.streaming

import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext, Time}


object SparkSessionSingleton {

  @transient  private var instance: SparkSession = _

  def getInstance(sparkConf: SparkConf): SparkSession = {
    if (instance == null) {
      instance = SparkSession
        .builder
        .config(sparkConf)
        .getOrCreate()
    }
    instance
  }
}
// scalastyle:on println

Source File: LocalTableScanExec.scala From drizzle-spark with Apache License 2.0

6 votes

package org.apache.spark.sql.execution

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
import org.apache.spark.sql.execution.metric.SQLMetrics



case class LocalTableScanExec(
    output: Seq[Attribute],
    rows: Seq[InternalRow]) extends LeafExecNode {

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))

  private val unsafeRows: Array[InternalRow] = {
    if (rows.isEmpty) {
      Array.empty
    } else {
      val proj = UnsafeProjection.create(output, output)
      rows.map(r => proj(r).copy()).toArray
    }
  }

  private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1),
    sqlContext.sparkContext.defaultParallelism)

  private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism)

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")
    rdd.map { r =>
      numOutputRows += 1
      r
    }
  }

  override protected def stringArgs: Iterator[Any] = {
    if (rows.isEmpty) {
      Iterator("<empty>", output)
    } else {
      Iterator(output)
    }
  }

  override def executeCollect(): Array[InternalRow] = {
    longMetric("numOutputRows").add(unsafeRows.size)
    unsafeRows
  }

  override def executeTake(limit: Int): Array[InternalRow] = {
    val taken = unsafeRows.take(limit)
    longMetric("numOutputRows").add(taken.size)
    taken
  }
}

Source File: GraphGeneration.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License

6 votes

package com.github.maxpumperla.ml_spark.graphs

import org.apache.spark.graphx.lib.TriangleCount
import org.apache.spark.graphx.util.GraphGenerators
import org.apache.spark.graphx.{Graph, GraphLoader, PartitionStrategy, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}


object GraphGeneration extends App {

  val conf = new SparkConf()
    .setAppName("Graph generation")
    .setMaster("local[4]")
  val sc = new SparkContext(conf)

  val edgeListGraph = GraphLoader.edgeListFile(sc, "./edge_list.txt")

  val rawEdges: RDD[(VertexId, VertexId)] = sc.textFile("./edge_list.txt").map {
    line =>
      val field = line.split(" ")
      (field(0).toLong, field(1).toLong)
  }
  val edgeTupleGraph = Graph.fromEdgeTuples(
    rawEdges=rawEdges, defaultValue="")

  val gridGraph = GraphGenerators.gridGraph(sc, 5, 5)
  val starGraph = GraphGenerators.starGraph(sc, 11)
  val logNormalGraph  = GraphGenerators.logNormalGraph(
    sc, numVertices = 20, mu=1, sigma = 3
  )
  logNormalGraph.outDegrees.map(_._2).collect().sorted

  val actorGraph = GraphLoader.edgeListFile(
    sc, "./ca-hollywood-2009.txt", true
  ).partitionBy(PartitionStrategy.RandomVertexCut)
  actorGraph.edges.count()

  val actorComponents = actorGraph.connectedComponents().cache
  actorComponents.vertices.map(_._2).distinct().count

  val clusterSizes =actorComponents.vertices.map(
    v => (v._2, 1)).reduceByKey(_ + _)
  clusterSizes.map(_._2).max
  clusterSizes.map(_._2).min

  val smallActorGraph = GraphLoader.edgeListFile(sc, "./ca-hollywood-2009.txt")
  val strongComponents = smallActorGraph.stronglyConnectedComponents(numIter = 5)
  strongComponents.vertices.map(_._2).distinct().count

  val canonicalGraph = actorGraph.mapEdges(e => 1).removeSelfEdges().convertToCanonicalEdges()
  val partitionedGraph = canonicalGraph.partitionBy(PartitionStrategy.RandomVertexCut)

  actorGraph.triangleCount()
  val triangles = TriangleCount.runPreCanonicalized(partitionedGraph)

  actorGraph.staticPageRank(10)
  val actorPrGraph: Graph[Double, Double] = actorGraph.pageRank(0.0001)
  actorPrGraph.vertices.reduce((v1, v2) => {
    if (v1._2 > v2._2) v1 else v2
  })

  actorPrGraph.inDegrees.filter(v => v._1 == 33024L).collect.foreach(println)

  actorPrGraph.inDegrees.map(_._2).collect().sorted.takeRight(10)

  actorPrGraph.inDegrees.map(_._2).filter(_ >= 62).count

}

Source File: PipePrintSampleCorpus.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.reading.corpus

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.PipeSampler
import de.unihamburg.vsis.sddf.visualisation.Table
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable


class PipePrintSampleCorpus(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping)
  extends PipeElementPassthrough[RDD[Tuple]] with PipeSampler {

  def substep(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: CorpusContext => {
        val sample: Array[Tuple] = pc.corpus.takeSample(false, count)
        val table: Seq[Seq[String]] = createTupleTable(sample)
        log.info("Corpus sample of " + sample.size + " tuples: ")
        Table.printTable(table)
      }
    }
  }

}

object PipePrintSampleCorpus {

  def apply(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping) = {
    new PipePrintSampleCorpus(count)
  }

}

Source File: PipeContextReadCorpus.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.reading.corpus

import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.pipe.PipeElement
import scala.reflect.ClassTag

class PipeContextReadCorpus[A: ClassTag] extends PipeElement[RDD[A], RDD[Tuple]] {

  def step(input: RDD[A])(implicit pipeContext: AbstractPipeContext): RDD[Tuple] = {
    pipeContext match {
      case pc: CorpusContext => pc.corpus
    }
  }
}

object PipeContextReadCorpus {

  def apply[A]() = new PipeContextReadCorpus()

}

Source File: PipeAnalyseCorpus.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.reading.corpus

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.IdConverter
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.reading.TupleArray
import de.unihamburg.vsis.sddf.visualisation.model.ReadingModel
import de.unihamburg.vsis.sddf.pipe.context.ResultContext

class PipeAnalyseCorpus
  extends PipeElementPassthrough[RDD[Tuple]]
  with Serializable {

  override val _analysable = new ReadingModel

  def substep(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): Unit = {
    _analysable.tuples_=(input)
    pipeContext match {
      case pc: ResultContext => {
        pc.readingModel = Some(_analysable)
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

}

object PipeAnalyseCorpus {
  def apply() = {
    new PipeAnalyseCorpus()
  }
}

Source File: PipeStoreInContextGoldstandard.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipeStoreInContextGoldstandard extends PipeElementPassthrough[RDD[SymPair[Tuple]]] {
  
  def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext => pc.goldstandard = input
    }
  }
}

object PipeStoreInContextGoldstandard {
  
  def apply() = new PipeStoreInContextGoldstandard()

}

Source File: PipeReaderGoldstandardIdsPairs.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.IdConverter
import de.unihamburg.vsis.sddf.reading.IdConverterBasic
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable


class PipeReaderGoldstandardIdsPairs(
    separator: Char = ',',
    idIndex1: Int = 0,
    idIndex2: Int = 1,
    idConverter: IdConverter = IdConverterBasic)
  extends PipeElement[RDD[String], RDD[SymPair[Long]]] {

  override def step(inputRdd: RDD[String])(implicit pipeContext: AbstractPipeContext): RDD[SymPair[Long]] = {
    inputRdd.map(line => {
      val parts = line.split(separator)
      val tupleId1 = idConverter.convert(parts(idIndex1).replaceAll("[^0-9]",""))
      val tupleId2 = idConverter.convert(parts(idIndex2).replaceAll("[^0-9]",""))
      new SymPair(tupleId1, tupleId2)
    })
  }

}

object PipeReaderGoldstandardIdsPairs {
  
  def apply(
      separator: Char = ',',
      idIndex1: Int = 0,
      idIndex2: Int = 1,
      idConverter: IdConverter = IdConverterBasic) = {
    new PipeReaderGoldstandardIdsPairs(separator, idIndex1, idIndex2, idConverter)
  }

}

Source File: PipeReaderGoldstandard.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.Pipeline
import de.unihamburg.vsis.sddf.reading.IdConverter
import de.unihamburg.vsis.sddf.reading.IdConverterBasic
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple

object PipeReaderGoldstandardPairs {

  def apply(
    separator: Char = ',',
    idIndex1: Int = 0,
    idIndex2: Int = 1,
    idConverter: IdConverter = IdConverterBasic): Pipeline[RDD[String], RDD[SymPair[Tuple]]] = {
    PipeReaderGoldstandardIdsPairs(separator, idIndex1, idIndex2, idConverter)
      .append(PipeReaderGoldstandardIdToTuple())
  }

}

object PipeReaderGoldstandardCluster {

  def apply(
      separator: Char = ',',
      clusterIdIndex: Int = 0,
      tupleIdIndex: Int = 1,
      idConverter: IdConverter = IdConverterBasic): Pipeline[RDD[String], RDD[SymPair[Tuple]]] = {
    PipeReaderGoldstandardIdsCluster(separator, clusterIdIndex, tupleIdIndex, idConverter)
      .append(PipeReaderGoldstandardIdToTuple())
  }

}

Source File: PipeAnalyseGoldstandardCluster.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.ResultContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.visualisation.model.GoldstandardClusterModel

class PipeAnalyseGoldstandardCluster extends PipeElementPassthrough[RDD[Seq[Long]]] {

  override val _analysable = new GoldstandardClusterModel

  def substep(input: RDD[Seq[Long]])(implicit pipeContext: AbstractPipeContext): Unit = {
    _analysable.goldstandard = input
    pipeContext match {
      case pc: ResultContext => {
        pc.goldstandardModelCluster = Some(_analysable)
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

}

object PipeAnalyseGoldstandardCluster {

  def apply() = new PipeAnalyseGoldstandardCluster()

}

Source File: PipePrintSampleGoldstandard.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.PipeSampler
import de.unihamburg.vsis.sddf.visualisation.Table
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipePrintSampleGoldstandard(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping)
  extends PipeElementPassthrough[RDD[Tuple]] with PipeSampler {

  def substep(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext => {
        val sample: Array[SymPair[Tuple]] = pc.goldstandard.takeSample(false, count)
        val table: Seq[Seq[String]] = createSymPairTable(sample)
        
        log.info("Goldstandard sample of " + sample.size + " tuples: ")
        Table.printTable(table)
      }
    }
  }

}

object PipePrintSampleGoldstandard {
  
  def apply(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping) = {
    new PipePrintSampleGoldstandard(count)
  }

}

Source File: PipeReaderGoldstandardClusterOutput.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.reading.goldstandard

import java.util.regex.PatternSyntaxException

import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions

import de.unihamburg.vsis.sddf.SddfContext.rddToRdd
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.IdConverter
import de.unihamburg.vsis.sddf.reading.IdConverterBasic
import de.unihamburg.vsis.sddf.reading.SymPair


class PipeReaderGoldstandardClusterOutput(
  separator: Char = ',',
  clusterIdIndex: Int = 0,
  tupleIdIndex: Int = 1,
  idConverter: IdConverter = IdConverterBasic)
  extends PipeElement[RDD[String], RDD[Seq[Long]]] {

  override def step(inputRdd: RDD[String])(implicit pipeContext: AbstractPipeContext): RDD[Seq[Long]] = {
    // parse tuple ids
    val clusterIdTupleIdRdd = inputRdd.map(line => {
      val parts = line.split(separator)
      val tupleId = idConverter.convert(parts(tupleIdIndex).replaceAll("[^0-9]",""))
      val clusterId = idConverter.convert(parts(clusterIdIndex).replaceAll("[^0-9]",""))
      (clusterId, tupleId)
    })
    clusterIdTupleIdRdd.groupByKey().map(_._2.toSeq)
  }

}

object PipeReaderGoldstandardClusterOutput {
  
  def apply(
      separator: Char = ',',
      clusterIdIndex: Int = 0,
      tupleIdIndex: Int = 1,
      idConverter: IdConverter = IdConverterBasic) = {
    new PipeReaderGoldstandardClusterOutput(separator, clusterIdIndex, tupleIdIndex, idConverter)
  }

}

Source File: PipeAnalyseGoldstandard.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.ResultContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.GoldstandardModel

class PipeAnalyseGoldstandard extends PipeElementPassthrough[RDD[SymPair[Tuple]]] {

  override val _analysable = new GoldstandardModel

  def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    _analysable.goldstandard = input
    pipeContext match {
      case pc: ResultContext => {
        pc.goldstandardModel = Some(_analysable)
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

}

object PipeAnalyseGoldstandard {

  def apply() = new PipeAnalyseGoldstandard()

}

Source File: PipePrintHeadGoldstandard.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.PipeSampler
import de.unihamburg.vsis.sddf.visualisation.Table
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable


class PipePrintHeadGoldstandard(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping)
  extends PipeElementPassthrough[RDD[SymPair[Tuple]]] with PipeSampler {

  def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext => {
        val sample: Array[SymPair[Tuple]] = pc.goldstandard.take(count)
        val table: Seq[Seq[String]] = createSymPairTable(sample)
        
        log.info("Goldstandard sample of " + sample.size + " tuples: ")
        Table.printTable(table)
      }
    }
  }

}

object PipePrintHeadGoldstandard {
  
  def apply(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping) = {
    new PipePrintHeadGoldstandard(count)
  }

}

Source File: PipePrintHeadTuple.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.print

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.PipeSampler
import de.unihamburg.vsis.sddf.visualisation.Table


class PipePrintHeadTuple(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping)
  extends PipeElementPassthrough[RDD[Tuple]] with PipeSampler {

  def substep(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): Unit = {
    val sample: Array[Tuple] = input.take(count)
    val table: Seq[Seq[String]] = createTupleTable(sample)
    log.info("Sample of " + sample.size + " tuples: ")
    Table.printTable(table)
  }

}

object PipePrintHeadTuple {

  def apply(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping) = {
    new PipePrintHeadTuple(count)
  }

}

Source File: PipeWordcount.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.examples

import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext._
import com.rockymadden.stringmetric.StringMetric
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipeWordcount()
  extends PipeElement[RDD[String], RDD[(String, Int)]] {

  def step(input: RDD[String])(implicit pipeContext: AbstractPipeContext): RDD[(String, Int)] = {
    // flatten the collection of word arrays
    val words = input.flatMap(line => line.split(" "))
    // initialize the counter of each word with one
    val wordsWithCounter = words.map(word => (word, 1))
    // add up all counters of the same word
    wordsWithCounter.reduceByKey(_ + _)
  }

}

// companion object for a better usability
object PipeWordcount {
  def apply() = new PipeWordcount()
}

Source File: AbstractPipeClusteringGraph.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.clustering

import org.apache.spark.graphx.Edge
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.VertexId
import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.similarity.aggregator.Mean
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable


abstract class AbstractPipeClusteringGraph
  extends PipeElement[RDD[(SymPair[Tuple], Array[Double])], RDD[Set[Tuple]]]
  with Serializable {
  
  def cluster(graph: Graph[Tuple, Double]): RDD[Set[Tuple]]

  def step(input: RDD[(SymPair[Tuple], Array[Double])])(implicit pipeContext: AbstractPipeContext): RDD[Set[Tuple]] = {
    
    val duplicatePairsWithSimilarity = input.map(
      pair => (pair._1, Mean.agrSimilarity(pair._2))
    )
    
    val edges: RDD[Edge[Double]] = duplicatePairsWithSimilarity.map(
      pair => { Edge(pair._1._1.id, pair._1._2.id, pair._2) }
    )

    // TODO optimize: it would be nice to build the graph only by using edge triplets
    // but as far as I know that's not possible
    val verticesNotUnique: RDD[(VertexId, Tuple)] = duplicatePairsWithSimilarity.map(_._1).flatMap(
      tuplePair => Seq(tuplePair._1, tuplePair._2)
    ).map(tuple => (tuple.id, tuple))

    // delete all duplicate vertices
    val vertices = verticesNotUnique.distinct()

    // The edge type Boolean is just a workaround because no edge types are needed
    val graph: Graph[Tuple, Double] = Graph.apply(vertices, edges, null)
    
    cluster(graph)
  }

}

Source File: PipeAnalyseClustering.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.clustering

import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.ClusterModel
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.pipe.context.ResultContext

class PipeAnalyseClustering extends PipeElementPassthrough[RDD[Set[Tuple]]] {

  override val _analysable = new ClusterModel

  def substep(input: RDD[Set[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext with ResultContext => {
        _analysable.clusters = input
        _analysable.goldstandard = pc.goldstandard
        pc.clusterModel = Some(_analysable)
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

}

object PipeAnalyseClustering {
  
  def apply() = {
    new PipeAnalyseClustering()
  }
  
}

Source File: PipeWriterTupleCluster.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.writing

import java.io.File

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipeWriterTupleCluster(file: File, separator: Char = ',')
  extends PipeElementPassthrough[RDD[Set[Tuple]]] {

  def substep(input: RDD[Set[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    val writer = new TupleWriterFile(file, separator)
    // TODO write tuples to hdfs in parallel and merge them afterwards
    val collected = input.collect()
    collected.foreach(set => {
      set.foreach(tuple => {
        writer.writeTuple(tuple)
      })
      writer.blankLine()
    })
    writer.close()
  }

}

object PipeWriterTupleCluster {

  def apply(file: File, separator: Char = ',') = {
    new PipeWriterTupleCluster(file, separator)
  }

}

Source File: PipeWriterTuplePairs.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.writing

import java.io.File
import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipeWriterTuplePairs(file: File, separator: Char = ',') extends PipeElementPassthrough[RDD[SymPair[Tuple]]] {

  def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    val writer = new TupleWriterFile(file, separator)
    val collected = input.collect()
    collected.foreach(pair => {
      writer.writeTuple(pair._1)
      writer.writeTuple(pair._2)
      writer.blankLine()
    })
    writer.close()
  }

}

object PipeWriterTuplePairs {
  
  def apply(file: File, separator: Char = ',') = {
    new PipeWriterTuplePairs(file, separator)
  }

}

Source File: ClusterWriterCsvFile.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.writing

import java.io.File
import java.io.FileWriter

import org.apache.spark.rdd.RDD

import com.opencsv.CSVWriter

import de.unihamburg.vsis.sddf.reading.Tuple

class ClusterWriterCsvFile(file: File, separator: Char = ',') {

  // create folders
  file.getParentFile().mkdirs()

  def this(path: String) = {
    this(new File(path))
  }

  def this(folder: String, file: String) = {
    this(new File(folder, file))
  }

  def write(clusterRdd: RDD[Set[Tuple]]): Unit = {
    val collectedClusters = clusterRdd.collect()
    val writer = new CSVWriter(new FileWriter(file), separator);
    // feed in your array (or convert your data to an array)
    collectedClusters.foreach(set => {
      val tupleIdSet: Set[String] = set.map(tuple => tuple.id.toString())
      val tupleIdArray: Array[String] = tupleIdSet.toArray
      writer.writeNext(tupleIdArray)
    })
    writer.close()
  }
  
}

Source File: TupleWriterFile.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.writing

import java.io.File
import java.io.FileWriter

import org.apache.spark.rdd.RDD

import com.opencsv.CSVWriter

import de.unihamburg.vsis.sddf.reading.Tuple


class TupleWriterFile(file: File, separator: Char = ',') {

  val writer = new CSVWriter(new FileWriter(file), separator);

  def writeTuple[A <: Tuple](tuple: A): Unit = {
    writer.writeNext(tuple.id.toString +: tuple.toSeq.map(_._2).toArray)
  }

  def close() = {
	  writer.close()
  }
  
  def blankLine() = {
    writer.writeNext(Array())
  }
  
  def writeTuple[A <: Tuple](tuples: Traversable[A]): Unit = {
    tuples.foreach(tuple => {
      writer.writeNext(tuple.id.toString +: tuple.toSeq.map(_._2).toArray)
    })
  }

  def writeTuple[A <: Tuple](tuples: RDD[A]): Unit = {
    val collectedTuples = tuples.collect()
    collectedTuples.foreach(tuple => {
      writer.writeNext(tuple.id.toString +: tuple.toSeq.map(_._2).toArray)
    })
  }
}

Source File: DummyIndexer.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.indexing

import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable


class PipeIndexerDummy extends IndexingPipe {

  override val name = "DummyIndexer"
  
  def step(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): RDD[SymPair[Tuple]] = {
    val cartesian = input.cartesian(input).map(new SymPair(_))
    // filter identities like (a,a) and symmetric duplicates like (a,b) && (b,a)
    cartesian.filter(pair => pair._1 != pair._2).distinct()
  }
  
}

object PipeIndexerDummy {
  def apply() = {
    new PipeIndexerDummy()
  }
}

Source File: PipeAnalyseIndexer.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.indexing

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.pipe.context.ResultContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.IndexingModel

class PipeAnalyseIndexer extends PipeElementPassthrough[RDD[SymPair[Tuple]]] {

  override val _analysable: IndexingModel = new IndexingModel

  def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: CorpusContext with ResultContext => {
        _analysable.pairs = input
        _analysable.corpus = pc.corpus
        pc.indexingModel = Some(_analysable)
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

}

object PipeAnalyseIndexer {
  
  def apply() = new PipeAnalyseIndexer
  
}

Source File: PipeIndexerSortedNeighborhood.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.indexing

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.indexing.blocking.PipeBlockerSortedNeighborhood
import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilder
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple

object PipeIndexerSortedNeighborhood {
  
  def apply(windowSize: Int = 10)(implicit bkvBuilder: BlockingKeyBuilder) = {
    PipeBlockerSortedNeighborhood(windowSize)
    .append(SortedNeighborhoodIndexer())
  }
  
}


  def calcPairCount(elementCount: Int, windowSize: Int): Int = {
    val windowCount = elementCount - windowSize + 1
    val firstWindowPairs = (windowSize * (windowSize - 1)) / 2
    val lastWindowPairs = (windowCount - 1) * (windowSize - 1)
    firstWindowPairs + lastWindowPairs
  }
}

Source File: PipeAnalyseIndexerExtended.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.indexing

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.pipe.context.ResultContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.IndexingModelExtended

class PipeAnalyseIndexerExtended extends PipeElementPassthrough[RDD[SymPair[Tuple]]] {

  override val _analysable: IndexingModelExtended = new IndexingModelExtended

  def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext with CorpusContext with ResultContext => {
        _analysable.pairs = input
        _analysable.goldstandard = pc.goldstandard
        _analysable.corpus = pc.corpus
        pc.indexingModel = Some(_analysable)
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

}

object PipeAnalyseIndexerExtended {
  
  def apply() = new PipeAnalyseIndexerExtended
  
}

Source File: PipeAnalyseBlocker.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.indexing.blocking

import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.visualisation.model.IndexingModel
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.ResultContext
import de.unihamburg.vsis.sddf.visualisation.model.BlockingModel

class PipeAnalyseBlocker extends PipeElementPassthrough[RDD[Seq[Tuple]]] {

  override val _analysable: BlockingModel = new BlockingModel

  def substep(input: RDD[Seq[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext with CorpusContext with ResultContext => {
        _analysable.blocks = input
        pc.blockingModel = Some(_analysable)
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

}

object PipeAnalyseBlocker {
  
  def apply() = new PipeAnalyseBlocker
  
}

Source File: PipeBlockerStandard.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.indexing.blocking

import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions

import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilder
import de.unihamburg.vsis.sddf.logging.Logging
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable


  def step(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): RDD[Seq[Tuple]] = {
    val bkvTuplePairs: RDD[(String, Tuple)] = input.map(t => (bkvBuilder.buildBlockingKey(t), t))
    val keyBlocks: RDD[(String, Iterable[Tuple])] = bkvTuplePairs.groupByKey
    keyBlocks.map(_._2.toSeq).filter(_.size > 1)
  }

  @transient override val _analysable = new AlgoAnalysable
  _analysable.algo = this
  _analysable.name = this.name
  override val name = "StandardBlocker"
  override val paramMap = Map("BlockingKeyBuilder" -> bkvBuilder)

}

object PipeBlockerStandard {

  def apply(implicit bkvBuilder: BlockingKeyBuilder) = {
    new PipeBlockerStandard()
  }

}

Source File: PipeBlockerSortedNeighborhood.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.indexing.blocking

import org.apache.spark.mllib.rdd.RDDFunctions.fromRDD
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions

import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilder
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable

class PipeBlockerSortedNeighborhood(windowSize: Int = 10)(implicit bkvBuilder: BlockingKeyBuilder)
    extends BlockingPipe
    with Parameterized {

  def step(tuples: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): RDD[Seq[Tuple]] = {
    val bkvTuplePairs: RDD[(String, Tuple)] = tuples.map(t => (bkvBuilder.buildBlockingKey(t), t))
    val sortedPairs = bkvTuplePairs.sortByKey().map(_._2)
    sortedPairs.sliding(windowSize).map(_.toSeq)
  }

  @transient override val _analysable = new AlgoAnalysable
  _analysable.algo = this
  _analysable.name = this.name
  override val name = "SortedNeighborhoodBlocker"
  override val paramMap = Map("windowSize" -> windowSize,
    "BlockingKeyBuilder" -> bkvBuilder)

}

object PipeBlockerSortedNeighborhood {

  def apply(windowSize: Int = 10)(implicit bkvBuilder: BlockingKeyBuilder) = {
    new PipeBlockerSortedNeighborhood(windowSize)
  }

}

Source File: PipeBlockerSuffixArray.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.indexing.blocking

import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions

import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilder
import de.unihamburg.vsis.sddf.logging.Logging
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable


  def filterBlocks(suffixTuplePair: (String, Seq[Tuple])): Boolean = {
    val tupleCount = suffixTuplePair._2.length
    if (tupleCount > maximumBlockSize) {
      false
    } else if (tupleCount < 2) {
      false
    } else {
      true
    }
  }
}

object PipeBlockerSuffixArray {

  def apply(minimumSuffixLength: Int = 6, maximumBlockSize: Int = 12)(
    implicit bkvBuilder: BlockingKeyBuilder) = {
    new PipeBlockerSuffixArray(minimumSuffixLength, maximumBlockSize)
  }

}

Source File: SddfPipeContext.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.pipe.context

import org.apache.spark.rdd.RDD
import org.joda.time.Period

import de.unihamburg.vsis.sddf.visualisation.ModelRouter
import de.unihamburg.vsis.sddf.visualisation.logger.ModelRouterLogging

class SddfPipeContext(
    val name: String = "Unnamed Pipeline",
    modelRouter: ModelRouter = ModelRouterLogging)
  extends AbstractPipeContext(modelRouter)
  with CorpusContext
  with GoldstandardContext
  with ResultContext {
  
  var runtime: Option[Period] = None
  var filepath: Option[String] = None
      
  val persistedRDDs = new scala.collection.mutable.HashMap[String, RDD[_]]()
  
}

Source File: PipeOptimizeUnpersist.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.pipe.optimize

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext

class PipeOptimizeUnpersist[A](rddname: String) extends PipeElementPassthrough[RDD[A]] {

  def substep(input: RDD[A])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: SddfPipeContext => {
        val rddOption = pc.persistedRDDs.get(rddname)
        if (rddOption.isDefined) {
          rddOption.get.unpersist()
          analysable.values += ("RDD unpersisted" -> rddname)
        } else {
          log.warn("Can't unpersist RDD with the name " + rddname)
        }
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }
}

object PipeOptimizeUnpersist {

  def apply[A](rddname: String) = {
    new PipeOptimizeUnpersist[A](rddname)
  }

}

Source File: PipeOptimizePersistAndName.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.pipe.optimize

import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext

class PipeOptimizePersistAndName[A](rddname: String = null, newLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends PipeElementPassthrough[RDD[A]] {
  
  def substep(input: RDD[A])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: SddfPipeContext => {
        input.persist(newLevel)
        if(rddname != null){
          input.name = rddname
          pc.persistedRDDs += (rddname -> input)
          analysable.values += ("name" -> rddname)
        }
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }
}

object PipeOptimizePersistAndName {
  
  def apply[A](rddname: String = null, newLevel: StorageLevel = StorageLevel.MEMORY_ONLY) = {
    new PipeOptimizePersistAndName[A](rddname, newLevel)
  }

}

Source File: RddUtils.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.sparkextensions

import scala.reflect.ClassTag

import org.apache.spark.rdd.RDD

object RddUtils {

  
  def securlyZipRdds[A, B: ClassTag](rdd1: RDD[A], rdd2: RDD[B]): RDD[(A, B)] = {
    val rdd1Repartitioned = rdd1.repartition(1)
    val rdd2Repartitioned = rdd2.repartition(1)
    val (rdd1Balanced, rdd2Balanced) = balanceRddSizes(rdd1Repartitioned, rdd2Repartitioned)
    rdd1Balanced.zip(rdd2Balanced)
  }

  def balanceRddSizes[A, B](rdd1: RDD[A], rdd2: RDD[B]): (RDD[A], RDD[B]) = {
    val rdd1count = rdd1.count()
    val rdd2count = rdd2.count()
    val difference = math.abs(rdd1count - rdd2count).toInt
    if (rdd1count > rdd2count) {
      (removeRandomElements(rdd1, difference), rdd2)
    } else if (rdd2count > rdd1count) {
      (rdd1, removeRandomElements(rdd2, difference))
    } else {
      (rdd1, rdd2)
    }
  }

  def removeRandomElements[A](rdd: RDD[A], numberOfElements: Int): RDD[A] = {
    val sample: Array[A] = rdd.takeSample(false, numberOfElements)
    val set: Set[A] = Set(sample: _*)
    rdd.filter(x => if (set.contains(x)) false else true)
  }

}

Source File: PipePrintHeadFalsePositives.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.classification

import org.apache.spark.rdd.RDD

import com.rockymadden.stringmetric.StringMetric

import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple

class PipePrintHeadFalsePositives(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])])
  extends AbstractPipePrintFalseTuples(count) {

  def selectFalseTuples(goldstandard: RDD[SymPair[Tuple]], input: RDD[SymPair[Tuple]]) = {
    input.subtract(goldstandard)
  }

  def filterFalseTuplesForOutput(falseTuplesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])]) = {
    falseTuplesWithSimilarity.take(count)
  }
  
    def logMessage(count: Int): String = {
    "Printing " + count + " first false positives. (duplicate pairs which were not found)"
  }

}

object PipePrintHeadFalsePositives {
  
  def apply(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])]) = {
    new PipePrintHeadFalsePositives(count)
  }

}

Source File: PipeClassificationNaiveBayes.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.classification

import scala.beans.BeanInfo
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import org.apache.spark.mllib.classification.NaiveBayesModel


class PipeClassificationNaiveBayes(lambda: Double = 1.0) extends AbstractPipeClassification {

  val paramMap: Map[String, Any] = Map(("lambda", lambda))

    def trainModelAndClassify(
    trainingData: RDD[LabeledPoint],
    symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = {
    
    val model = NaiveBayes.train(trainingData, lambda)

    log.debug("Classification Model:" + model)
    log.debug("Classification Model labels :" + model.labels.mkString(" "))
    log.debug("Classification Model pi:     " + model.pi.mkString(" "))
    log.debug("Classification Model theta:  " + model.theta.foreach(_.mkString(" ")))

    // Marking Missing Values as Not Equal (0)
    symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2))))
  }

}

object PipeClassificationNaiveBayes {
  def apply(lambda: Double = 1.0) = {
    new PipeClassificationNaiveBayes(lambda)
  }
}

Source File: PipeClassificationTrainingDataGenerator.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.classification

import scala.compat.Platform

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

import com.rockymadden.stringmetric.StringMetric

import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.NoDuplicate
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.logging.Logging
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.similarity.SimilarityCalculator
import de.unihamburg.vsis.sddf.sparkextensions.RddUtils.securlyZipRdds
import de.unihamburg.vsis.sddf.visualisation.model.TrainingSetModel
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipeClassificationTrainingDataGenerator(
  truePositiveCount: Int = 500,
  trueNegativeCount: Int = 500)(
  implicit featureMeasures: Array[(Int, StringMetric[Double])])
  extends PipeElement[SymPairSim, (SymPairSim, RDD[LabeledPoint])]
  with Logging {

  override def step(input: SymPairSim)(implicit pipeContext: AbstractPipeContext) = {
    pipeContext match {
      case pc: GoldstandardContext with CorpusContext => {
        var truePositiveFraction = truePositiveCount / pc.goldstandard.count.toDouble
        var trueNegativeFraction = trueNegativeCount / pc.corpus.count.toDouble
        log.debug("True positive pair fraction taken from the gold standard for training purposes: " + truePositiveFraction)
        log.debug("True negative pair fraction taken from the corpus for training purposes: " + trueNegativeFraction)
        if (truePositiveFraction > 1.0) {
          truePositiveFraction = 1.0
          log.debug("True positive pair fraction limited to 1.0")
        }
        if (trueNegativeFraction > 1.0) {
          trueNegativeFraction = 1.0
          log.debug("True negative pair fraction limited to 1.0")
        }
        val result = generateTrainingData(pc.corpus, pc.goldstandard,
          truePositiveFraction, trueNegativeFraction)
        (input, result)
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

  
object PipeClassificationTrainingDataGenerator {

  val All = -1
  
  def apply(
      truePositiveCount: Int = 500,
      trueNegativeCount: Int = 500)(
      implicit featureMeasures: Array[(Int, StringMetric[Double])]) = {
    new PipeClassificationTrainingDataGenerator(truePositiveCount, trueNegativeCount)
  }

}

Source File: PipeClassificationDecisionTree.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.classification

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable
import de.unihamburg.vsis.sddf.Parameterized
import org.apache.spark.mllib.classification.ClassificationModel

class PipeClassificationDecisionTree(
    impurity: String = "gini",
    maxDepth: Int = 5,
    maxBins: Int = 32)
  extends AbstractPipeClassification {

  val paramMap: Map[String, Any] = Map(("impurity", impurity), ("maxDepth", maxDepth), ("maxBins", maxBins))

  def trainModelAndClassify(
    trainingData: RDD[LabeledPoint],
    symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = {
    val model = DecisionTree.trainClassifier(trainingData, numClasses = 2,
      categoricalFeaturesInfo = Map[Int, Int](), impurity, maxDepth, maxBins)

    log.debug("Decision Tree Model:" + model)
    log.debug("Decision Tree:" + model.toDebugString)

    // Marking Missing Values as Not Equal (0)
    symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2))))
  }

}

object PipeClassificationDecisionTree {
  def apply(
    impurity: String = "gini",
    maxDepth: Int = 5,
    maxBins: Int = 32) = {
    new PipeClassificationDecisionTree(impurity, maxDepth, maxBins)
  }
}

Source File: PipeClassificationSvm.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.classification

import scala.beans.BeanInfo
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import org.apache.spark.mllib.classification.SVMWithSGD

class PipeClassificationSvm(numIterations: Int = 100) extends AbstractPipeClassification {

  val paramMap: Map[String, Any] = Map(("numIterations", numIterations))

    def trainModelAndClassify(
    trainingData: RDD[LabeledPoint],
    symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = {
    
    val model = SVMWithSGD.train(trainingData, numIterations)

    log.debug("Classification Model:" + model)

    // Marking Missing Values as Not Equal (0)
    symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2))))
  }

}

object PipeClassificationSvm {
  def apply(numIterations: Int = 100) = {
    new PipeClassificationSvm(numIterations)
  }
}

Source File: PipePrintHeadFalseNegatives.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.classification

import org.apache.spark.rdd.RDD

import com.rockymadden.stringmetric.StringMetric

import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple

class PipePrintHeadFalseNegatives(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])])
  extends AbstractPipePrintFalseTuples(count) {

  def selectFalseTuples(goldstandard: RDD[SymPair[Tuple]], input: RDD[SymPair[Tuple]]) = {
    goldstandard.subtract(input)
  }

  def filterFalseTuplesForOutput(falseTuplesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])]) = {
    falseTuplesWithSimilarity.take(count)
  }
  
  def logMessage(count: Int): String = {
    "Printing " + count + " first false negatives. (duplicate pairs which are no duplicates)"
  }

}

object PipePrintHeadFalseNegatives {
  
  def apply(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])]) = {
    new PipePrintHeadFalseNegatives(count)
  }

}

Source File: PipePrintSampleFalseNegatives.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.classification

import org.apache.spark.rdd.RDD

import com.rockymadden.stringmetric.StringMetric

import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple

class PipePrintSampleFalseNegatives(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])])
  extends AbstractPipePrintFalseTuples(count) {

  def selectFalseTuples(goldstandard: RDD[SymPair[Tuple]], input: RDD[SymPair[Tuple]]) = {
    goldstandard.subtract(input)
  }

  def filterFalseTuplesForOutput(falseTuplesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])]) = {
    falseTuplesWithSimilarity.takeSample(false, count)
  }

  def logMessage(count: Int): String = {
    "Sampling " + count + " false negatives. (duplicate pairs which are no duplicates)"
  }

}

object PipePrintSampleFalseNegatives {
  
  def apply(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping, 
    featureMeasures: Array[(Int, StringMetric[Double])]) = {
    new PipePrintSampleFalseNegatives(count)
  }

}

Source File: PipeAnalyseClassificationTraining.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.classification

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.ResultContext
import de.unihamburg.vsis.sddf.visualisation.model.TrainingSetModel

class PipeAnalyseClassificationTraining
  extends PipeElementPassthrough[(SymPairSim, RDD[LabeledPoint])] {

  override val _analysable: TrainingSetModel = new TrainingSetModel

  def substep(
      input: (SymPairSim, RDD[LabeledPoint]))(
      implicit pipeContext: AbstractPipeContext): Unit = {
    _analysable.trainingsSetLabeled = input._2
    pipeContext match {
      case pc: ResultContext => {
        pc.trainingSetModel = Some(_analysable)
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

}


object PipeAnalyseClassificationTraining {

  def apply() = new PipeAnalyseClassificationTraining

}

Source File: PipePrintSampleFalsePositives.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.classification

import org.apache.spark.rdd.RDD

import com.rockymadden.stringmetric.StringMetric

import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple

class PipePrintSampleFalsePositives(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])])
  extends AbstractPipePrintFalseTuples(count) {
  
  def selectFalseTuples(goldstandard: RDD[SymPair[Tuple]], input: RDD[SymPair[Tuple]]) = {
    input.subtract(goldstandard)
  }

  def filterFalseTuplesForOutput(falseTuplesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])]) = {
    falseTuplesWithSimilarity.takeSample(false, count)
  }
  
  def logMessage(count: Int): String = {
    "Sampling " + count + " false positives. (duplicate pairs which were not found)"
  }

}

object PipePrintSampleFalsePositives {
  
  def apply(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping, 
    featureMeasures: Array[(Int, StringMetric[Double])]) = {
    new PipePrintSampleFalsePositives(count)
  }

}

Source File: AbstractPipeClassification.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.classification

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable

abstract class AbstractPipeClassification()
  extends PipeElement[(SymPairSim, RDD[LabeledPoint]), SymPairSim]
  with Parameterized {

  override val _analysable = new AlgoAnalysable
  _analysable.algo = this

  
  def trainModelAndClassify(
    trainingData: RDD[LabeledPoint],
    symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)]

  def step(input: (SymPairSim, RDD[LabeledPoint]))(implicit pipeContext: AbstractPipeContext): SymPairSim = {
    pipeContext match {
      case pc: CorpusContext with GoldstandardContext => {

        val symPairSim = input._1
        val trainingsSet = input._2

        val prediction = trainModelAndClassify(trainingsSet, symPairSim)

        val duplicatePairs = prediction.filter(_._3 == Duplicate).map(tri => (tri._1, tri._2))

        duplicatePairs
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

}

Source File: AbstractPipePrintFalseTuples.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.classification

import org.apache.spark.SparkContext.rddToPairRDDFunctions
import org.apache.spark.rdd.RDD

import com.rockymadden.stringmetric.StringMetric

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.PipeSampler
import de.unihamburg.vsis.sddf.visualisation.Table
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

abstract class AbstractPipePrintFalseTuples(
  count: Int)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])])
  extends PipeElementPassthrough[RDD[(SymPair[Tuple], Array[Double])]]
  with PipeSampler {

  def selectFalseTuples(goldstandard: RDD[SymPair[Tuple]], input: RDD[SymPair[Tuple]]): RDD[SymPair[Tuple]]

  def filterFalseTuplesForOutput(falseTuplesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])]): Array[(SymPair[Tuple], Array[Double])]

  def logMessage(count: Int): String

  def substep(input: RDD[(SymPair[Tuple], Array[Double])])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext => {

        val falseTuples = selectFalseTuples(pc.goldstandard, input.map(_._1))

        if (falseTuples.count > 0) {
          val dummyValue: RDD[(SymPair[Tuple], Int)] = falseTuples.map((_, 1))
          val join: RDD[(SymPair[Tuple], (Int, Option[Array[Double]]))] = dummyValue.leftOuterJoin(input)
          val falsePositivesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])] = join.map(pair => {
            (pair._1, pair._2._2.getOrElse(Array()))
          })

          val falseTuplesSample = filterFalseTuplesForOutput(falsePositivesWithSimilarity)

          val table = createSymPairSimVectorTable(falseTuplesSample)
          log.info(logMessage(count))
          Table.printTable(table)
        } else {
          log.info(logMessage(0))
        }
      }
    }
  }

}

Source File: ExactDuplicateFilter.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.tools

import java.io.File

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.logging.Logging
import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping.Id
import de.unihamburg.vsis.sddf.reading.corpus.PipeStoreInContextCorpus
import de.unihamburg.vsis.sddf.reading.corpus.PipePrintSampleCorpus
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.reading.corpus.PipeReaderTupleCsv
import de.unihamburg.vsis.sddf.writing.TupleWriterFile


object ExactDuplicateFilter extends App with Logging {

  if (args.size == 1 && (new File(args(0))).exists()) {
    val conf = new SparkConf().setAppName("ExactDuplicateFilter")
    conf.setMaster("local")
    val sc = new SparkContext(conf)

    implicit val pipeContext = new SddfPipeContext
      
    val Content: (Int, String) = (0, "content")

    val featureMapping: Map[Int, String] = Map(Content)

    implicit val featureIdNameMapper = new FeatureIdNameMapping(featureMapping)

    val inputFileKey = "musicbrainz"

    // Parse Tuples
    val allFields: Seq[Int] = Seq(Content._1)
    val allFieldsWithId: Seq[Int] = Id +: allFields

    val parserPipe = new PipeTupleParserCsvIdContent(allFieldsWithId)
    val pipe = parserPipe.append(PipeStoreInContextCorpus()).append(PipePrintSampleCorpus())
    pipe.start(sc.textFile(args(0)))
    val result: RDD[Tuple] = parserPipe.output.get
    val resultCount = result.count
    log.info("Lines parsed: " + resultCount)
    
    val distinct = result.distinct()
    val distinctCount = distinct.count
    log.info("Distinct Lines Count: " + distinctCount)
    log.info("Lines removed: " + (resultCount - distinctCount))
    
    val tupleWriter = new TupleWriterFile(new File(args(0) + ".distinct"))
    tupleWriter.writeTuple(distinct)

  } else {
    println("Please provide a valid file path.")
  }

}

class PipeTupleParserCsvIdContent(featureIds: Seq[Int]) extends PipeReaderTupleCsv(featureIds) {
  override def extractValues(line: String): Seq[String] = {
    val splitted = parser.parseLine(line)
    Seq(splitted.head, splitted.tail.mkString(","))
  }
}

Source File: PipeGoldstandardReaderClusterTest.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.test.reading.goldstandard

import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite

import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.reading.goldstandard.PipeReaderGoldstandardIdToTuple
import de.unihamburg.vsis.sddf.reading.goldstandard.PipeReaderGoldstandardIdsCluster
import de.unihamburg.vsis.sddf.test.util.FixtureHelper
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext
import de.unihamburg.vsis.sddf.test.util.TestSddfPipeContext

class PipeReaderGoldstandardClusterTest
  extends FunSuite
  with LocalSparkContext
  with TestSddfPipeContext
  with FixtureHelper {

  test("test goldstandard tuple reading in cluster format") {
    // format clusterId, tupleId
    val input: RDD[String] = sc.parallelize(Seq("1,1", "2,2", "2,3"))
    val gsReaderPipe = PipeReaderGoldstandardIdsCluster()
    gsReaderPipe.start(input)
    val gsIds = gsReaderPipe.output.get
    assert(gsIds.count() === 1)

    val tuples: Seq[Tuple] = initializeTuples(1, 3)
    pc.corpus = sc.parallelize(tuples)
    val gsconverterPipe = new PipeReaderGoldstandardIdToTuple
    gsconverterPipe.start(gsIds)
    val gsTuple = gsconverterPipe.output.get
    assert(gsTuple.count() === 1)
  }

  test("test goldstandard id reading in cluster format") {
    // format clusterId, tupleId
    val input: RDD[String] = sc.parallelize(Seq("1,1", "2,2", "2,3"))
    val gsReaderPipe = PipeReaderGoldstandardIdsCluster()
    gsReaderPipe.start(input)
    val result = gsReaderPipe.output.get
    assert(result.count() === 1)
  }

  test("test goldstandard cluster reader from file") {
    val input = sc.textFile("src/test/resources/musicbrainz-1000.csv.dup")
    val gsReaderPipe = PipeReaderGoldstandardIdsCluster()
    gsReaderPipe.start(input)
    val result = gsReaderPipe.output.get
    assert(result.collect().size === 13)
  }

}

Source File: StrongestPathClusteringTest.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.test.clustering

import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite

import de.unihamburg.vsis.sddf.clustering.PipeClusteringStrongestPath
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.test.util.FixtureHelper
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext
import de.unihamburg.vsis.sddf.test.util.TestSddfPipeContext

class StrongestPathClusteringTest
  extends FunSuite
  with LocalSparkContext
  with TestSddfPipeContext
  with FixtureHelper {

  test("simple cluster test") {
    val pair1 = (createTuplePair(1, 2), Array(0.4, 0.6))
    val pair2 = (createTuplePair(2, 4), Array(0.1, 0.2))
    val pair3 = (createTuplePair(4, 3), Array(0.6, 0.8))
    val pair4 = (createTuplePair(3, 1), Array(0.0, 0.2))

    val pairs: RDD[(SymPair[Tuple], Array[Double])] = sc.parallelize(Seq(pair1, pair2, pair3, pair4))
    val clusterer = new PipeClusteringStrongestPath
    clusterer.start(pairs)
    val clusterResult: Array[Set[Tuple]] = clusterer.output.get.collect()

    val expectedResult = Array(Set(pair1._1._1, pair1._1._2), Set(pair3._1._1, pair3._1._2))
    assert(clusterResult === expectedResult)
  }

}

Source File: ClusterAnalyserTest.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.test.evaluation

import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite

import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.test.util.FixtureHelper
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext
import de.unihamburg.vsis.sddf.visualisation.model.ClusterModel

class ClusterAnalyserTest extends FunSuite with LocalSparkContext with FixtureHelper {
  test("Precission and recall test") {

    val analyser = new ClusterModel
    analyser.clusters = buildClusters()
    analyser.goldstandard = buildGoldstandard()

    assert(analyser.precision === 0.2857142857142857) // should be 2/7
    assert(analyser.recall === 0.6666666666666666) // should be 2/3

  }

  def buildClusters(): RDD[Set[Tuple]] = {
    val cluster1 = initializeTuples(0, 2).toSet
    val cluster2 = initializeTuples(3, 4).toSet
    val cluster3 = initializeTuples(5, 7).toSet

    sc.parallelize(Seq(cluster1, cluster2, cluster3))
  }

  def buildGoldstandard(): RDD[SymPair[Tuple]] = {
    val pair1 = createTuplePair(0, 1)
    val pair2 = createTuplePair(4, 7)
    val pair3 = createTuplePair(6, 7)

    sc.parallelize(Seq(pair1, pair2, pair3))
  }
  
}

Source File: SparkApiTest.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.test

import org.apache.spark.rdd.RDD
import org.scalatest.Finders
import org.scalatest.FunSuite
import de.unihamburg.vsis.sddf.SddfContext.pairToInt
import de.unihamburg.vsis.sddf.preprocessing.PipePreprocessorRemoveRegex
import de.unihamburg.vsis.sddf.preprocessing.PipePreprocessorTrim
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping.Id
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping.Ignore
import de.unihamburg.vsis.sddf.reading.corpus.PipeReaderTupleCsv
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext
import de.unihamburg.vsis.sddf.test.util.MusicbrainzSchema

class SparkApiTest extends FunSuite with LocalSparkContext with MusicbrainzSchema {

  test("test rdd substraction") {

    val file1 = sc.textFile("src/test/resources/musicbrainz-10.csv.dup")
    val file2 = sc.textFile("src/test/resources/musicbrainz-10.csv.dup")
    
    val data1 = parseTuples(file1)
    assert(data1.count() === 10)
    val data2 = parseTuples(file2)
    assert(data2.count() === 10)
    val substraction = data1.subtract(data2)
    assert(substraction.count() === 0)
  }

}

Source File: PipeDecisionTest.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.test.classification

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.scalatest.BeforeAndAfterAll
import org.scalatest.FunSuite

import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.NoDuplicate
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.classification.PipeClassificationDecisionTree
import de.unihamburg.vsis.sddf.classification.PipeClassificationNaiveBayes
import de.unihamburg.vsis.sddf.classification.PipeClassificationSvm
import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext

class PipeClassificationTest extends FunSuite with LocalSparkContext with BeforeAndAfterAll{
  
  var input: (SymPairSim, RDD[LabeledPoint]) = _
  
  override def beforeAll() {
    super.beforeAll()
    val tuple1 = Tuple("test1","test1","test1")
    tuple1.id = 1
    val tuple2 = Tuple("test2","test2","test2")
    tuple2.id = 2
    val tuple3 = Tuple("hans","franz","wurst")
    tuple3.id = 3
    
    val symPairSim: SymPairSim = sc.parallelize(Seq(
      (new SymPair(tuple1, tuple2), Array(1D,1D,0D))
      ,(new SymPair(tuple2, tuple3), Array(0D,0D,1D))
    ))
    
    val trainingData: RDD[LabeledPoint] = sc.parallelize(Seq(
      LabeledPoint(label = Duplicate, features = Vectors.dense(Array(0.99,1.0,0.0)))
      ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,1.0,0.0)))
      ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,0.875,0.0)))
      ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,1.0,0.1)))
      ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,0.89,0.0)))
      
      ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.1,0.0,1.0)))
      ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.0,0.2,1.0)))
      ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.06,0.0,0.89)))
      ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.21,0.19,0.91)))
    ))
    
    input = (symPairSim, trainingData)
  }

  override def afterAll() {
    super.afterAll()
  }
              
	test("naive bayes classification test") {
    val classificationPipe = new PipeClassificationNaiveBayes()
    implicit val pipeContext = new SddfPipeContext()
    val result = classificationPipe.run(input)
    assert(result.count === 1)
  }
  
  test("svm classification test") {
    val classificationPipe = new PipeClassificationSvm()
    implicit val pipeContext = new SddfPipeContext()
    val result = classificationPipe.run(input)
    assert(result.count === 1)
  }

  test("decision tree classification test") {
    val classificationPipe = new PipeClassificationDecisionTree()
    implicit val pipeContext = new SddfPipeContext()
    val result = classificationPipe.run(input)
    assert(result.count === 1)
  }

}

Source File: MusicbrainzSchema.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.test.util

import org.apache.spark.rdd.RDD
import org.scalatest.Suite

import de.unihamburg.vsis.sddf.SddfContext.pairToInt
import de.unihamburg.vsis.sddf.preprocessing.PipePreprocessorRemoveRegex
import de.unihamburg.vsis.sddf.preprocessing.PipePreprocessorTrim
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping.Id
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping.Ignore
import de.unihamburg.vsis.sddf.reading.corpus.PipeReaderTupleCsv

trait MusicbrainzSchema extends TestSddfPipeContext { self: Suite =>

  val Number = (0, "number")
  val Title = (1, "title")
  val Length = (2, "length")
  val Artist = (3, "artist")
  val Album = (4, "album")
  val Year = (5, "year")
  val Language = (6, "language")

  val featureIdNameMapping = Map(Number, Title, Length, Artist, Album, Year, Language)

  implicit val featureIdNameMapper = new FeatureIdNameMapping(featureIdNameMapping)

  def parseTuples(input: RDD[String]) = {
    // Parse Tuples
    val allFields: Seq[Int] = Seq(Number, Title, Length, Artist, Album, Year, Language)
    val allFieldsWithId: Seq[Int] = Ignore +: Id +: Ignore +: allFields

    val pipe = PipeReaderTupleCsv(allFieldsWithId)
      .append(PipePreprocessorTrim(allFields: _*))
      .append(PipePreprocessorRemoveRegex("[^0-9]", Number, Year, Length))

    pipe.run(input)

  }

}

Source File: SortedNeighbourhoodBlockerTest.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.test.blocking

import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite
import org.scalatest.Matchers
import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilderBasic
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.reading.TupleArray
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext
import de.unihamburg.vsis.sddf.test.util.TestSddfPipeContext
import de.unihamburg.vsis.sddf.indexing.PipeIndexerSortedNeighborhood
import de.unihamburg.vsis.sddf.indexing.PipeIndexerSortedNeighborhood

class SortedNeighborhoodIndexingTest
  extends FunSuite
  with LocalSparkContext
  with TestSddfPipeContext
  with Matchers {

  test("testing whole Sorted Neighborhood Indexer") {
    val featureId = 1
    implicit val bkvBuilder = new BlockingKeyBuilderBasic((featureId, 0 to 6))

    val tuple1: Tuple = new TupleArray(1)
    tuple1.addFeature(0, "blubluba")
    tuple1.id = 1
    val tuple2: Tuple = new TupleArray(1)
    tuple2.addFeature(0, "blubluba")
    tuple2.id = 2
    val tuple3: Tuple = new TupleArray(1)
    tuple3.addFeature(0, "blubluba")
    tuple3.id = 3
    val tuple4: Tuple = new TupleArray(1)
    tuple4.addFeature(0, "blubluba")
    tuple4.id = 4
    val tuple5: Tuple = new TupleArray(1)
    tuple5.addFeature(0, "blubluba")
    tuple5.id = 5
    val tuples = sc.parallelize(Seq(tuple1, tuple2, tuple3, tuple4, tuple5))

    val indexer = PipeIndexerSortedNeighborhood(windowSize = 3)
    val blockingResult: RDD[SymPair[Tuple]] = indexer.run(tuples)
    assert(blockingResult.count === 7)

    val resultArray = blockingResult.collect()
    resultArray.foreach(println(_))
    val expectedResult = Seq(
      new SymPair(tuple1, tuple2), new SymPair(tuple1, tuple3), new SymPair(tuple2, tuple3), new SymPair(tuple2, tuple4), new SymPair(tuple3, tuple4), new SymPair(tuple3, tuple5), new SymPair(tuple4, tuple5)
    )

    resultArray should contain theSameElementsAs expectedResult
  }

}

Source File: SuffixArrayBlockingTest.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.test.blocking

import org.apache.spark.rdd.RDD
import org.scalatest.Finders
import org.scalatest.FunSuite

import de.unihamburg.vsis.sddf.indexing.PipeIndexerSuffixArray
import de.unihamburg.vsis.sddf.indexing.blocking.PipeBlockerSuffixArray
import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilderBasic
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.reading.TupleArray
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext
import de.unihamburg.vsis.sddf.test.util.TestSddfPipeContext

class SuffixArrayIndexingTest extends FunSuite with LocalSparkContext with TestSddfPipeContext {

  test("testing suffix calculation") {
    val featureId = 0
    implicit val bkvBuilder = new BlockingKeyBuilderBasic((featureId, 0 to 2))

    val tuple1: Tuple = new TupleArray(1)
    tuple1.addFeature(0, "blockingkeyvalue")
    tuple1.id = 1
    val tuples: RDD[Tuple] = sc.parallelize(Seq(tuple1))

    val sab = PipeBlockerSuffixArray(minimumSuffixLength = 4, maximumBlockSize = 12)

    val suffixTuplePairs: Seq[(String, Tuple)] = sab.calcSuffixes(("blockingkeyvalue", tuple1))

    //    println(suffixTuplePairs.map(_._1).mkString("\n"))

    assert(suffixTuplePairs.length === 13)

  }

  test("testing filter blocks") {
    val featureId = 0
    implicit val bkvBuilder = new BlockingKeyBuilderBasic((featureId, 0 to 2))

    val tuple1: Tuple = new TupleArray(1)
    tuple1.addFeature(0, "blockingkeyvalue")
    tuple1.id = 1
    val tuples = sc.parallelize(Seq(tuple1))

    val sab = new PipeBlockerSuffixArray(minimumSuffixLength = 4, maximumBlockSize = 4)

    val suffixTuplePair = ("bla", Seq(tuple1, tuple1, tuple1, tuple1, tuple1))
    assert(sab.filterBlocks(suffixTuplePair) === false)

    val suffixTuplePair2 = ("bla", Seq(tuple1, tuple1, tuple1, tuple1))
    assert(sab.filterBlocks(suffixTuplePair2) === true)

    val suffixTuplePair3 = ("bla", Seq(tuple1))
    assert(sab.filterBlocks(suffixTuplePair3) === false)

    val suffixTuplePair4 = ("bla", Seq(tuple1, tuple1))
    assert(sab.filterBlocks(suffixTuplePair4) === true)
  }

  test("testing whole SAB") {
    val featureId = 0
    implicit val bkvBuilder = new BlockingKeyBuilderBasic((featureId, 0 to 6))

    val tuple1: Tuple = new TupleArray(1)
    tuple1.addFeature(0, "blubluba")
    tuple1.id = 1
    val tuple2: Tuple = new TupleArray(1)
    tuple2.addFeature(0, "blubluba")
    tuple2.id = 2
    val tuple3: Tuple = new TupleArray(1)
    tuple3.addFeature(0, "blubluba")
    tuple3.id = 3
    val tuples = sc.parallelize(Seq(tuple1, tuple2, tuple3))

    val sab = PipeIndexerSuffixArray(minimumSuffixLength = 4, maximumBlockSize = 12)
    val blockingResult: RDD[SymPair[Tuple]] = sab.run(tuples)
    // print(blockingResult.collect().map(symPair => (symPair._1.id, symPair._2.id)).mkString("\n"))
    assert(blockingResult.count === 3)
  }

}

Source File: BisectingKMeansModel.scala From bisecting-kmeans with Apache License 2.0

5 votes

package org.apache.spark.mllib.bisectingkmeans

import breeze.linalg.{Vector => BV, norm => breezeNorm}

import org.apache.spark.Logging
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD


  def toJavaLinkageMatrix: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = {
    val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]()
    this.node.toLinkageMatrix.foreach {x =>
      val row = new java.util.ArrayList[java.lang.Double]()
      row.add(x._1.toDouble)
      row.add(x._2.toDouble)
      row.add(x._3.toDouble)
      row.add(x._4.toDouble)
      javaList.add(row)
    }
    javaList
  }
}

Source File: TestFFM.scala From spark-ffm with Apache License 2.0

5 votes

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification._
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.linalg.DenseVector
import org.apache.spark.rdd.RDD


object TestFFM extends App {

  override def main(args: Array[String]): Unit = {

    val sc = new SparkContext(new SparkConf().setAppName("TESTFFM").setMaster("local[4]"))

    if (args.length != 8) {
      println("testFFM <train_file> <k> <n_iters> <eta> <lambda> " + "<normal> <random>")
    }

    val data= sc.textFile(args(0)).map(_.split("\\s")).map(x => {
      val y = if(x(0).toInt > 0 ) 1.0 else -1.0
      val nodeArray: Array[(Int, Int, Double)] = x.drop(1).map(_.split(":")).map(x => {
        (x(0).toInt, x(1).toInt, x(2).toDouble)
      })
      (y, nodeArray)
    }).repartition(4)
    val splits = data.randomSplit(Array(0.7, 0.3))
    val (training: RDD[(Double, Array[(Int, Int, Double)])], testing) = (splits(0), splits(1))

    //sometimes the max feature/field number would be different in training/testing dataset,
    // so use the whole dataset to get the max feature/field number
    val m = data.flatMap(x=>x._2).map(_._1).collect.reduceLeft(_ max _) //+ 1
    val n = data.flatMap(x=>x._2).map(_._2).collect.reduceLeft(_ max _) //+ 1

    val ffm: FFMModel = FFMWithAdag.train(training, m, n, dim = (args(6).toBoolean, args(7).toBoolean, args(1).toInt), n_iters = args(2).toInt,
      eta = args(3).toDouble, regParam = (args(4).toDouble, args(5).toDouble), normalization = false, false, "adagrad")

    val scores: RDD[(Double, Double)] = testing.map(x => {
      val p = ffm.predict(x._2)
      val ret = if (p >= 0.5) 1.0 else -1.0
      (ret, x._1)
    })

    val metrics = new BinaryClassificationMetrics(scores)
    val auROC = metrics.areaUnderROC
    val auPRC = metrics.areaUnderPR
    val accuracy = scores.filter(x => x._1 == x._2).count().toDouble / scores.count()
    println(s"accuracy = $accuracy, Area under ROC = $auROC, Area under precision-recall curve = $auPRC")
  }
}

Source File: InferSchema.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.spark.excel

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._


private[excel] object InferSchema {

  type CellType = Int

  
  private[excel] def inferField(typeSoFar: DataType, field: DataType): DataType = {
    // Defining a function to return the StringType constant is necessary in order to work around
    // a Scala compiler issue which leads to runtime incompatibilities with certain Spark versions;
    // see issue #128 for more details.
    def stringType(): DataType = {
      StringType
    }

    if (field == NullType) {
      typeSoFar
    } else {
      (typeSoFar, field) match {
        case (NullType, ct) => ct
        case (DoubleType, DoubleType) => DoubleType
        case (BooleanType, BooleanType) => BooleanType
        case (TimestampType, TimestampType) => TimestampType
        case (StringType, _) => stringType()
        case (_, _) => stringType()
      }
    }
  }


  private val numericPrecedence: IndexedSeq[DataType] =
    IndexedSeq[DataType](ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType, TimestampType)


  val findTightestCommonType: (DataType, DataType) => Option[DataType] = {
    case (t1, t2) if t1 == t2 => Some(t1)
    case (NullType, t1) => Some(t1)
    case (t1, NullType) => Some(t1)
    case (StringType, t2) => Some(StringType)
    case (t1, StringType) => Some(StringType)

    // Promote numeric types to the highest of the two and all numeric types to unlimited decimal
    case (t1, t2) if Seq(t1, t2).forall(numericPrecedence.contains) =>
      val index = numericPrecedence.lastIndexWhere(t => t == t1 || t == t2)
      Some(numericPrecedence(index))

    case _ => None
  }
}

Source File: DatabaseInteraction.scala From reactive-machine-learning-systems with MIT License

5 votes

package com.reactivemachinelearning

import com.couchbase.client.java.document.JsonDocument
import com.couchbase.client.java.view.ViewQuery
import com.couchbase.spark._
import com.reactivemachinelearning.FeatureGeneration.{IntFeature, BooleanFeature, Feature}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object DatabaseInteraction extends App {

  // Configure Spark
  val conf = new SparkConf()
    .setAppName("couchbaseQuickstart")
    .setMaster("local[*]")
    .set("com.couchbase.bucket.default", "")

  // Generate The Context
  val sc = new SparkContext(conf)

  val rawSquawks: RDD[JsonDocument] = sc.couchbaseView(
    ViewQuery.from("squawks", "by_squawk_id"))
    .map(_.id)
    .couchbaseGet[JsonDocument]()

  rawSquawks.foreach(println)


  def extract(rawSquawks: RDD[JsonDocument]): RDD[IntFeature] = {
    ???
  }

  def transform(inputFeatures: RDD[IntFeature]): RDD[BooleanFeature] = {
    ???
  }

  val trainableFeatures = transform(extract(rawSquawks))
}

Source File: TestableQueueInputDStream.scala From SparkUnitTestingExamples with Apache License 2.0

5 votes

package org.apache.spark.streaming

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.spark.rdd.{RDD, UnionRDD}
import org.apache.spark.streaming.dstream.InputDStream

import scala.collection.mutable.{ArrayBuffer, Queue}
import scala.reflect.ClassTag

class TestableQueueInputDStream[T: ClassTag](
                                              ssc: StreamingContext,
                                              val queue: Queue[RDD[T]],
                                              oneAtATime: Boolean,
                                              defaultRDD: RDD[T]
                                              ) extends InputDStream[T](ssc) {

  override def start() { }

  override def stop() { }

  private def readObject(in: ObjectInputStream): Unit = {
    logWarning("queueStream doesn't support checkpointing")
  }

  private def writeObject(oos: ObjectOutputStream): Unit = {
    logWarning("queueStream doesn't support checkpointing")
  }

  override def compute(validTime: Time): Option[RDD[T]] = {
    val buffer = new ArrayBuffer[RDD[T]]()
    queue.synchronized {
      if (oneAtATime && queue.nonEmpty) {
        buffer += queue.dequeue()
      } else {
        buffer ++= queue
        queue.clear()
      }
    }
    if (buffer.nonEmpty) {
      if (oneAtATime) {
        Some(buffer.head)
      } else {
        Some(new UnionRDD(context.sc, buffer.toSeq))
      }
    } else if (defaultRDD != null) {
      Some(defaultRDD)
    } else {
      Some(ssc.sparkContext.emptyRDD)
    }
  }

}

Source File: StreamingUnitTest.scala From SparkUnitTestingExamples with Apache License 2.0

5 votes

package com.cloudera.sa.spark.unittest.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming._
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}

import scala.collection.mutable.Queue

class StreamingUnitTest extends FunSuite with
BeforeAndAfterEach with BeforeAndAfterAll{

  @transient var sc: SparkContext = null
  @transient var ssc: StreamingContext = null

  override def beforeAll(): Unit = {

    val envMap = Map[String,String](("Xmx", "512m"))

    val sparkConfig = new SparkConf()
    sparkConfig.set("spark.broadcast.compress", "false")
    sparkConfig.set("spark.shuffle.compress", "false")
    sparkConfig.set("spark.shuffle.spill.compress", "false")
    sparkConfig.set("spark.io.compression.codec", "lzf")
    sc = new SparkContext("local[2]", "unit test", sparkConfig)
    ssc = new StreamingContext(sc, Milliseconds(200))
  }

  override def afterAll(): Unit = {
    sc.stop()
  }

  test("Streaming word count") {

    val firstBatchRDD = sc.parallelize(Seq("a", "b", "c"))
    val secondBatchRDD = sc.parallelize(Seq("a", "e"))
    val thirdBatchRDD = sc.parallelize(Seq("b", "c", "e", "f"))
    val forthBatchRDD = sc.parallelize(Seq("a", "e"))

    val queue = new Queue[RDD[String]]

    queue.+=(firstBatchRDD)
    queue.+=(secondBatchRDD)
    queue.+=(thirdBatchRDD)
    queue.+=(forthBatchRDD)

    println(queue)

    val startTime = System.currentTimeMillis()

    val dstream = new TestableQueueInputDStream(ssc, queue, true, sc.makeRDD(Seq[String](), 1))
    //ssc.queueStream(queue)

    dstream.checkpoint(Seconds(100))

    val batchTotals:DStream[(String, Int)] = dstream.map(r => (r, 1)).reduceByKey(_ + _)

    val streamTotals = batchTotals.updateStateByKey(
      (seq:Seq[Int], opt:Option[Int]) => {
        if (!seq.isEmpty) {
          val totalCountForNew = seq.reduce(_ + _)
          if (opt.isEmpty) {
            Option(totalCountForNew)
          } else {
            Option(opt.get + totalCountForNew)
          }
        } else {
          opt
        }
    })

    streamTotals.foreachRDD(rdd => {

    })

    ssc.checkpoint("./tmp")
    ssc.start()
    ssc.awaitTerminationOrTimeout(2000)

    val endTime = System.currentTimeMillis()

    val rddList = streamTotals.slice(new Time(startTime), new Time(endTime))

    rddList(0).collect().foreach(println)
    assert(rddList(0).collect().filter(r => r._1.equals("a"))(0)._2 == 1)
    rddList(1).collect().foreach(println)
    assert(rddList(1).collect().filter(r => r._1.equals("a"))(0)._2  == 2)
    rddList(2).collect().foreach(println)
    assert(rddList(2).collect().filter(r => r._1.equals("a"))(0)._2  == 2)
    rddList(3).collect().foreach(println)
    assert(rddList(3).collect().filter(r => r._1.equals("a"))(0)._2  == 3)
  }
}

Source File: SparkCassRDDFunctions.scala From Spark2Cassandra with Apache License 2.0

5 votes

package com.github.jparkie.spark.cassandra.rdd

import com.datastax.spark.connector.cql.CassandraConnector
import com.datastax.spark.connector.mapper.ColumnMapper
import com.datastax.spark.connector.writer.{ DefaultRowWriter, RowWriterFactory }
import com.datastax.spark.connector.{ AllColumns, ColumnSelector }
import com.github.jparkie.spark.cassandra.SparkCassBulkWriter
import com.github.jparkie.spark.cassandra.conf.{ SparkCassServerConf, SparkCassWriteConf }
import org.apache.spark.rdd.RDD

import scala.reflect.runtime.universe._


  def bulkLoadToCass(
    keyspaceName:        String,
    tableName:           String,
    columns:             ColumnSelector      = AllColumns,
    sparkCassWriteConf:  SparkCassWriteConf  = SparkCassWriteConf.fromSparkConf(internalSparkContext.getConf),
    sparkCassServerConf: SparkCassServerConf = SparkCassServerConf.fromSparkConf(internalSparkContext.getConf)
  )(implicit
    connector: CassandraConnector = CassandraConnector(internalSparkContext.getConf),
    rwf: RowWriterFactory[T] = DefaultRowWriter.factory[T]): Unit = {
    val sparkCassBulkWriter = SparkCassBulkWriter(
      connector,
      keyspaceName,
      tableName,
      columns,
      sparkCassWriteConf,
      sparkCassServerConf
    )

    internalSparkContext.runJob(rdd, sparkCassBulkWriter.write _)
  }
}

Source File: PointCloudRelation.scala From geotrellis-pointcloud with Apache License 2.0

5 votes

package geotrellis.pointcloud.spark.datasource

import geotrellis.pointcloud.spark.store.hadoop._
import geotrellis.pointcloud.spark.store.hadoop.HadoopPointCloudRDD.{Options => HadoopOptions}
import geotrellis.pointcloud.util.Filesystem
import geotrellis.proj4.CRS
import geotrellis.store.hadoop.util.HdfsUtils
import geotrellis.vector.Extent

import cats.implicits._
import io.pdal._
import io.circe.syntax._
import org.apache.hadoop.fs.Path
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, SQLContext}

import java.io.File

import scala.collection.JavaConverters._

// This class has to be serializable since it is shipped over the network.
class PointCloudRelation(
  val sqlContext: SQLContext,
  path: String,
  options: HadoopOptions
) extends BaseRelation with TableScan with Serializable {

  @transient implicit lazy val sc: SparkContext = sqlContext.sparkContext

  // TODO: switch between HadoopPointCloudRDD and S3PointcCloudRDD
  lazy val isS3: Boolean = path.startsWith("s3")

  override def schema: StructType = {
    lazy val (local, fixedPath) =
      if(path.startsWith("s3") || path.startsWith("hdfs")) {
        val tmpDir = Filesystem.createDirectory()
        val remotePath = new Path(path)
        // copy remote file into local tmp dir
        val localPath = new File(tmpDir, remotePath.getName)
        HdfsUtils.copyPath(remotePath, new Path(s"file:///${localPath.getAbsolutePath}"), sc.hadoopConfiguration)
        (true, localPath.toString)
      } else (false, path)

    val localPipeline =
      options.pipeline
        .hcursor
        .downField("pipeline").downArray
        .downField("filename").withFocus(_ => fixedPath.asJson)
        .top.fold(options.pipeline)(identity)

    val pl = Pipeline(localPipeline.noSpaces)
    if (pl.validate()) pl.execute()
    val pointCloud = try {
      pl.getPointViews().next().getPointCloud(0)
    } finally {
      pl.close()
      if(local) println(new File(fixedPath).delete)
    }

    val rdd = HadoopPointCloudRDD(new Path(path), options)

    val md: (Option[Extent], Option[CRS]) =
      rdd
        .map { case (header, _) => (header.projectedExtent3D.map(_.extent3d.toExtent), header.crs) }
        .reduce { case ((e1, c), (e2, _)) => ((e1, e2).mapN(_ combine _), c) }

    val metadata = new MetadataBuilder().putString("metadata", md.asJson.noSpaces).build

    pointCloud.deriveSchema(metadata)
  }

  override def buildScan(): RDD[Row] = {
    val rdd = HadoopPointCloudRDD(new Path(path), options)
    rdd.flatMap { _._2.flatMap { pc => pc.readAll.toList.map { k => Row(k: _*) } } }
  }
}

Source File: PointCloudToDem.scala From geotrellis-pointcloud with Apache License 2.0

5 votes

package geotrellis.pointcloud.spark.dem

import io.pdal._
import geotrellis.layer._
import geotrellis.raster._
import geotrellis.spark._
import geotrellis.util._
import geotrellis.vector._

import org.apache.spark.rdd.RDD

object PointCloudToDem {
  def apply[M: GetComponent[*, LayoutDefinition]](rdd: RDD[(SpatialKey, PointCloud)] with Metadata[M], tileDimensions: (Int, Int), options: PointToGrid.Options): RDD[(SpatialKey, Tile)] with Metadata[LayoutDefinition] =
    apply[M](rdd, options) { e => RasterExtent(e, tileDimensions._1, tileDimensions._2) }

  def apply[M: GetComponent[*, LayoutDefinition]](rdd: RDD[(SpatialKey, PointCloud)] with Metadata[M], cellSize: CellSize, options: PointToGrid.Options): RDD[(SpatialKey, Tile)] with Metadata[LayoutDefinition] =
   apply[M](rdd, options) { e => RasterExtent(e, cellSize) }

  def apply[M: GetComponent[*, LayoutDefinition]](rdd: RDD[(SpatialKey, PointCloud)] with Metadata[M], options: PointToGrid.Options)(createRE: Extent => RasterExtent): RDD[(SpatialKey, Tile)] with Metadata[LayoutDefinition] = {
    val layoutDefinition = rdd.metadata.getComponent[LayoutDefinition]
    val mapTransform = layoutDefinition.mapTransform

    val result =
      rdd
        .collectNeighbors
        .mapPartitions({ partition =>
          partition.map { case (key, neighbors) =>
            val extent = mapTransform(key)
            val raster =
              PointToGrid.createRaster(neighbors.map(_._2._2), createRE(extent), options)
            (key, raster.tile)
          }
        }, preservesPartitioning = true)

    ContextRDD(result, layoutDefinition)
  }
}

Source File: BufferUnionable.scala From geotrellis-pointcloud with Apache License 2.0

5 votes

package geotrellis.pointcloud.spark.buffer

import geotrellis.layer._

import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

object BufferUnionable {

  
  def apply[
    K: SpatialComponent,
    X <: { def union(other: Any): V },
    V: (? => X) : ClassTag
  ](rdd: RDD[(K, V)]): RDD[(K, V)] = {
    rdd
      .flatMap({ case (key, data) =>
        val SpatialKey(col, row) = key

        for (deltaX <- -1 to +1; deltaY <- -1 to +1) yield {
          if (deltaX == 0 && deltaY == 0)
            (SpatialKey(col + deltaX, row + deltaY), (key, data, true))
          else
            (SpatialKey(col + deltaX, row + deltaY), (key, data, false))
        }
      })
      .groupByKey
      .filter({ case (_, seq) => seq.exists { case (_, _, center) => center } })
      .map({ case (sortKey, seq) =>
        val resultKey = seq.filter({ case (_, _, center) => center }).head._1
        val resultValue = seq.map({ case (_, data, _) => data }).reduce(_ union _)

        (resultKey, resultValue)
      })
  }

}

Source File: HadoopPointCloudRDD.scala From geotrellis-pointcloud with Apache License 2.0

5 votes

package geotrellis.pointcloud.spark.store.hadoop

import geotrellis.pointcloud.spark.store.hadoop.formats._
import geotrellis.store.hadoop._
import geotrellis.vector.Extent

import io.circe.Json
import io.pdal._
import io.pdal.pipeline._
import org.apache.hadoop.fs.Path
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD


  def apply(path: Path, options: Options = Options.DEFAULT)(implicit sc: SparkContext): RDD[(HadoopPointCloudHeader, List[PointCloud])] = {
    val conf = sc.hadoopConfiguration.withInputDirectory(path, options.filesExtensions)

    options.tmpDir.foreach(PointCloudInputFormat.setTmpDir(conf, _))
    options.dimTypes.foreach(PointCloudInputFormat.setDimTypes(conf, _))
    PointCloudInputFormat.setPipeline(conf, options.pipeline)

    options.filterExtent match {
      case Some(filterExtent) =>
        PointCloudInputFormat.setFilterExtent(conf, filterExtent)

        sc.newAPIHadoopRDD(
          conf,
          classOf[PointCloudInputFormat],
          classOf[HadoopPointCloudHeader],
          classOf[List[PointCloud]]
        ).filter { case (header, _) =>
          header.extent3D.map(_.toExtent.intersects(filterExtent)).getOrElse(false)
        }
      case None =>
        sc.newAPIHadoopRDD(
          conf,
          classOf[PointCloudInputFormat],
          classOf[HadoopPointCloudHeader],
          classOf[List[PointCloud]]
        )
    }
  }
}

Source File: S3PointCloudRDD.scala From geotrellis-pointcloud with Apache License 2.0

5 votes

package geotrellis.pointcloud.spark.store.s3

import geotrellis.pointcloud.spark.store.hadoop.formats.PointCloudInputFormat
import geotrellis.spark.store.s3._
import geotrellis.store.s3.S3ClientProducer
import geotrellis.vector.Extent
import io.circe._
import io.pdal._
import io.pdal.pipeline._
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import software.amazon.awssdk.services.s3.S3Client


  def apply(bucket: String, prefix: String, options: Options = Options.DEFAULT)(implicit sc: SparkContext): RDD[(S3PointCloudHeader, List[PointCloud])] = {
    val conf = sc.hadoopConfiguration

    S3InputFormat.setBucket(conf, bucket)
    S3InputFormat.setPrefix(conf, prefix)
    S3InputFormat.setExtensions(conf, options.filesExtensions)
    S3InputFormat.setCreateS3Client(conf, options.getClient)
    options.numPartitions.foreach(S3InputFormat.setPartitionCount(conf, _))
    options.partitionBytes.foreach(S3InputFormat.setPartitionBytes(conf, _))

    options.tmpDir.foreach(PointCloudInputFormat.setTmpDir(conf, _))
    options.dimTypes.foreach(PointCloudInputFormat.setDimTypes(conf, _))
    PointCloudInputFormat.setPipeline(conf, options.pipeline)

    options.filterExtent match {
      case Some(filterExtent) =>
        PointCloudInputFormat.setFilterExtent(conf, filterExtent)

        sc.newAPIHadoopRDD(
          conf,
          classOf[S3PointCloudInputFormat],
          classOf[S3PointCloudHeader],
          classOf[List[PointCloud]]
        ).filter { case (header, _) => header.extent3D.exists(_.toExtent.intersects(filterExtent)) }
      case None =>
        sc.newAPIHadoopRDD(
          conf,
          classOf[S3PointCloudInputFormat],
          classOf[S3PointCloudHeader],
          classOf[List[PointCloud]]
        )
    }
  }
}

Source File: MlLibOnKudu.scala From Taxi360 with Apache License 2.0

5 votes

package com.hadooparchitecturebook.taxi360.etl.machinelearning.kudu

import com.hadooparchitecturebook.taxi360.model.{NyTaxiYellowTrip, NyTaxiYellowTripBuilder}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object MlLibOnKudu {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> " +
        "<kuduMaster> " +
        "<taxiTable> " +
        "<numOfCenters> " +
        "<numOfIterations> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val taxiTable = args(2)
    val numOfCenters = args(3).toInt
    val numOfIterations = args(4).toInt

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val sqlContext = new SQLContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> taxiTable,
      "kudu.master" -> kuduMaster)

    sqlContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load.
      registerTempTable("ny_taxi_trip_tmp")

    //Vector
    val vectorRDD:RDD[Vector] = sqlContext.sql("select * from ny_taxi_trip_tmp").map(r => {
      val taxiTrip = NyTaxiYellowTripBuilder.build(r)
      generateVectorOnly(taxiTrip)
    })

    println("--Running KMeans")
    val clusters = KMeans.train(vectorRDD, numOfCenters, numOfIterations)
    println(" > vector centers:")
    clusters.clusterCenters.foreach(v => println(" >> " + v))

    println("--Running corr")
    val correlMatrix: Matrix = Statistics.corr(vectorRDD, "pearson")
    println(" > corr: " + correlMatrix.toString)

    println("--Running colStats")
    val colStats = Statistics.colStats(vectorRDD)
    println(" > max: " + colStats.max)
    println(" > count: " + colStats.count)
    println(" > mean: " + colStats.mean)
    println(" > min: " + colStats.min)
    println(" > normL1: " + colStats.normL1)
    println(" > normL2: " + colStats.normL2)
    println(" > numNonZeros: " + colStats.numNonzeros)
    println(" > variance: " + colStats.variance)

    //Labeled Points
    
}

Source File: SolRSupport.scala From Taxi360 with Apache License 2.0

5 votes

package com.hadooparchitecturebook.taxi360.streaming.ingestion.solr

import java.net.{ConnectException, SocketException}
import java.util

import org.apache.solr.client.solrj.impl.CloudSolrServer
import org.apache.solr.client.solrj.request.UpdateRequest
import org.apache.solr.common.{SolrException, SolrInputDocument}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream


object SolRSupport {
  def indexDStreamOfDocs(zkHost:String,
                         collection:String,
                         batchSize:Int,
                         docDStream:DStream[SolrInputDocument]): Unit ={
    docDStream.foreachRDD(docRdd => {
      indexDoc(zkHost, collection, batchSize, docRdd)
    })
  }

  def indexDoc(zkHost:String,
               collection:String,
               batchSize:Int,
               docRdd:RDD[SolrInputDocument]): Unit = {
    docRdd.foreachPartition(it => {
      val solrServer = CloudSolRServerBuilder.build(zkHost)

      val batch = new util.ArrayList[SolrInputDocument]()

      while (it.hasNext) {
        val inputDoc = it.next()
        batch.add(inputDoc)
        if (batch.size() >= batchSize)
          sendBatchToSolr(solrServer, collection, batch)
      }
      if (!batch.isEmpty())
        sendBatchToSolr(solrServer, collection, batch)
    })
  }

  def sendBatchToSolr( solrServer: CloudSolrServer,
                       collection:String,
                       batch:util.Collection[SolrInputDocument]) {
    val req = new UpdateRequest()
    req.setParam("collection", collection)

    req.add(batch)
    try {
      solrServer.request(req)
    } catch  {
      case e:Exception => {
        if (shouldRetry(e)) {
          try {
            Thread.sleep(2000)
          } catch {
            case e1: InterruptedException => {
              Thread.interrupted()
            }
          }

          try {
            solrServer.request(req)
          } catch {
            case e1: Exception => {

              if (e1.isInstanceOf[RuntimeException]) {
                throw e1.asInstanceOf[RuntimeException]
              } else {
                throw new RuntimeException(e1)
              }
            }
          }
        } else {
          if (e.isInstanceOf[RuntimeException]) {
            throw e.asInstanceOf[RuntimeException]
          } else {
            throw new RuntimeException(e)
          }
        }
      }
    } finally {
      batch.clear()
    }
  }

  def shouldRetry( exc:Exception): Boolean = {
    val rootCause = SolrException.getRootCause(exc)
    rootCause.isInstanceOf[ConnectException] ||
      rootCause.isInstanceOf[SocketException]
  }
}

Source File: HBaseSQLTableScan.scala From Backup-Repo with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase.execution

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.physical.RangePartitioning
import org.apache.spark.sql.execution.LeafNode
import org.apache.spark.sql.hbase._


@DeveloperApi
case class HBaseSQLTableScan(
                              relation: HBaseRelation,
                              output: Seq[Attribute],
                              result: RDD[Row]) extends LeafNode {
  override def outputPartitioning = {
    var ordering = List[SortOrder]()
    for (key <- relation.partitionKeys) {
      ordering = ordering :+ SortOrder(key, Ascending)
    }
    RangePartitioning(ordering.toSeq, relation.partitions.size)
  }

  override protected def doExecute(): RDD[Row] = result
}

Source File: HBaseShuffledRDD.scala From Backup-Repo with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase

import org.apache.spark._
import org.apache.spark.rdd.{RDD, ShuffledRDD, ShuffledRDDPartition}

class HBaseShuffledRDD (
    prevRdd: RDD[(HBaseRawType, Array[HBaseRawType])],
    part: Partitioner,
    @transient hbPartitions: Seq[HBasePartition] = Nil) extends ShuffledRDD(prevRdd, part){

  override def getPartitions: Array[Partition] = {
    if (hbPartitions==null || hbPartitions.isEmpty) {
      Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i))
    } else {
      // only to be invoked by clients
      hbPartitions.toArray
    }
  }

  override def getPreferredLocations(split: Partition): Seq[String] = {
    if (hbPartitions==null || hbPartitions.isEmpty) {
      Seq.empty
    } else {
      split.asInstanceOf[HBasePartition].server.map {
        identity[String]
      }.toSeq
    }
  }
}

Source File: RDFS11.scala From SparkSRE with Apache License 2.0

5 votes

package com.hj.examples

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object RDFS11 {
  def transitive(rdd:RDD[(String, String)]) = {
    var rddTuple = rdd
    val reverseTuple = rddTuple.map(x => (x._2, x._1))

    var cur = 0L
    var pre = rddTuple.count
    var flag = true
    while (flag) {
      val joined = reverseTuple.join(rddTuple)
      val res = joined.map(x => x._2)
      rddTuple = rddTuple.union(res).distinct
      cur = rddTuple.count
      if(pre == cur) flag = false
      pre = cur
    }
    rddTuple
  }

  def main(args: Array[String]): Unit = {
    if(args.length != 2) {
      System.out.println("Arguments are invalid! \nExample: <input_path> <output_path>")
      System.exit(1)
    }
    val inputPath = args(0)
    val outputPath = args(1)

    val conf = new SparkConf().setAppName("RDFS11").setMaster("local[2]")
    val sc = new SparkContext(conf)

    val lines = sc.textFile(inputPath)

    val triples = lines.map(x => {
      val arr = x.split(" ")
      (arr(0), arr(1), arr(2))
    })

    

    var subClass = triples.filter(x => x._2.equals("rdfs:subClassOf")).map(x => (x._1, x._3))
    subClass = transitive(subClass)

    subClass.foreach(x => println(x))
    subClass.saveAsTextFile(outputPath)
  }
}

Source File: RDFS5.scala From SparkSRE with Apache License 2.0

5 votes

package com.hj.examples

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object RDFS5 {
  def transitive(rdd:RDD[(String, String)]) = {
    var rddTuple = rdd
    val reverseTuple = rddTuple.map(x => (x._2, x._1))

    var cur = 0L
    var pre = rddTuple.count
    var flag = true
    while (flag) {
      val joined = reverseTuple.join(rddTuple)
      val res = joined.map(x => x._2)
      rddTuple = rddTuple.union(res).distinct
      cur = rddTuple.count
      if(pre == cur) flag = false
      pre = cur
    }
    rddTuple
  }

  def main(args: Array[String]): Unit = {
    if(args.length != 2) {
      System.out.println("Arguments are invalid! \nExample: <input_path> <output_path>")
      System.exit(1)
    }
    val inputPath = args(0)
    val outputPath = args(1)

    val conf = new SparkConf().setAppName("RDFS5").setMaster("local[2]")
    val sc = new SparkContext(conf)

    val lines = sc.textFile(inputPath)

    val triples = lines.map(x => {
      val arr = x.split(" ")
      (arr(0), arr(1), arr(2))
    })

    

    var subProp = triples.filter(x => x._2.equals("rdfs:subPropertyOf")).map(x => (x._1, x._3))
    subProp = transitive(subProp)

    subProp.foreach(x => println(x))
    subProp.saveAsTextFile(outputPath)
  }
}

Source File: DFConverter.scala From flint with Apache License 2.0

5 votes

package org.apache.spark.sql

import com.twosigma.flint.rdd.OrderedRDD
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.StructType


object DFConverter {

  def newDataFrame(df: DataFrame): DataFrame = {
    new DataFrame(df.sparkSession, df.logicalPlan, RowEncoder(df.schema))
  }

  def toDataFrame(rdd: OrderedRDD[Long, InternalRow], schema: StructType): DataFrame = {
    val spark = SparkSession.builder().getOrCreate()
    val internalRows = rdd.values
    spark.internalCreateDataFrame(internalRows, schema)
  }

  def toDataFrame(rdd: RDD[InternalRow], schema: StructType): DataFrame = {
    val spark = SparkSession.builder().getOrCreate()
    spark.internalCreateDataFrame(rdd, schema)
  }

}

Source File: WeightedLabeledPoint.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.math.stats.regression

import breeze.linalg.DenseVector
import org.apache.spark.mllib.random.RandomRDDs
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext

case class WeightedLabeledPoint(label: Double, weight: Double, features: DenseVector[Double]) {
  
  def generateSampleData(sc: SparkContext, weights: DenseVector[Double], intercept: Double,
    numRows: Long = 100L, numPartitions: Int = 4, errorScalar: Double = 1.0,
    seed: Long = 1L): RDD[WeightedLabeledPoint] = {
    val len = weights.length + 2
    // The last entry will serve as the weight of point and the second last entry will serve
    // as noisy of the label.
    val data = RandomRDDs.normalVectorRDD(sc, numRows, len, numPartitions, seed)
    data.map { d =>
      val fw = d.toArray
      val x = new DenseVector(fw.dropRight(2))
      WeightedLabeledPoint(
        weights.dot(x) + intercept + errorScalar * fw(len - 2),
        Math.abs(fw(len - 1)) + 0.5, x
      )
    }
  }
}

Source File: OLSMultipleLinearRegression.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.math.stats.regression

import org.apache.spark.rdd.RDD
import breeze.linalg.{ DenseMatrix, DenseVector }

object OLSMultipleLinearRegression {

  
  def regression(input: RDD[WeightedLabeledPoint], intercept: Boolean = true): LinearRegressionModel = {
    // Try to get the number of columns
    val nCols = if (intercept) {
      input.first.features.length + 1
    } else {
      input.first.features.length
    }

    val (xx, xy, swx, srwsl, ssrw, wsl, sw, n, lw) = input.treeAggregate((
      new DenseMatrix[Double](nCols, nCols), // 1. Calculate a k-by-k matrix X^TX.
      new DenseVector[Double](nCols), // 2. Calculate a k-dimension vector X^Ty.
      new DenseVector[Double](nCols), // 3. Calculate a k-dimension vector of weighted sum of X.
      0.0, // 4. Calculate the square root weighted sum of labels.
      0.0, // 5. Calculate the sum of square root of weights.
      0.0, // 6. Calculate the weighted sum of labels.
      0.0, // 7. Calculate the sum of weights.
      0: Long, // 8. Calculate the length of input.
      0.0 // 9. Calculate sum of log weights
    ))(
      // U is a pair of matrix and vector and v is a WeightedLabeledPoint.
      seqOp = (U, v) => {
      // Append 1.0 at the head for calculating intercept.
      val x = if (intercept) {
        DenseVector.vertcat(DenseVector(1.0), v.features)
      } else {
        v.features
      }
      val wx = x * v.weight
      val sqrtW = Math sqrt v.weight
      // Unfortunately, breeze.linalg.DenseVector does not support tensor product.
      (U._1 += wx.asDenseMatrix.t * x.asDenseMatrix,
        U._2 += wx * v.label,
        U._3 += wx,
        U._4 + v.label * sqrtW,
        U._5 + sqrtW,
        U._6 + v.label * v.weight,
        U._7 + v.weight,
        U._8 + 1,
        U._9 + math.log(v.weight))
    }, combOp = (U1, U2) => (
      U1._1 += U2._1,
      U1._2 += U2._2,
      U1._3 += U2._3,
      U1._4 + U2._4,
      U1._5 + U2._5,
      U1._6 + U2._6,
      U1._7 + U2._7,
      U1._8 + U2._8,
      U1._9 + U2._9
    )
    )
    LinearRegressionModel(input, intercept, n, (xx + xx.t) :/ 2.0, xy, swx, srwsl, ssrw, wsl, sw, lw)
  }
}

Source File: PartitionsIterator.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.rdd

import grizzled.slf4j.Logger

import org.apache.spark.rdd.RDD
import org.apache.spark.{ Partition, TaskContext }

protected[flint] object PartitionsIterator {
  val logger = Logger(PartitionsIterator.getClass)

  def apply[T](
    rdd: RDD[T],
    partitions: Seq[Partition],
    context: TaskContext,
    preservesPartitionsOrdering: Boolean = false // FIXME: This is a band-aid which should be fixed.
  ): PartitionsIterator[T] = new PartitionsIterator(rdd, partitions, context, preservesPartitionsOrdering)
}


  def headPartitionIndex: Int = curPart.index
}

Source File: TreeReduce.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.rdd.function.summarize

import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

object TreeReduce {

  
  def apply[T: ClassTag](
    rdd: RDD[T]
  )(
    f: (T, T) => T,
    depth: Int = 2
  ): T = {
    require(depth >= 1, s"Depth must be greater than or equal to 1 but got $depth.")

    val reducePartition: Iterator[T] => Option[T] = iter => {
      if (iter.hasNext) {
        Some(iter.reduceLeft(f))
      } else {
        None
      }
    }

    val partiallyReduced = rdd.mapPartitions(it => Iterator(reducePartition(it)))

    val op: (Option[T], Option[T]) => Option[T] = (c, x) => {
      if (c.isDefined && x.isDefined) {
        Some(f(c.get, x.get))
      } else if (c.isDefined) {
        c
      } else if (x.isDefined) {
        x
      } else {
        None
      }
    }

    TreeAggregate(partiallyReduced)(Option.empty[T], op, op, depth).getOrElse(
      sys.error("Empty collection.")
    )
  }

}

Source File: PythonUtils.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.rdd

import com.twosigma.flint.timeseries.{ TimeSeriesRDD, TimeSeriesRDDImpl }
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.types._
import org.apache.spark.sql.{ CatalystTypeConvertersWrapper, Row }

private[rdd] case class SchemaColumnInfo(idx: Int, clazz: Class[_ <: Ordered[_]], dataType: DataType)

case class TimeSeriesRDDWithSchema(rdd: TimeSeriesRDDImpl, schema: StructType)

object PythonUtils {
  
  def fromUnsortedRDD(
    sc: SparkContext,
    rdd: RDD[Row],
    schema: StructType,
    keyColumn: String
  ): TimeSeriesRDDImpl = {
    val orderedRdd = OrderedRDD.fromRDD(formatRDD[Long](rdd, schema, keyColumn), KeyPartitioningType.UnSorted)
    TimeSeriesRDD.fromOrderedRDD(orderedRdd, schema).asInstanceOf[TimeSeriesRDDImpl]
  }

  def toOrderedRDD(
    rdd: RDD[Row],
    schema: StructType,
    keyColumn: String,
    ranges: Seq[CloseOpen[Long]]
  ): OrderedRDD[Long, InternalRow] = {
    val keyIdx = schema.fieldIndex(keyColumn)
    val converter = CatalystTypeConvertersWrapper.toCatalystRowConverter(schema)
    OrderedRDD.fromRDD(rdd.map(row => (row.getAs[Long](keyIdx), converter(row))), ranges)
  }
}

Source File: TimeSeriesRDDConversionSpec.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.timeseries

import java.util.concurrent.TimeUnit

import com.twosigma.flint.timeseries.row.Schema
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{ SQLContext, DataFrame, Row }
import org.apache.spark.sql.types._
import org.apache.spark.sql.catalyst.expressions.{ GenericRowWithSchema => ExternalRow }
import org.scalatest.tagobjects.Slow

class TimeSeriesRDDConversionSpec extends TimeSeriesSuite {

  // The largest prime < 100
  override val defaultPartitionParallelism = 97

  // The 10000-th prime.
  private val defaultNumRows = 104729

  private def createDataFrame(isSorted: Boolean = true)(implicit sqlContext: SQLContext): DataFrame = {
    val n = defaultNumRows
    val schema = Schema("value" -> DoubleType)
    val rdd: RDD[Row] = sqlContext.sparkContext.parallelize(1 to n, defaultPartitionParallelism).map { i =>
      val data: Array[Any] = if (isSorted) {
        Array((i / 100).toLong, i.toDouble)
      } else {
        Array(((i + 1 - n) / 100).toLong, i.toDouble)
      }
      new ExternalRow(data, schema)
    }
    sqlContext.createDataFrame(rdd, schema)
  }

  "TimeSeriesRDD" should "convert from a sorted DataFrame correctly" taggedAs (Slow) in {
    implicit val _sqlContext = sqlContext
    (1 to 10).foreach {
      i =>
        val tsRdd = TimeSeriesRDD.fromDF(createDataFrame(isSorted = true))(isSorted = true, TimeUnit.NANOSECONDS)
        assert(tsRdd.count() == defaultNumRows)
    }
    (1 to 10).foreach {
      i =>
        val tsRdd = TimeSeriesRDD.fromDF(createDataFrame(isSorted = true))(isSorted = false, TimeUnit.NANOSECONDS)
        assert(tsRdd.count() == defaultNumRows)
    }
    (1 to 10).foreach {
      i =>
        val tsRdd = TimeSeriesRDD.fromDF(createDataFrame(isSorted = false))(isSorted = false, TimeUnit.NANOSECONDS)
        assert(tsRdd.count() == defaultNumRows)
    }
    (1 to 10).foreach {
      i =>
        val tsRdd = TimeSeriesRDD.fromDF(
          createDataFrame(isSorted = false).sort("time")
        )(
            isSorted = true, TimeUnit.NANOSECONDS
          )
        assert(tsRdd.count() == defaultNumRows)
    }
  }
}

Source File: ParallelCollectionRDD.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.rdd

import org.apache.spark.rdd.RDD
import org.apache.spark.{ Partition, SparkContext, TaskContext }

import scala.reflect.ClassTag



case class ParallelCollectionRDDPartition[T: ClassTag](
  override val index: Int,
  values: Seq[T]
) extends Partition

class ParallelCollectionRDD[T: ClassTag](
  sc: SparkContext,
  @transient data: Seq[Seq[T]]
) extends RDD[T](sc, Nil) {
  override def compute(split: Partition, context: TaskContext): Iterator[T] =
    split.asInstanceOf[ParallelCollectionRDDPartition[T]].values.iterator

  override protected def getPartitions: Array[Partition] =
    data.zipWithIndex.map {
      case (d, index) =>
        ParallelCollectionRDDPartition(index, d)
    }.toArray
}

Source File: OverlappedOrderedRDDSpec.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.rdd

import com.twosigma.flint.SharedSparkContext
import org.apache.spark.rdd.RDD
import org.scalatest.FlatSpec

class OverlappedOrderedRDDSpec extends FlatSpec with SharedSparkContext {

  val numSlices: Int = 3

  val sliceLength: Int = 4

  var rdd: RDD[(Int, Int)] = _

  var orderedRdd: OrderedRDD[Int, Int] = _

  var overlappedOrderedRdd: OverlappedOrderedRDD[Int, Int] = _

  private def window(t: Int): (Int, Int) = (t - 2, t)

  override def beforeAll() {
    super.beforeAll()
    val s = sliceLength
    rdd = sc.parallelize(0 until numSlices, numSlices).flatMap {
      i => (1 to s).map { j => i * s + j }
    }.map { x => (x, x) }
    orderedRdd = OrderedRDD.fromRDD(rdd, KeyPartitioningType.Sorted)
    overlappedOrderedRdd = OverlappedOrderedRDD(orderedRdd, window)
  }

  "The OverlappedOrderedRDD" should "be constructed from `OrderedRDD` correctly" in {
    assert(overlappedOrderedRdd.rangeSplits.deep == orderedRdd.rangeSplits.deep)
    val benchmark = Array(1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9, 8, 9, 10, 11, 12).map { x => (x, x) }
    assert(overlappedOrderedRdd.collect().deep == benchmark.deep)
  }

  it should "be able to remove overlapped rows to get an `OrderedRDD` correctly" in {
    assert(overlappedOrderedRdd.rangeSplits.deep == orderedRdd.rangeSplits.deep)
    assert(overlappedOrderedRdd.nonOverlapped().collect().deep == orderedRdd.collect().deep)
  }

  it should "`mapPartitionsWithIndexOverlapped` correctly" in {
    val mapped = overlappedOrderedRdd.mapPartitionsWithIndexOverlapped(
      (index, iterator) => iterator.map { case (k, v) => (k, v * 2) }
    )
    val benchmark = Array(1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9, 8, 9, 10, 11, 12).map { x => (x, 2 * x) }
    assert(mapped.collect().deep == benchmark.deep)
  }
}

Source File: RDDKafkaWriter.scala From spark-kafka-writer with Apache License 2.0

5 votes

package com.github.benfradet.spark.kafka.writer

import org.apache.kafka.clients.producer.{Callback, ProducerRecord}
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag


  override def writeToKafka[K, V](
    producerConfig: Map[String, Object],
    transformFunc: T => ProducerRecord[K, V],
    callback: Option[Callback] = None
  ): Unit =
    rdd.foreachPartition { partition =>
      val producer = KafkaProducerCache.getProducer[K, V](producerConfig)
      partition
        .map(transformFunc)
        .foreach(record => producer.send(record, callback.orNull))
    }
}

Source File: DStreamKafkaWriterSpec.scala From spark-kafka-writer with Apache License 2.0

5 votes

package com.github.benfradet.spark.kafka.writer

import org.apache.kafka.clients.producer._
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream

import scala.collection.mutable
import scala.concurrent.duration._

class DStreamKafkaWriterSpec extends SKRSpec {

  "a DStreamKafkaWriter" when {
    "given a dstream" should {
      "write its content to Kafka" in {
        val localTopic = topic
        val msgs = (1 to 10).map(_.toString)
        val stream = createDStream(msgs)
        stream.writeToKafka(
          producerConfig,
          s => new ProducerRecord[String, String](localTopic, s)
        )

        val results = collect(ssc, localTopic)

        ssc.start()
        eventually(timeout(30.seconds), interval(1.second)) {
          results shouldBe msgs
        }
      }

      "trigger a given callback for every write to Kafka" in {
        val localTopic = topic
        val msgs = (1 to 10).map(_.toString)
        val stream = createDStream(msgs)
        stream.writeToKafka(
          producerConfig,
          s => new ProducerRecord[String, String](localTopic, s),
          Some(new Callback with Serializable {
            override def onCompletion(metadata: RecordMetadata, exception: Exception): Unit = {
              SKRSpec.callbackTriggerCount.incrementAndGet()
            }
          })
        )

        ssc.start()
        eventually(timeout(30.seconds), interval(1.second)) {
          SKRSpec.callbackTriggerCount.get() shouldBe msgs.size
        }
      }
    }
  }

  private def createDStream(seq: Seq[String]): DStream[String] = {
    val q = mutable.Queue.empty[RDD[String]]
    q.enqueue(ssc.sparkContext.makeRDD(seq))
    ssc.queueStream(q)
  }
}

Source File: StreamingExample.scala From reactiveinflux-spark with Apache License 2.0

5 votes

package com.pygmalios.reactiveinflux.spark.examples

import com.pygmalios.reactiveinflux._
import com.pygmalios.reactiveinflux.spark._
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.joda.time.DateTime

import scala.concurrent.duration._


object StreamingExample extends App {
  val conf = new SparkConf()
    .setMaster("local[*]")
    .setAppName("Example")
  val ssc = new StreamingContext(conf, Seconds(1))

  val point1 = Point(
    time        = DateTime.now(),
    measurement = "measurement1",
    tags        = Map(
      "tagKey1" -> "tagValue1",
      "tagKey2" -> "tagValue2"),
    fields      = Map(
      "fieldKey1" -> "fieldValue1",
      "fieldKey2" -> 10.7)
  )

  // Provide settings for reactiveinflux
  implicit val params = ReactiveInfluxDbName("example")
  implicit val awaitAtMost = 1.second

  // Create DStream of Influx points
  val queue = new scala.collection.mutable.Queue[RDD[Point]]
  val queueStream: DStream[Point] = ssc.queueStream(queue)

  // Add single RDD with a single Influx point to the DStream
  queue.enqueue(ssc.sparkContext.parallelize(Seq(point1)))

  // Save DStream to Influx
  queueStream.saveToInflux()

  // Start Spark streaming
  ssc.start()
  ssc.awaitTermination()
}

Source File: Example.scala From reactiveinflux-spark with Apache License 2.0

5 votes

package com.pygmalios.reactiveinflux.spark.examples

import com.pygmalios.reactiveinflux._
import com.pygmalios.reactiveinflux.spark._
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.joda.time.DateTime

import scala.concurrent.duration._


object Example extends App {
  val conf = new SparkConf()
    .setMaster("local[*]")
    .setAppName("Example")
  val sc = new SparkContext(conf)

  val point1 = Point(
    time        = DateTime.now(),
    measurement = "measurement1",
    tags        = Map(
      "tagKey1" -> "tagValue1",
      "tagKey2" -> "tagValue2"),
    fields      = Map(
      "fieldKey1" -> "fieldValue1",
      "fieldKey2" -> 10.7)
  )

  // Provide settings for reactiveinflux
  implicit val params = ReactiveInfluxDbName("example")
  implicit val awaitAtMost = 1.second

  // Create RDD with Influx point
  val rdd: RDD[Point] = sc.parallelize(Seq(point1))

  // Save RDD to Influx
  rdd.saveToInflux()

  // Stop Spark context
  sc.stop()
}

Source File: PointRDDExtensions.scala From reactiveinflux-spark with Apache License 2.0

5 votes

package com.pygmalios.reactiveinflux.spark.extensions

import com.pygmalios.reactiveinflux.spark.config.ReactiveInfluxSparkConfig
import com.pygmalios.reactiveinflux.spark.{RDDExtensions, _}
import com.pygmalios.reactiveinflux.{PointNoTime, ReactiveInfluxDbName}
import org.apache.spark.rdd.RDD
import org.slf4j.LoggerFactory

import scala.concurrent.duration.Duration

private[spark] class PointRDDExtensions[+T <: PointNoTime](rdd: RDD[T]) extends RDDExtensions[T] {
  import PointRDDExtensions._

  override def saveToInflux()(implicit reactiveInfluxDbName: ReactiveInfluxDbName,
                              awaitAtMost: Duration): Unit = {
    // Process each partition separately
    totalBatchCount = 0
    totalPointCount = 0
    rdd.foreachPartition { partition =>
      withInflux { db =>
        val batchSize = ReactiveInfluxSparkConfig(db.config).sparkBatchSize

        // Write points in batches
        var batchCount = 0
        var pointCount = 0
        partition.sliding(batchSize, batchSize).foreach { batch =>
          // Write single batch
          db.write(batch)

          // Statistics for logging
          batchCount += 1
          pointCount += batch.size
        }

        totalBatchCount += batchCount
        totalPointCount += pointCount

        log.debug(s"Partition with $pointCount points written to Influx in $batchCount batches.")
      }
    }
    log.info(s"RDD with ${rdd.partitions.size} partitions and $totalPointCount points written to Influx in $totalBatchCount batches.")
  }
}

object PointRDDExtensions {
  private val log = LoggerFactory.getLogger(classOf[PointRDDExtensions[_]])

  // This makes sense for testing purposes only
  private[reactiveinflux] var totalBatchCount = 0
  private[reactiveinflux] var totalPointCount = 0
}

Source File: MSNBCStreamingExample.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License

5 votes

package com.github.maxpumperla.ml_spark.streaming

import org.apache.spark.mllib.fpm.PrefixSpan
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

object MSNBCStreamingExample extends App {

    val conf = new SparkConf()
      .setAppName("MSNBC data initial streaming example")
      .setMaster("local[4]")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, batchDuration = Seconds(10))

    val transactions: RDD[Array[Int]] = sc.textFile("src/main/resources/msnbc990928.seq") map { line =>
      line.split(" ").map(_.toInt)
    }
    val trainSequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache()
    val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15)
    val psModel = prefixSpan.run(trainSequences)
    val freqSequences = psModel.freqSequences.map(_.sequence).collect()


    val rawSequences: DStream[String] = ssc.socketTextStream("localhost", 9999)

    val sequences: DStream[Array[Array[Int]]] = rawSequences
      .map(line => line.split(" ").map(_.toInt))
      .map(_.map(Array(_)))

    print(">>> Analysing new batch of data")
    sequences.foreachRDD(
      rdd => rdd.foreach(
        array => {
          println(">>> Sequence: ")
          println(array.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]"))
          freqSequences.count(_.deep == array.deep) match {
            case count if count > 0 => println("is frequent!")
            case _ => println("is not frequent.")
          }
        }
      )
    )
    print(">>> done")

    ssc.start()
    ssc.awaitTermination()

}

Source File: MSNBCPatternMining.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License

5 votes

package com.github.maxpumperla.ml_spark.streaming

import org.apache.spark.mllib.fpm.{FPGrowth, PrefixSpan}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}


object MSNBCPatternMining extends App {

    val conf = new SparkConf()
      .setAppName("MSNBC.com data pattern mining")
      .setMaster("local[4]")
    val sc = new SparkContext(conf)

    val transactionTest = sc.parallelize(Array(Array("A", "B", "C"), Array("B", "C", "A")))
    val fp = new FPGrowth().setMinSupport(0.8).setNumPartitions(5)
    fp.run(transactionTest)

    val transactions: RDD[Array[Int]] = sc.textFile("./msnbc990928.seq") map { line =>
      line.split(" ").map(_.toInt)
    }

    // NOTE: Caching data is recommended
    val uniqueTransactions: RDD[Array[Int]] = transactions.map(_.distinct).cache()


    val fpGrowth = new FPGrowth().setMinSupport(0.01)
    val model = fpGrowth.run(uniqueTransactions)
    val count = uniqueTransactions.count()

    model.freqItemsets.collect().foreach { itemset =>
      if (itemset.items.length >= 3)
        println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq / count.toDouble )
    }

    val rules = model.generateAssociationRules(confidence = 0.4)
    rules.collect().foreach { rule =>
      println("[" + rule.antecedent.mkString(",") + "=>"
        + rule.consequent.mkString(",") + "]," + (100 * rule.confidence).round / 100.0)
    }

    val frontPageConseqRules = rules.filter(_.consequent.head == 1)
    frontPageConseqRules.count
    frontPageConseqRules.filter(_.antecedent.contains(2)).count
    rules.filter(_.antecedent.contains(7)).count


    val sequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache()

    val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15)
    val psModel = prefixSpan.run(sequences)

    psModel.freqSequences.map(fs => (fs.sequence.length, 1))
      .reduceByKey(_ + _)
      .sortByKey()
      .collect()
      .foreach(fs => println(s"${fs._1}: ${fs._2}"))

    psModel.freqSequences
      .map(fs => (fs.sequence.length, fs))
      .groupByKey()
      .map(group => group._2.reduce((f1, f2) => if (f1.freq > f2.freq) f1 else f2))
      .map(_.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]"))
      .collect.foreach(println)


    psModel.freqSequences
      .map(fs => (fs.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]"), 1))
      .reduceByKey(_ + _)
      .reduce( (f1, f2) => if (f1._2 > f2._2) f1 else f2 )


    psModel.freqSequences.reduce( (f1, f2) => if (f1.freq > f2.freq) f1 else f2 )
    psModel.freqSequences.filter(_.sequence.length == 1).map(_.sequence.toString).collect.foreach(println)

    psModel.freqSequences.collect().foreach {
      freqSequence =>
        println(
          freqSequence.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]") + ", " + freqSequence.freq
        )
    }
}

Source File: MSNBCStreamingAdvanced.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License

5 votes

package com.github.maxpumperla.ml_spark.streaming

import org.apache.spark.mllib.fpm.PrefixSpan
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

object MSNBCStreamingAdvanced extends App {

    val conf = new SparkConf()
      .setAppName("MSNBC data initial streaming example")
      .setMaster("local[4]")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, batchDuration = Seconds(10))

    val transactions: RDD[Array[Int]] = sc.textFile("src/main/resources/msnbc990928.seq") map { line =>
      line.split(" ").map(_.toInt)
    }
    val trainSequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache()
    val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15)
    val psModel = prefixSpan.run(trainSequences)
    val freqSequences = psModel.freqSequences.map(_.sequence).collect()


    val rawEvents: DStream[String] = ssc.socketTextStream("localhost", 9999)

    val events: DStream[(Int, String)] = rawEvents.map(line => line.split(": "))
        .map(kv => (kv(0).toInt, kv(1)))

    val countIds = events.map(e => (e._1, 1))
    val counts: DStream[(Int, Int)] = countIds.reduceByKey(_ + _)

    def updateFunction(newValues: Seq[Int], runningCount: Option[Int]): Option[Int] = {
      Some(runningCount.getOrElse(0) + newValues.sum)
    }
    val runningCounts = countIds.updateStateByKey[Int](updateFunction _)

    val duration = Seconds(20)
    val slide = Seconds(10)

    val rawSequences: DStream[(Int, String)] = events
      .reduceByKeyAndWindow((v1: String, v2: String) => v1 + " " + v2, duration, slide)

    val sequences: DStream[Array[Array[Int]]] = rawSequences.map(_._2)
      .map(line => line.split(" ").map(_.toInt))
      .map(_.map(Array(_)))


    print(">>> Analysing new batch of data")
    sequences.foreachRDD(
      rdd => rdd.foreach(
        array => {
          println(">>> Sequence: ")
          println(array.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]"))
          freqSequences.count(_.deep == array.deep) match {
            case count if count > 0 => println("is frequent!")
            case _ => println("is not frequent.")
          }
        }
      )
    )
    print(">>> done")

    ssc.start()
    ssc.awaitTermination()
}

Source File: GraphFromRdd.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License

5 votes

package com.github.maxpumperla.ml_spark.graphs

import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}



object GraphFromRdd extends App {

     val conf = new SparkConf()
       .setAppName("RDD graph")
       .setMaster("local[4]")
     val sc = new SparkContext(conf)

     val vertices: RDD[(VertexId, String)] = sc.parallelize(
       Array((1L, "Anne"),
         (2L, "Bernie"),
         (3L, "Chris"),
         (4L, "Don"),
         (5L, "Edgar")))

     val edges: RDD[Edge[String]] = sc.parallelize(
       Array(Edge(1L, 2L, "likes"),
         Edge(2L, 3L, "trusts"),
         Edge(3L, 4L, "believes"),
         Edge(4L, 5L, "worships"),
         Edge(1L, 3L, "loves"),
         Edge(4L, 1L, "dislikes")))

     val friendGraph: Graph[String, String] = Graph(vertices, edges)
     friendGraph.vertices.collect.foreach(println)

     friendGraph.edges.map( e => e.srcId > e.dstId ).count()

     val mappedEdgeGraph: Graph[String, Boolean] = friendGraph.mapEdges( e => e.srcId > e.dstId )

     val inDegVertexRdd: VertexRDD[Int] = friendGraph.aggregateMessages[Int](
       sendMsg = ec => ec.sendToDst(1),
       mergeMsg = (msg1, msg2) => msg1+msg2
     )
     assert(inDegVertexRdd.collect.deep == friendGraph.inDegrees.collect.deep)

     friendGraph.staticPageRank(numIter = 10).vertices.collect.foreach(println)
     friendGraph.pageRank(tol = 0.0001, resetProb = 0.15)

}

Source File: GraphFramesExample.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License

5 votes

package com.github.maxpumperla.ml_spark.graphs

import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
//import org.graphframes._


object GraphFramesExample extends App {

    val conf = new SparkConf()
      .setAppName("RDD graph")
      .setMaster("local[4]")
    val sc = new SparkContext(conf)


    val vertices: RDD[(VertexId, String)] = sc.parallelize(
      Array((1L, "Anne"),
        (2L, "Bernie"),
        (3L, "Chris"),
        (4L, "Don"),
        (5L, "Edgar")))

    val edges: RDD[Edge[String]] = sc.parallelize(
      Array(Edge(1L, 2L, "likes"),
        Edge(2L, 3L, "trusts"),
        Edge(3L, 4L, "believes"),
        Edge(4L, 5L, "worships"),
        Edge(1L, 3L, "loves"),
        Edge(4L, 1L, "dislikes")))

    val friendGraph: Graph[String, String] = Graph(vertices, edges)

//    val friendGraphFrame = GraphFrame.fromGraphX(friendGraph)
//
//    friendGraphFrame.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3)").filter(
//      "e1.attr = 'trusts' OR v3.attr = 'Chris'"
//    ).collect.foreach(println)

}

Source File: GephiApp.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License

5 votes

package com.github.maxpumperla.ml_spark.graphs

import java.io.PrintWriter

import com.github.maxpumperla.ml_spark.utils.Gephi.toGexf
import org.apache.spark._
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD

object GephiApp extends App {

    val conf = new SparkConf()
      .setAppName("Gephi Test Writer")
      .setMaster("local[4]")
    val sc = new SparkContext(conf)

    val vertices: RDD[(VertexId, String)] = sc.parallelize(
      Array((1L, "Anne"),
        (2L, "Bernie"),
        (3L, "Chris"),
        (4L, "Don"),
        (5L, "Edgar")))

    val edges: RDD[Edge[String]] = sc.parallelize(
      Array(Edge(1L, 2L, "likes"),
        Edge(2L, 3L, "trusts"),
        Edge(3L, 4L, "believes"),
        Edge(4L, 5L, "worships"),
        Edge(1L, 3L, "loves"),
        Edge(4L, 1L, "dislikes")))

    val graph: Graph[String, String] = Graph(vertices, edges)

    val pw = new PrintWriter("./graph.gexf")
    pw.write(toGexf(graph))
    pw.close()
}

Source File: DCollectionGenProperties.scala From kontextfrei with Apache License 2.0

5 votes

package com.danielwestheide.kontextfrei.scalatest

import org.apache.spark.rdd.RDD
import org.scalatest.PropSpecLike
import org.scalatest.prop.GeneratorDrivenPropertyChecks

trait DCollectionGenProperties[DColl[_]]
    extends PropSpecLike
    with GeneratorDrivenPropertyChecks
    with DCollectionGen
    with KontextfreiSpec[DColl] {

  property("Can get arbitrary DCollections") {
    forAll { xs: DColl[String] =>
      ops.count(xs) === ops.collectAsArray(xs).length
    }
  }

}

class DCollectionGenStreamSpec
    extends DCollectionGenProperties[Stream]
    with StreamSpec
class DCollectionGenRDDSpec extends DCollectionGenProperties[RDD] with RDDSpec

Source File: CollectingInstancesProperties.scala From kontextfrei with Apache License 2.0

5 votes

package com.danielwestheide.kontextfrei.scalatest

import org.apache.spark.rdd.RDD
import org.scalatest.enablers.Collecting
import org.scalatest.{Inspectors, PropSpec, PropSpecLike}
import org.scalatest.prop.GeneratorDrivenPropertyChecks

trait CollectingInstancesProperties[DColl[_]]
    extends PropSpecLike
    with GeneratorDrivenPropertyChecks
    with KontextfreiSpec[DColl]
    with CollectingInstances {

  property("There is a Collecting instance for DCollection") {
    forAll { (xs: List[String]) =>
      val dcoll = ops.unit(xs)
      Inspectors.forAll(dcoll) { x =>
        assert(xs.contains(x))
      }
    }
  }

  property(
    "Collecting nature of DCollection returns the original size of the input sequence") {
    forAll { (xs: List[String]) =>
      val dcoll = ops.unit(xs)
      assert(
        implicitly[Collecting[String, DColl[String]]]
          .sizeOf(dcoll) === xs.size)
    }
  }

  property(
    "Collecting nature of DCollection returns the Some loneElement if input sequence has exactly one element") {
    forAll { (x: String) =>
      val dcoll = ops.unit(List(x))
      assert(
        implicitly[Collecting[String, DColl[String]]]
          .loneElementOf(dcoll) === Some(x))
    }
  }

  property(
    "Collecting nature of DCollection returns the None as loneElement if input sequence as more than one element") {
    forAll { (xs: List[String]) =>
      whenever(xs.size > 1) {
        val dcoll = ops.unit(xs)
        assert(
          implicitly[Collecting[String, DColl[String]]]
            .loneElementOf(dcoll)
            .isEmpty)
      }
    }
  }

  property(
    "Collecting nature of DCollection returns the None as loneElement if input sequence is empty") {
    val dcoll = ops.unit(List.empty[String])
    assert(
      implicitly[Collecting[String, DColl[String]]]
        .loneElementOf(dcoll)
        .isEmpty)
  }

}

class CollectionInstancesStreamSpec
    extends CollectingInstancesProperties[Stream]
    with StreamSpec

class CollectionInstancesRDDSpec
    extends CollectingInstancesProperties[RDD]
    with RDDSpec

Source File: RDDPairFunctions.scala From kontextfrei with Apache License 2.0

5 votes

package com.danielwestheide.kontextfrei.rdd

import com.danielwestheide.kontextfrei.DCollectionPairFunctions
import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD

import scala.collection.Map
import scala.reflect.ClassTag

private[kontextfrei] trait RDDPairFunctions
    extends DCollectionPairFunctions[RDD] { this: RDDBase =>

  override final def cogroup[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Iterable[B], Iterable[C]))] = withSite(x) {
    _.cogroup(y)
  }

  override final def values[A: ClassTag, B: ClassTag](x: RDD[(A, B)]): RDD[B] = withSite(x) {
    _.values
  }

  override final def keys[A: ClassTag, B: ClassTag](x: RDD[(A, B)]): RDD[A] = withSite(x) {
    _.keys
  }

  override final def leftOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (B, Option[C]))] = withSite(x) {
    _.leftOuterJoin(y)
  }

  override final def rightOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Option[B], C))] = withSite(x) {
    _.rightOuterJoin(y)
  }

  override final def fullOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Option[B], Option[C]))] = withSite(x) {
    _.fullOuterJoin(y)
  }

  override final def mapValues[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(f: B => C): RDD[(A, C)] = withSite(x) {
    _.mapValues(f)
  }

  override final def flatMapValues[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(f: B => TraversableOnce[C]): RDD[(A, C)] = withSite(x) {
    _.flatMapValues(f)
  }

  override final def reduceByKey[A: ClassTag, B: ClassTag](xs: RDD[(A, B)])(
      f: (B, B) => B): RDD[(A, B)] = withSite(xs) {
    _.reduceByKey(f)
  }

  override final def foldByKey[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)])(zeroValue: B, f: (B, B) => B): RDD[(A, B)] = withSite(xs) {
    _.foldByKey(zeroValue)(f)
  }

  override final def aggregateByKey[A: ClassTag, B: ClassTag, C: ClassTag](
      xs: RDD[(A, B)])(zeroValue: C)(seqOp: (C, B) => C,
                                     combOp: (C, C) => C): RDD[(A, C)] = withSite(xs) {
    _.aggregateByKey(zeroValue)(seqOp, combOp)
  }

  override final def combineByKey[A: ClassTag, B: ClassTag, C: ClassTag](
      xs: RDD[(A, B)])(createCombiner: B => C)(
      mergeValue: (C, B) => C,
      mergeCombiners: (C, C) => C): RDD[(A, C)] = withSite(xs) {
    _.combineByKey(createCombiner, mergeValue, mergeCombiners)
  }

  override final def countByKey[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)]): Map[A, Long] = withSite(xs) {
    _.countByKey()
  }

  override final def collectAsMap[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)]): Map[A, B] = withSite(xs) {
    _.collectAsMap()
  }

  override final def partitionBy[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)])(partitioner: Partitioner): RDD[(A, B)] = withSite(xs) {
    _.partitionBy(partitioner)
  }
}

Source File: RDDOrderedFunctions.scala From kontextfrei with Apache License 2.0

5 votes

package com.danielwestheide.kontextfrei.rdd
import com.danielwestheide.kontextfrei.DCollectionOrderedFunctions
import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

private[kontextfrei] trait RDDOrderedFunctions
    extends DCollectionOrderedFunctions[RDD] { this: RDDBase =>

  override final def sortByKey[A: ClassTag: Ordering, B: ClassTag](
      x: RDD[(A, B)])(ascending: Boolean): RDD[(A, B)] = withSite(x) {
    _.sortByKey(ascending)
  }

  override final def sortByKeyWithNumPartitions[A: ClassTag: Ordering,
                                                B: ClassTag](
      x: RDD[(A, B)])(ascending: Boolean, numPartitions: Int): RDD[(A, B)] = withSite(x) {
    _.sortByKey(ascending, numPartitions)
  }

  override final def filterByRange[A: ClassTag: Ordering, B: ClassTag](
      x: RDD[(A, B)])(lower: A, upper: A): RDD[(A, B)] = withSite(x) {
    _.filterByRange(lower, upper)
  }

  override def repartitionAndSortWithinPartitions[
      A: ClassTag: Ordering,
      B: ClassTag](
      x: RDD[(A, B)])(
      partitioner: Partitioner)
    : RDD[(A, B)] = withSite(x) {
    _.repartitionAndSortWithinPartitions(partitioner)
  }
}

Source File: RDDCollectionOpsSpec.scala From kontextfrei with Apache License 2.0

5 votes

package com.danielwestheide.kontextfrei

import com.danielwestheide.kontextfrei.rdd.RDDOpsSupport
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.scalatest.BeforeAndAfterAll

class RDDCollectionOpsSpec
    extends DCollectionOpsProperties[RDD]
    with BeforeAndAfterAll {
  implicit val sparkContext = new SparkContext("local[2]", "dcollection-spec")
  override implicit val ops: DCollectionOps[RDD] =
    RDDOpsSupport.rddCollectionOps
  override protected def afterAll(): Unit = {
    sparkContext.stop()
  }
}

Source File: TSNEHelper.scala From spark-tsne with Apache License 2.0

5 votes

package com.github.saurfang.spark.tsne

import breeze.linalg._
import breeze.stats._
import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
import org.apache.spark.rdd.RDD

object TSNEHelper {
  // p_ij = (p_{i|j} + p_{j|i}) / 2n
  def computeP(p_ji: CoordinateMatrix, n: Int): RDD[(Int, Iterable[(Int, Double)])] = {
    p_ji.entries
      .flatMap(e => Seq(
      ((e.i.toInt, e.j.toInt), e.value),
      ((e.j.toInt, e.i.toInt), e.value)
    ))
      .reduceByKey(_ + _) // p + p'
      .map{case ((i, j), v) => (i, (j, math.max(v / 2 / n, 1e-12))) } // p / 2n
      .groupByKey()
  }

  
  def update(Y: DenseMatrix[Double],
             dY: DenseMatrix[Double],
             iY: DenseMatrix[Double],
             gains: DenseMatrix[Double],
             iteration: Int,
             param: TSNEParam): DenseMatrix[Double] = {
    import param._
    val momentum = if (iteration <= t_momentum) initial_momentum else final_momentum
    gains.foreachPair {
      case ((i, j), old_gain) =>
        val new_gain = math.max(min_gain,
          if ((dY(i, j) > 0.0) != (iY(i, j) > 0.0))
            old_gain + 0.2
          else
            old_gain * 0.8
        )
        gains.update(i, j, new_gain)

        val new_iY = momentum * iY(i, j) - eta * new_gain * dY(i, j)
        iY.update(i, j, new_iY)

        Y.update(i, j, Y(i, j) + new_iY) // Y += iY
    }
    val t_Y: DenseVector[Double] = mean(Y(::, *)).t
    val y_sub = Y(*, ::)
    Y := y_sub - t_Y
  }
}

Source File: LocalRunner.scala From spark-betweenness with Apache License 2.0

5 votes

package com.centrality.kBC

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Edge
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.VertexId
import org.apache.spark.rdd.RDD

object MainRunner 
{
  def main(args: Array[String])
  {
    // Create spark context
    val appName="kBC"
    val sparkMode="local"
    val conf = new SparkConf().setAppName(appName).setMaster(sparkMode);
    val sc = new SparkContext(conf);
    
    // Create sample graph
    //
    // Create an RDD for vertices
    val users: RDD[(VertexId, (String, String))] =
    sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
                         (5L, ("franklin", "prof")), (2L, ("istoica", "prof"))))
    // Create an RDD for edges
    val relationships: RDD[Edge[String]] =
      sc.parallelize(Array(Edge(3L, 7L, "collab"),    Edge(5L, 3L, "advisor"),
                           Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi")))
    // Define a default user in case there are relationship with missing user
    val defaultUser = ("John Doe", "Missing")
    // Build the initial Graph
    val graph = Graph(users, relationships, defaultUser)
    
    val kBCGraph = 
      KBetweenness.run(graph, 3)
  }
}

Source File: TiRDD.scala From tispark with Apache License 2.0

5 votes

package org.apache.spark.sql.tispark

import com.pingcap.tikv._
import com.pingcap.tikv.exception.TiInternalException
import com.pingcap.tikv.meta.TiDAGRequest
import com.pingcap.tikv.types.Converter
import com.pingcap.tikv.util.RangeSplitter
import com.pingcap.tikv.util.RangeSplitter.RegionTask
import com.pingcap.tispark.{TiPartition, TiTableReference}
import org.apache.spark.Partition
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow

import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.collection.mutable.ListBuffer

abstract class TiRDD(
    val dagRequest: TiDAGRequest,
    val physicalId: Long,
    val tiConf: TiConfiguration,
    val tableRef: TiTableReference,
    @transient private val session: TiSession,
    @transient private val sparkSession: SparkSession)
    extends RDD[InternalRow](sparkSession.sparkContext, Nil) {

  private lazy val partitionPerSplit = tiConf.getPartitionPerSplit

  protected def checkTimezone(): Unit = {
    if (!tiConf.getLocalTimeZone.equals(Converter.getLocalTimezone)) {
      throw new TiInternalException(
        "timezone are different! driver: " + tiConf.getLocalTimeZone + " executor:" + Converter.getLocalTimezone +
          " please set user.timezone in spark.driver.extraJavaOptions and spark.executor.extraJavaOptions")
    }
  }

  override protected def getPartitions: Array[Partition] = {
    val keyWithRegionTasks = RangeSplitter
      .newSplitter(session.getRegionManager)
      .splitRangeByRegion(dagRequest.getRangesByPhysicalId(physicalId), dagRequest.getStoreType)

    val hostTasksMap = new mutable.HashMap[String, mutable.Set[RegionTask]]
      with mutable.MultiMap[String, RegionTask]

    var index = 0
    val result = new ListBuffer[TiPartition]
    for (task <- keyWithRegionTasks) {
      hostTasksMap.addBinding(task.getHost, task)
      val tasks = hostTasksMap(task.getHost)
      if (tasks.size >= partitionPerSplit) {
        result.append(new TiPartition(index, tasks.toSeq, sparkContext.applicationId))
        index += 1
        hostTasksMap.remove(task.getHost)
      }

    }
    // add rest
    for (tasks <- hostTasksMap.values) {
      result.append(new TiPartition(index, tasks.toSeq, sparkContext.applicationId))
      index += 1
    }
    result.toArray
  }

  override protected def getPreferredLocations(split: Partition): Seq[String] =
    split.asInstanceOf[TiPartition].tasks.head.getHost :: Nil
}

Source File: BasicDataSourceSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.datasource

import com.pingcap.tikv.exception.TiBatchWriteException
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

class BasicDataSourceSuite extends BaseDataSourceTest("test_datasource_basic") {
  private val row1 = Row(null, "Hello")
  private val row2 = Row(2, "TiDB")
  private val row3 = Row(3, "Spark")
  private val row4 = Row(4, null)

  private val schema = StructType(
    List(StructField("i", IntegerType), StructField("s", StringType)))

  override def beforeAll(): Unit = {
    super.beforeAll()

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(null, 'Hello'), (2, 'TiDB')")
  }

  test("Test Select") {
    if (!supportBatchWrite) {
      cancel
    }

    testTiDBSelect(Seq(row1, row2))
  }

  test("Test Write Append") {
    if (!supportBatchWrite) {
      cancel
    }

    val data: RDD[Row] = sc.makeRDD(List(row3, row4))
    val df = sqlContext.createDataFrame(data, schema)

    df.write
      .format("tidb")
      .options(tidbOptions)
      .option("database", database)
      .option("table", table)
      .mode("append")
      .save()

    testTiDBSelect(Seq(row1, row2, row3, row4))
  }

  test("Test Write Overwrite") {
    if (!supportBatchWrite) {
      cancel
    }

    val data: RDD[Row] = sc.makeRDD(List(row3, row4))
    val df = sqlContext.createDataFrame(data, schema)

    val caught = intercept[TiBatchWriteException] {
      df.write
        .format("tidb")
        .options(tidbOptions)
        .option("database", database)
        .option("table", table)
        .mode("overwrite")
        .save()
    }

    assert(
      caught.getMessage
        .equals("SaveMode: Overwrite is not supported. TiSpark only support SaveMode.Append."))
  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }
}

Source File: UpperCaseColumnNameSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.datasource

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

class UpperCaseColumnNameSuite
    extends BaseDataSourceTest("test_datasource_uppser_case_column_name") {

  private val row1 = Row(1, 2)

  private val schema = StructType(
    List(StructField("O_ORDERKEY", IntegerType), StructField("O_CUSTKEY", IntegerType)))

  override def beforeAll(): Unit = {
    super.beforeAll()

    dropTable()
    jdbcUpdate(s"""
                  |CREATE TABLE $dbtable (O_ORDERKEY INTEGER NOT NULL,
                  |                       O_CUSTKEY INTEGER NOT NULL);
       """.stripMargin)
  }

  test("Test insert upper case column name") {
    if (!supportBatchWrite) {
      cancel
    }

    val data: RDD[Row] = sc.makeRDD(List(row1))
    val df = sqlContext.createDataFrame(data, schema)
    df.write
      .format("tidb")
      .options(tidbOptions)
      .option("database", database)
      .option("table", table)
      .mode("append")
      .save()
  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }
}

Source File: MissingParameterSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.datasource

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

class MissingParameterSuite extends BaseDataSourceTest("test_datasource_missing_parameter") {
  private val row1 = Row(null, "Hello")

  private val schema = StructType(
    List(StructField("i", IntegerType), StructField("s", StringType)))

  test("Missing parameter: database") {
    if (!supportBatchWrite) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")

    val caught = intercept[IllegalArgumentException] {
      val rows = row1 :: Nil
      val data: RDD[Row] = sc.makeRDD(rows)
      val df = sqlContext.createDataFrame(data, schema)
      df.write
        .format("tidb")
        .options(tidbOptions)
        .option("table", table)
        .mode("append")
        .save()
    }
    assert(
      caught.getMessage
        .equals("requirement failed: Option 'database' is required."))
  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }
}

Source File: OnlyOnePkSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.datasource

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

class OnlyOnePkSuite extends BaseDataSourceTest("test_datasource_only_one_pk") {
  private val row3 = Row(3)
  private val row4 = Row(4)

  private val schema = StructType(List(StructField("i", IntegerType)))

  override def beforeAll(): Unit = {
    super.beforeAll()

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int primary key)")
  }

  test("Test Write Append") {
    if (!supportBatchWrite) {
      cancel
    }

    val data: RDD[Row] = sc.makeRDD(List(row3, row4))
    val df = sqlContext.createDataFrame(data, schema)

    df.write
      .format("tidb")
      .options(tidbOptions)
      .option("database", database)
      .option("table", table)
      .mode("append")
      .save()

    testTiDBSelect(Seq(row3, row4))
  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }
}

Source File: WriteDDLConflictSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.concurrency

import com.pingcap.tikv.exception.TiBatchWriteException
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

class WriteDDLConflictSuite extends ConcurrencyTest {
  test("write ddl conflict using TableLock") {
    if (!supportBatchWrite) {
      cancel
    }

    if (!isEnableTableLock) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(4, 'null')")

    doBatchWriteInBackground(Map("useTableLock" -> "true"))

    Thread.sleep(sleepBeforeQuery)

    val caught = intercept[java.sql.SQLException] {
      jdbcUpdate(s"alter table $dbtable ADD Email varchar(255)")
    }
    assert(
      caught.getMessage
        .startsWith("Table 'test_concurrency_write_read' was locked in WRITE LOCAL by server"))
  }

  test("write ddl conflict using SchemaVersionCheck") {
    if (!supportBatchWrite) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(4, 'null')")

    new Thread(new Runnable {
      override def run(): Unit = {
        Thread.sleep(sleepBeforeQuery)
        jdbcUpdate(s"alter table $dbtable ADD Email varchar(255)")
      }
    }).start()

    val caught = intercept[TiBatchWriteException] {
      val data: RDD[Row] = sc.makeRDD(List(row1, row2, row3))
      val df = sqlContext.createDataFrame(data, schema)
      df.write
        .format("tidb")
        .options(tidbOptions)
        .option("database", database)
        .option("table", table)
        .option("sleepAfterPrewriteSecondaryKey", sleepBeforeQuery * 2)
        .option("useTableLock", "false")
        .mode("append")
        .save()
    }

    assert(caught.getMessage.equals("schema has changed during prewrite!"))
  }
}

Source File: WriteDDLNotConflictSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.concurrency

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

class WriteDDLNotConflictSuite extends ConcurrencyTest {
  test("ddl after GetCommitTS: add column") {
    doTest(s"alter table $dbtable ADD Email varchar(255)")
  }

  test("ddl after GetCommitTS: delete column") {
    doTest(s"alter table $dbtable drop column s")
  }

  test("ddl after GetCommitTS: rename column") {
    doTest(s"alter table $dbtable CHANGE s s2 varchar(128)")
  }

  test("ddl after GetCommitTS: change column type") {
    doTest(s"alter table $dbtable CHANGE i i BIGINT")
  }

  private def doTest(ddl: String): Unit = {
    if (!supportBatchWrite) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(4, 'null')")

    new Thread(new Runnable {
      override def run(): Unit = {
        Thread.sleep(sleepBeforeQuery)
        jdbcUpdate(ddl)
      }
    }).start()

    val data: RDD[Row] = sc.makeRDD(List(row1, row2, row3))
    val df = sqlContext.createDataFrame(data, schema)
    df.write
      .format("tidb")
      .options(tidbOptions)
      .option("database", database)
      .option("table", table)
      .option("sleepAfterGetCommitTS", sleepBeforeQuery * 2)
      .option("useTableLock", "false")
      .mode("append")
      .save()

    compareSelect()
  }
}

Source File: WriteWriteConflictSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.concurrency

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

class WriteWriteConflictSuite extends ConcurrencyTest {
  test("write write conflict using TableLock & jdbc") {
    if (!supportBatchWrite) {
      cancel
    }

    if (!isEnableTableLock) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(4, 'null')")

    doBatchWriteInBackground(Map("useTableLock" -> "true"))

    Thread.sleep(sleepBeforeQuery)

    val caught = intercept[java.sql.SQLException] {
      jdbcUpdate(s"insert into $dbtable values(5, 'test')")
    }
    assert(
      caught.getMessage
        .startsWith("Table 'test_concurrency_write_read' was locked in WRITE LOCAL by server"))
  }

  test("write write conflict using TableLock & tispark") {
    if (!supportBatchWrite) {
      cancel
    }

    if (!isEnableTableLock) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(4, 'null')")

    doBatchWriteInBackground(Map("useTableLock" -> "true"))

    Thread.sleep(sleepBeforeQuery)

    val caught = intercept[java.sql.SQLException] {
      val data: RDD[Row] = sc.makeRDD(List(row5))
      val df = sqlContext.createDataFrame(data, schema)
      df.write
        .format("tidb")
        .options(tidbOptions)
        .option("database", database)
        .option("table", table)
        .option("useTableLock", "true")
        .mode("append")
        .save()
    }
    assert(
      caught.getMessage
        .startsWith("Table 'test_concurrency_write_read' was locked in WRITE LOCAL by server"))
  }
}

Source File: LockTimeoutSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.ttl

import com.pingcap.tikv.TTLManager
import com.pingcap.tikv.exception.GrpcException
import com.pingcap.tispark.datasource.BaseDataSourceTest
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

class LockTimeoutSuite extends BaseDataSourceTest("test_lock_timeout") {
  private val row1 = Row(1, "Hello")

  private val schema = StructType(
    List(StructField("i", IntegerType), StructField("s", StringType)))

  override def beforeAll(): Unit = {
    super.beforeAll()
    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
  }

  test("Test Lock TTL Timeout") {
    if (!supportTTLUpdate) {
      cancel
    }

    val seconds = 1000
    val sleep1 = TTLManager.MANAGED_LOCK_TTL + 10 * seconds
    val sleep2 = TTLManager.MANAGED_LOCK_TTL + 15 * seconds

    val data: RDD[Row] = sc.makeRDD(List(row1))
    val df = sqlContext.createDataFrame(data, schema)

    new Thread(new Runnable {
      override def run(): Unit = {
        Thread.sleep(sleep1)
        queryTiDBViaJDBC(s"select * from $dbtable")
      }
    }).start()

    val grpcException = intercept[GrpcException] {
      df.write
        .format("tidb")
        .options(tidbOptions)
        .option("database", database)
        .option("table", table)
        .option("sleepAfterPrewritePrimaryKey", sleep2)
        .mode("append")
        .save()
    }

    assert(grpcException.getMessage.equals("retry is exhausted."))
    assert(grpcException.getCause.getMessage.startsWith("Txn commit primary key failed"))
    assert(
      grpcException.getCause.getCause.getMessage.startsWith(
        "Key exception occurred and the reason is retryable: \"Txn(Mvcc(TxnLockNotFound"))
  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }
}

Source File: EmployeeRelationship.scala From spark-dev with GNU General Public License v3.0

5 votes

package examples.graphx

import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.rdd.RDD
import org.apache.spark.graphx.{ Edge, Graph }


object EmployeeRelationship {
	def main(args: Array[String]): Unit = {
		// vertex format: vertex_id, data
		val vertexArray = Array(
			(1L, ("John", "Software Developer")),
			(2L, ("Robert", "Technical Leader")),
			(3L, ("Charlie", "Software Architect")),
			(4L, ("David", "Software Developer")),
			(5L, ("Edward", "Software Development Manager")),
			(6L, ("Francesca", "Software Development Manager")))

		// edge format: from_vertex_id, to_vertex_id, data
		val edgeArray = Array(
			Edge(2L, 1L, "Technical Mentor"),
			Edge(2L, 4L, "Technical Mentor"),
			Edge(3L, 2L, "Collaborator"),
			Edge(6L, 3L, "Team Member"),
			Edge(4L, 1L, "Peers"),
			Edge(5L, 2L, "Team Member"),
			Edge(5L, 3L, "Team Member"),
			Edge(5L, 6L, "Peers"))

		val sc = new SparkContext(new SparkConf().setAppName("EmployeeRelationshipJob"))

		val vertexRDD: RDD[(Long, (String, String))] = sc.parallelize(vertexArray)

		val edgeRDD: RDD[Edge[String]] = sc.parallelize(edgeArray)

		val graph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD)

		// Vanilla query
		println(">>> Showing the names of people who are Software Developers")
		graph.vertices.filter { case (id, (name, designation)) => designation.equals("Software Developer") }
			.collect()
			.foreach { case (id, (name, designation)) => println(s"... Name: $name, Designation: $designation") }

		// Connection analysis
		println(">>> People connected to Robert (Technical Leader) -> ")
		graph.triplets.filter(_.srcId == 2).collect()
			.foreach { item => println("... " + item.dstAttr._1 + ", " + item.dstAttr._2) }

		println(">>> Robert (Technical Leader) connected to -> ")
		graph.triplets.filter(_.dstId == 2).collect()
			.foreach { item => println("... " + item.srcAttr._1 + ", " + item.srcAttr._2) }

		println(">>> Technical Mentoring Analysis -> ")
		graph.triplets.filter(_.attr.equals("Technical Mentor")).collect()
			.foreach { item => println("... " + item.srcAttr._1 + " mentoring " + item.dstAttr._1) }
	}
}

Source File: PurchaseLogAnalysis.scala From spark-dev with GNU General Public License v3.0

5 votes

package examples

import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.rdd.RDD


object PurchaseLogAnalysis {
	def main(args: Array[String]): Unit = {

		val ctx = new SparkContext(new SparkConf().setAppName("PurchaseAnalysisJob"))

		val badPkts = ctx.accumulator(0, "Bad Packets")
		val zeroValueSales = ctx.accumulator(0, "Zero Value Sales")
		val missingFields = ctx.accumulator(0, "Missing Fields")
		val blankLines = ctx.accumulator(0, "Blank Lines")

		ctx.textFile("file:/media/linux-1/spark-dev/data/purchases.log", 4)
			.foreach { line =>

				if (line.length() == 0) blankLines += 1
				else if (line.contains("Bad data packet")) badPkts += 1
				else {
					val fields = line.split("\t")

					if (fields.length != 4) missingFields += 1
					else if (fields(3).toFloat == 0) zeroValueSales += 1
				}
			}

		println("Purchase Log Analysis Counters:")
		println(s"\tBad Data Packets=${badPkts.value}")
		println(s"\tZero Value Sales=${zeroValueSales.value}")
		println(s"\tMissing Fields=${missingFields.value}")
		println(s"\tBlank Lines=${blankLines.value}")
	}
}

Source File: TestBroadcastVariables.scala From spark-dev with GNU General Public License v3.0

5 votes

package examples

import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.rdd.RDD
import org.apache.spark.broadcast.Broadcast

import scala.io.Source
import scala.util.{ Try, Success, Failure }
import scala.collection.mutable.Map


	def loadCSVFile(filename: String): Option[Map[String, String]] = {
		val countries = Map[String, String]()

		Try {
			val bufferedSource = Source.fromFile(filename)

			for (line <- bufferedSource.getLines) {
				val Array(country, capital) = line.split(",").map(_.trim)
				countries += country -> capital
			}

			bufferedSource.close()
			return Some(countries)

		}.toOption
	}
}

Source File: TestAccumulators.scala From spark-dev with GNU General Public License v3.0

5 votes

package examples

import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.rdd.RDD


		rdd.foreach { line =>
			if (line.length() > 0) totalLines += 1
			if (line.startsWith("error:")) errorLines += 1
			else if (line.startsWith("info:")) infoLines += 1
			else if (line.startsWith("warn:")) warnLines += 1
		}

		println(s">>> [Using Accumulators] Total: ${totalLines.value}, Error: ${errorLines.value}, Warnings: ${warnLines.value}, Info: ${infoLines.value}")
	}

	def usingRDDTransformations(sc: SparkContext, rdd: RDD[String]): Unit = {
		val errorLines = rdd.filter(_.startsWith("error:")).count()
		val infoLines = rdd.filter(_.startsWith("info:")).count()
		val warnLines = rdd.filter(_.startsWith("warn:")).count()

		println(s">>> [Using RDD Transformations] Error: $errorLines, Warnings: $warnLines, Info: $infoLines")
	}
}

Source File: TestJoins.scala From spark-dev with GNU General Public License v3.0

5 votes

package examples

import org.apache.spark.{ SparkConf, SparkContext, HashPartitioner }
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
import scala.Iterator



object TestJoins {
	def main(args: Array[String]): Unit = {
		val sc = new SparkContext(new SparkConf().setAppName("TestJoinJob"))

		val x = sc.parallelize(List((1, 2), (1, 3), (2, 3), (2, 4))).partitionBy(new HashPartitioner(2)).cache
		val y = sc.parallelize(List((2, 5), (2, 6))).partitionBy(new HashPartitioner(2)).cache

		inspectRDD(x)
		inspectRDD(y)

		println(">>> joining x with y")
		val joinRDD = x.join(y).cache
		joinRDD.collect().foreach(println)
		inspectRDD(joinRDD)

		println(">>> left outer join of x with y")
		val leftJoin = x.leftOuterJoin(y).cache
		leftJoin.collect().foreach(println)
		inspectRDD(leftJoin)

		println(">>> right outer join of x with y")
		val rightJoin = x.rightOuterJoin(y).cache
		rightJoin.collect().foreach(println)
		inspectRDD(rightJoin)
	}
	
	def inspectRDD[T](rdd: RDD[T]): Unit = {
		
		println(">>> Partition length...")
		rdd.mapPartitions(f => Iterator(f.length), true).foreach(println)
		
		println(">>> Partition data...")
		rdd.foreachPartition(f => f.foreach(println))
	}
}

Source File: RedisSourceRdd.scala From spark-redis with BSD 3-Clause "New" or "Revised" License

5 votes

package org.apache.spark.sql.redis.stream

import com.redislabs.provider.redis.RedisConfig
import com.redislabs.provider.redis.util.ConnectionUtils.withConnection
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.redis.stream.RedisSourceTypes.StreamEntry
import org.apache.spark.{Partition, SparkContext, TaskContext}


class RedisSourceRdd(sc: SparkContext, redisConfig: RedisConfig,
                     offsetRanges: Seq[RedisSourceOffsetRange], autoAck: Boolean = true)
  extends RDD[StreamEntry](sc, Nil) {

  override def compute(split: Partition, context: TaskContext): Iterator[StreamEntry] = {
    val partition = split.asInstanceOf[RedisSourceRddPartition]
    val offsetRange = partition.offsetRange
    val streamReader = new RedisStreamReader(redisConfig)
    streamReader.unreadStreamEntries(offsetRange)
  }

  override protected def getPartitions: Array[Partition] = {
    offsetRanges.zipWithIndex.map { case (e, i) => RedisSourceRddPartition(i, e) }
      .toArray
  }
}

case class RedisSourceRddPartition(index: Int, offsetRange: RedisSourceOffsetRange)
  extends Partition

Source File: ManyValueBenchmarkSuite.scala From spark-redis with BSD 3-Clause "New" or "Revised" License

5 votes

package com.redislabs.provider.redis.df.benchmark

import com.redislabs.provider.redis.env.RedisClusterEnv
import com.redislabs.provider.redis.util.Person
import org.apache.spark.rdd.RDD


trait ManyValueBenchmarkSuite extends DataframeBenchmarkSuite with RedisClusterEnv {

  private def num = 1000000

  override def suiteTags: String = s"${super.suiteTags}, Many:$num"

  override def rdd(): RDD[Person] = {
    val partitionsNum = 8
    val sectionLength = num / partitionsNum
    spark.sparkContext
      .parallelize(0 until partitionsNum, partitionsNum)
      .mapPartitions {
        _
          .flatMap { i =>
            val start = i * sectionLength
            val end = start + sectionLength + 1
            Stream.range(start, end)
          }
          .map { i =>
            Person(s"John-$i", 30, "60 Wall Street", 150.5)
          }
      }
  }
}

Source File: Dijkstra.scala From graphx-algorithm with GNU General Public License v2.0

5 votes

package org.apache.spark.graphx.iiot.shortestpath

import org.apache.spark.graphx.GraphLoaderPlus
import org.apache.spark._
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD


    if (args.length < 2) sys.error("Usage: inputFileName sourceId [outputFileDirectory]")

    val inputFile = args(0)
    val sourceId: VertexId = args(1).toInt

    val sc = new SparkContext(new SparkConf().setAppName("Dijkstra Algorithm"))

    val graph = GraphLoaderPlus.edgeListFile(sc, inputFile)

    // `mapEdges` sometimes may be needed such as
    // `g.mapEdges(e => (new scala.util.Random).nextInt(100))`
    val g = graph.mapVertices((id, _) =>
      if (id == sourceId) Array(0.0, id)
      else Array(Double.PositiveInfinity, id)
    )

    val sssp = g.pregel(Array(Double.PositiveInfinity, -1))(
      (id, dist, newDist) => {
        if (dist(0) < newDist(0)) dist
        else newDist
      },
      triplet => {
        if (triplet.srcAttr(0) + triplet.attr < triplet.dstAttr(0)) {
          Iterator((triplet.dstId, Array(triplet.srcAttr(0) + triplet.attr, triplet.srcId)))
        }
        else {
          Iterator.empty
        }
      },
      (a, b) => {
        if (a(0) < b(0)) a
        else b
      }
    )

    val format_sssp: RDD[String] = sssp.vertices.map(vertex =>
      "Vertex " + vertex._1 + ": distance is " + vertex._2(0) + ", previous node is Vertex " + vertex._2(1).toInt)
    format_sssp.collect.foreach(println(_))

    if (args.length > 2) {
      val outputFileDir = args(2)
      format_sssp.saveAsTextFile(outputFileDir)
    }
  }
}

Source File: ReplicatedVertexView.scala From graphx-algorithm with GNU General Public License v2.0

5 votes

package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD

import org.apache.spark.graphx._


  def updateVertices(updates: VertexRDD[VD]): ReplicatedVertexView[VD, ED] = {
    val shippedVerts = updates.shipVertexAttributes(hasSrcId, hasDstId)
      .setName("ReplicatedVertexView.updateVertices - shippedVerts %s %s (broadcast)".format(
        hasSrcId, hasDstId))
      .partitionBy(edges.partitioner.get)

    val newEdges = edges.withPartitionsRDD(edges.partitionsRDD.zipPartitions(shippedVerts) {
      (ePartIter, shippedVertsIter) => ePartIter.map {
        case (pid, edgePartition) =>
          (pid, edgePartition.updateVertices(shippedVertsIter.flatMap(_._2.iterator)))
      }
    })
    new ReplicatedVertexView(newEdges, hasSrcId, hasDstId)
  }
}

Source File: EdgeRDDImpl.scala From graphx-algorithm with GNU General Public License v2.0

5 votes

package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{OneToOneDependency, HashPartitioner, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

import org.apache.spark.graphx._

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if (partitionsRDD.name != null) {
      partitionsRDD.setName(partitionsRDD.name + ", " + _name)
    } else {
      partitionsRDD.setName(_name)
    }
    this
  }
  setName("EdgeRDD")

  
  override def count(): Long = {
    partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
  }

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) => part.map(f))

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
      vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))
  }

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) = thisIter.next()
        val (_, otherEPart) = otherIter.next()
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
    })
  }

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) = iter.next()
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
        Iterator.empty
      }
    }, preservesPartitioning = true))
  }

  private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
  }

  override private[graphx] def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
  }

}

Source File: RoutingTablePartition.scala From graphx-algorithm with GNU General Public License v2.0

5 votes

package org.apache.spark.graphx.impl

import scala.reflect.ClassTag

import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.ShuffledRDD
import org.apache.spark.util.collection.{BitSet, PrimitiveVector}

import org.apache.spark.graphx._
import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap

import org.apache.spark.graphx.impl.RoutingTablePartition.RoutingTableMessage

private[graphx]
object RoutingTablePartition {
  
  def foreachWithinEdgePartition
      (pid: PartitionID, includeSrc: Boolean, includeDst: Boolean)
      (f: VertexId => Unit) {
    val (vidsCandidate, srcVids, dstVids) = routingTable(pid)
    val size = vidsCandidate.length
    if (includeSrc && includeDst) {
      // Avoid checks for performance
      vidsCandidate.iterator.foreach(f)
    } else if (!includeSrc && !includeDst) {
      // Do nothing
    } else {
      val relevantVids = if (includeSrc) srcVids else dstVids
      relevantVids.iterator.foreach { i => f(vidsCandidate(i)) }
    }
  }
}

Source File: SparkBatchAdapter.scala From eventuate with Apache License 2.0

5 votes

package com.rbmhtechnology.eventuate.adapter.spark

import akka.actor.ActorSystem
import akka.serialization.SerializationExtension

import com.datastax.spark.connector._
import com.datastax.spark.connector.types._
import com.rbmhtechnology.eventuate.DurableEvent
import com.rbmhtechnology.eventuate.log.cassandra.CassandraEventLogSettings
import com.typesafe.config._

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD


  def eventBatch(logId: String, fromSequenceNr: Long = 1L): RDD[DurableEvent] = {
    context.cassandraTable(cassandraSettings.keyspace, s"${cassandraSettings.tablePrefix}_$logId")
      .select("event").where(s"sequence_nr >= $fromSequenceNr").as((event: DurableEvent) => event)
  }
}

private class DurableEventConverter(config: Config) extends TypeConverter[DurableEvent] {
  import scala.reflect.runtime.universe._

  val converter = implicitly[TypeConverter[Array[Byte]]]

  // --------------------------------------
  //  FIXME: how to shutdown actor system?
  // --------------------------------------

  @transient lazy val system = ActorSystem("TypeConverter", config)
  @transient lazy val serial = SerializationExtension(system)

  def targetTypeTag = implicitly[TypeTag[DurableEvent]]
  def convertPF = {
    case obj => deserialize(converter.convert(obj))
  }

  def deserialize(bytes: Array[Byte]): DurableEvent =
    serial.deserialize(bytes, classOf[DurableEvent]).get
}

Source File: GenerateVerticesExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch08

// scalastyle:off println
import org.apache.log4j.{Level, Logger}

import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.rdd.RDD


object GenerateVerticesExample {

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }
    // ログレベルをWARNに設定
    Logger.getLogger("org").setLevel(Level.WARN)

    // SparkContextの生成
    val conf = new SparkConf().setAppName("GenerateVerticesExample")
    val sc = new SparkContext(conf)

    // 引数から設定値を取得
    val (numProducts, numUsers): (Int, Int) = (args(0).toInt, args(1).toInt)
    implicit val recOpts: RecommendLogOptions = RecommendLogOptions(numProducts, numUsers)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext)
         (implicit recOpts: RecommendLogOptions)
  : Unit = {

    // 商品リスト、ユーザリストのRDDを生成
    val products: RDD[VertexProperty] = sc.parallelize(PurchaseLogGenerator.genProductList)
    val users: RDD[VertexProperty] = sc.parallelize(PurchaseLogGenerator.genUserList)

    // 商品リスト20件を表示
    println("===================================")
    println("get top 20 products:")
    products.take(20).foreach(x => println(s"id: ${x.id},\ttype: ${x.kind},\tname: ${x.name}"))

    // ユーザリスト20件を表示
    println("===================================")
    println("get top 20 users:")
    users.take(20).foreach(x => println(s"id: ${x.id},\ttype: ${x.kind},\tname: ${x.name}"))

  }
}
// scalastyle:on println

Source File: gihyo_6_3_Transform.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{StreamingContext, Seconds}
import org.apache.spark.streaming.dstream.InputDStream

object gihyo_6_3_Transform {
  def main(args: Array[String]) {
    if (args.length != 2) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }
    val targetHost = args(0)
    val targetHostPort = args(1).toInt

    val conf = new SparkConf().setAppName("NetworkWordCount")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, Seconds(5))
    val lines = ssc.socketTextStream(targetHost, targetHostPort)
    val blackList = sc.parallelize(Array(("user002", "rockLogin"), ("user003", "rockPayment")))
    run(lines, blackList)

    ssc.start
    ssc.awaitTermination
  }

  def run(stream: InputDStream[String], blackList: RDD[(String, String)]) {
    val userList = stream.map(x => (x, "action:Login")).transform(rdd => {
      val tmpUserList = rdd.leftOuterJoin(blackList)
      tmpUserList.filter(user => (user._2._2 == None))
    })
    userList.print
  }
}

Source File: gihyo_6_3_JoinSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}
import scala.collection.mutable
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_JoinSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines1 = mutable.Queue[RDD[String]]()
    val ds1 = ssc.queueStream(lines1)
    val lines2 = mutable.Queue[RDD[String]]()
    val ds2 = ssc.queueStream(lines2)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_Join.run(ds1, ds2)
    ssc.start()

    lines1 += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
    lines2 += sc.makeRDD(Seq("key2", "key3", "key4")) // test data
    clock.advance(1000)
    Thread.sleep(1000)
  }
}

Source File: gihyo_6_3_CountByValueAndWindowSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}
import scala.collection.mutable
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper
import java.nio.file.Files

class gihyo_6_3_CountByValueAndWindowSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString
    ssc.checkpoint(checkpointDir)
    gihyo_6_3_countByValueAndWindow.run(ds, 2, 1)
    ssc.start()
    (1 to 3).foreach { case i =>
      lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
      clock.advance(1000)
      Thread.sleep(1000)
    }
  }
}

Source File: gihyo_6_3_MapSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_MapSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_Map.run(ds)
    ssc.start()
    lines += sc.makeRDD(Seq("key1", "key2", "key3", "key1")) // test data
    clock.advance(1000)
    Thread.sleep(1000)
  }
}

Source File: gihyo_6_3_TwitterStreamSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import java.nio.file.Files

import scala.collection.mutable
import scala.io.Source

import twitter4j.{Status, TwitterObjectFactory}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}


class gihyo_6_3_TwitterStreamSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[Status]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_TwitterStream.run(
      sc,
      ds,
      Files.createTempDirectory("TwitterTag").toString,
      Files.createTempDirectory("TwitterWords").toString)
    val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString
    ssc.checkpoint(checkpointDir)
    ssc.start()

    (1 to 2).foreach { case i =>
      // test data
      lines += sc.makeRDD(Seq(
        MockTweetGenerator.createMockStatusFromJson(),
        MockTweetGenerator.createMockStatusFromJson(),
        MockTweetGenerator.createMockStatusFromJson(),
        MockTweetGenerator.createMockStatusFromJson()))
      clock.advance(1000)
      Thread.sleep(1000)
    }
  }
}

object MockTweetGenerator {
  // Creates a tweet status from a JSON file
  def createMockStatusFromJson(): Status = {
    val jsonFile = getClass.getResource("/streaming/test-tweet.json").getPath
    TwitterObjectFactory.createStatus(Source.fromFile(jsonFile).getLines().mkString)
  }
}

Source File: gihyo_6_3_FilterSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_FilterSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_Filter.run(ds)
    ssc.start()
    lines += sc.makeRDD(Seq("lengthOver5", "les1", "les2")) // test data
    clock.advance(1000)
    Thread.sleep(1000)
  }
}

Source File: gihyo_6_3_FlatMapSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_FlatMapSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_flatMap.run(ds)
    ssc.start()
    // test data
    lines += sc.makeRDD(Seq("Apache Spark is a fast and general-purpose cluster computing system."))
    clock.advance(1000)
    Thread.sleep(1000)
  }
}

Source File: gihyo_6_3_CountSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_CountSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_Count.run(ds, 2, 1)
    ssc.start()
    (1 to 2).foreach { case i =>
      lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
      clock.advance(1000)
      Thread.sleep(1000)
    }
  }
}

Source File: gihyo_6_3_UnionSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_UnionSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = (1 to 3).map(x => mutable.Queue[RDD[(String, String)]]())
    val dss = lines.map(x => ssc.queueStream(x))
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_Union.run(ssc, dss)
    ssc.start()
    lines.map(x => x += sc.makeRDD(Seq(("", "key1"), ("", "key2"), ("", "key3")))) //test data
    clock.advance(1000)
    Thread.sleep(1000)
  }
}

Source File: gihyo_6_3_ReduceByKeyAndWindowSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_ReduceByKeyAndWindowSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_reduceByKeyAndWindow.run(ds, 2, 1)
    ssc.start()
    (1 to 3).foreach { case i =>
      lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
      clock.advance(1000)
      Thread.sleep(1000)
    }
  }
}

Source File: gihyo_6_3_ReduceByKeySuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_ReduceByKeySuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_reduceByKey.run(ds)
    ssc.start()
    lines += sc.makeRDD(Seq("key1", "key2", "key3", "key1")) // test data
    clock.advance(1000)
    Thread.sleep(1000)
  }
}

Source File: gihyo_6_3_CountByWindowSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import java.nio.file.Files

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_CountByWindowSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString
    ssc.checkpoint(checkpointDir)
    gihyo_6_3_countByWindow.run(ds, 2, 1)
    ssc.start()
    (1 to 3).foreach { case i =>
      lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
      clock.advance(1000)
      Thread.sleep(1000)
    }
  }
}

Source File: gihyo_6_3_UpdateStateByKeySuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import java.nio.file.Files

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_UpdateStateByKeySuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_updateStateByKey.run(ds)
    val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString
    ssc.checkpoint(checkpointDir)
    ssc.start()
    lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
    clock.advance(1000)
    Thread.sleep(1000)
  }
}

Source File: gihyo_6_3_RepartitionSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_RepartitionSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_Repartition.run(ds)
    ssc.start()
    lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
    clock.advance(1000)
    Thread.sleep(1000)
  }
}

Source File: gihyo_6_3_ReduceByKeyAndWindowEfficientSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import java.nio.file.Files

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_ReduceByKeyAndWindowEfficientSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_reduceByKeyAndWindow_efficient.run(ds, 2, 1)
    val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString
    ssc.checkpoint(checkpointDir)
    ssc.start()
    (1 to 2).foreach { case i =>
      lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
      clock.advance(1000)
      Thread.sleep(1000)
    }
  }
}

Source File: gihyo_6_3_KafkaStreamSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import scala.collection.mutable
import java.nio.file.Files

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_KafkaStreamSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[(String, String)]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_KafkaStream.run(ds, Files.createTempDirectory("KafkaStreamSuite").toString, 2, 1)
    val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString
    ssc.checkpoint(checkpointDir)
    ssc.start()
    (1 to 2).foreach { case i =>
      lines += sc.makeRDD(Seq(("", "userid:userid001,action:view,pageid:value1"),
        ("", "userid:userid002,action:click,pageid:value2"),
        ("", "userid:userid003,action:view,pageid:value3"),
        ("", "userid:userid001,action:view,pageid:value4"))) // test data
      clock.advance(1000)
      Thread.sleep(1000)
    }
  }
}

Source File: gihyo_6_3_WiindowSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}
import scala.collection.mutable
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_WindowSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_Window.run(ds, 2, 1)
    ssc.start()
    (1 to 3).foreach {
      case i => {
        lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
        clock.advance(1000)
        Thread.sleep(1000)
      }
    }
  }
}

Source File: gihyo_6_3_CogroupSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_CogroupSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val lines2 = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val ds2 = ssc.queueStream(lines2)
    val clock = new StreamingContextWrapper(ssc).manualClock
    lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
    lines2 += sc.makeRDD(Seq("key2", "key3", "key4")) // test data
    gihyo_6_3_Cogroup.run(ds, ds2)
    ssc.start()
    clock.advance(1000)
    Thread.sleep(1000)
  }
}

Source File: gihyo_6_2_1_SampleSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_2_1_SampleSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    lines += sc.makeRDD(Seq("word1 word2", "word3 word1", "word4 word2")) // test data
    gihyo_6_2_1_Sample.run(ds)
    ssc.start()
    clock.advance(1000)
    Thread.sleep(1000)
  }
}

Source File: gihyo_6_3_TransformSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_TransformSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    val blackList = sc.parallelize(Array(("user002", "rockLogin"), ("user003", "rockPayment")))
    gihyo_6_3_Transform.run(ds, blackList)
    ssc.start()
    lines += sc.makeRDD(Seq("user001", "user002", "user003")) // test data
    clock.advance(1000)
    Thread.sleep(1000)
  }
}

Source File: gihyo_6_3_CountByValueSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_CountByValueSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_countByValue.run(ds)
    ssc.start()
    lines += sc.makeRDD(Seq("key1", "key2", "key3", "key1")) // test data
    clock.advance(1000)
    Thread.sleep(1000)
  }
}

Source File: gihyo_6_3_ReduceSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_ReduceSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_Reduce.run(ds)
    ssc.start()
    lines += sc.makeRDD(Seq("gi", "jutsu", "hyoron", "sha")) // test data
    clock.advance(1000)
    Thread.sleep(1000)
  }
}

Source File: gihyo_6_3_ReduceByWindowSuite.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_ReduceByWindowSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_reduceByWindow.run(ds, 2, 1)
    ssc.start()
    (1 to 2).foreach {
      case i => {
        lines += sc.makeRDD(Seq("gi", "jutsu", "hyoron", "sha")) // test data
        clock.advance(1000)
        Thread.sleep(1000)
      }
    }
  }
}

Source File: FileReader.scala From bdd-spark with MIT License

5 votes

import org.apache.spark.rdd.RDD

trait FileReader {
  def readLinesToRdd(filename : String) : RDD[String]
  def readText(filename : String) : String
}

object FileReader {
  class RealFileReader extends FileReader{
    override def readLinesToRdd(filename: String): RDD[String] = {
      Spark.spark.sparkContext.textFile(filename)
    }

    override def readText(filename: String): String = {
      //Whatever!
      ""
    }
  }

  def apply() : FileReader = new RealFileReader
}

Source File: RecommendationModelReuse.scala From Scala-Machine-Learning-Projects with MIT License

5 votes

package com.packt.ScalaML.MovieRecommendation

import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
import scala.Tuple2
import org.apache.spark.rdd.RDD

object RecommendationModelReuse {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession
      .builder()
      .appName("JavaLDAExample")
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/").
      getOrCreate()

    val ratigsFile = "data/ratings.csv"
    val ratingDF = spark.read.format("com.databricks.spark.csv").option("header", true).load(ratigsFile)
    val selectedRatingsDF = ratingDF.select(ratingDF.col("userId"), ratingDF.col("movieId"), ratingDF.col("rating"), ratingDF.col("timestamp"))

    // Randomly split ratings RDD into training data RDD (75%) and test data RDD (25%)
    val splits = selectedRatingsDF.randomSplit(Array(0.75, 0.25), seed = 12345L)
    val testData = splits(1)

    val testRDD = testData.rdd.map(row => {
      val userId = row.getString(0)
      val movieId = row.getString(1)
      val ratings = row.getString(2)
      Rating(userId.toInt, movieId.toInt, ratings.toDouble)
    })

    //Load the workflow back
    val same_model = MatrixFactorizationModel.load(spark.sparkContext, "model/MovieRecomModel/")

    // Making Predictions. Get the top 6 movie predictions for user 668
    println("Rating:(UserID, MovieID, Rating)")
    println("----------------------------------")
    val topRecsForUser = same_model.recommendProducts(458, 10)
    for (rating <- topRecsForUser) {
      println(rating.toString())
    }
    println("----------------------------------")

    val rmseTest = MovieRecommendation.computeRmse(same_model, testRDD, true)
    println("Test RMSE: = " + rmseTest) //Less is better

    //Movie recommendation for a specific user. Get the top 6 movie predictions for user 668
    println("Recommendations: (MovieId => Rating)")
    println("----------------------------------")
    val recommendationsUser = same_model.recommendProducts(458, 10)
    recommendationsUser.map(rating => (rating.product, rating.rating)).foreach(println)
    println("----------------------------------")

    spark.stop()
  }
}

Source File: MovieRecommendation.scala From Scala-Machine-Learning-Projects with MIT License

5 votes

package com.packt.ScalaML.MovieRecommendation

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SQLImplicits
import org.apache.spark.sql._
import org.apache.spark.sql.Dataset
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
import scala.Tuple2
import org.apache.spark.rdd.RDD

object MovieRecommendation {  
  //Compute the RMSE to evaluate the model. Less the RMSE better the model and it's prediction capability. 
  def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating], implicitPrefs: Boolean): Double = {
    val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product)))
    val predictionsAndRatings = predictions.map { x => ((x.user, x.product), x.rating)
    }.join(data.map(x => ((x.user, x.product), x.rating))).values
    if (implicitPrefs) {
      println("(Prediction, Rating)")
      println(predictionsAndRatings.take(5).mkString("\n"))
    }
    math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).mean())
  }

  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession
      .builder()
      .appName("JavaLDAExample")
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/").
      getOrCreate()

    val ratigsFile = "data/ratings.csv"
    val df1 = spark.read.format("com.databricks.spark.csv").option("header", true).load(ratigsFile)

    val ratingsDF = df1.select(df1.col("userId"), df1.col("movieId"), df1.col("rating"), df1.col("timestamp"))
    ratingsDF.show(false)

    val moviesFile = "data/movies.csv"
    val df2 = spark.read.format("com.databricks.spark.csv").option("header", "true").load(moviesFile)

    val moviesDF = df2.select(df2.col("movieId"), df2.col("title"), df2.col("genres"))
    moviesDF.show(false)

    ratingsDF.createOrReplaceTempView("ratings")
    moviesDF.createOrReplaceTempView("movies")

    

    var rmseTest = computeRmse(model, testRDD, true)
    println("Test RMSE: = " + rmseTest) //Less is better

    //Movie recommendation for a specific user. Get the top 6 movie predictions for user 668
    println("Recommendations: (MovieId => Rating)")
    println("----------------------------------")
    val recommendationsUser = model.recommendProducts(668, 6)
    recommendationsUser.map(rating => (rating.product, rating.rating)).foreach(println)
    println("----------------------------------")

    spark.stop()
  }
}

Source File: HbRddWriter.scala From hbrdd with Apache License 2.0

5 votes

package top.spoofer.hbrdd.hbsupport

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.spark.rdd.RDD
import top.spoofer.hbrdd.config.HbRddConfig
import top.spoofer.hbrdd.unit.HbRddFormatsWriter
import top.spoofer.hbrdd._
import HbRddWritPuter._

trait HbRddWriter {
  type TsValue[A] = (Long, A) // (ts, A)
  val LATEST_TIMESTAMP = Long.MaxValue
  
final class SingleFamilyRDDWriter[A](
    val rdd: RDD[(String, Map[String, A])],
    val put: HbRddPuter[A]
) extends HbRddWritCommon[A] with Serializable {
  def put2Hbase(tableName: String, family: String)(implicit config: HbRddConfig) = {
    val job = createJob(tableName, config.getHbaseConfig)
    rdd.flatMap({ case (rowId, data) => convert2Writable(rowId, Map(family -> data), put) })
      .saveAsNewAPIHadoopDataset(job.getConfiguration)
  }
}

Source File: XmlReader.scala From spark-xml with Apache License 2.0

5 votes

package com.databricks.spark.xml

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, SQLContext, SparkSession}
import org.apache.spark.sql.types.StructType
import com.databricks.spark.xml.util.XmlFile
import com.databricks.spark.xml.util.FailFastMode


  @deprecated("Use xmlFile(SparkSession, ...)", "0.5.0")
  def xmlFile(sqlContext: SQLContext, path: String): DataFrame = {
    // We need the `charset` and `rowTag` before creating the relation.
    val (charset, rowTag) = {
      val options = XmlOptions(parameters.toMap)
      (options.charset, options.rowTag)
    }
    val relation = XmlRelation(
      () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag),
      Some(path),
      parameters.toMap,
      schema)(sqlContext)
    sqlContext.baseRelationToDataFrame(relation)
  }

  @deprecated("Use xmlRdd(SparkSession, ...)", "0.5.0")
  def xmlRdd(sqlContext: SQLContext, xmlRDD: RDD[String]): DataFrame = {
    val relation = XmlRelation(
      () => xmlRDD,
      None,
      parameters.toMap,
      schema)(sqlContext)
    sqlContext.baseRelationToDataFrame(relation)
  }

}

Source File: XmlFile.scala From spark-xml with Apache License 2.0

5 votes

package com.databricks.spark.xml.util

import java.io.CharArrayWriter
import java.nio.charset.Charset
import javax.xml.stream.XMLOutputFactory

import scala.collection.Map

import com.databricks.spark.xml.parsers.StaxXmlGenerator
import com.sun.xml.txw2.output.IndentingXMLStreamWriter
import org.apache.hadoop.io.{Text, LongWritable}

import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.sql.DataFrame
import com.databricks.spark.xml.{XmlOptions, XmlInputFormat}

private[xml] object XmlFile {
  val DEFAULT_INDENT = "    "

  def withCharset(
      context: SparkContext,
      location: String,
      charset: String,
      rowTag: String): RDD[String] = {
    // This just checks the charset's validity early, to keep behavior
    Charset.forName(charset)
    context.hadoopConfiguration.set(XmlInputFormat.START_TAG_KEY, s"<$rowTag>")
    context.hadoopConfiguration.set(XmlInputFormat.END_TAG_KEY, s"</$rowTag>")
    context.hadoopConfiguration.set(XmlInputFormat.ENCODING_KEY, charset)
    context.newAPIHadoopFile(location,
      classOf[XmlInputFormat],
      classOf[LongWritable],
      classOf[Text]).map { case (_, text) => new String(text.getBytes, 0, text.getLength, charset) }
  }

  
  def saveAsXmlFile(
      dataFrame: DataFrame,
      path: String,
      parameters: Map[String, String] = Map()): Unit = {
    val options = XmlOptions(parameters.toMap)
    val codecClass = CompressionCodecs.getCodecClass(options.codec)
    val rowSchema = dataFrame.schema
    val indent = XmlFile.DEFAULT_INDENT

    val xmlRDD = dataFrame.rdd.mapPartitions { iter =>
      val factory = XMLOutputFactory.newInstance()
      val writer = new CharArrayWriter()
      val xmlWriter = factory.createXMLStreamWriter(writer)
      val indentingXmlWriter = new IndentingXMLStreamWriter(xmlWriter)
      indentingXmlWriter.setIndentStep(indent)

      new Iterator[String] {
        var firstRow: Boolean = true
        var lastRow: Boolean = true

        override def hasNext: Boolean = iter.hasNext || firstRow || lastRow

        override def next: String = {
          if (iter.nonEmpty) {
            if (firstRow) {
              indentingXmlWriter.writeStartElement(options.rootTag)
              firstRow = false
            }
            val xml = {
              StaxXmlGenerator(
                rowSchema,
                indentingXmlWriter,
                options)(iter.next())
              indentingXmlWriter.flush()
              writer.toString
            }
            writer.reset()
            xml
          } else {
            if (!firstRow) {
              lastRow = false
              indentingXmlWriter.writeEndElement()
              indentingXmlWriter.close()
              writer.toString
            } else {
              // This means the iterator was initially empty.
              firstRow = false
              lastRow = false
              ""
            }
          }
        }
      }
    }

    codecClass match {
      case null => xmlRDD.saveAsTextFile(path)
      case codec => xmlRDD.saveAsTextFile(path, codec)
    }
  }
}

Source File: XmlRelation.scala From spark-xml with Apache License 2.0

5 votes

package com.databricks.spark.xml

import java.io.IOException

import org.apache.hadoop.fs.Path

import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.sources.{PrunedScan, InsertableRelation, BaseRelation, TableScan}
import org.apache.spark.sql.types._
import com.databricks.spark.xml.util.{InferSchema, XmlFile}
import com.databricks.spark.xml.parsers.StaxXmlParser

case class XmlRelation protected[spark] (
    baseRDD: () => RDD[String],
    location: Option[String],
    parameters: Map[String, String],
    userSchema: StructType = null)(@transient val sqlContext: SQLContext)
  extends BaseRelation
  with InsertableRelation
  with PrunedScan {

  private val options = XmlOptions(parameters)

  override val schema: StructType = {
    Option(userSchema).getOrElse {
      InferSchema.infer(
        baseRDD(),
        options)
    }
  }

  override def buildScan(requiredColumns: Array[String]): RDD[Row] = {
    val requiredFields = requiredColumns.map(schema(_))
    val requestedSchema = StructType(requiredFields)
    StaxXmlParser.parse(
      baseRDD(),
      requestedSchema,
      options)
  }

  // The function below was borrowed from JSONRelation
  override def insert(data: DataFrame, overwrite: Boolean): Unit = {
    val filesystemPath = location match {
      case Some(p) => new Path(p)
      case None =>
        throw new IOException(s"Cannot INSERT into table with no path defined")
    }

    val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)

    if (overwrite) {
      try {
        fs.delete(filesystemPath, true)
      } catch {
        case e: IOException =>
          throw new IOException(
            s"Unable to clear output directory ${filesystemPath.toString} prior"
              + s" to INSERT OVERWRITE a XML table:\n${e.toString}")
      }
      // Write the data. We assume that schema isn't changed, and we won't update it.
      XmlFile.saveAsXmlFile(data, filesystemPath.toString, parameters)
    } else {
      throw new IllegalArgumentException("XML tables only support INSERT OVERWRITE for now.")
    }
  }
}

Source File: SparkSuite.scala From spark-sorted with Apache License 2.0

5 votes

package com.tresata.spark.sorted

import org.scalactic.Equality
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.{ Dataset, SparkSession }

object SparkSuite {
  lazy val spark: SparkSession = {
    val session = SparkSession.builder
      .master("local[*]")
      .appName("test")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .config("spark.ui.enabled", false)
      .config("spark.sql.shuffle.partitions", 4)
      .getOrCreate()
    session
  }
  lazy val sc: SparkContext = spark.sparkContext

  lazy val jsc = new JavaSparkContext(sc)
  def javaSparkContext() = jsc
}

trait SparkSuite {
  implicit lazy val spark: SparkSession = SparkSuite.spark
  implicit lazy val sc: SparkContext = SparkSuite.spark.sparkContext

  implicit def rddEq[X]: Equality[RDD[X]] = new Equality[RDD[X]] {
    private def toCounts[Y](s: Seq[Y]): Map[Y, Int] = s.groupBy(identity).mapValues(_.size)

    def areEqual(a: RDD[X], b: Any): Boolean = b match {
      case s: Seq[_] => toCounts(a.collect) == toCounts(s)
      case rdd: RDD[_] => toCounts(a.collect) == toCounts(rdd.collect)
    }
  }

  implicit def gsEq[K, V](implicit rddEq: Equality[RDD[(K, V)]]): Equality[GroupSorted[K, V]] = new Equality[GroupSorted[K, V]] {
    def areEqual(a: GroupSorted[K, V], b: Any): Boolean = rddEq.areEqual(a, b)
  }
  
  implicit def dsEq[X](implicit rddEq: Equality[RDD[X]]): Equality[Dataset[X]] = new Equality[Dataset[X]] {
    def areEqual(a: Dataset[X], b: Any): Boolean = b match {
      case ds: Dataset[_] => rddEq.areEqual(a.rdd, ds.rdd)
      case x => rddEq.areEqual(a.rdd, x)
    }
  }
}

Source File: BinaryClassifierEvaluator.scala From keystone with Apache License 2.0

5 votes

package keystoneml.evaluation

import org.apache.spark.rdd.RDD


  def evaluate(predictions: RDD[Boolean], actuals: RDD[Boolean]): BinaryClassificationMetrics = {
    predictions.zip(actuals).map { case (pred, actual) =>
      val tp = if (pred && actual) 1d else 0d
      val fp = if (pred && !actual) 1d else 0d
      val tn = if (!pred && !actual) 1d else 0d
      val fn = if (!pred && actual) 1d else 0d
      BinaryClassificationMetrics(tp, fp, tn, fn)
    }.reduce(_ merge _)
  }
}

Source File: AugmentedExamplesEvaluator.scala From keystone with Apache License 2.0

5 votes

package keystoneml.evaluation

import breeze.linalg._
import keystoneml.nodes.util.MaxClassifier
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

object AggregationPolicyType extends Enumeration {
  type AggregationPolicyType = Value
  val average, borda = Value
}

class AugmentedExamplesEvaluator[T : ClassTag](
    names: RDD[T],
    numClasses: Int,
    policy: AggregationPolicyType.Value = AggregationPolicyType.average)
  extends Evaluator[DenseVector[Double], Int, MulticlassMetrics] with Serializable {

  def averagePolicy(preds: Array[DenseVector[Double]]): DenseVector[Double] = {
    preds.reduce(_ + _) :/ preds.size.toDouble
  }

  
  def bordaPolicy(preds: Array[DenseVector[Double]]): DenseVector[Double] = {
    val ranks = preds.map { vec =>
      val sortedPreds = vec.toArray.zipWithIndex.sortBy(_._1).map(_._2)
      val rank = DenseVector(sortedPreds.zipWithIndex.sortBy(_._1).map(x => x._2.toDouble))
      rank
    }
    ranks.reduceLeft(_ + _)
  }

  def evaluate(
      predicted: RDD[DenseVector[Double]],
      actualLabels: RDD[Int]): MulticlassMetrics = {

    val aggFunc = policy match {
      case AggregationPolicyType.borda => bordaPolicy _
      case _ => averagePolicy _
    }
       
    // associate a name with each predicted, actual
    val namedPreds = names.zip(predicted.zip(actualLabels))

    // group by name to get all the predicted values for a name
    val groupedPreds = namedPreds.groupByKey(names.partitions.length).map { case (group, iter) =>
      val predActuals = iter.toArray // this is a array of tuples
      val predsForName = predActuals.map(_._1)
      assert(predActuals.map(_._2).distinct.size == 1)
      val actualForName: Int = predActuals.map(_._2).head

      (predsForName, actualForName)
    }.cache()

    // Averaging policy
    val finalPred = groupedPreds.map(x => (aggFunc(x._1), x._2) )
    val finalPredictedLabels = MaxClassifier(finalPred.map(_._1))
    val finalActualLabels = finalPred.map(_._2)

    val ret = new MulticlassClassifierEvaluator(numClasses).evaluate(finalPredictedLabels, finalActualLabels)
    groupedPreds.unpersist()
    ret
  }
}

Source File: MeanAveragePrecisionEvaluator.scala From keystone with Apache License 2.0

5 votes

package keystoneml.evaluation

import breeze.linalg.DenseVector
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext._


  private def getAP(precisions: Array[Double], recalls: Array[Double]) = {
    var ap = 0.0
    val levels = (0 to 10).map(x => x / 10.0)
    levels.foreach { t =>
      // Find where recalls are greater than t and precision values at those indices
      val px = recalls.toSeq.zipWithIndex.filter(x => x._1 >= t).map(x => precisions(x._2))
      val p = if (px.isEmpty) {
        0.0
      } else {
        px.max
      }
      ap = ap + p / 11.0
    }
    ap
  }

}

Source File: Stats.scala From keystone with Apache License 2.0

5 votes

package keystoneml.utils

import java.util.{Random => JRandom}

import breeze.linalg._
import breeze.numerics._
import breeze.stats._
import breeze.stats.distributions._
import keystoneml.nodes.util.TopKClassifier
import org.apache.spark.rdd.RDD

object Stats extends Serializable {
  
  def normalizeRows(mat: DenseMatrix[Double], alpha: Double = 1.0): DenseMatrix[Double] = {
    // FIXME: This currently must convert the matrices to double due to breeze implicits
    // TODO: Could optimize, use way fewer copies
    val rowMeans: DenseVector[Double] = mean(mat(*, ::)).map(x => if (x.isNaN) 0 else x)
    val variances: DenseVector[Double] = sum((mat(::, *) - rowMeans) :^= 2.0, Axis._1) :/= (mat.cols.toDouble - 1.0)
    val sds: DenseVector[Double] = sqrt(variances + alpha.toDouble).map(x => if (x.isNaN) math.sqrt(alpha) else x)

    val out = mat(::, *) - rowMeans
    out(::, *) /= sds

    out
  }
}

Source File: GatherTransformerOperator.scala From keystone with Apache License 2.0

5 votes

package keystoneml.workflow

import org.apache.spark.rdd.RDD


private[workflow] case class GatherTransformerOperator[T]() extends TransformerOperator {
  override private[workflow] def singleTransform(inputs: Seq[DatumExpression]): Any = {
    inputs.map(_.get.asInstanceOf[T])
  }

  override private[workflow] def batchTransform(inputs: Seq[DatasetExpression]): RDD[_] = {
    inputs.map(_.get.asInstanceOf[RDD[T]].map(t => Seq(t))).reduceLeft((x, y) => {
      x.zip(y).map(z => z._1 ++ z._2)
    })
  }
}

Source File: PipelineDataset.scala From keystone with Apache License 2.0

5 votes

package keystoneml.workflow

import org.apache.spark.rdd.RDD


class PipelineDataset[T] private[workflow](executor: GraphExecutor, sink: SinkId)
  extends PipelineResult[RDD[T]](
    executor,
    sink)

object PipelineDataset {
  private[workflow] def apply[T](rdd: RDD[T]): PipelineDataset[T] = {
    val emptyGraph = Graph(Set(), Map(), Map(), Map())
    val (graphWithDataset, nodeId) = emptyGraph.addNode(new DatasetOperator(rdd), Seq())
    val (graph, sinkId) = graphWithDataset.addSink(nodeId)

    new PipelineDataset[T](new GraphExecutor(graph), sinkId)
  }
}

Source File: KernelMatrix.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.learning

import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import breeze.linalg._

import org.apache.spark.rdd.RDD

import keystoneml.utils.{MatrixUtils, Stats}
import keystoneml.workflow.{Transformer, LabelEstimator}


class BlockKernelMatrix[T: ClassTag](
    val kernelGen: KernelTransformer[T],
    val data: RDD[T],
    val cacheKernel: Boolean)
  extends KernelMatrix {

  val colBlockCache = HashMap.empty[Seq[Int], RDD[DenseMatrix[Double]]]
  val diagBlockCache = HashMap.empty[Seq[Int], DenseMatrix[Double]]

  def apply(colIdxs: Seq[Int]): RDD[DenseMatrix[Double]] = {
    if (colBlockCache.contains(colIdxs)) {
      colBlockCache(colIdxs)
    } else {
      val (kBlock, diagBlock) = kernelGen.computeKernel(data, colIdxs)
      if (cacheKernel) {
        colBlockCache += (colIdxs -> kBlock)
        diagBlockCache += (colIdxs -> diagBlock)
      }
      kBlock
    }
  }

  def unpersist(colIdxs: Seq[Int]): Unit = {
    if (colBlockCache.contains(colIdxs) && !cacheKernel) {
      colBlockCache(colIdxs).unpersist(true)
    }
  }

  def diagBlock(idxs: Seq[Int]): DenseMatrix[Double] = {
    if (!diagBlockCache.contains(idxs)) {
      val (kBlock, diagBlock) = kernelGen.computeKernel(data, idxs)
      if (cacheKernel) {
        colBlockCache += (idxs -> kBlock)
        diagBlockCache += (idxs -> diagBlock)
      }
      diagBlock
    } else {
      diagBlockCache(idxs)
    }
  }
}

Source File: LinearMapper.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.learning

import breeze.linalg._
import edu.berkeley.cs.amplab.mlmatrix.{NormalEquations, RowPartitionedMatrix}
import keystoneml.nodes.stats.{StandardScaler, StandardScalerModel}
import keystoneml.nodes.util.Densify
import org.apache.spark.rdd.RDD
import keystoneml.utils.MatrixUtils
import keystoneml.workflow.{LabelEstimator, Transformer}


object LinearMapEstimator extends Serializable {
  def apply(lambda: Option[Double] = None) = new LinearMapEstimator(lambda)

  def computeCost(
      trainingFeatures: RDD[DenseVector[Double]],
      trainingLabels: RDD[DenseVector[Double]],
      lambda: Double,
      x: DenseMatrix[Double],
      bOpt: Option[DenseVector[Double]]): Double = {

    val nTrain = trainingLabels.count
    val modelBroadcast = trainingLabels.context.broadcast(x)
    val bBroadcast = trainingLabels.context.broadcast(bOpt)

    val axb = trainingFeatures.mapPartitions(rows => {
      MatrixUtils.rowsToMatrixIter(rows).flatMap { rMat =>
        val mat = rMat * modelBroadcast.value
        val out = bBroadcast.value.map { b =>
          mat(*, ::) :+= b
          mat
        }.getOrElse(mat)

        MatrixUtils.matrixToRowArray(out).iterator
      }
    })

    val cost = axb.zip(trainingLabels).map { part =>
      val axb = part._1
      val labels = part._2
      val out = axb - labels
      math.pow(norm(out), 2)
    }.reduce(_ + _)

    if (lambda == 0) {
      cost/(2.0*nTrain.toDouble)
    } else {
      val wNorm = math.pow(norm(x.toDenseVector), 2)
      cost/(2.0*nTrain.toDouble) + lambda/2.0 * wNorm
    }
  }
}

Source File: LocalLeastSquaresEstimator.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.learning

import breeze.linalg._
import breeze.stats._
import keystoneml.nodes.stats.StandardScalerModel
import org.apache.spark.rdd.RDD
import keystoneml.utils.MatrixUtils
import keystoneml.workflow.LabelEstimator


  def trainWithL2(
   trainingFeatures: RDD[DenseVector[Double]],
   trainingLabels: RDD[DenseVector[Double]],
   lambda: Double): LinearMapper[DenseVector[Double]] = {

    val A_parts = trainingFeatures.mapPartitions { x =>
      MatrixUtils.rowsToMatrixIter(x)
    }.collect()
    val b_parts = trainingLabels.mapPartitions { x =>
      MatrixUtils.rowsToMatrixIter(x)
    }.collect()

    val A_local = DenseMatrix.vertcat(A_parts:_*)
    val b_local = DenseMatrix.vertcat(b_parts:_*)

    val featuresMean = mean(A_local(::, *)).t
    val labelsMean = mean(b_local(::, *)).t

    val A_zm = A_local(*, ::) - featuresMean
    val b_zm = b_local(*, ::) - labelsMean

    val AAt = A_zm * A_zm.t
    val model = A_zm.t * ( (AAt + (DenseMatrix.eye[Double](AAt.rows) :* lambda)) \ b_zm )
    LinearMapper(model, Some(labelsMean), Some(new StandardScalerModel(featuresMean, None)))
  }

}

Source File: LinearDiscriminantAnalysis.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.learning

import breeze.linalg._
import breeze.stats._
import org.apache.spark.rdd.RDD
import keystoneml.utils.MatrixUtils
import keystoneml.workflow.LabelEstimator


  override def fit(data: RDD[DenseVector[Double]], labels: RDD[Int]): LinearMapper[DenseVector[Double]] = {
    val sample = labels.zip(data).collect()
    computeLDA(sample)
  }

  def computeLDA(dataAndLabels: Array[(Int, DenseVector[Double])]): LinearMapper[DenseVector[Double]] = {
    val featuresByClass = dataAndLabels.groupBy(_._1).values.map(x => MatrixUtils.rowsToMatrix(x.map(_._2)))
    val meanByClass = featuresByClass.map(f => mean(f(::, *))) // each mean is a row vector, not col

    val sW = featuresByClass.zip(meanByClass).map(f => {
      val featuresMinusMean = f._1(*, ::) - f._2.t // row vector, not column
      featuresMinusMean.t * featuresMinusMean
    }).reduce(_+_)

    val numByClass = featuresByClass.map(_.rows : Double)
    val features = MatrixUtils.rowsToMatrix(dataAndLabels.map(_._2))
    val totalMean = mean(features(::, *)) // A row-vector, not a column-vector

    val sB = meanByClass.zip(numByClass).map {
      case (classMean, classNum) => {
        val m = classMean - totalMean
        (m.t * m) :* classNum
      }
    }.reduce(_+_)

    val eigen = eig((inv(sW): DenseMatrix[Double]) * sB)
    val eigenvectors = (0 until eigen.eigenvectors.cols).map(eigen.eigenvectors(::, _).toDenseMatrix.t)

    val topEigenvectors = eigenvectors.zip(eigen.eigenvalues.toArray).sortBy(x => -math.abs(x._2)).map(_._1).take(numDimensions)
    val W = DenseMatrix.horzcat(topEigenvectors:_*)

    new LinearMapper(W)
  }
}

Source File: LeastSquaresEstimator.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.learning

import breeze.linalg._
import keystoneml.nodes.util.{Densify, Sparsify}
import org.apache.spark.rdd.RDD
import keystoneml.pipelines.Logging
import keystoneml.workflow._

import scala.reflect._


class LeastSquaresEstimator[T <: Vector[Double]: ClassTag](
    lambda: Double = 0,
    numMachines: Option[Int] = None,
    cpuWeight: Double = 3.8e-4,
    memWeight: Double = 2.9e-1,
    networkWeight: Double = 1.32)
  extends OptimizableLabelEstimator[T, DenseVector[Double], DenseVector[Double]]
    with WeightedNode
    with Logging {

  val options: Seq[(CostModel, LabelEstimator[T, DenseVector[Double], DenseVector[Double]])] = Seq(
    {
      val solver = new DenseLBFGSwithL2[T](new LeastSquaresDenseGradient, regParam = lambda, numIterations = 20)
      (solver, solver)
    },
    {
      val solver = new SparseLBFGSwithL2(new LeastSquaresSparseGradient, regParam = lambda, numIterations = 20)
      (solver, TransformerLabelEstimatorChain(Sparsify(), solver))
    },
    {
      val solver = new BlockLeastSquaresEstimator(1000, 3, lambda = lambda)
      (solver, TransformerLabelEstimatorChain(Densify(), solver))
    },
    {
      val solver = new LinearMapEstimator(Some(lambda))
      (solver, TransformerLabelEstimatorChain(Densify(), solver))
    }
  )

  override val default: LabelEstimator[T, DenseVector[Double], DenseVector[Double]] with WeightedNode = {
    new DenseLBFGSwithL2[T](new LeastSquaresDenseGradient, regParam = lambda, numIterations = 20)
  }

  override def optimize(
      sample: RDD[T],
      sampleLabels: RDD[DenseVector[Double]],
      numPerPartition: Map[Int, Int])
  : LabelEstimator[T, DenseVector[Double], DenseVector[Double]] = {
    val n = numPerPartition.values.map(_.toLong).sum
    val d = sample.first().length
    val k = sampleLabels.first().length
    val sparsity = sample.map(x => x.activeSize.toDouble / x.length).sum() / sample.count()

    val realNumMachines = numMachines.getOrElse {
      if (sample.sparkContext.getExecutorStorageStatus.length == 1) {
        1
      } else {
        sample.sparkContext.getExecutorStorageStatus.length - 1
      }
    }

    logDebug(s"Optimizable Param n is $n")
    logDebug(s"Optimizable Param d is $d")
    logDebug(s"Optimizable Param k is $k")
    logDebug(s"Optimizable Param sparsity is $sparsity")
    logDebug(s"Optimizable Param numMachines is $realNumMachines")

    options.minBy(_._1.cost(n, d, k, sparsity, realNumMachines, cpuWeight, memWeight, networkWeight))._2
  }

  override val weight: Int = default.weight
}

Source File: SparseLinearMapper.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.learning

import breeze.linalg._
import org.apache.spark.rdd.RDD
import keystoneml.workflow.Transformer


  override def apply(in: RDD[SparseVector[Double]]): RDD[DenseVector[Double]] = {
    val modelBroadcast = in.context.broadcast(x)
    val bBroadcast = in.context.broadcast(bOpt)
    in.map(row => {
      val out = modelBroadcast.value.t * row
      bBroadcast.value.foreach { b =>
        out :+= b
      }

      out
    })
  }
}

Source File: ApproximatePCA.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.learning

import breeze.linalg._
import breeze.numerics._
import breeze.stats._
import breeze.stats.distributions.{Gaussian, ThreadLocalRandomGenerator, RandBasis}
import com.github.fommil.netlib.LAPACK._
import edu.berkeley.cs.amplab.mlmatrix.util.QRUtils
import org.apache.commons.math3.random.MersenneTwister
import org.apache.spark.rdd.RDD
import org.netlib.util.intW
import keystoneml.pipelines.Logging
import keystoneml.workflow.Estimator


  def approximateQ(A: DenseMatrix[Double], l: Int, q: Int, seed: Int = 0): DenseMatrix[Double] = {
    val d = A.cols

    val randBasis: RandBasis = new RandBasis(new ThreadLocalRandomGenerator(new MersenneTwister(seed)))
    val omega = DenseMatrix.rand(d, l, Gaussian(0,1)(randBasis)) //cpu: d*l, mem: d*l
    val y0 = A*omega //cpu: n*d*l, mem: n*l

    var Q = QRUtils.qrQR(y0)._1 //cpu: n*l**2

    for (i <- 1 to q) {
      val YHat = Q.t * A //cpu: l*n*d, mem: l*d
      val Qh = QRUtils.qrQR(YHat.t)._1 //cpu: d*l^2, mem: d*l

      val Yj = A * Qh //cpu: n*d*l, mem: n*l
      Q = QRUtils.qrQR(Yj)._1 //cpu:  n*l^2, mem: n*l
    }

    Q
  }
}

Source File: DistributedPCA.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.learning

import breeze.linalg._
import breeze.numerics._
import breeze.stats._
import com.github.fommil.netlib.LAPACK.{getInstance => lapack}
import org.apache.spark.rdd.RDD
import org.netlib.util.intW
import keystoneml.pipelines._
import keystoneml.utils.MatrixUtils
import keystoneml.workflow.{Transformer, Estimator}

import edu.berkeley.cs.amplab.mlmatrix.{RowPartition, NormalEquations, RowPartitionedMatrix, TSQR}


  def fit(samples: RDD[DenseVector[Float]]): PCATransformer = {
    new PCATransformer(computePCA(samples, dims))
  }

  def computePCA(dataMat: RDD[DenseVector[Float]], dims: Int): DenseMatrix[Float] = {

    val mat = new RowPartitionedMatrix(dataMat.mapPartitions { part =>
      val dblIter = part.map(x => convert(x, Double))
      MatrixUtils.rowsToMatrixIter(dblIter).map(RowPartition(_))
    })
    val means = DenseVector(mat.colSums():_*) :/ mat.numRows().toDouble

    val meansBC = dataMat.context.broadcast(means)
    val zeroMeanMat = new RowPartitionedMatrix(mat.rdd.map { part =>
      RowPartition(part.mat(*, ::) - meansBC.value)
    })

    val rPart = new TSQR().qrR(zeroMeanMat)

    val svd.SVD(u, s, pcaT) = svd(rPart)

    val pca = convert(pcaT.t, Float)

    val matlabConventionPCA = PCAEstimator.enforceMatlabPCASignConvention(pca)

    // Return a subset of the columns.
    matlabConventionPCA(::, 0 until dims)
  }

  override def cost(
    n: Long,
    d: Int,
    k: Int,
    sparsity: Double,
    numMachines: Int,
    cpuWeight: Double,
    memWeight: Double,
    networkWeight: Double): Double = {
    val log2NumMachines = math.log(numMachines.toDouble) / math.log(2.0)
    val flops = n.toDouble * d * d / numMachines + d.toDouble * d * d * log2NumMachines
    val bytesScanned = n.toDouble * d
    val network = d.toDouble * d * log2NumMachines
    math.max(cpuWeight * flops, memWeight * bytesScanned) + networkWeight * network
  }
}

Source File: WrapperTrait.scala From sparker with GNU General Public License v3.0

5 votes

package SparkER.Wrappers

import SparkER.DataStructures.{KeyValue, MatchingEntities, Profile}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

import scala.collection.mutable.MutableList


  def rowToAttributes(columnNames: Array[String], row: Row, explodeInnerFields: Boolean = false, innerSeparator: String = ","): MutableList[KeyValue] = {
    val attributes: MutableList[KeyValue] = new MutableList()
    for (i <- 0 to row.size - 1) {
      try {
        val value = row(i)
        val attributeKey = columnNames(i)

        if (value != null) {
          value match {
            case listOfAttributes: Iterable[Any] =>
              listOfAttributes map {
                attributeValue =>
                  attributes += KeyValue(attributeKey, attributeValue.toString)
              }
            case stringAttribute: String =>
              if (explodeInnerFields) {
                stringAttribute.split(innerSeparator) map {
                  attributeValue =>
                    attributes += KeyValue(attributeKey, attributeValue)
                }
              }
              else {
                attributes += KeyValue(attributeKey, stringAttribute)
              }
            case singleAttribute =>
              attributes += KeyValue(attributeKey, singleAttribute.toString)
          }
        }
      }
      catch {
        case e: Throwable => println(e)
      }
    }
    attributes
  }
}

Source File: SerializedProfilesLoader.scala From sparker with GNU General Public License v3.0

5 votes

package SparkER.Wrappers

import java.io.{IOException, _}

import SparkER.DataStructures.Profile
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD


  def loadSerializedObject(fileName: String): Any = {
    var `object`: Any = null
    try {
      val file: InputStream = new FileInputStream(fileName)
      val buffer: InputStream = new BufferedInputStream(file)
      val input: ObjectInput = new ObjectInputStream(buffer)
      try {
        `object` = input.readObject
      } finally {
        input.close
      }
    }
    catch {
      case cnfEx: ClassNotFoundException => {
        System.err.println(fileName)
        cnfEx.printStackTrace
      }
      case ioex: IOException => {
        System.err.println(fileName)
        ioex.printStackTrace
      }
    }
    return `object`
  }
}

Source File: Converters.scala From sparker with GNU General Public License v3.0

5 votes

package SparkER.Utilities

import SparkER.BlockBuildingMethods.TokenBlocking
import org.apache.spark.rdd.RDD
import SparkER.DataStructures._
import org.apache.spark.partial.PartialResult


  def profilesBlockToBlocks(profilesBlocks: RDD[ProfileBlocks], separatorIDs: Array[Long] = Array.emptyLongArray): RDD[BlockAbstract] = {

    val blockIDProfileID = profilesBlocks flatMap {
      profileWithBlocks =>
        val profileID = profileWithBlocks.profileID
        profileWithBlocks.blocks map {
          BlockWithSize =>
            (BlockWithSize.blockID, profileID)
        }
    }

    val blocks = blockIDProfileID.groupByKey().map {
      block =>
        val blockID = block._1
        val profilesID = block._2.toSet

        if (separatorIDs.isEmpty) {
          BlockDirty(blockID, Array(profilesID))
        }
        else {
          BlockClean(blockID, TokenBlocking.separateProfiles(profilesID, separatorIDs))
        }
    }

    blocks.filter(_.getComparisonSize() > 0).map(x => x)

  }
}

Source File: BlockFiltering.scala From sparker with GNU General Public License v3.0

5 votes

package SparkER.BlockRefinementMethods

import SparkER.DataStructures.{BlockWithComparisonSize, ProfileBlocks}
import SparkER.Utilities.BoundedPriorityQueue
import org.apache.log4j.LogManager
import org.apache.spark.rdd.RDD


    }
  }

  def blockFilteringAdvanced(profilesWithBlocks: RDD[ProfileBlocks], r: Double, minCardinality: Int = 1): RDD[ProfileBlocks] = {
    profilesWithBlocks map {
      profileWithBlocks =>
        val blocksSortedByComparisons = profileWithBlocks.blocks.toList.sortWith(_.comparisons < _.comparisons)
        val blocksToKeep = Math.round(blocksSortedByComparisons.size * r).toInt
        val threshold = blocksSortedByComparisons(blocksToKeep-1).comparisons
        ProfileBlocks(profileWithBlocks.profileID, blocksSortedByComparisons.filter(_.comparisons <= threshold).toSet)
    }
  }
}

Source File: SerializedObjectLoader.scala From sparker with GNU General Public License v3.0

5 votes

package Wrappers

import DataStructures.{KeyValue, MatchingEntities, Profile}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD


object SerializedObjectLoader extends WrapperTrait {

  def loadProfiles(filePath: String, startIDFrom: Long = 0, realFieldID: String = "", sourceId: Int = 0): RDD[Profile] = {
    @transient lazy val log = org.apache.log4j.LogManager.getRootLogger

    log.info("SPARKER - Start to loading entities")
    val entities = DataLoaders.SerializedLoader.loadSerializedDataset(filePath)
    log.info("SPARKER - Loading ended")

    log.info("SPARKER - Start to generate profiles")
    val profiles: Array[Profile] = new Array(entities.size())

    for (i <- 0 until entities.size()) {
      val profile = Profile(id = i + startIDFrom, originalID = i + "", sourceId = sourceId)

      val entity = entities.get(i)
      val it = entity.getAttributes.iterator()
      while (it.hasNext) {
        val attribute = it.next()
        profile.addAttribute(KeyValue(attribute.getName, attribute.getValue))
      }

      profiles.update(i, profile)
    }
    log.info("SPARKER - Ended to loading profiles")

    log.info("SPARKER - Start to parallelize profiles")
    val sc = SparkContext.getOrCreate()

    sc.union(profiles.grouped(10000).map(sc.parallelize(_)).toArray)
  }

  def loadGroundtruth(filePath: String): RDD[MatchingEntities] = {

    val groundtruth = DataLoaders.SerializedLoader.loadSerializedGroundtruth(filePath)

    val matchingEntitites: Array[MatchingEntities] = new Array(groundtruth.size())

    var i = 0

    val it = groundtruth.iterator
    while (it.hasNext) {
      val matching = it.next()
      matchingEntitites.update(i, MatchingEntities(matching.getEntityId1.toString, matching.getEntityId2.toString))
      i += 1
    }

    val sc = SparkContext.getOrCreate()
    sc.union(matchingEntitites.grouped(10000).map(sc.parallelize(_)).toArray)
  }
}

Source File: WrapperTrait.scala From sparker with GNU General Public License v3.0

5 votes

package Wrappers

import DataStructures.{KeyValue, MatchingEntities, Profile}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

import scala.collection.mutable.MutableList


  def rowToAttributes(columnNames: Array[String], row: Row, explodeInnerFields: Boolean = false, innerSeparator: String = ","): MutableList[KeyValue] = {
    val attributes: MutableList[KeyValue] = new MutableList()
    for (i <- 0 to row.size - 1) {
      try {
        val value = row(i)
        val attributeKey = columnNames(i)

        if (value != null) {
          value match {
            case listOfAttributes: Iterable[Any] =>
              listOfAttributes map {
                attributeValue =>
                  attributes += KeyValue(attributeKey, attributeValue.toString)
              }
            case stringAttribute: String =>
              if (explodeInnerFields) {
                stringAttribute.split(innerSeparator) map {
                  attributeValue =>
                    attributes += KeyValue(attributeKey, attributeValue)
                }
              }
              else {
                attributes += KeyValue(attributeKey, stringAttribute)
              }
            case singleAttribute =>
              attributes += KeyValue(attributeKey, singleAttribute.toString)
          }
        }
      }
      catch {
        case e: Throwable => println(e)
      }
    }
    attributes
  }
}

Source File: SerializedProfilesLoader.scala From sparker with GNU General Public License v3.0

5 votes

package Wrappers

import java.io.{IOException, _}

import DataStructures.Profile
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD


  def loadSerializedObject(fileName: String): Any = {
    var `object`: Any = null
    try {
      val file: InputStream = new FileInputStream(fileName)
      val buffer: InputStream = new BufferedInputStream(file)
      val input: ObjectInput = new ObjectInputStream(buffer)
      try {
        `object` = input.readObject
      } finally {
        input.close
      }
    }
    catch {
      case cnfEx: ClassNotFoundException => {
        System.err.println(fileName)
        cnfEx.printStackTrace
      }
      case ioex: IOException => {
        System.err.println(fileName)
        ioex.printStackTrace
      }
    }
    return `object`
  }
}

Source File: Converters.scala From sparker with GNU General Public License v3.0

5 votes

package Utilities

import BlockBuildingMethods.TokenBlocking
import org.apache.spark.rdd.RDD
import DataStructures._
import org.apache.spark.partial.PartialResult


  def profilesBlockToBlocks(profilesBlocks: RDD[ProfileBlocks], separatorIDs: Array[Long] = Array.emptyLongArray): RDD[BlockAbstract] = {

    val blockIDProfileID = profilesBlocks flatMap {
      profileWithBlocks =>
        val profileID = profileWithBlocks.profileID
        profileWithBlocks.blocks map {
          BlockWithSize =>
            (BlockWithSize.blockID, profileID)
        }
    }

    val blocks = blockIDProfileID.groupByKey().map {
      block =>
        val blockID = block._1
        val profilesID = block._2.toSet

        if (separatorIDs.isEmpty) {
          BlockDirty(blockID, Array(profilesID))
        }
        else {
          BlockClean(blockID, TokenBlocking.separateProfiles(profilesID, separatorIDs))
        }
    }

    blocks.filter(_.getComparisonSize() >= 1).map(x => x)

  }
}

Source File: BlockFiltering.scala From sparker with GNU General Public License v3.0

5 votes

package BlockRefinementMethods

import DataStructures.{BlockWithComparisonSize, ProfileBlocks}
import Utilities.BoundedPriorityQueue
import org.apache.log4j.LogManager
import org.apache.spark.rdd.RDD


    }
  }

  def blockFilteringAdvanced(profilesWithBlocks: RDD[ProfileBlocks], r: Double, minCardinality: Int = 1): RDD[ProfileBlocks] = {
    profilesWithBlocks map {
      profileWithBlocks =>
        val blocksSortedByComparisons = profileWithBlocks.blocks.toList.sortWith(_.comparisons < _.comparisons)
        val blocksToKeep = Math.round(blocksSortedByComparisons.size * r).toInt
        val threshold = blocksSortedByComparisons(blocksToKeep-1).comparisons
        ProfileBlocks(profileWithBlocks.profileID, blocksSortedByComparisons.filter(_.comparisons <= threshold).toSet)
    }
  }
}

Source File: SerializedObjectLoader.scala From sparker with GNU General Public License v3.0

5 votes

package Wrappers

import DataStructures.{KeyValue, MatchingEntities, Profile}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD


object SerializedObjectLoader extends  WrapperTrait{

  def loadProfiles(filePath : String, startIDFrom : Long = 0, realFieldID : String = "") : RDD[Profile] = {
    @transient lazy val log = org.apache.log4j.LogManager.getRootLogger

    log.info("SPARKER - Start to loading entities")
    val entities = DataLoaders.SerializedLoader.loadSerializedDataset(filePath)
    log.info("SPARKER - Loading ended")

    log.info("SPARKER - Start to generate profiles")
    val profiles : Array[Profile] = new Array(entities.size())

    for(i <- 0 to entities.size()-1){
      val profile = Profile(id = i+startIDFrom, originalID = i+"")

      val entity = entities.get(i)
      val it = entity.getAttributes.iterator()
      while(it.hasNext){
        val attribute = it.next()
        profile.addAttribute(KeyValue(attribute.getName, attribute.getValue))
      }

      profiles.update(i, profile)
    }
    log.info("SPARKER - Ended to loading profiles")

    log.info("SPARKER - Start to parallelize profiles")
    val sc = SparkContext.getOrCreate()

    sc.union(profiles.grouped(10000).map(sc.parallelize(_)).toArray)
  }

  def loadGroundtruth(filePath : String) : RDD[MatchingEntities] = {

    val groundtruth = DataLoaders.SerializedLoader.loadSerializedGroundtruth(filePath)

    val matchingEntitites : Array[MatchingEntities] = new Array(groundtruth.size())

    var i = 0

    val it = groundtruth.iterator
    while(it.hasNext){
      val matching = it.next()
      matchingEntitites.update(i, MatchingEntities(matching.getEntityId1.toString, matching.getEntityId2.toString))
      i+=1
    }

    val sc = SparkContext.getOrCreate()
    sc.union(matchingEntitites.grouped(10000).map(sc.parallelize(_)).toArray)
  }
}

Source File: WrapperTrait.scala From sparker with GNU General Public License v3.0

5 votes

package Wrappers

import DataStructures.{KeyValue, MatchingEntities, Profile}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

import scala.collection.mutable.MutableList


  def rowToAttributes(columnNames : Array[String], row : Row, explodeInnerFields:Boolean = false, innerSeparator : String = ",") : MutableList[KeyValue] = {
    val attributes: MutableList[KeyValue] = new MutableList()
    for(i <- 0 to row.size-1){
      try{
        val value = row(i)
        val attributeKey = columnNames(i)

        if(value != null){
          value match {
            case listOfAttributes : Iterable[Any] =>
              listOfAttributes map {
                attributeValue =>
                  attributes += KeyValue(attributeKey, attributeValue.toString)
              }
            case stringAttribute : String =>
              if(explodeInnerFields){
                stringAttribute.split(innerSeparator) map {
                  attributeValue =>
                    attributes += KeyValue(attributeKey, attributeValue)
                }
              }
              else {
                attributes += KeyValue(attributeKey, stringAttribute)
              }
            case singleAttribute =>
              attributes += KeyValue(attributeKey, singleAttribute.toString)
          }
        }
      }
      catch{
        case e : Throwable => println(e)
      }
    }
    attributes
  }
}

Source File: SerializedProfilesLoader.scala From sparker with GNU General Public License v3.0

5 votes

package Wrappers

import java.io.{IOException, _}

import DataStructures.Profile
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD


  def loadSerializedObject(fileName: String): Any = {
    var `object`: Any = null
    try {
      val file: InputStream = new FileInputStream(fileName)
      val buffer: InputStream = new BufferedInputStream(file)
      val input: ObjectInput = new ObjectInputStream(buffer)
      try {
        `object` = input.readObject
      } finally {
        input.close
      }
    }
    catch {
      case cnfEx: ClassNotFoundException => {
        System.err.println(fileName)
        cnfEx.printStackTrace
      }
      case ioex: IOException => {
        System.err.println(fileName)
        ioex.printStackTrace
      }
    }
    return `object`
  }
}

Source File: Converters.scala From sparker with GNU General Public License v3.0

5 votes

package Utilities

import org.apache.spark.rdd.RDD
import DataStructures._
import org.apache.spark.partial.PartialResult


  def profilesBlockToBlocks(profilesBlocks : RDD[ProfileBlocks], separatorID : Long = -1) : RDD[BlockAbstract] = {

    val blockIDProfileID = profilesBlocks flatMap {
      profileWithBlocks =>
        val profileID = profileWithBlocks.profileID
        profileWithBlocks.blocks map {
          BlockWithSize =>
            (BlockWithSize.blockID, profileID)
        }
    }

    val blocks = blockIDProfileID.groupByKey().map {
      block =>
        val blockID = block._1
        val profilesID = block._2.toSet

        if (separatorID < 0){
          BlockDirty(blockID, (profilesID, Set.empty))
        }
        else{
          BlockClean(blockID, (profilesID.partition(_ <= separatorID)))
        }
    }

    blocks.filter(_.getComparisonSize() >=1).map(x => x)

  }
 }

Source File: BlockFiltering.scala From sparker with GNU General Public License v3.0

5 votes

package BlockRefinementMethods

import DataStructures.{BlockWithComparisonSize, ProfileBlocks}
import Utilities.BoundedPriorityQueue
import org.apache.log4j.LogManager
import org.apache.spark.rdd.RDD


    }
  }

  def blockFilteringAdvanced(profilesWithBlocks: RDD[ProfileBlocks], r: Double, minCardinality: Int = 1): RDD[ProfileBlocks] = {
    profilesWithBlocks map {
      profileWithBlocks =>
        val blocksSortedByComparisons = profileWithBlocks.blocks.toList.sortWith(_.comparisons < _.comparisons)
        val blocksToKeep = Math.round(blocksSortedByComparisons.size * r).toInt
        val threshold = blocksSortedByComparisons(blocksToKeep-1).comparisons
        ProfileBlocks(profileWithBlocks.profileID, blocksSortedByComparisons.filter(_.comparisons <= threshold).toSet)
    }
  }
}

Source File: CNNModel.scala From SparkMLlibDeepLearn with Apache License 2.0

5 votes

package CNN

import breeze.linalg.{
  Matrix => BM,
  CSCMatrix => BSM,
  DenseMatrix => BDM,
  Vector => BV,
  DenseVector => BDV,
  SparseVector => BSV
}
import org.apache.spark.rdd.RDD


  def Loss(predict: RDD[PredictCNNLabel]): Double = {
    val predict1 = predict.map(f => f.error)
    // error and loss
    // ���������
    val loss1 = predict1
    val (loss2, counte) = loss1.treeAggregate((0.0, 0L))(
      seqOp = (c, v) => {
        // c: (e, count), v: (m)
        val e1 = c._1
        val e2 = (v :* v).sum
        val esum = e1 + e2
        (esum, c._2 + 1)
      },
      combOp = (c1, c2) => {
        // c: (e, count)
        val e1 = c1._1
        val e2 = c2._1
        val esum = e1 + e2
        (esum, c1._2 + c2._2)
      })
    val Loss = (loss2 / counte.toDouble) * 0.5
    Loss
  }

}

Source File: NeuralNetModel.scala From SparkMLlibDeepLearn with Apache License 2.0

5 votes

package NN

import breeze.linalg.{
  Matrix => BM,
  CSCMatrix => BSM,
  DenseMatrix => BDM,
  Vector => BV,
  DenseVector => BDV,
  SparseVector => BSV
}
import org.apache.spark.rdd.RDD


  def Loss(predict: RDD[PredictNNLabel]): Double = {
    val predict1 = predict.map(f => f.error)
    // error and loss
    // ���������
    val loss1 = predict1
    val (loss2, counte) = loss1.treeAggregate((0.0, 0L))(
      seqOp = (c, v) => {
        // c: (e, count), v: (m)
        val e1 = c._1
        val e2 = (v :* v).sum
        val esum = e1 + e2
        (esum, c._2 + 1)
      },
      combOp = (c1, c2) => {
        // c: (e, count)
        val e1 = c1._1
        val e2 = c2._1
        val esum = e1 + e2
        (esum, c1._2 + c2._2)
      })
    val Loss = loss2 / counte.toDouble
    Loss * 0.5
  }

}

Source File: DBNModel.scala From SparkMLlibDeepLearn with Apache License 2.0

5 votes

package DBN

import breeze.linalg.{
  Matrix => BM,
  CSCMatrix => BSM,
  DenseMatrix => BDM,
  Vector => BV,
  DenseVector => BDV,
  SparseVector => BSV
}
import org.apache.spark.rdd.RDD
import scala.collection.mutable.ArrayBuffer

class DBNModel(
  val config: DBNConfig,
  val dbn_W: Array[BDM[Double]],
  val dbn_b: Array[BDM[Double]],
  val dbn_c: Array[BDM[Double]]) extends Serializable {

  
  def dbnunfoldtonn(outputsize: Int): (Array[Int], Int, Array[BDM[Double]]) = {
    //1 size layer ����ת��
    val size = if (outputsize > 0) {
      val size1 = config.size
      val size2 = ArrayBuffer[Int]()
      size2 ++= size1
      size2 += outputsize
      size2.toArray
    } else config.size
    val layer = if (outputsize > 0) config.layer + 1 else config.layer
    
    //2 dbn_W ����ת��
    var initW = ArrayBuffer[BDM[Double]]()
    for (i <- 0 to dbn_W.length - 1) {
      initW += BDM.horzcat(dbn_c(i), dbn_W(i))
    }
    (size, layer, initW.toArray)
  }

}

Source File: StringKeyRDD.scala From cuesheet with Apache License 2.0

5 votes

package com.kakao.cuesheet.convert

import java.nio.charset.StandardCharsets.UTF_8

import com.kakao.mango.concurrent._
import com.kakao.mango.couchbase.Couchbase
import com.kakao.mango.hbase.HBase
import com.kakao.mango.json._
import com.kakao.mango.util.Retry
import org.apache.spark.rdd.RDD

import scala.concurrent.duration._


class StringKeyRDD[T](rdd: RDD[(String, T)]) extends SaveToES(rdd) {

  def saveToCouchbase(nodes: Seq[String], bucket: String, expiry: Int = 0, maxRate: Double = 1e7, password: String = null): Unit = {
    // rate per executor
    val rate = maxRate / rdd.sparkContext.getExecutorMemoryStatus.size

    rdd.foreachPartition { partition =>
      // BackPressureException may happen, so retry 10 times
      // if that fails, Spark task scheduler may retry again.
      val cluster = Couchbase(nodes: _*)
      val client = cluster.bucket(bucket, password)

      val converted = partition.map {
        case (key, value: Array[Byte]) => (key, new String(value, UTF_8))
        case (key, value: String) => (key, value)
        case (key, value) => (key, toJson(value))
      }

      for (group <- converted.grouped(1000)) {
        Retry(10, 100.millis) {
          client.putAll(group, rate, expiry).sync()
        }
      }

      cluster.disconnect()
    }
  }

  def saveToHBase(quorum: String, table: String, family: String, qualifier: String, maxRate: Double = 1e7): Unit = {
    // rate per executor
    val rate = maxRate / rdd.sparkContext.getExecutorMemoryStatus.size

    rdd.foreachPartition { partition =>
      val hbase = HBase(quorum)
      val column = hbase.column(table, family, qualifier)

      val converted = partition.map {
        case (key, value: Array[Byte]) => (key.getBytes(UTF_8), value)
        case (key, value: String) => (key.getBytes(UTF_8), value.getBytes(UTF_8))
        case (key, value) => (key.getBytes(UTF_8), serialize(value))
      }

      for (group <- converted.grouped(1000)) {
        Retry(10, 100.millis) {
          column.putAllBytes(group, rate).sync()
        }
      }
    }
  }
}

Source File: HBaseReaders.scala From cuesheet with Apache License 2.0

5 votes

package com.kakao.cuesheet.convert

import com.kakao.mango.util.Conversions._
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import scala.collection.JavaConversions._

trait HBaseReaders {
  val sc: SparkContext

  
  def hbaseTable(quorum: String, table: String): RDD[(String, ((String, String), (Long, String)))] = {
    hbaseTableBinary(quorum, table).map {
      case (rowkey, ((family, qualifier), (timestamp, value))) =>
        (rowkey.string, ((family.string, qualifier.string), (timestamp, value.string)))
    }
  }

  def hbaseColumnBinary(quorum: String, table: String, family: Array[Byte], qualifier: Array[Byte]): RDD[(Array[Byte], (Long, Array[Byte]))] = {
    hbaseTableBinary(quorum, table).collect {
      case (rowkey, ((f, q), cell)) if family.sameElements(f) && qualifier.sameElements(q) => (rowkey, cell)
    }
  }

  def hbaseColumn(quorum: String, table: String, family: String, qualifier: String): RDD[(String, (Long, String))] = {
    hbaseTable(quorum, table).collect {
      case (rowkey, ((f, q), cell)) if family == f && qualifier == q => (rowkey, cell)
    }
  }
}

Source File: JoinableRDD.scala From cuesheet with Apache License 2.0

5 votes

package com.kakao.cuesheet.convert

import org.apache.spark.HashPartitioner
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

class JoinableRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]) {

  def selfJoin(numPartitions: Int = rdd.partitions.length): RDD[(K, (V, V))] = fastJoin(rdd, numPartitions)

  def fastJoin[W](other: RDD[(K, W)], numPartitions: Int = rdd.partitions.length): RDD[(K, (V, W))] = {
    val partitioner = new HashPartitioner(numPartitions)
    val grouped = rdd cogroup other

    val left = grouped.flatMap{
      case (k, (vs, ws)) => vs.zipWithIndex.map {
        case (v, idx) => ((k, idx), v)
      }
    }.partitionBy(partitioner)

    val right = grouped.flatMap {
      case (k, (vs, ws)) => ws.map { w => ((k, w.hashCode()), (w, vs.size)) }
    }.partitionBy(partitioner).flatMap {
      case ((k, r), (w, size)) => (0 until size).map(i => ((k, w), i))
    }.map {
      case ((k, w), idx) => ((k, idx), w)
    }

    (left join right).map {
      case ((k, idx), (v, w)) => (k, (v, w))
    }
  }

}

Source File: SavingStream.scala From cuesheet with Apache License 2.0

5 votes

package com.kakao.cuesheet.convert

import com.kakao.mango.concurrent.{NamedExecutors, RichExecutorService}
import com.kakao.mango.text.ThreadSafeDateFormat
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, DataFrame}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.Time
import org.apache.spark.streaming.dstream.DStream

import java.util.concurrent.{Future => JFuture}
import scala.reflect.runtime.universe.TypeTag

object SavingStream {
  val yyyyMMdd = ThreadSafeDateFormat("yyyy-MM-dd")
  val hh = ThreadSafeDateFormat("HH")
  val mm = ThreadSafeDateFormat("mm")
  val m0 = (ms: Long) => mm(ms).charAt(0) + "0"
}


  @transient var executor: RichExecutorService = _

  def ex: RichExecutorService = {
    if (executor == null) {
      this.synchronized {
        if (executor == null) {
          executor = new RichExecutorService(es.get())
        }
      }
    }
    executor
  }

  def saveAsPartitionedTable(table: String, path: String, format: String = "orc")(toPartition: Time => Seq[(String, String)]): Unit = {
    stream.foreachRDD { (rdd, time) =>
      ex.submit {
        toDF(rdd).appendToExternalTablePartition(table, path, format, toPartition(time): _*)
      }
    }
  }

  def saveAsDailyPartitionedTable(table: String, path: String, dateColumn: String = "date", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms))
    }
  }

  def saveAsHourlyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms))
    }
  }

  def saveAsTenMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> m0(ms))
    }
  }

  def saveAsMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> mm(ms))
    }
  }

}

class ProductStream[T <: Product : TypeTag](stream: DStream[T])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[T](stream) {
  override def toDF(rdd: RDD[T]) = ctx.createDataFrame(rdd)
}

class JsonStream(stream: DStream[String])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[String](stream) {
  override def toDF(rdd: RDD[String]) = ctx.read.json(rdd)
}

class MapStream[T](stream: DStream[Map[String, T]])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[Map[String, T]](stream) {
  import com.kakao.mango.json._

  override def toDF(rdd: RDD[Map[String, T]]) = ctx.read.json(rdd.map(toJson))
}

class RowStream(stream: DStream[Row])(implicit ctx: HiveContext, es: ExecutorSupplier, schema: StructType) extends SavingStream[Row](stream) {
  override def toDF(rdd: RDD[Row]): DataFrame = ctx.createDataFrame(rdd, schema)
}

Source File: MemsqlRDD.scala From memsql-spark-connector with Apache License 2.0

5 votes

package com.memsql.spark

import java.sql.{Connection, PreparedStatement, ResultSet}

import com.memsql.spark.SQLGen.VariableList
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils}
import org.apache.spark.sql.types._
import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}

case class MemsqlRDD(query: String,
                     variables: VariableList,
                     options: MemsqlOptions,
                     schema: StructType,
                     expectedOutput: Seq[Attribute],
                     @transient val sc: SparkContext)
    extends RDD[Row](sc, Nil) {

  override protected def getPartitions: Array[Partition] =
    MemsqlQueryHelpers.GetPartitions(options, query, variables)

  override def compute(rawPartition: Partition, context: TaskContext): Iterator[Row] = {
    var closed                     = false
    var rs: ResultSet              = null
    var stmt: PreparedStatement    = null
    var conn: Connection           = null
    var partition: MemsqlPartition = rawPartition.asInstanceOf[MemsqlPartition]

    def tryClose(name: String, what: AutoCloseable): Unit = {
      try {
        if (what != null) { what.close() }
      } catch {
        case e: Exception => logWarning(s"Exception closing $name", e)
      }
    }

    def close(): Unit = {
      if (closed) { return }
      tryClose("resultset", rs)
      tryClose("statement", stmt)
      tryClose("connection", conn)
      closed = true
    }

    context.addTaskCompletionListener { context =>
      close()
    }

    conn = JdbcUtils.createConnectionFactory(partition.connectionInfo)()
    stmt = conn.prepareStatement(partition.query)
    JdbcHelpers.fillStatement(stmt, partition.variables)
    rs = stmt.executeQuery()

    var rowsIter = JdbcUtils.resultSetToRows(rs, schema)

    if (expectedOutput.nonEmpty) {
      val schemaDatatypes   = schema.map(_.dataType)
      val expectedDatatypes = expectedOutput.map(_.dataType)

      if (schemaDatatypes != expectedDatatypes) {
        val columnEncoders = schemaDatatypes.zip(expectedDatatypes).zipWithIndex.map {
          case ((_: StringType, _: NullType), _)     => ((_: Row) => null)
          case ((_: ShortType, _: BooleanType), i)   => ((r: Row) => r.getShort(i) != 0)
          case ((_: IntegerType, _: BooleanType), i) => ((r: Row) => r.getInt(i) != 0)
          case ((_: LongType, _: BooleanType), i)    => ((r: Row) => r.getLong(i) != 0)

          case ((l, r), i) => {
            options.assert(l == r, s"MemsqlRDD: unable to encode ${l} into ${r}")
            ((r: Row) => r.get(i))
          }
        }

        rowsIter = rowsIter
          .map(row => Row.fromSeq(columnEncoders.map(_(row))))
      }
    }

    CompletionIterator[Row, Iterator[Row]](new InterruptibleIterator[Row](context, rowsIter), close)
  }

}

Source File: KMeanTest.scala From SparseML with Apache License 2.0

5 votes

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.clustering.{ScalableKMeans, KMeans}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.{SparseVector, Vectors, Vector}

import scala.util.Random


//spark/bin/spark-submit --master spark://10.100.34.48:7077 --class  ScalableKMeanTest --executor-memory 20g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 1000000 100 0.1 1 my 9

//guale spark/bin/spark-submit --master spark://10.100.34.48:7077 --class  ScalableKMeanTest --executor-memory 5g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 5000000 100 0.1 1 my 15

object ScalableKMeanTest {

  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName(s"kmeans: ${args.mkString(",")}")
    val sc = new SparkContext(conf)

    val k = args(0).toInt
    val dimension = args(1).toInt
    val recordNum = args(2).toInt
    val sparsity = args(3).toDouble
    val iterations = args(4).toInt
    val means = args(5)
    val parNumber = args(6).toInt

    val data: RDD[Vector] = sc.parallelize(1 to recordNum, parNumber).map(i => {
      val ran = new Random()
      val indexArr = ran.shuffle((0 until dimension).toList).take((dimension * sparsity).toInt).sorted.toArray
      val valueArr = (1 to (dimension * sparsity).toInt).map(in => ran.nextDouble()).sorted.toArray
      val vec: Vector = new SparseVector(dimension, indexArr, valueArr)
      vec
    }).cache()
    println(args.mkString(", "))
    println(data.count() + " records generated")

    val st = System.nanoTime()

    val model = if(means == "my") {
      println("running scalable kmeans")
      val model = new ScalableKMeans()
        .setK(k)
        .setInitializationMode("random")
        .setMaxIterations(iterations)
        .run(data)
      model
    } else {
      println("running mllib kmeans")
      val model = new KMeans()
        .setK(k)
        .setInitializationMode("random")
        .setMaxIterations(iterations)
        .run(data)
      model
    }

    println((System.nanoTime() - st) / 1e9 + " seconds cost")
    println("final clusters: " + model.clusterCenters.length)
    println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n"))

    sc.stop()
  }

}

Source File: LRUtils.scala From SparseML with Apache License 2.0

5 votes

package org.apache.spark.mllib.sparselr.Utils

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

object LRUtils {
  
  def bytes2Int (buffer: Array[Byte], pos: Int): (Int, Int) = {
    var result: Int = 0
    var position: Int = pos
    var byte = buffer(pos)
    var shiftNum = 0

    while ((byte & 0x80) != 0) {
      result = result | ((byte & 0x7F)<<shiftNum)
      position += 1
      byte = buffer(position)
      shiftNum += 7
    }
    result = result | ((byte & 0x7F)<<shiftNum)
    (result, position)
  }

  //featureId cached in X is localId
  def loadFileAsMatrix(
                sc: SparkContext,
                path: String,
                minPartitions: Int): RDD[(Array[Double], Matrix)] = {
    val lines = sc.textFile(path, minPartitions)
      .map(_.trim)
      .filter(line => !(line.isEmpty || line.startsWith("#")))

    val data = lines.mapPartitions { samples =>
      val labels = new PrimitiveVector[Double]()
      val builder = new MatrixBuilder()

      samples.foreach { line =>
        val items = line.split(' ')

        labels += items.head.toDouble

        val featureIdAndValues = items.tail.filter(_.nonEmpty)

        val indices = new PrimitiveVector[Int]()
        val values = new PrimitiveVector[Float]()
        featureIdAndValues.foreach { item =>
          val featureAndValue = item.split(":")
          indices += featureAndValue(0).toInt
          val value = featureAndValue(1).toFloat
          values += value
        }
        builder.add(new SparseVector(indices.trim.array, values.trim.array))
      }
      Iterator((labels.trim.array, builder.toMatrix))
    }
    data
  }
}

Source File: LogisticRegression.scala From SparseML with Apache License 2.0

5 votes

package org.apache.spark.mllib.sparselr

import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap
import org.apache.spark.mllib.sparselr.Utils._
import org.apache.spark.SparkEnv
import org.apache.spark.rdd.RDD
import org.apache.spark.broadcast.Broadcast

object LogisticRegression {
    def train(input: RDD[(Array[Double], Matrix)],
              optimizer: Optimizer
              ): (Array[Int], Array[Double]) = {

      val hdfsIndex2global = new Int2IntOpenHashMap()
      var index = 0

      input.map { point =>
        point._2 match {
          case x: CompressedSparseMatrix =>
            println("x.length" + x.mappings.length)
          case _ =>
            throw new IllegalArgumentException(s"dot doesn't support ${input.getClass}.")
        }
      }.count

      val global2hdfsIndex = input.map { point =>
        point._2 match {
          case x: CompressedSparseMatrix =>
            x.mappings
          case _ =>
            throw new IllegalArgumentException(s"dot doesn't support ${input.getClass}.")
        }
      }.collect().flatMap(t => t).distinct

      global2hdfsIndex.foreach{value =>
        hdfsIndex2global.put(value, index)
        index += 1
      }

      val bcHdfsIndex2global = input.context.broadcast(hdfsIndex2global)

      val examples = input.map(global2globalMapping(bcHdfsIndex2global)).cache()

      val numTraining = examples.count()
      println(s"Training: $numTraining.")

      SparkEnv.get.blockManager.removeBroadcast(bcHdfsIndex2global.id, true)

      val examplesTest = examples.mapPartitions(_.flatMap {
        case (y, part) => part.asInstanceOf[CompressedSparseMatrix].tupletIterator(y)})

      val weights = Vectors.dense(new Array[Double](global2hdfsIndex.size))

      val newWeights = optimizer.optimize(examplesTest, weights)

      ((global2hdfsIndex, newWeights.toArray))
    }

  //globalId to localId for mappings in Matrix
    def global2globalMapping(bchdfsIndex2global: Broadcast[Int2IntOpenHashMap])
                     (partition: (Array[Double], Matrix)): (Array[Double], Matrix) = {
      val hdfsIndex2global = bchdfsIndex2global.value

      partition._2 match {
        case x: CompressedSparseMatrix =>
          val local2hdfsIndex = x.mappings
          for (i <- 0 until local2hdfsIndex.length) {
            local2hdfsIndex(i) = hdfsIndex2global.get(local2hdfsIndex(i))
          }
        case _ =>
          throw new IllegalArgumentException(s"dot doesn't support ${partition.getClass}.")
      }
      partition
    }
}

Source File: OneWayANOVA.scala From StatisticsOnSpark with Apache License 2.0

5 votes

package main.ANOVA

import org.apache.commons.math3.distribution.FDistribution
import org.apache.spark.rdd.RDD



  def anovaPValue(categoryData: Iterable[RDD[Double]]): Double = {
    val anovaStats = getAnovaStats(categoryData)

    val fdist: FDistribution = new FDistribution(null, anovaStats.dfbg, anovaStats.dfwg)
    return 1.0 - fdist.cumulativeProbability(anovaStats.F)
  }

  private case class ANOVAStats(dfbg: Double, dfwg: Double, F: Double)

  private def getAnovaStats(categoryData: Iterable[RDD[Double]]): ANOVAStats = {
    var dfwg: Long = 0
    var sswg: Double = 0
    var totsum: Double = 0
    var totsumsq: Double = 0
    var totnum: Long = 0

    for (data <- categoryData) {
      val sum: Double = data.sum()
      val sumsq: Double = data.map(i => i * i).sum()
      val num = data.count()
      totnum += num
      totsum += sum
      totsumsq += sumsq
      dfwg += num - 1
      val ss: Double = sumsq - ((sum * sum) / num)
      sswg += ss
    }

    val sst: Double = totsumsq - ((totsum * totsum) / totnum)
    val ssbg: Double = sst - sswg
    val dfbg: Int = categoryData.size - 1
    val msbg: Double = ssbg / dfbg
    val mswg: Double = sswg / dfwg
    val F: Double = msbg / mswg
    ANOVAStats(dfbg, dfwg, F)
  }


}

Source File: TwoSampleIndependentTTest.scala From StatisticsOnSpark with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat

import org.apache.commons.math3.distribution.TDistribution
import org.apache.commons.math3.util.FastMath
import org.apache.spark.rdd.RDD


  def tTest(sample1: RDD[Double], sample2: RDD[Double]): Double = {
    val n1 = sample1.count()
    val n2 = sample2.count()
    val m1 = sample1.sum() / n1
    val m2 = sample2.sum() / n2
    val v1 = sample1.map(d => (d - m1) * (d - m1)).sum() / (n1 - 1)
    val v2 = sample2.map(d => (d - m2) * (d - m2)).sum() / (n2 - 1)
    val t: Double = math.abs((m1 - m2) / FastMath.sqrt((v1 / n1) + (v2 / n2)))
    val degreesOfFreedom: Double = (((v1 / n1) + (v2 / n2)) * ((v1 / n1) + (v2 / n2))) /
      ((v1 * v1) / (n1 * n1 * (n1 - 1d)) + (v2 * v2) / (n2 * n2 * (n2 - 1d)))

    // pass a null rng to avoid unneeded overhead as we will not sample from this distribution
    val distribution: TDistribution = new TDistribution(null, degreesOfFreedom)
    2.0 * distribution.cumulativeProbability(-t)
  }

}

Source File: EtlProcessor.scala From etl-light with MIT License

5 votes

package yamrcraft.etlite.processors

import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.DefaultDecoder
import org.apache.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.kafka._
import org.slf4j.LoggerFactory
import yamrcraft.etlite.Settings
import yamrcraft.etlite.state.{KafkaOffsetsState, KafkaStateManager}
import yamrcraft.etlite.transformers.InboundMessage

object EtlProcessor {

  val logger = LoggerFactory.getLogger(this.getClass)

  def run(settings: Settings) = {
    val context = createContext(settings)

    val stateManager = new KafkaStateManager(settings.etl.state)

    val lastState = stateManager.readState
    logger.info(s"last persisted state: $lastState")

    val currState = stateManager.fetchNextState(lastState, settings)
    logger.info(s"batch working state: $currState")

    val rdd = createRDD(context, currState, settings)
    processRDD(rdd, currState.jobId, settings)

    logger.info("committing state")
    stateManager.commitState(currState)
  }

  private def createContext(settings: Settings) = {
    val sparkConf = new SparkConf()
      .setAppName(settings.spark.appName)
      .setAll(settings.spark.conf)

    new SparkContext(sparkConf)
  }

  private def createRDD(context: SparkContext, state: KafkaOffsetsState, settings: Settings): RDD[InboundMessage] = {
    KafkaUtils.createRDD[Array[Byte], Array[Byte], DefaultDecoder, DefaultDecoder, InboundMessage](
      context,
      settings.kafka.properties,
      state.ranges.toArray,
      Map[TopicAndPartition, Broker](),
      (msgAndMeta: MessageAndMetadata[Array[Byte], Array[Byte]]) => { InboundMessage(msgAndMeta.topic, msgAndMeta.key(), msgAndMeta.message()) }
    )
  }

  private def processRDD(kafkaRDD: RDD[InboundMessage], jobId: Long, settings: Settings) = {
    // passed to remote workers
    val etlSettings = settings.etl

    logger.info(s"RDD processing started [rdd=${kafkaRDD.id}, jobId=$jobId]")

    val rdd = settings.etl.maxNumOfOutputFiles.map(kafkaRDD.coalesce(_)).getOrElse(kafkaRDD)

    rdd.foreachPartition { partition =>
        // executed at the worker
        new PartitionProcessor(jobId, TaskContext.get.partitionId(), etlSettings)
          .processPartition(partition)
      }

    logger.info(s"RDD processing ended [rdd=${kafkaRDD.id}, jobId=$jobId]")
  }


}

Source File: YahooParser.scala From spark-timeseries with Apache License 2.0

5 votes

package com.cloudera.sparkts.parsers

import com.cloudera.sparkts.TimeSeries
import com.cloudera.sparkts.TimeSeries._
import java.time._
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

object YahooParser {
  def yahooStringToTimeSeries(
    text: String,
    keyPrefix: String = "",
    zone: ZoneId = ZoneId.systemDefault())
    : TimeSeries[String] = {
    val lines = text.split('\n')
    val labels = lines(0).split(',').tail.map(keyPrefix + _)
    val samples = lines.tail.map { line =>
      val tokens = line.split(',')
      val dt = LocalDate.parse(tokens.head).atStartOfDay(zone)
      (dt, tokens.tail.map(_.toDouble))
    }.reverse
    timeSeriesFromIrregularSamples(samples, labels, zone)
  }

  def yahooFiles(
    dir: String,
    sc: SparkContext,
    zone: ZoneId = ZoneId.systemDefault())
    : RDD[TimeSeries[String]] = {
    sc.wholeTextFiles(dir).map { case (path, text) =>
      YahooParser.yahooStringToTimeSeries(text, path.split('/').last, zone)
    }
  }
}

Source File: DatasourceRDD.scala From datasource-receiver with Apache License 2.0

5 votes

package org.apache.spark.streaming.datasource.receiver

import org.apache.spark.partial.{BoundedDouble, CountEvaluator, PartialResult}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.streaming.datasource.config.ParametersUtils
import org.apache.spark.streaming.datasource.models.{InputSentences, OffsetOperator}
import org.apache.spark.{Logging, Partition, TaskContext}

private[datasource]
class DatasourceRDD(
                     @transient sqlContext: SQLContext,
                     inputSentences: InputSentences,
                     datasourceParams: Map[String, String]
                   ) extends RDD[Row](sqlContext.sparkContext, Nil) with Logging with ParametersUtils {

  private var totalCalculated: Option[Long] = None

  private val InitTableName = "initTable"
  private val LimitedTableName = "limitedTable"
  private val TempInitQuery = s"select * from $InitTableName"

  val dataFrame = inputSentences.offsetConditions.fold(sqlContext.sql(inputSentences.query)) { case offset =>
    val parsedQuery = parseInitialQuery
    val conditionsSentence = offset.fromOffset.extractConditionSentence(parsedQuery)
    val orderSentence = offset.fromOffset.extractOrderSentence(parsedQuery, inverse = offset.limitRecords.isEmpty)
    val limitSentence = inputSentences.extractLimitSentence

    sqlContext.sql(parsedQuery + conditionsSentence + orderSentence + limitSentence)
  }

  private def parseInitialQuery: String = {
    if (inputSentences.query.toUpperCase.contains("WHERE") ||
      inputSentences.query.toUpperCase.contains("ORDER") ||
      inputSentences.query.toUpperCase.contains("LIMIT")
    ) {
      sqlContext.sql(inputSentences.query).registerTempTable(InitTableName)
      TempInitQuery
    } else inputSentences.query
  }

  def progressInputSentences: InputSentences = {
    if (!dataFrame.rdd.isEmpty()) {
      inputSentences.offsetConditions.fold(inputSentences) { case offset =>

        val offsetValue = if (offset.limitRecords.isEmpty)
          dataFrame.rdd.first().get(dataFrame.schema.fieldIndex(offset.fromOffset.name))
        else {
          dataFrame.registerTempTable(LimitedTableName)
          val limitedQuery = s"select * from $LimitedTableName order by ${offset.fromOffset.name} " +
            s"${OffsetOperator.toInverseOrderOperator(offset.fromOffset.operator)} limit 1"

          sqlContext.sql(limitedQuery).rdd.first().get(dataFrame.schema.fieldIndex(offset.fromOffset.name))
        }

        inputSentences.copy(offsetConditions = Option(offset.copy(fromOffset = offset.fromOffset.copy(
          value = Option(offsetValue),
          operator = OffsetOperator.toProgressOperator(offset.fromOffset.operator)))))
      }
    } else inputSentences
  }

  
  override def isEmpty(): Boolean = {
    totalCalculated.fold {
      withScope {
        partitions.length == 0 || take(1).length == 0
      }
    } { total => total == 0L }
  }

  override def getPartitions: Array[Partition] = dataFrame.rdd.partitions

  override def compute(thePart: Partition, context: TaskContext): Iterator[Row] = dataFrame.rdd.compute(thePart, context)

  override def getPreferredLocations(thePart: Partition): Seq[String] = dataFrame.rdd.preferredLocations(thePart)
}

Source File: JsonInputStreamQuery.scala From spark-cep with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming.examples

import scala.collection.mutable.SynchronizedQueue
import scala.io.Source

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.streaming.StreamSQLContext
import org.apache.spark.streaming.{Duration, StreamingContext}


object JsonInputStreamQuery {
  def main(args: Array[String]): Unit = {
    val ssc = new StreamingContext("local[10]", "test", Duration(3000))
    val sc = ssc.sparkContext
    val streamSqlContext = new StreamSQLContext(ssc, new SQLContext(sc))
    import streamSqlContext._
    // Here we read data line by line from a given file and then put it into a queue DStream.
    // You can replace any kind of String type DStream here including kafka DStream.
    val queue = new SynchronizedQueue[RDD[String]]()
    Source.fromFile("src/main/resources/student.json").getLines().foreach(msg =>
      queue.enqueue(sc.parallelize(List(msg))))
    val queueDStream = ssc.queueStream[String](queue)
    // We can infer the schema of json automatically by using inferJsonSchema
    val schema = streamSqlContext.inferJsonSchema("src/main/resources/student.json")
    streamSqlContext.registerDStreamAsTable(
      streamSqlContext.jsonDStream(queueDStream, schema), "jsonTable")
    sql("SELECT * FROM jsonTable").print()
    ssc.start()
    ssc.awaitTerminationOrTimeout(30 * 1000)
    ssc.stop()
  }
}

Source File: ExistingDStream.scala From spark-cep with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming

import org.apache.spark.rdd.{EmptyRDD, RDD}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.streaming.Time
import org.apache.spark.streaming.dstream.DStream


private[streaming]
case class PhysicalDStream(output: Seq[Attribute], @transient stream: DStream[InternalRow])
    extends SparkPlan with StreamPlan {

  def children = Nil

  override def doExecute() = {
    assert(validTime != null)
    Utils.invoke(classOf[DStream[InternalRow]], stream, "getOrCompute", (classOf[Time], validTime))
      .asInstanceOf[Option[RDD[InternalRow]]]
      .getOrElse(new EmptyRDD[InternalRow](sparkContext))
  }
}

org.apache.spark.rdd.RDD Scala Examples