org.apache.spark.rdd.RDD Scala Examples
The following examples show how to use org.apache.spark.rdd.RDD.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DeltaQA.scala From spark-tools with Apache License 2.0 | 12 votes |
package io.univalence.deltaqa.kpialgebra import org.apache.spark.rdd.RDD import org.apache.spark.SparkConf import org.apache.spark.SparkContext import shapeless.contrib.spire._ import spire.algebra._ import spire.implicits._ import scala.reflect.ClassTag case class DeltaPart[T: AdditiveMonoid]( count: Long, part: T ) case class DeltaCommon[T: AdditiveMonoid]( count: Long, countZero: Long, diff: T, error: T, left: T, right: T ) case class Delta[L: AdditiveMonoid, R: AdditiveMonoid, C: AdditiveMonoid]( left: DeltaPart[L], right: DeltaPart[R], common: DeltaCommon[C] ) object KpiAlgebra { def computeCommon[LRC: AdditiveAbGroup: MultiplicativeSemigroup](left: LRC, right: LRC): DeltaCommon[LRC] = { val diff = left - right val error = diff * diff DeltaCommon( count = 1, countZero = if (diff == Monoid.additive[LRC].id) 1 else 0, diff = diff, error = error, left = left, right = right ) } def monoid[LM: AdditiveMonoid, RM: AdditiveMonoid, LRC: AdditiveMonoid]: Monoid[Delta[LM, RM, LRC]] = Monoid.additive[Delta[LM, RM, LRC]] def compare[ K: ClassTag, L: ClassTag, R: ClassTag, LM: AdditiveMonoid: ClassTag, RM: AdditiveMonoid: ClassTag, LRC: AdditiveAbGroup: MultiplicativeSemigroup: ClassTag ]( left: RDD[(K, L)], right: RDD[(K, R)] )(flm: L => LM, frm: R => RM, flc: L => LRC, frc: R => LRC): Delta[LM, RM, LRC] = { val map: RDD[Delta[LM, RM, LRC]] = left .fullOuterJoin(right) .map({ case (_, (Some(l), None)) => monoid[LM, RM, LRC].id .copy(left = DeltaPart(count = 1, part = flm(l))) case (_, (None, Some(r))) => monoid[LM, RM, LRC].id .copy(right = DeltaPart(count = 1, part = frm(r))) case (_, (Some(l), Some(r))) => monoid[LM, RM, LRC].id.copy(common = computeCommon(flc(l), frc(r))) }) map.reduce((x, y) => monoid[LM, RM, LRC].op(x, y)) } } case class KpiLeaf(l1: Long, l2: Long, l3: Long) object KpiAlgebraTest { def main(args: Array[String]) { val sc = new SparkContext(new SparkConf().setMaster("local[*]").setAppName("smoketest")) val parallelize: RDD[(Int, Int)] = sc.parallelize((1 to 4).zipWithIndex) // Delta(DeltaPart(0,0),DeltaPart(0,0),DeltaCommon(4,4,0,0,6,6)) val p2: RDD[(Int, KpiLeaf)] = sc.parallelize((1 to 4)).map(_ -> KpiLeaf(1, 2, 3)) import spire.implicits._ import shapeless.contrib.spire._ ////println(((KpiAlgebra.compare(p2, p2)(identity, identity, identity, identity)) } }
Example 2
Source File: Test1.scala From BigData-News with Apache License 2.0 | 12 votes |
package com.vita.spark.test import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD object Test1 { def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf() conf.setMaster("local") conf.setAppName("TransformationOperator") val sc: SparkContext = new SparkContext(conf) val list: List[String] = List("张无忌", "赵敏", "周芷若") val rdd: RDD[String] = sc.parallelize(list) val list1: List[(Int, String)] = List((1, "东方不败"), (2, "令狐冲"), (3, "林平之")) val list2: List[(Int, Int)] = List((1, 99), (2, 98), (3, 97)) val rdd1: RDD[(Int, String)] = sc.parallelize(list1) val rdd2: RDD[(Int, Int)] = sc.parallelize(list2) rdd1.join(rdd2).foreach(x => println("学号: " + x._1 + "名字:" + x._2._1 + " 分数:" + x._2._2)) } }
Example 3
Source File: SqlNetworkWordCount.scala From drizzle-spark with Apache License 2.0 | 6 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} object SparkSessionSingleton { @transient private var instance: SparkSession = _ def getInstance(sparkConf: SparkConf): SparkSession = { if (instance == null) { instance = SparkSession .builder .config(sparkConf) .getOrCreate() } instance } } // scalastyle:on println
Example 4
Source File: LocalTableScanExec.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.execution.metric.SQLMetrics case class LocalTableScanExec( output: Seq[Attribute], rows: Seq[InternalRow]) extends LeafExecNode { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) private val unsafeRows: Array[InternalRow] = { if (rows.isEmpty) { Array.empty } else { val proj = UnsafeProjection.create(output, output) rows.map(r => proj(r).copy()).toArray } } private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1), sqlContext.sparkContext.defaultParallelism) private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.map { r => numOutputRows += 1 r } } override protected def stringArgs: Iterator[Any] = { if (rows.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def executeCollect(): Array[InternalRow] = { longMetric("numOutputRows").add(unsafeRows.size) unsafeRows } override def executeTake(limit: Int): Array[InternalRow] = { val taken = unsafeRows.take(limit) longMetric("numOutputRows").add(taken.size) taken } }
Example 5
Source File: GraphGeneration.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 6 votes |
package com.github.maxpumperla.ml_spark.graphs import org.apache.spark.graphx.lib.TriangleCount import org.apache.spark.graphx.util.GraphGenerators import org.apache.spark.graphx.{Graph, GraphLoader, PartitionStrategy, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object GraphGeneration extends App { val conf = new SparkConf() .setAppName("Graph generation") .setMaster("local[4]") val sc = new SparkContext(conf) val edgeListGraph = GraphLoader.edgeListFile(sc, "./edge_list.txt") val rawEdges: RDD[(VertexId, VertexId)] = sc.textFile("./edge_list.txt").map { line => val field = line.split(" ") (field(0).toLong, field(1).toLong) } val edgeTupleGraph = Graph.fromEdgeTuples( rawEdges=rawEdges, defaultValue="") val gridGraph = GraphGenerators.gridGraph(sc, 5, 5) val starGraph = GraphGenerators.starGraph(sc, 11) val logNormalGraph = GraphGenerators.logNormalGraph( sc, numVertices = 20, mu=1, sigma = 3 ) logNormalGraph.outDegrees.map(_._2).collect().sorted val actorGraph = GraphLoader.edgeListFile( sc, "./ca-hollywood-2009.txt", true ).partitionBy(PartitionStrategy.RandomVertexCut) actorGraph.edges.count() val actorComponents = actorGraph.connectedComponents().cache actorComponents.vertices.map(_._2).distinct().count val clusterSizes =actorComponents.vertices.map( v => (v._2, 1)).reduceByKey(_ + _) clusterSizes.map(_._2).max clusterSizes.map(_._2).min val smallActorGraph = GraphLoader.edgeListFile(sc, "./ca-hollywood-2009.txt") val strongComponents = smallActorGraph.stronglyConnectedComponents(numIter = 5) strongComponents.vertices.map(_._2).distinct().count val canonicalGraph = actorGraph.mapEdges(e => 1).removeSelfEdges().convertToCanonicalEdges() val partitionedGraph = canonicalGraph.partitionBy(PartitionStrategy.RandomVertexCut) actorGraph.triangleCount() val triangles = TriangleCount.runPreCanonicalized(partitionedGraph) actorGraph.staticPageRank(10) val actorPrGraph: Graph[Double, Double] = actorGraph.pageRank(0.0001) actorPrGraph.vertices.reduce((v1, v2) => { if (v1._2 > v2._2) v1 else v2 }) actorPrGraph.inDegrees.filter(v => v._1 == 33024L).collect.foreach(println) actorPrGraph.inDegrees.map(_._2).collect().sorted.takeRight(10) actorPrGraph.inDegrees.map(_._2).filter(_ >= 62).count }
Example 6
Source File: PipePrintSampleCorpus.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.reading.corpus import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.CorpusContext import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.PipeSampler import de.unihamburg.vsis.sddf.visualisation.Table import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable class PipePrintSampleCorpus(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping) extends PipeElementPassthrough[RDD[Tuple]] with PipeSampler { def substep(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): Unit = { pipeContext match { case pc: CorpusContext => { val sample: Array[Tuple] = pc.corpus.takeSample(false, count) val table: Seq[Seq[String]] = createTupleTable(sample) log.info("Corpus sample of " + sample.size + " tuples: ") Table.printTable(table) } } } } object PipePrintSampleCorpus { def apply(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping) = { new PipePrintSampleCorpus(count) } }
Example 7
Source File: PipeContextReadCorpus.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.reading.corpus import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.CorpusContext import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.pipe.PipeElement import scala.reflect.ClassTag class PipeContextReadCorpus[A: ClassTag] extends PipeElement[RDD[A], RDD[Tuple]] { def step(input: RDD[A])(implicit pipeContext: AbstractPipeContext): RDD[Tuple] = { pipeContext match { case pc: CorpusContext => pc.corpus } } } object PipeContextReadCorpus { def apply[A]() = new PipeContextReadCorpus() }
Example 8
Source File: PipeAnalyseCorpus.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.reading.corpus import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.pipe.PipeElement import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.reading.IdConverter import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.reading.TupleArray import de.unihamburg.vsis.sddf.visualisation.model.ReadingModel import de.unihamburg.vsis.sddf.pipe.context.ResultContext class PipeAnalyseCorpus extends PipeElementPassthrough[RDD[Tuple]] with Serializable { override val _analysable = new ReadingModel def substep(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): Unit = { _analysable.tuples_=(input) pipeContext match { case pc: ResultContext => { pc.readingModel = Some(_analysable) } case _ => { throw new Exception("Wrong AbstractPipeContext type.") } } } } object PipeAnalyseCorpus { def apply() = { new PipeAnalyseCorpus() } }
Example 9
Source File: PipeStoreInContextGoldstandard.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.reading.goldstandard import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable class PipeStoreInContextGoldstandard extends PipeElementPassthrough[RDD[SymPair[Tuple]]] { def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = { pipeContext match { case pc: GoldstandardContext => pc.goldstandard = input } } } object PipeStoreInContextGoldstandard { def apply() = new PipeStoreInContextGoldstandard() }
Example 10
Source File: PipeReaderGoldstandardIdsPairs.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.reading.goldstandard import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.pipe.PipeElement import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.reading.IdConverter import de.unihamburg.vsis.sddf.reading.IdConverterBasic import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable class PipeReaderGoldstandardIdsPairs( separator: Char = ',', idIndex1: Int = 0, idIndex2: Int = 1, idConverter: IdConverter = IdConverterBasic) extends PipeElement[RDD[String], RDD[SymPair[Long]]] { override def step(inputRdd: RDD[String])(implicit pipeContext: AbstractPipeContext): RDD[SymPair[Long]] = { inputRdd.map(line => { val parts = line.split(separator) val tupleId1 = idConverter.convert(parts(idIndex1).replaceAll("[^0-9]","")) val tupleId2 = idConverter.convert(parts(idIndex2).replaceAll("[^0-9]","")) new SymPair(tupleId1, tupleId2) }) } } object PipeReaderGoldstandardIdsPairs { def apply( separator: Char = ',', idIndex1: Int = 0, idIndex2: Int = 1, idConverter: IdConverter = IdConverterBasic) = { new PipeReaderGoldstandardIdsPairs(separator, idIndex1, idIndex2, idConverter) } }
Example 11
Source File: PipeReaderGoldstandard.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.reading.goldstandard import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.pipe.Pipeline import de.unihamburg.vsis.sddf.reading.IdConverter import de.unihamburg.vsis.sddf.reading.IdConverterBasic import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple object PipeReaderGoldstandardPairs { def apply( separator: Char = ',', idIndex1: Int = 0, idIndex2: Int = 1, idConverter: IdConverter = IdConverterBasic): Pipeline[RDD[String], RDD[SymPair[Tuple]]] = { PipeReaderGoldstandardIdsPairs(separator, idIndex1, idIndex2, idConverter) .append(PipeReaderGoldstandardIdToTuple()) } } object PipeReaderGoldstandardCluster { def apply( separator: Char = ',', clusterIdIndex: Int = 0, tupleIdIndex: Int = 1, idConverter: IdConverter = IdConverterBasic): Pipeline[RDD[String], RDD[SymPair[Tuple]]] = { PipeReaderGoldstandardIdsCluster(separator, clusterIdIndex, tupleIdIndex, idConverter) .append(PipeReaderGoldstandardIdToTuple()) } }
Example 12
Source File: PipeAnalyseGoldstandardCluster.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.reading.goldstandard import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.ResultContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.visualisation.model.GoldstandardClusterModel class PipeAnalyseGoldstandardCluster extends PipeElementPassthrough[RDD[Seq[Long]]] { override val _analysable = new GoldstandardClusterModel def substep(input: RDD[Seq[Long]])(implicit pipeContext: AbstractPipeContext): Unit = { _analysable.goldstandard = input pipeContext match { case pc: ResultContext => { pc.goldstandardModelCluster = Some(_analysable) } case _ => { throw new Exception("Wrong AbstractPipeContext type.") } } } } object PipeAnalyseGoldstandardCluster { def apply() = new PipeAnalyseGoldstandardCluster() }
Example 13
Source File: PipePrintSampleGoldstandard.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.reading.goldstandard import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.PipeSampler import de.unihamburg.vsis.sddf.visualisation.Table import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable class PipePrintSampleGoldstandard(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping) extends PipeElementPassthrough[RDD[Tuple]] with PipeSampler { def substep(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): Unit = { pipeContext match { case pc: GoldstandardContext => { val sample: Array[SymPair[Tuple]] = pc.goldstandard.takeSample(false, count) val table: Seq[Seq[String]] = createSymPairTable(sample) log.info("Goldstandard sample of " + sample.size + " tuples: ") Table.printTable(table) } } } } object PipePrintSampleGoldstandard { def apply(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping) = { new PipePrintSampleGoldstandard(count) } }
Example 14
Source File: PipeReaderGoldstandardClusterOutput.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.reading.goldstandard import java.util.regex.PatternSyntaxException import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import de.unihamburg.vsis.sddf.SddfContext.rddToRdd import de.unihamburg.vsis.sddf.pipe.PipeElement import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.reading.IdConverter import de.unihamburg.vsis.sddf.reading.IdConverterBasic import de.unihamburg.vsis.sddf.reading.SymPair class PipeReaderGoldstandardClusterOutput( separator: Char = ',', clusterIdIndex: Int = 0, tupleIdIndex: Int = 1, idConverter: IdConverter = IdConverterBasic) extends PipeElement[RDD[String], RDD[Seq[Long]]] { override def step(inputRdd: RDD[String])(implicit pipeContext: AbstractPipeContext): RDD[Seq[Long]] = { // parse tuple ids val clusterIdTupleIdRdd = inputRdd.map(line => { val parts = line.split(separator) val tupleId = idConverter.convert(parts(tupleIdIndex).replaceAll("[^0-9]","")) val clusterId = idConverter.convert(parts(clusterIdIndex).replaceAll("[^0-9]","")) (clusterId, tupleId) }) clusterIdTupleIdRdd.groupByKey().map(_._2.toSeq) } } object PipeReaderGoldstandardClusterOutput { def apply( separator: Char = ',', clusterIdIndex: Int = 0, tupleIdIndex: Int = 1, idConverter: IdConverter = IdConverterBasic) = { new PipeReaderGoldstandardClusterOutput(separator, clusterIdIndex, tupleIdIndex, idConverter) } }
Example 15
Source File: PipeAnalyseGoldstandard.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.reading.goldstandard import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.ResultContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.model.GoldstandardModel class PipeAnalyseGoldstandard extends PipeElementPassthrough[RDD[SymPair[Tuple]]] { override val _analysable = new GoldstandardModel def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = { _analysable.goldstandard = input pipeContext match { case pc: ResultContext => { pc.goldstandardModel = Some(_analysable) } case _ => { throw new Exception("Wrong AbstractPipeContext type.") } } } } object PipeAnalyseGoldstandard { def apply() = new PipeAnalyseGoldstandard() }
Example 16
Source File: PipePrintHeadGoldstandard.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.reading.goldstandard import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.PipeSampler import de.unihamburg.vsis.sddf.visualisation.Table import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable class PipePrintHeadGoldstandard(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping) extends PipeElementPassthrough[RDD[SymPair[Tuple]]] with PipeSampler { def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = { pipeContext match { case pc: GoldstandardContext => { val sample: Array[SymPair[Tuple]] = pc.goldstandard.take(count) val table: Seq[Seq[String]] = createSymPairTable(sample) log.info("Goldstandard sample of " + sample.size + " tuples: ") Table.printTable(table) } } } } object PipePrintHeadGoldstandard { def apply(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping) = { new PipePrintHeadGoldstandard(count) } }
Example 17
Source File: PipePrintHeadTuple.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.print import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.CorpusContext import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.PipeSampler import de.unihamburg.vsis.sddf.visualisation.Table class PipePrintHeadTuple(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping) extends PipeElementPassthrough[RDD[Tuple]] with PipeSampler { def substep(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): Unit = { val sample: Array[Tuple] = input.take(count) val table: Seq[Seq[String]] = createTupleTable(sample) log.info("Sample of " + sample.size + " tuples: ") Table.printTable(table) } } object PipePrintHeadTuple { def apply(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping) = { new PipePrintHeadTuple(count) } }
Example 18
Source File: PipeWordcount.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.examples import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ import com.rockymadden.stringmetric.StringMetric import de.unihamburg.vsis.sddf.pipe.PipeElement import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable class PipeWordcount() extends PipeElement[RDD[String], RDD[(String, Int)]] { def step(input: RDD[String])(implicit pipeContext: AbstractPipeContext): RDD[(String, Int)] = { // flatten the collection of word arrays val words = input.flatMap(line => line.split(" ")) // initialize the counter of each word with one val wordsWithCounter = words.map(word => (word, 1)) // add up all counters of the same word wordsWithCounter.reduceByKey(_ + _) } } // companion object for a better usability object PipeWordcount { def apply() = new PipeWordcount() }
Example 19
Source File: AbstractPipeClusteringGraph.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.clustering import org.apache.spark.graphx.Edge import org.apache.spark.graphx.Graph import org.apache.spark.graphx.VertexId import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.pipe.PipeElement import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.similarity.aggregator.Mean import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable abstract class AbstractPipeClusteringGraph extends PipeElement[RDD[(SymPair[Tuple], Array[Double])], RDD[Set[Tuple]]] with Serializable { def cluster(graph: Graph[Tuple, Double]): RDD[Set[Tuple]] def step(input: RDD[(SymPair[Tuple], Array[Double])])(implicit pipeContext: AbstractPipeContext): RDD[Set[Tuple]] = { val duplicatePairsWithSimilarity = input.map( pair => (pair._1, Mean.agrSimilarity(pair._2)) ) val edges: RDD[Edge[Double]] = duplicatePairsWithSimilarity.map( pair => { Edge(pair._1._1.id, pair._1._2.id, pair._2) } ) // TODO optimize: it would be nice to build the graph only by using edge triplets // but as far as I know that's not possible val verticesNotUnique: RDD[(VertexId, Tuple)] = duplicatePairsWithSimilarity.map(_._1).flatMap( tuplePair => Seq(tuplePair._1, tuplePair._2) ).map(tuple => (tuple.id, tuple)) // delete all duplicate vertices val vertices = verticesNotUnique.distinct() // The edge type Boolean is just a workaround because no edge types are needed val graph: Graph[Tuple, Double] = Graph.apply(vertices, edges, null) cluster(graph) } }
Example 20
Source File: PipeAnalyseClustering.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.clustering import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.Parameterized import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.model.ClusterModel import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext import de.unihamburg.vsis.sddf.pipe.context.ResultContext class PipeAnalyseClustering extends PipeElementPassthrough[RDD[Set[Tuple]]] { override val _analysable = new ClusterModel def substep(input: RDD[Set[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = { pipeContext match { case pc: GoldstandardContext with ResultContext => { _analysable.clusters = input _analysable.goldstandard = pc.goldstandard pc.clusterModel = Some(_analysable) } case _ => { throw new Exception("Wrong AbstractPipeContext type.") } } } } object PipeAnalyseClustering { def apply() = { new PipeAnalyseClustering() } }
Example 21
Source File: PipeWriterTupleCluster.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.writing import java.io.File import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable class PipeWriterTupleCluster(file: File, separator: Char = ',') extends PipeElementPassthrough[RDD[Set[Tuple]]] { def substep(input: RDD[Set[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = { val writer = new TupleWriterFile(file, separator) // TODO write tuples to hdfs in parallel and merge them afterwards val collected = input.collect() collected.foreach(set => { set.foreach(tuple => { writer.writeTuple(tuple) }) writer.blankLine() }) writer.close() } } object PipeWriterTupleCluster { def apply(file: File, separator: Char = ',') = { new PipeWriterTupleCluster(file, separator) } }
Example 22
Source File: PipeWriterTuplePairs.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.writing import java.io.File import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable class PipeWriterTuplePairs(file: File, separator: Char = ',') extends PipeElementPassthrough[RDD[SymPair[Tuple]]] { def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = { val writer = new TupleWriterFile(file, separator) val collected = input.collect() collected.foreach(pair => { writer.writeTuple(pair._1) writer.writeTuple(pair._2) writer.blankLine() }) writer.close() } } object PipeWriterTuplePairs { def apply(file: File, separator: Char = ',') = { new PipeWriterTuplePairs(file, separator) } }
Example 23
Source File: ClusterWriterCsvFile.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.writing import java.io.File import java.io.FileWriter import org.apache.spark.rdd.RDD import com.opencsv.CSVWriter import de.unihamburg.vsis.sddf.reading.Tuple class ClusterWriterCsvFile(file: File, separator: Char = ',') { // create folders file.getParentFile().mkdirs() def this(path: String) = { this(new File(path)) } def this(folder: String, file: String) = { this(new File(folder, file)) } def write(clusterRdd: RDD[Set[Tuple]]): Unit = { val collectedClusters = clusterRdd.collect() val writer = new CSVWriter(new FileWriter(file), separator); // feed in your array (or convert your data to an array) collectedClusters.foreach(set => { val tupleIdSet: Set[String] = set.map(tuple => tuple.id.toString()) val tupleIdArray: Array[String] = tupleIdSet.toArray writer.writeNext(tupleIdArray) }) writer.close() } }
Example 24
Source File: TupleWriterFile.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.writing import java.io.File import java.io.FileWriter import org.apache.spark.rdd.RDD import com.opencsv.CSVWriter import de.unihamburg.vsis.sddf.reading.Tuple class TupleWriterFile(file: File, separator: Char = ',') { val writer = new CSVWriter(new FileWriter(file), separator); def writeTuple[A <: Tuple](tuple: A): Unit = { writer.writeNext(tuple.id.toString +: tuple.toSeq.map(_._2).toArray) } def close() = { writer.close() } def blankLine() = { writer.writeNext(Array()) } def writeTuple[A <: Tuple](tuples: Traversable[A]): Unit = { tuples.foreach(tuple => { writer.writeNext(tuple.id.toString +: tuple.toSeq.map(_._2).toArray) }) } def writeTuple[A <: Tuple](tuples: RDD[A]): Unit = { val collectedTuples = tuples.collect() collectedTuples.foreach(tuple => { writer.writeNext(tuple.id.toString +: tuple.toSeq.map(_._2).toArray) }) } }
Example 25
Source File: DummyIndexer.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.indexing import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.Parameterized import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable class PipeIndexerDummy extends IndexingPipe { override val name = "DummyIndexer" def step(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): RDD[SymPair[Tuple]] = { val cartesian = input.cartesian(input).map(new SymPair(_)) // filter identities like (a,a) and symmetric duplicates like (a,b) && (b,a) cartesian.filter(pair => pair._1 != pair._2).distinct() } } object PipeIndexerDummy { def apply() = { new PipeIndexerDummy() } }
Example 26
Source File: PipeAnalyseIndexer.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.indexing import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.CorpusContext import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext import de.unihamburg.vsis.sddf.pipe.context.ResultContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.model.IndexingModel class PipeAnalyseIndexer extends PipeElementPassthrough[RDD[SymPair[Tuple]]] { override val _analysable: IndexingModel = new IndexingModel def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = { pipeContext match { case pc: CorpusContext with ResultContext => { _analysable.pairs = input _analysable.corpus = pc.corpus pc.indexingModel = Some(_analysable) } case _ => { throw new Exception("Wrong AbstractPipeContext type.") } } } } object PipeAnalyseIndexer { def apply() = new PipeAnalyseIndexer }
Example 27
Source File: PipeIndexerSortedNeighborhood.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.indexing import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.indexing.blocking.PipeBlockerSortedNeighborhood import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilder import de.unihamburg.vsis.sddf.pipe.PipeElement import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple object PipeIndexerSortedNeighborhood { def apply(windowSize: Int = 10)(implicit bkvBuilder: BlockingKeyBuilder) = { PipeBlockerSortedNeighborhood(windowSize) .append(SortedNeighborhoodIndexer()) } } def calcPairCount(elementCount: Int, windowSize: Int): Int = { val windowCount = elementCount - windowSize + 1 val firstWindowPairs = (windowSize * (windowSize - 1)) / 2 val lastWindowPairs = (windowCount - 1) * (windowSize - 1) firstWindowPairs + lastWindowPairs } }
Example 28
Source File: PipeAnalyseIndexerExtended.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.indexing import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.CorpusContext import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext import de.unihamburg.vsis.sddf.pipe.context.ResultContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.model.IndexingModelExtended class PipeAnalyseIndexerExtended extends PipeElementPassthrough[RDD[SymPair[Tuple]]] { override val _analysable: IndexingModelExtended = new IndexingModelExtended def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = { pipeContext match { case pc: GoldstandardContext with CorpusContext with ResultContext => { _analysable.pairs = input _analysable.goldstandard = pc.goldstandard _analysable.corpus = pc.corpus pc.indexingModel = Some(_analysable) } case _ => { throw new Exception("Wrong AbstractPipeContext type.") } } } } object PipeAnalyseIndexerExtended { def apply() = new PipeAnalyseIndexerExtended }
Example 29
Source File: PipeAnalyseBlocker.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.indexing.blocking import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.visualisation.model.IndexingModel import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext import de.unihamburg.vsis.sddf.pipe.context.CorpusContext import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.pipe.context.ResultContext import de.unihamburg.vsis.sddf.visualisation.model.BlockingModel class PipeAnalyseBlocker extends PipeElementPassthrough[RDD[Seq[Tuple]]] { override val _analysable: BlockingModel = new BlockingModel def substep(input: RDD[Seq[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = { pipeContext match { case pc: GoldstandardContext with CorpusContext with ResultContext => { _analysable.blocks = input pc.blockingModel = Some(_analysable) } case _ => { throw new Exception("Wrong AbstractPipeContext type.") } } } } object PipeAnalyseBlocker { def apply() = new PipeAnalyseBlocker }
Example 30
Source File: PipeBlockerStandard.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.indexing.blocking import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import de.unihamburg.vsis.sddf.Parameterized import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilder import de.unihamburg.vsis.sddf.logging.Logging import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable def step(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): RDD[Seq[Tuple]] = { val bkvTuplePairs: RDD[(String, Tuple)] = input.map(t => (bkvBuilder.buildBlockingKey(t), t)) val keyBlocks: RDD[(String, Iterable[Tuple])] = bkvTuplePairs.groupByKey keyBlocks.map(_._2.toSeq).filter(_.size > 1) } @transient override val _analysable = new AlgoAnalysable _analysable.algo = this _analysable.name = this.name override val name = "StandardBlocker" override val paramMap = Map("BlockingKeyBuilder" -> bkvBuilder) } object PipeBlockerStandard { def apply(implicit bkvBuilder: BlockingKeyBuilder) = { new PipeBlockerStandard() } }
Example 31
Source File: PipeBlockerSortedNeighborhood.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.indexing.blocking import org.apache.spark.mllib.rdd.RDDFunctions.fromRDD import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import de.unihamburg.vsis.sddf.Parameterized import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilder import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable class PipeBlockerSortedNeighborhood(windowSize: Int = 10)(implicit bkvBuilder: BlockingKeyBuilder) extends BlockingPipe with Parameterized { def step(tuples: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): RDD[Seq[Tuple]] = { val bkvTuplePairs: RDD[(String, Tuple)] = tuples.map(t => (bkvBuilder.buildBlockingKey(t), t)) val sortedPairs = bkvTuplePairs.sortByKey().map(_._2) sortedPairs.sliding(windowSize).map(_.toSeq) } @transient override val _analysable = new AlgoAnalysable _analysable.algo = this _analysable.name = this.name override val name = "SortedNeighborhoodBlocker" override val paramMap = Map("windowSize" -> windowSize, "BlockingKeyBuilder" -> bkvBuilder) } object PipeBlockerSortedNeighborhood { def apply(windowSize: Int = 10)(implicit bkvBuilder: BlockingKeyBuilder) = { new PipeBlockerSortedNeighborhood(windowSize) } }
Example 32
Source File: PipeBlockerSuffixArray.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.indexing.blocking import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import de.unihamburg.vsis.sddf.Parameterized import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilder import de.unihamburg.vsis.sddf.logging.Logging import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable def filterBlocks(suffixTuplePair: (String, Seq[Tuple])): Boolean = { val tupleCount = suffixTuplePair._2.length if (tupleCount > maximumBlockSize) { false } else if (tupleCount < 2) { false } else { true } } } object PipeBlockerSuffixArray { def apply(minimumSuffixLength: Int = 6, maximumBlockSize: Int = 12)( implicit bkvBuilder: BlockingKeyBuilder) = { new PipeBlockerSuffixArray(minimumSuffixLength, maximumBlockSize) } }
Example 33
Source File: SddfPipeContext.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.pipe.context import org.apache.spark.rdd.RDD import org.joda.time.Period import de.unihamburg.vsis.sddf.visualisation.ModelRouter import de.unihamburg.vsis.sddf.visualisation.logger.ModelRouterLogging class SddfPipeContext( val name: String = "Unnamed Pipeline", modelRouter: ModelRouter = ModelRouterLogging) extends AbstractPipeContext(modelRouter) with CorpusContext with GoldstandardContext with ResultContext { var runtime: Option[Period] = None var filepath: Option[String] = None val persistedRDDs = new scala.collection.mutable.HashMap[String, RDD[_]]() }
Example 34
Source File: PipeOptimizeUnpersist.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.pipe.optimize import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext class PipeOptimizeUnpersist[A](rddname: String) extends PipeElementPassthrough[RDD[A]] { def substep(input: RDD[A])(implicit pipeContext: AbstractPipeContext): Unit = { pipeContext match { case pc: SddfPipeContext => { val rddOption = pc.persistedRDDs.get(rddname) if (rddOption.isDefined) { rddOption.get.unpersist() analysable.values += ("RDD unpersisted" -> rddname) } else { log.warn("Can't unpersist RDD with the name " + rddname) } } case _ => { throw new Exception("Wrong AbstractPipeContext type.") } } } } object PipeOptimizeUnpersist { def apply[A](rddname: String) = { new PipeOptimizeUnpersist[A](rddname) } }
Example 35
Source File: PipeOptimizePersistAndName.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.pipe.optimize import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext class PipeOptimizePersistAndName[A](rddname: String = null, newLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends PipeElementPassthrough[RDD[A]] { def substep(input: RDD[A])(implicit pipeContext: AbstractPipeContext): Unit = { pipeContext match { case pc: SddfPipeContext => { input.persist(newLevel) if(rddname != null){ input.name = rddname pc.persistedRDDs += (rddname -> input) analysable.values += ("name" -> rddname) } } case _ => { throw new Exception("Wrong AbstractPipeContext type.") } } } } object PipeOptimizePersistAndName { def apply[A](rddname: String = null, newLevel: StorageLevel = StorageLevel.MEMORY_ONLY) = { new PipeOptimizePersistAndName[A](rddname, newLevel) } }
Example 36
Source File: RddUtils.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.sparkextensions import scala.reflect.ClassTag import org.apache.spark.rdd.RDD object RddUtils { def securlyZipRdds[A, B: ClassTag](rdd1: RDD[A], rdd2: RDD[B]): RDD[(A, B)] = { val rdd1Repartitioned = rdd1.repartition(1) val rdd2Repartitioned = rdd2.repartition(1) val (rdd1Balanced, rdd2Balanced) = balanceRddSizes(rdd1Repartitioned, rdd2Repartitioned) rdd1Balanced.zip(rdd2Balanced) } def balanceRddSizes[A, B](rdd1: RDD[A], rdd2: RDD[B]): (RDD[A], RDD[B]) = { val rdd1count = rdd1.count() val rdd2count = rdd2.count() val difference = math.abs(rdd1count - rdd2count).toInt if (rdd1count > rdd2count) { (removeRandomElements(rdd1, difference), rdd2) } else if (rdd2count > rdd1count) { (rdd1, removeRandomElements(rdd2, difference)) } else { (rdd1, rdd2) } } def removeRandomElements[A](rdd: RDD[A], numberOfElements: Int): RDD[A] = { val sample: Array[A] = rdd.takeSample(false, numberOfElements) val set: Set[A] = Set(sample: _*) rdd.filter(x => if (set.contains(x)) false else true) } }
Example 37
Source File: PipePrintHeadFalsePositives.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import org.apache.spark.rdd.RDD import com.rockymadden.stringmetric.StringMetric import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple class PipePrintHeadFalsePositives( count: Int = 10)( implicit featureIdNameMapping: FeatureIdNameMapping, featureMeasures: Array[(Int, StringMetric[Double])]) extends AbstractPipePrintFalseTuples(count) { def selectFalseTuples(goldstandard: RDD[SymPair[Tuple]], input: RDD[SymPair[Tuple]]) = { input.subtract(goldstandard) } def filterFalseTuplesForOutput(falseTuplesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])]) = { falseTuplesWithSimilarity.take(count) } def logMessage(count: Int): String = { "Printing " + count + " first false positives. (duplicate pairs which were not found)" } } object PipePrintHeadFalsePositives { def apply( count: Int = 10)( implicit featureIdNameMapping: FeatureIdNameMapping, featureMeasures: Array[(Int, StringMetric[Double])]) = { new PipePrintHeadFalsePositives(count) } }
Example 38
Source File: PipeClassificationNaiveBayes.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import scala.beans.BeanInfo import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import org.apache.spark.mllib.classification.NaiveBayesModel class PipeClassificationNaiveBayes(lambda: Double = 1.0) extends AbstractPipeClassification { val paramMap: Map[String, Any] = Map(("lambda", lambda)) def trainModelAndClassify( trainingData: RDD[LabeledPoint], symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = { val model = NaiveBayes.train(trainingData, lambda) log.debug("Classification Model:" + model) log.debug("Classification Model labels :" + model.labels.mkString(" ")) log.debug("Classification Model pi: " + model.pi.mkString(" ")) log.debug("Classification Model theta: " + model.theta.foreach(_.mkString(" "))) // Marking Missing Values as Not Equal (0) symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2)))) } } object PipeClassificationNaiveBayes { def apply(lambda: Double = 1.0) = { new PipeClassificationNaiveBayes(lambda) } }
Example 39
Source File: PipeClassificationTrainingDataGenerator.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import scala.compat.Platform import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import com.rockymadden.stringmetric.StringMetric import de.unihamburg.vsis.sddf.SddfContext.Duplicate import de.unihamburg.vsis.sddf.SddfContext.NoDuplicate import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.logging.Logging import de.unihamburg.vsis.sddf.pipe.PipeElement import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.CorpusContext import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.similarity.SimilarityCalculator import de.unihamburg.vsis.sddf.sparkextensions.RddUtils.securlyZipRdds import de.unihamburg.vsis.sddf.visualisation.model.TrainingSetModel import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable class PipeClassificationTrainingDataGenerator( truePositiveCount: Int = 500, trueNegativeCount: Int = 500)( implicit featureMeasures: Array[(Int, StringMetric[Double])]) extends PipeElement[SymPairSim, (SymPairSim, RDD[LabeledPoint])] with Logging { override def step(input: SymPairSim)(implicit pipeContext: AbstractPipeContext) = { pipeContext match { case pc: GoldstandardContext with CorpusContext => { var truePositiveFraction = truePositiveCount / pc.goldstandard.count.toDouble var trueNegativeFraction = trueNegativeCount / pc.corpus.count.toDouble log.debug("True positive pair fraction taken from the gold standard for training purposes: " + truePositiveFraction) log.debug("True negative pair fraction taken from the corpus for training purposes: " + trueNegativeFraction) if (truePositiveFraction > 1.0) { truePositiveFraction = 1.0 log.debug("True positive pair fraction limited to 1.0") } if (trueNegativeFraction > 1.0) { trueNegativeFraction = 1.0 log.debug("True negative pair fraction limited to 1.0") } val result = generateTrainingData(pc.corpus, pc.goldstandard, truePositiveFraction, trueNegativeFraction) (input, result) } case _ => { throw new Exception("Wrong AbstractPipeContext type.") } } } object PipeClassificationTrainingDataGenerator { val All = -1 def apply( truePositiveCount: Int = 500, trueNegativeCount: Int = 500)( implicit featureMeasures: Array[(Int, StringMetric[Double])]) = { new PipeClassificationTrainingDataGenerator(truePositiveCount, trueNegativeCount) } }
Example 40
Source File: PipeClassificationDecisionTree.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.SddfContext.Duplicate import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.pipe.PipeElement import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.CorpusContext import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable import de.unihamburg.vsis.sddf.Parameterized import org.apache.spark.mllib.classification.ClassificationModel class PipeClassificationDecisionTree( impurity: String = "gini", maxDepth: Int = 5, maxBins: Int = 32) extends AbstractPipeClassification { val paramMap: Map[String, Any] = Map(("impurity", impurity), ("maxDepth", maxDepth), ("maxBins", maxBins)) def trainModelAndClassify( trainingData: RDD[LabeledPoint], symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = { val model = DecisionTree.trainClassifier(trainingData, numClasses = 2, categoricalFeaturesInfo = Map[Int, Int](), impurity, maxDepth, maxBins) log.debug("Decision Tree Model:" + model) log.debug("Decision Tree:" + model.toDebugString) // Marking Missing Values as Not Equal (0) symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2)))) } } object PipeClassificationDecisionTree { def apply( impurity: String = "gini", maxDepth: Int = 5, maxBins: Int = 32) = { new PipeClassificationDecisionTree(impurity, maxDepth, maxBins) } }
Example 41
Source File: PipeClassificationSvm.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import scala.beans.BeanInfo import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import org.apache.spark.mllib.classification.SVMWithSGD class PipeClassificationSvm(numIterations: Int = 100) extends AbstractPipeClassification { val paramMap: Map[String, Any] = Map(("numIterations", numIterations)) def trainModelAndClassify( trainingData: RDD[LabeledPoint], symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = { val model = SVMWithSGD.train(trainingData, numIterations) log.debug("Classification Model:" + model) // Marking Missing Values as Not Equal (0) symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2)))) } } object PipeClassificationSvm { def apply(numIterations: Int = 100) = { new PipeClassificationSvm(numIterations) } }
Example 42
Source File: PipePrintHeadFalseNegatives.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import org.apache.spark.rdd.RDD import com.rockymadden.stringmetric.StringMetric import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple class PipePrintHeadFalseNegatives( count: Int = 10)( implicit featureIdNameMapping: FeatureIdNameMapping, featureMeasures: Array[(Int, StringMetric[Double])]) extends AbstractPipePrintFalseTuples(count) { def selectFalseTuples(goldstandard: RDD[SymPair[Tuple]], input: RDD[SymPair[Tuple]]) = { goldstandard.subtract(input) } def filterFalseTuplesForOutput(falseTuplesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])]) = { falseTuplesWithSimilarity.take(count) } def logMessage(count: Int): String = { "Printing " + count + " first false negatives. (duplicate pairs which are no duplicates)" } } object PipePrintHeadFalseNegatives { def apply( count: Int = 10)( implicit featureIdNameMapping: FeatureIdNameMapping, featureMeasures: Array[(Int, StringMetric[Double])]) = { new PipePrintHeadFalseNegatives(count) } }
Example 43
Source File: PipePrintSampleFalseNegatives.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import org.apache.spark.rdd.RDD import com.rockymadden.stringmetric.StringMetric import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple class PipePrintSampleFalseNegatives( count: Int = 10)( implicit featureIdNameMapping: FeatureIdNameMapping, featureMeasures: Array[(Int, StringMetric[Double])]) extends AbstractPipePrintFalseTuples(count) { def selectFalseTuples(goldstandard: RDD[SymPair[Tuple]], input: RDD[SymPair[Tuple]]) = { goldstandard.subtract(input) } def filterFalseTuplesForOutput(falseTuplesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])]) = { falseTuplesWithSimilarity.takeSample(false, count) } def logMessage(count: Int): String = { "Sampling " + count + " false negatives. (duplicate pairs which are no duplicates)" } } object PipePrintSampleFalseNegatives { def apply( count: Int = 10)( implicit featureIdNameMapping: FeatureIdNameMapping, featureMeasures: Array[(Int, StringMetric[Double])]) = { new PipePrintSampleFalseNegatives(count) } }
Example 44
Source File: PipeAnalyseClassificationTraining.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.ResultContext import de.unihamburg.vsis.sddf.visualisation.model.TrainingSetModel class PipeAnalyseClassificationTraining extends PipeElementPassthrough[(SymPairSim, RDD[LabeledPoint])] { override val _analysable: TrainingSetModel = new TrainingSetModel def substep( input: (SymPairSim, RDD[LabeledPoint]))( implicit pipeContext: AbstractPipeContext): Unit = { _analysable.trainingsSetLabeled = input._2 pipeContext match { case pc: ResultContext => { pc.trainingSetModel = Some(_analysable) } case _ => { throw new Exception("Wrong AbstractPipeContext type.") } } } } object PipeAnalyseClassificationTraining { def apply() = new PipeAnalyseClassificationTraining }
Example 45
Source File: PipePrintSampleFalsePositives.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import org.apache.spark.rdd.RDD import com.rockymadden.stringmetric.StringMetric import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple class PipePrintSampleFalsePositives( count: Int = 10)( implicit featureIdNameMapping: FeatureIdNameMapping, featureMeasures: Array[(Int, StringMetric[Double])]) extends AbstractPipePrintFalseTuples(count) { def selectFalseTuples(goldstandard: RDD[SymPair[Tuple]], input: RDD[SymPair[Tuple]]) = { input.subtract(goldstandard) } def filterFalseTuplesForOutput(falseTuplesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])]) = { falseTuplesWithSimilarity.takeSample(false, count) } def logMessage(count: Int): String = { "Sampling " + count + " false positives. (duplicate pairs which were not found)" } } object PipePrintSampleFalsePositives { def apply( count: Int = 10)( implicit featureIdNameMapping: FeatureIdNameMapping, featureMeasures: Array[(Int, StringMetric[Double])]) = { new PipePrintSampleFalsePositives(count) } }
Example 46
Source File: AbstractPipeClassification.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.Parameterized import de.unihamburg.vsis.sddf.SddfContext.Duplicate import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.pipe.PipeElement import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.CorpusContext import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable abstract class AbstractPipeClassification() extends PipeElement[(SymPairSim, RDD[LabeledPoint]), SymPairSim] with Parameterized { override val _analysable = new AlgoAnalysable _analysable.algo = this def trainModelAndClassify( trainingData: RDD[LabeledPoint], symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] def step(input: (SymPairSim, RDD[LabeledPoint]))(implicit pipeContext: AbstractPipeContext): SymPairSim = { pipeContext match { case pc: CorpusContext with GoldstandardContext => { val symPairSim = input._1 val trainingsSet = input._2 val prediction = trainModelAndClassify(trainingsSet, symPairSim) val duplicatePairs = prediction.filter(_._3 == Duplicate).map(tri => (tri._1, tri._2)) duplicatePairs } case _ => { throw new Exception("Wrong AbstractPipeContext type.") } } } }
Example 47
Source File: AbstractPipePrintFalseTuples.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import org.apache.spark.SparkContext.rddToPairRDDFunctions import org.apache.spark.rdd.RDD import com.rockymadden.stringmetric.StringMetric import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.PipeSampler import de.unihamburg.vsis.sddf.visualisation.Table import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable abstract class AbstractPipePrintFalseTuples( count: Int)( implicit featureIdNameMapping: FeatureIdNameMapping, featureMeasures: Array[(Int, StringMetric[Double])]) extends PipeElementPassthrough[RDD[(SymPair[Tuple], Array[Double])]] with PipeSampler { def selectFalseTuples(goldstandard: RDD[SymPair[Tuple]], input: RDD[SymPair[Tuple]]): RDD[SymPair[Tuple]] def filterFalseTuplesForOutput(falseTuplesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])]): Array[(SymPair[Tuple], Array[Double])] def logMessage(count: Int): String def substep(input: RDD[(SymPair[Tuple], Array[Double])])(implicit pipeContext: AbstractPipeContext): Unit = { pipeContext match { case pc: GoldstandardContext => { val falseTuples = selectFalseTuples(pc.goldstandard, input.map(_._1)) if (falseTuples.count > 0) { val dummyValue: RDD[(SymPair[Tuple], Int)] = falseTuples.map((_, 1)) val join: RDD[(SymPair[Tuple], (Int, Option[Array[Double]]))] = dummyValue.leftOuterJoin(input) val falsePositivesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])] = join.map(pair => { (pair._1, pair._2._2.getOrElse(Array())) }) val falseTuplesSample = filterFalseTuplesForOutput(falsePositivesWithSimilarity) val table = createSymPairSimVectorTable(falseTuplesSample) log.info(logMessage(count)) Table.printTable(table) } else { log.info(logMessage(0)) } } } } }
Example 48
Source File: ExactDuplicateFilter.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.tools import java.io.File import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.logging.Logging import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping.Id import de.unihamburg.vsis.sddf.reading.corpus.PipeStoreInContextCorpus import de.unihamburg.vsis.sddf.reading.corpus.PipePrintSampleCorpus import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.reading.corpus.PipeReaderTupleCsv import de.unihamburg.vsis.sddf.writing.TupleWriterFile object ExactDuplicateFilter extends App with Logging { if (args.size == 1 && (new File(args(0))).exists()) { val conf = new SparkConf().setAppName("ExactDuplicateFilter") conf.setMaster("local") val sc = new SparkContext(conf) implicit val pipeContext = new SddfPipeContext val Content: (Int, String) = (0, "content") val featureMapping: Map[Int, String] = Map(Content) implicit val featureIdNameMapper = new FeatureIdNameMapping(featureMapping) val inputFileKey = "musicbrainz" // Parse Tuples val allFields: Seq[Int] = Seq(Content._1) val allFieldsWithId: Seq[Int] = Id +: allFields val parserPipe = new PipeTupleParserCsvIdContent(allFieldsWithId) val pipe = parserPipe.append(PipeStoreInContextCorpus()).append(PipePrintSampleCorpus()) pipe.start(sc.textFile(args(0))) val result: RDD[Tuple] = parserPipe.output.get val resultCount = result.count log.info("Lines parsed: " + resultCount) val distinct = result.distinct() val distinctCount = distinct.count log.info("Distinct Lines Count: " + distinctCount) log.info("Lines removed: " + (resultCount - distinctCount)) val tupleWriter = new TupleWriterFile(new File(args(0) + ".distinct")) tupleWriter.writeTuple(distinct) } else { println("Please provide a valid file path.") } } class PipeTupleParserCsvIdContent(featureIds: Seq[Int]) extends PipeReaderTupleCsv(featureIds) { override def extractValues(line: String): Seq[String] = { val splitted = parser.parseLine(line) Seq(splitted.head, splitted.tail.mkString(",")) } }
Example 49
Source File: PipeGoldstandardReaderClusterTest.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.test.reading.goldstandard import org.apache.spark.rdd.RDD import org.scalatest.FunSuite import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.reading.goldstandard.PipeReaderGoldstandardIdToTuple import de.unihamburg.vsis.sddf.reading.goldstandard.PipeReaderGoldstandardIdsCluster import de.unihamburg.vsis.sddf.test.util.FixtureHelper import de.unihamburg.vsis.sddf.test.util.LocalSparkContext import de.unihamburg.vsis.sddf.test.util.TestSddfPipeContext class PipeReaderGoldstandardClusterTest extends FunSuite with LocalSparkContext with TestSddfPipeContext with FixtureHelper { test("test goldstandard tuple reading in cluster format") { // format clusterId, tupleId val input: RDD[String] = sc.parallelize(Seq("1,1", "2,2", "2,3")) val gsReaderPipe = PipeReaderGoldstandardIdsCluster() gsReaderPipe.start(input) val gsIds = gsReaderPipe.output.get assert(gsIds.count() === 1) val tuples: Seq[Tuple] = initializeTuples(1, 3) pc.corpus = sc.parallelize(tuples) val gsconverterPipe = new PipeReaderGoldstandardIdToTuple gsconverterPipe.start(gsIds) val gsTuple = gsconverterPipe.output.get assert(gsTuple.count() === 1) } test("test goldstandard id reading in cluster format") { // format clusterId, tupleId val input: RDD[String] = sc.parallelize(Seq("1,1", "2,2", "2,3")) val gsReaderPipe = PipeReaderGoldstandardIdsCluster() gsReaderPipe.start(input) val result = gsReaderPipe.output.get assert(result.count() === 1) } test("test goldstandard cluster reader from file") { val input = sc.textFile("src/test/resources/musicbrainz-1000.csv.dup") val gsReaderPipe = PipeReaderGoldstandardIdsCluster() gsReaderPipe.start(input) val result = gsReaderPipe.output.get assert(result.collect().size === 13) } }
Example 50
Source File: StrongestPathClusteringTest.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.test.clustering import org.apache.spark.rdd.RDD import org.scalatest.FunSuite import de.unihamburg.vsis.sddf.clustering.PipeClusteringStrongestPath import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.test.util.FixtureHelper import de.unihamburg.vsis.sddf.test.util.LocalSparkContext import de.unihamburg.vsis.sddf.test.util.TestSddfPipeContext class StrongestPathClusteringTest extends FunSuite with LocalSparkContext with TestSddfPipeContext with FixtureHelper { test("simple cluster test") { val pair1 = (createTuplePair(1, 2), Array(0.4, 0.6)) val pair2 = (createTuplePair(2, 4), Array(0.1, 0.2)) val pair3 = (createTuplePair(4, 3), Array(0.6, 0.8)) val pair4 = (createTuplePair(3, 1), Array(0.0, 0.2)) val pairs: RDD[(SymPair[Tuple], Array[Double])] = sc.parallelize(Seq(pair1, pair2, pair3, pair4)) val clusterer = new PipeClusteringStrongestPath clusterer.start(pairs) val clusterResult: Array[Set[Tuple]] = clusterer.output.get.collect() val expectedResult = Array(Set(pair1._1._1, pair1._1._2), Set(pair3._1._1, pair3._1._2)) assert(clusterResult === expectedResult) } }
Example 51
Source File: ClusterAnalyserTest.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.test.evaluation import org.apache.spark.rdd.RDD import org.scalatest.FunSuite import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.test.util.FixtureHelper import de.unihamburg.vsis.sddf.test.util.LocalSparkContext import de.unihamburg.vsis.sddf.visualisation.model.ClusterModel class ClusterAnalyserTest extends FunSuite with LocalSparkContext with FixtureHelper { test("Precission and recall test") { val analyser = new ClusterModel analyser.clusters = buildClusters() analyser.goldstandard = buildGoldstandard() assert(analyser.precision === 0.2857142857142857) // should be 2/7 assert(analyser.recall === 0.6666666666666666) // should be 2/3 } def buildClusters(): RDD[Set[Tuple]] = { val cluster1 = initializeTuples(0, 2).toSet val cluster2 = initializeTuples(3, 4).toSet val cluster3 = initializeTuples(5, 7).toSet sc.parallelize(Seq(cluster1, cluster2, cluster3)) } def buildGoldstandard(): RDD[SymPair[Tuple]] = { val pair1 = createTuplePair(0, 1) val pair2 = createTuplePair(4, 7) val pair3 = createTuplePair(6, 7) sc.parallelize(Seq(pair1, pair2, pair3)) } }
Example 52
Source File: SparkApiTest.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.test import org.apache.spark.rdd.RDD import org.scalatest.Finders import org.scalatest.FunSuite import de.unihamburg.vsis.sddf.SddfContext.pairToInt import de.unihamburg.vsis.sddf.preprocessing.PipePreprocessorRemoveRegex import de.unihamburg.vsis.sddf.preprocessing.PipePreprocessorTrim import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping.Id import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping.Ignore import de.unihamburg.vsis.sddf.reading.corpus.PipeReaderTupleCsv import de.unihamburg.vsis.sddf.test.util.LocalSparkContext import de.unihamburg.vsis.sddf.test.util.MusicbrainzSchema class SparkApiTest extends FunSuite with LocalSparkContext with MusicbrainzSchema { test("test rdd substraction") { val file1 = sc.textFile("src/test/resources/musicbrainz-10.csv.dup") val file2 = sc.textFile("src/test/resources/musicbrainz-10.csv.dup") val data1 = parseTuples(file1) assert(data1.count() === 10) val data2 = parseTuples(file2) assert(data2.count() === 10) val substraction = data1.subtract(data2) assert(substraction.count() === 0) } }
Example 53
Source File: PipeDecisionTest.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.test.classification import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.scalatest.BeforeAndAfterAll import org.scalatest.FunSuite import de.unihamburg.vsis.sddf.SddfContext.Duplicate import de.unihamburg.vsis.sddf.SddfContext.NoDuplicate import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.classification.PipeClassificationDecisionTree import de.unihamburg.vsis.sddf.classification.PipeClassificationNaiveBayes import de.unihamburg.vsis.sddf.classification.PipeClassificationSvm import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.test.util.LocalSparkContext class PipeClassificationTest extends FunSuite with LocalSparkContext with BeforeAndAfterAll{ var input: (SymPairSim, RDD[LabeledPoint]) = _ override def beforeAll() { super.beforeAll() val tuple1 = Tuple("test1","test1","test1") tuple1.id = 1 val tuple2 = Tuple("test2","test2","test2") tuple2.id = 2 val tuple3 = Tuple("hans","franz","wurst") tuple3.id = 3 val symPairSim: SymPairSim = sc.parallelize(Seq( (new SymPair(tuple1, tuple2), Array(1D,1D,0D)) ,(new SymPair(tuple2, tuple3), Array(0D,0D,1D)) )) val trainingData: RDD[LabeledPoint] = sc.parallelize(Seq( LabeledPoint(label = Duplicate, features = Vectors.dense(Array(0.99,1.0,0.0))) ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,1.0,0.0))) ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,0.875,0.0))) ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,1.0,0.1))) ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,0.89,0.0))) ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.1,0.0,1.0))) ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.0,0.2,1.0))) ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.06,0.0,0.89))) ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.21,0.19,0.91))) )) input = (symPairSim, trainingData) } override def afterAll() { super.afterAll() } test("naive bayes classification test") { val classificationPipe = new PipeClassificationNaiveBayes() implicit val pipeContext = new SddfPipeContext() val result = classificationPipe.run(input) assert(result.count === 1) } test("svm classification test") { val classificationPipe = new PipeClassificationSvm() implicit val pipeContext = new SddfPipeContext() val result = classificationPipe.run(input) assert(result.count === 1) } test("decision tree classification test") { val classificationPipe = new PipeClassificationDecisionTree() implicit val pipeContext = new SddfPipeContext() val result = classificationPipe.run(input) assert(result.count === 1) } }
Example 54
Source File: MusicbrainzSchema.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.test.util import org.apache.spark.rdd.RDD import org.scalatest.Suite import de.unihamburg.vsis.sddf.SddfContext.pairToInt import de.unihamburg.vsis.sddf.preprocessing.PipePreprocessorRemoveRegex import de.unihamburg.vsis.sddf.preprocessing.PipePreprocessorTrim import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping.Id import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping.Ignore import de.unihamburg.vsis.sddf.reading.corpus.PipeReaderTupleCsv trait MusicbrainzSchema extends TestSddfPipeContext { self: Suite => val Number = (0, "number") val Title = (1, "title") val Length = (2, "length") val Artist = (3, "artist") val Album = (4, "album") val Year = (5, "year") val Language = (6, "language") val featureIdNameMapping = Map(Number, Title, Length, Artist, Album, Year, Language) implicit val featureIdNameMapper = new FeatureIdNameMapping(featureIdNameMapping) def parseTuples(input: RDD[String]) = { // Parse Tuples val allFields: Seq[Int] = Seq(Number, Title, Length, Artist, Album, Year, Language) val allFieldsWithId: Seq[Int] = Ignore +: Id +: Ignore +: allFields val pipe = PipeReaderTupleCsv(allFieldsWithId) .append(PipePreprocessorTrim(allFields: _*)) .append(PipePreprocessorRemoveRegex("[^0-9]", Number, Year, Length)) pipe.run(input) } }
Example 55
Source File: SortedNeighbourhoodBlockerTest.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.test.blocking import org.apache.spark.rdd.RDD import org.scalatest.FunSuite import org.scalatest.Matchers import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilderBasic import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.reading.TupleArray import de.unihamburg.vsis.sddf.test.util.LocalSparkContext import de.unihamburg.vsis.sddf.test.util.TestSddfPipeContext import de.unihamburg.vsis.sddf.indexing.PipeIndexerSortedNeighborhood import de.unihamburg.vsis.sddf.indexing.PipeIndexerSortedNeighborhood class SortedNeighborhoodIndexingTest extends FunSuite with LocalSparkContext with TestSddfPipeContext with Matchers { test("testing whole Sorted Neighborhood Indexer") { val featureId = 1 implicit val bkvBuilder = new BlockingKeyBuilderBasic((featureId, 0 to 6)) val tuple1: Tuple = new TupleArray(1) tuple1.addFeature(0, "blubluba") tuple1.id = 1 val tuple2: Tuple = new TupleArray(1) tuple2.addFeature(0, "blubluba") tuple2.id = 2 val tuple3: Tuple = new TupleArray(1) tuple3.addFeature(0, "blubluba") tuple3.id = 3 val tuple4: Tuple = new TupleArray(1) tuple4.addFeature(0, "blubluba") tuple4.id = 4 val tuple5: Tuple = new TupleArray(1) tuple5.addFeature(0, "blubluba") tuple5.id = 5 val tuples = sc.parallelize(Seq(tuple1, tuple2, tuple3, tuple4, tuple5)) val indexer = PipeIndexerSortedNeighborhood(windowSize = 3) val blockingResult: RDD[SymPair[Tuple]] = indexer.run(tuples) assert(blockingResult.count === 7) val resultArray = blockingResult.collect() resultArray.foreach(println(_)) val expectedResult = Seq( new SymPair(tuple1, tuple2), new SymPair(tuple1, tuple3), new SymPair(tuple2, tuple3), new SymPair(tuple2, tuple4), new SymPair(tuple3, tuple4), new SymPair(tuple3, tuple5), new SymPair(tuple4, tuple5) ) resultArray should contain theSameElementsAs expectedResult } }
Example 56
Source File: SuffixArrayBlockingTest.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.test.blocking import org.apache.spark.rdd.RDD import org.scalatest.Finders import org.scalatest.FunSuite import de.unihamburg.vsis.sddf.indexing.PipeIndexerSuffixArray import de.unihamburg.vsis.sddf.indexing.blocking.PipeBlockerSuffixArray import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilderBasic import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.reading.TupleArray import de.unihamburg.vsis.sddf.test.util.LocalSparkContext import de.unihamburg.vsis.sddf.test.util.TestSddfPipeContext class SuffixArrayIndexingTest extends FunSuite with LocalSparkContext with TestSddfPipeContext { test("testing suffix calculation") { val featureId = 0 implicit val bkvBuilder = new BlockingKeyBuilderBasic((featureId, 0 to 2)) val tuple1: Tuple = new TupleArray(1) tuple1.addFeature(0, "blockingkeyvalue") tuple1.id = 1 val tuples: RDD[Tuple] = sc.parallelize(Seq(tuple1)) val sab = PipeBlockerSuffixArray(minimumSuffixLength = 4, maximumBlockSize = 12) val suffixTuplePairs: Seq[(String, Tuple)] = sab.calcSuffixes(("blockingkeyvalue", tuple1)) // println(suffixTuplePairs.map(_._1).mkString("\n")) assert(suffixTuplePairs.length === 13) } test("testing filter blocks") { val featureId = 0 implicit val bkvBuilder = new BlockingKeyBuilderBasic((featureId, 0 to 2)) val tuple1: Tuple = new TupleArray(1) tuple1.addFeature(0, "blockingkeyvalue") tuple1.id = 1 val tuples = sc.parallelize(Seq(tuple1)) val sab = new PipeBlockerSuffixArray(minimumSuffixLength = 4, maximumBlockSize = 4) val suffixTuplePair = ("bla", Seq(tuple1, tuple1, tuple1, tuple1, tuple1)) assert(sab.filterBlocks(suffixTuplePair) === false) val suffixTuplePair2 = ("bla", Seq(tuple1, tuple1, tuple1, tuple1)) assert(sab.filterBlocks(suffixTuplePair2) === true) val suffixTuplePair3 = ("bla", Seq(tuple1)) assert(sab.filterBlocks(suffixTuplePair3) === false) val suffixTuplePair4 = ("bla", Seq(tuple1, tuple1)) assert(sab.filterBlocks(suffixTuplePair4) === true) } test("testing whole SAB") { val featureId = 0 implicit val bkvBuilder = new BlockingKeyBuilderBasic((featureId, 0 to 6)) val tuple1: Tuple = new TupleArray(1) tuple1.addFeature(0, "blubluba") tuple1.id = 1 val tuple2: Tuple = new TupleArray(1) tuple2.addFeature(0, "blubluba") tuple2.id = 2 val tuple3: Tuple = new TupleArray(1) tuple3.addFeature(0, "blubluba") tuple3.id = 3 val tuples = sc.parallelize(Seq(tuple1, tuple2, tuple3)) val sab = PipeIndexerSuffixArray(minimumSuffixLength = 4, maximumBlockSize = 12) val blockingResult: RDD[SymPair[Tuple]] = sab.run(tuples) // print(blockingResult.collect().map(symPair => (symPair._1.id, symPair._2.id)).mkString("\n")) assert(blockingResult.count === 3) } }
Example 57
Source File: BisectingKMeansModel.scala From bisecting-kmeans with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.bisectingkmeans import breeze.linalg.{Vector => BV, norm => breezeNorm} import org.apache.spark.Logging import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD def toJavaLinkageMatrix: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = { val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]() this.node.toLinkageMatrix.foreach {x => val row = new java.util.ArrayList[java.lang.Double]() row.add(x._1.toDouble) row.add(x._2.toDouble) row.add(x._3.toDouble) row.add(x._4.toDouble) javaList.add(row) } javaList } }
Example 58
Source File: TestFFM.scala From spark-ffm with Apache License 2.0 | 5 votes |
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification._ import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.DenseVector import org.apache.spark.rdd.RDD object TestFFM extends App { override def main(args: Array[String]): Unit = { val sc = new SparkContext(new SparkConf().setAppName("TESTFFM").setMaster("local[4]")) if (args.length != 8) { println("testFFM <train_file> <k> <n_iters> <eta> <lambda> " + "<normal> <random>") } val data= sc.textFile(args(0)).map(_.split("\\s")).map(x => { val y = if(x(0).toInt > 0 ) 1.0 else -1.0 val nodeArray: Array[(Int, Int, Double)] = x.drop(1).map(_.split(":")).map(x => { (x(0).toInt, x(1).toInt, x(2).toDouble) }) (y, nodeArray) }).repartition(4) val splits = data.randomSplit(Array(0.7, 0.3)) val (training: RDD[(Double, Array[(Int, Int, Double)])], testing) = (splits(0), splits(1)) //sometimes the max feature/field number would be different in training/testing dataset, // so use the whole dataset to get the max feature/field number val m = data.flatMap(x=>x._2).map(_._1).collect.reduceLeft(_ max _) //+ 1 val n = data.flatMap(x=>x._2).map(_._2).collect.reduceLeft(_ max _) //+ 1 val ffm: FFMModel = FFMWithAdag.train(training, m, n, dim = (args(6).toBoolean, args(7).toBoolean, args(1).toInt), n_iters = args(2).toInt, eta = args(3).toDouble, regParam = (args(4).toDouble, args(5).toDouble), normalization = false, false, "adagrad") val scores: RDD[(Double, Double)] = testing.map(x => { val p = ffm.predict(x._2) val ret = if (p >= 0.5) 1.0 else -1.0 (ret, x._1) }) val metrics = new BinaryClassificationMetrics(scores) val auROC = metrics.areaUnderROC val auPRC = metrics.areaUnderPR val accuracy = scores.filter(x => x._1 == x._2).count().toDouble / scores.count() println(s"accuracy = $accuracy, Area under ROC = $auROC, Area under precision-recall curve = $auPRC") } }
Example 59
Source File: InferSchema.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.spark.excel import org.apache.spark.rdd.RDD import org.apache.spark.sql.types._ private[excel] object InferSchema { type CellType = Int private[excel] def inferField(typeSoFar: DataType, field: DataType): DataType = { // Defining a function to return the StringType constant is necessary in order to work around // a Scala compiler issue which leads to runtime incompatibilities with certain Spark versions; // see issue #128 for more details. def stringType(): DataType = { StringType } if (field == NullType) { typeSoFar } else { (typeSoFar, field) match { case (NullType, ct) => ct case (DoubleType, DoubleType) => DoubleType case (BooleanType, BooleanType) => BooleanType case (TimestampType, TimestampType) => TimestampType case (StringType, _) => stringType() case (_, _) => stringType() } } } private val numericPrecedence: IndexedSeq[DataType] = IndexedSeq[DataType](ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType, TimestampType) val findTightestCommonType: (DataType, DataType) => Option[DataType] = { case (t1, t2) if t1 == t2 => Some(t1) case (NullType, t1) => Some(t1) case (t1, NullType) => Some(t1) case (StringType, t2) => Some(StringType) case (t1, StringType) => Some(StringType) // Promote numeric types to the highest of the two and all numeric types to unlimited decimal case (t1, t2) if Seq(t1, t2).forall(numericPrecedence.contains) => val index = numericPrecedence.lastIndexWhere(t => t == t1 || t == t2) Some(numericPrecedence(index)) case _ => None } }
Example 60
Source File: DatabaseInteraction.scala From reactive-machine-learning-systems with MIT License | 5 votes |
package com.reactivemachinelearning import com.couchbase.client.java.document.JsonDocument import com.couchbase.client.java.view.ViewQuery import com.couchbase.spark._ import com.reactivemachinelearning.FeatureGeneration.{IntFeature, BooleanFeature, Feature} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object DatabaseInteraction extends App { // Configure Spark val conf = new SparkConf() .setAppName("couchbaseQuickstart") .setMaster("local[*]") .set("com.couchbase.bucket.default", "") // Generate The Context val sc = new SparkContext(conf) val rawSquawks: RDD[JsonDocument] = sc.couchbaseView( ViewQuery.from("squawks", "by_squawk_id")) .map(_.id) .couchbaseGet[JsonDocument]() rawSquawks.foreach(println) def extract(rawSquawks: RDD[JsonDocument]): RDD[IntFeature] = { ??? } def transform(inputFeatures: RDD[IntFeature]): RDD[BooleanFeature] = { ??? } val trainableFeatures = transform(extract(rawSquawks)) }
Example 61
Source File: TestableQueueInputDStream.scala From SparkUnitTestingExamples with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming import java.io.{ObjectInputStream, ObjectOutputStream} import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.streaming.dstream.InputDStream import scala.collection.mutable.{ArrayBuffer, Queue} import scala.reflect.ClassTag class TestableQueueInputDStream[T: ClassTag]( ssc: StreamingContext, val queue: Queue[RDD[T]], oneAtATime: Boolean, defaultRDD: RDD[T] ) extends InputDStream[T](ssc) { override def start() { } override def stop() { } private def readObject(in: ObjectInputStream): Unit = { logWarning("queueStream doesn't support checkpointing") } private def writeObject(oos: ObjectOutputStream): Unit = { logWarning("queueStream doesn't support checkpointing") } override def compute(validTime: Time): Option[RDD[T]] = { val buffer = new ArrayBuffer[RDD[T]]() queue.synchronized { if (oneAtATime && queue.nonEmpty) { buffer += queue.dequeue() } else { buffer ++= queue queue.clear() } } if (buffer.nonEmpty) { if (oneAtATime) { Some(buffer.head) } else { Some(new UnionRDD(context.sc, buffer.toSeq)) } } else if (defaultRDD != null) { Some(defaultRDD) } else { Some(ssc.sparkContext.emptyRDD) } } }
Example 62
Source File: StreamingUnitTest.scala From SparkUnitTestingExamples with Apache License 2.0 | 5 votes |
package com.cloudera.sa.spark.unittest.streaming import org.apache.spark.rdd.RDD import org.apache.spark.streaming._ import org.apache.spark.streaming.dstream.DStream import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} import scala.collection.mutable.Queue class StreamingUnitTest extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll{ @transient var sc: SparkContext = null @transient var ssc: StreamingContext = null override def beforeAll(): Unit = { val envMap = Map[String,String](("Xmx", "512m")) val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") sparkConfig.set("spark.io.compression.codec", "lzf") sc = new SparkContext("local[2]", "unit test", sparkConfig) ssc = new StreamingContext(sc, Milliseconds(200)) } override def afterAll(): Unit = { sc.stop() } test("Streaming word count") { val firstBatchRDD = sc.parallelize(Seq("a", "b", "c")) val secondBatchRDD = sc.parallelize(Seq("a", "e")) val thirdBatchRDD = sc.parallelize(Seq("b", "c", "e", "f")) val forthBatchRDD = sc.parallelize(Seq("a", "e")) val queue = new Queue[RDD[String]] queue.+=(firstBatchRDD) queue.+=(secondBatchRDD) queue.+=(thirdBatchRDD) queue.+=(forthBatchRDD) println(queue) val startTime = System.currentTimeMillis() val dstream = new TestableQueueInputDStream(ssc, queue, true, sc.makeRDD(Seq[String](), 1)) //ssc.queueStream(queue) dstream.checkpoint(Seconds(100)) val batchTotals:DStream[(String, Int)] = dstream.map(r => (r, 1)).reduceByKey(_ + _) val streamTotals = batchTotals.updateStateByKey( (seq:Seq[Int], opt:Option[Int]) => { if (!seq.isEmpty) { val totalCountForNew = seq.reduce(_ + _) if (opt.isEmpty) { Option(totalCountForNew) } else { Option(opt.get + totalCountForNew) } } else { opt } }) streamTotals.foreachRDD(rdd => { }) ssc.checkpoint("./tmp") ssc.start() ssc.awaitTerminationOrTimeout(2000) val endTime = System.currentTimeMillis() val rddList = streamTotals.slice(new Time(startTime), new Time(endTime)) rddList(0).collect().foreach(println) assert(rddList(0).collect().filter(r => r._1.equals("a"))(0)._2 == 1) rddList(1).collect().foreach(println) assert(rddList(1).collect().filter(r => r._1.equals("a"))(0)._2 == 2) rddList(2).collect().foreach(println) assert(rddList(2).collect().filter(r => r._1.equals("a"))(0)._2 == 2) rddList(3).collect().foreach(println) assert(rddList(3).collect().filter(r => r._1.equals("a"))(0)._2 == 3) } }
Example 63
Source File: SparkCassRDDFunctions.scala From Spark2Cassandra with Apache License 2.0 | 5 votes |
package com.github.jparkie.spark.cassandra.rdd import com.datastax.spark.connector.cql.CassandraConnector import com.datastax.spark.connector.mapper.ColumnMapper import com.datastax.spark.connector.writer.{ DefaultRowWriter, RowWriterFactory } import com.datastax.spark.connector.{ AllColumns, ColumnSelector } import com.github.jparkie.spark.cassandra.SparkCassBulkWriter import com.github.jparkie.spark.cassandra.conf.{ SparkCassServerConf, SparkCassWriteConf } import org.apache.spark.rdd.RDD import scala.reflect.runtime.universe._ def bulkLoadToCass( keyspaceName: String, tableName: String, columns: ColumnSelector = AllColumns, sparkCassWriteConf: SparkCassWriteConf = SparkCassWriteConf.fromSparkConf(internalSparkContext.getConf), sparkCassServerConf: SparkCassServerConf = SparkCassServerConf.fromSparkConf(internalSparkContext.getConf) )(implicit connector: CassandraConnector = CassandraConnector(internalSparkContext.getConf), rwf: RowWriterFactory[T] = DefaultRowWriter.factory[T]): Unit = { val sparkCassBulkWriter = SparkCassBulkWriter( connector, keyspaceName, tableName, columns, sparkCassWriteConf, sparkCassServerConf ) internalSparkContext.runJob(rdd, sparkCassBulkWriter.write _) } }
Example 64
Source File: PointCloudRelation.scala From geotrellis-pointcloud with Apache License 2.0 | 5 votes |
package geotrellis.pointcloud.spark.datasource import geotrellis.pointcloud.spark.store.hadoop._ import geotrellis.pointcloud.spark.store.hadoop.HadoopPointCloudRDD.{Options => HadoopOptions} import geotrellis.pointcloud.util.Filesystem import geotrellis.proj4.CRS import geotrellis.store.hadoop.util.HdfsUtils import geotrellis.vector.Extent import cats.implicits._ import io.pdal._ import io.circe.syntax._ import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{BaseRelation, TableScan} import org.apache.spark.sql.types._ import org.apache.spark.sql.{Row, SQLContext} import java.io.File import scala.collection.JavaConverters._ // This class has to be serializable since it is shipped over the network. class PointCloudRelation( val sqlContext: SQLContext, path: String, options: HadoopOptions ) extends BaseRelation with TableScan with Serializable { @transient implicit lazy val sc: SparkContext = sqlContext.sparkContext // TODO: switch between HadoopPointCloudRDD and S3PointcCloudRDD lazy val isS3: Boolean = path.startsWith("s3") override def schema: StructType = { lazy val (local, fixedPath) = if(path.startsWith("s3") || path.startsWith("hdfs")) { val tmpDir = Filesystem.createDirectory() val remotePath = new Path(path) // copy remote file into local tmp dir val localPath = new File(tmpDir, remotePath.getName) HdfsUtils.copyPath(remotePath, new Path(s"file:///${localPath.getAbsolutePath}"), sc.hadoopConfiguration) (true, localPath.toString) } else (false, path) val localPipeline = options.pipeline .hcursor .downField("pipeline").downArray .downField("filename").withFocus(_ => fixedPath.asJson) .top.fold(options.pipeline)(identity) val pl = Pipeline(localPipeline.noSpaces) if (pl.validate()) pl.execute() val pointCloud = try { pl.getPointViews().next().getPointCloud(0) } finally { pl.close() if(local) println(new File(fixedPath).delete) } val rdd = HadoopPointCloudRDD(new Path(path), options) val md: (Option[Extent], Option[CRS]) = rdd .map { case (header, _) => (header.projectedExtent3D.map(_.extent3d.toExtent), header.crs) } .reduce { case ((e1, c), (e2, _)) => ((e1, e2).mapN(_ combine _), c) } val metadata = new MetadataBuilder().putString("metadata", md.asJson.noSpaces).build pointCloud.deriveSchema(metadata) } override def buildScan(): RDD[Row] = { val rdd = HadoopPointCloudRDD(new Path(path), options) rdd.flatMap { _._2.flatMap { pc => pc.readAll.toList.map { k => Row(k: _*) } } } } }
Example 65
Source File: PointCloudToDem.scala From geotrellis-pointcloud with Apache License 2.0 | 5 votes |
package geotrellis.pointcloud.spark.dem import io.pdal._ import geotrellis.layer._ import geotrellis.raster._ import geotrellis.spark._ import geotrellis.util._ import geotrellis.vector._ import org.apache.spark.rdd.RDD object PointCloudToDem { def apply[M: GetComponent[*, LayoutDefinition]](rdd: RDD[(SpatialKey, PointCloud)] with Metadata[M], tileDimensions: (Int, Int), options: PointToGrid.Options): RDD[(SpatialKey, Tile)] with Metadata[LayoutDefinition] = apply[M](rdd, options) { e => RasterExtent(e, tileDimensions._1, tileDimensions._2) } def apply[M: GetComponent[*, LayoutDefinition]](rdd: RDD[(SpatialKey, PointCloud)] with Metadata[M], cellSize: CellSize, options: PointToGrid.Options): RDD[(SpatialKey, Tile)] with Metadata[LayoutDefinition] = apply[M](rdd, options) { e => RasterExtent(e, cellSize) } def apply[M: GetComponent[*, LayoutDefinition]](rdd: RDD[(SpatialKey, PointCloud)] with Metadata[M], options: PointToGrid.Options)(createRE: Extent => RasterExtent): RDD[(SpatialKey, Tile)] with Metadata[LayoutDefinition] = { val layoutDefinition = rdd.metadata.getComponent[LayoutDefinition] val mapTransform = layoutDefinition.mapTransform val result = rdd .collectNeighbors .mapPartitions({ partition => partition.map { case (key, neighbors) => val extent = mapTransform(key) val raster = PointToGrid.createRaster(neighbors.map(_._2._2), createRE(extent), options) (key, raster.tile) } }, preservesPartitioning = true) ContextRDD(result, layoutDefinition) } }
Example 66
Source File: BufferUnionable.scala From geotrellis-pointcloud with Apache License 2.0 | 5 votes |
package geotrellis.pointcloud.spark.buffer import geotrellis.layer._ import org.apache.spark.rdd.RDD import scala.reflect.ClassTag object BufferUnionable { def apply[ K: SpatialComponent, X <: { def union(other: Any): V }, V: (? => X) : ClassTag ](rdd: RDD[(K, V)]): RDD[(K, V)] = { rdd .flatMap({ case (key, data) => val SpatialKey(col, row) = key for (deltaX <- -1 to +1; deltaY <- -1 to +1) yield { if (deltaX == 0 && deltaY == 0) (SpatialKey(col + deltaX, row + deltaY), (key, data, true)) else (SpatialKey(col + deltaX, row + deltaY), (key, data, false)) } }) .groupByKey .filter({ case (_, seq) => seq.exists { case (_, _, center) => center } }) .map({ case (sortKey, seq) => val resultKey = seq.filter({ case (_, _, center) => center }).head._1 val resultValue = seq.map({ case (_, data, _) => data }).reduce(_ union _) (resultKey, resultValue) }) } }
Example 67
Source File: HadoopPointCloudRDD.scala From geotrellis-pointcloud with Apache License 2.0 | 5 votes |
package geotrellis.pointcloud.spark.store.hadoop import geotrellis.pointcloud.spark.store.hadoop.formats._ import geotrellis.store.hadoop._ import geotrellis.vector.Extent import io.circe.Json import io.pdal._ import io.pdal.pipeline._ import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD def apply(path: Path, options: Options = Options.DEFAULT)(implicit sc: SparkContext): RDD[(HadoopPointCloudHeader, List[PointCloud])] = { val conf = sc.hadoopConfiguration.withInputDirectory(path, options.filesExtensions) options.tmpDir.foreach(PointCloudInputFormat.setTmpDir(conf, _)) options.dimTypes.foreach(PointCloudInputFormat.setDimTypes(conf, _)) PointCloudInputFormat.setPipeline(conf, options.pipeline) options.filterExtent match { case Some(filterExtent) => PointCloudInputFormat.setFilterExtent(conf, filterExtent) sc.newAPIHadoopRDD( conf, classOf[PointCloudInputFormat], classOf[HadoopPointCloudHeader], classOf[List[PointCloud]] ).filter { case (header, _) => header.extent3D.map(_.toExtent.intersects(filterExtent)).getOrElse(false) } case None => sc.newAPIHadoopRDD( conf, classOf[PointCloudInputFormat], classOf[HadoopPointCloudHeader], classOf[List[PointCloud]] ) } } }
Example 68
Source File: S3PointCloudRDD.scala From geotrellis-pointcloud with Apache License 2.0 | 5 votes |
package geotrellis.pointcloud.spark.store.s3 import geotrellis.pointcloud.spark.store.hadoop.formats.PointCloudInputFormat import geotrellis.spark.store.s3._ import geotrellis.store.s3.S3ClientProducer import geotrellis.vector.Extent import io.circe._ import io.pdal._ import io.pdal.pipeline._ import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import software.amazon.awssdk.services.s3.S3Client def apply(bucket: String, prefix: String, options: Options = Options.DEFAULT)(implicit sc: SparkContext): RDD[(S3PointCloudHeader, List[PointCloud])] = { val conf = sc.hadoopConfiguration S3InputFormat.setBucket(conf, bucket) S3InputFormat.setPrefix(conf, prefix) S3InputFormat.setExtensions(conf, options.filesExtensions) S3InputFormat.setCreateS3Client(conf, options.getClient) options.numPartitions.foreach(S3InputFormat.setPartitionCount(conf, _)) options.partitionBytes.foreach(S3InputFormat.setPartitionBytes(conf, _)) options.tmpDir.foreach(PointCloudInputFormat.setTmpDir(conf, _)) options.dimTypes.foreach(PointCloudInputFormat.setDimTypes(conf, _)) PointCloudInputFormat.setPipeline(conf, options.pipeline) options.filterExtent match { case Some(filterExtent) => PointCloudInputFormat.setFilterExtent(conf, filterExtent) sc.newAPIHadoopRDD( conf, classOf[S3PointCloudInputFormat], classOf[S3PointCloudHeader], classOf[List[PointCloud]] ).filter { case (header, _) => header.extent3D.exists(_.toExtent.intersects(filterExtent)) } case None => sc.newAPIHadoopRDD( conf, classOf[S3PointCloudInputFormat], classOf[S3PointCloudHeader], classOf[List[PointCloud]] ) } } }
Example 69
Source File: MlLibOnKudu.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.etl.machinelearning.kudu import com.hadooparchitecturebook.taxi360.model.{NyTaxiYellowTrip, NyTaxiYellowTripBuilder} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object MlLibOnKudu { def main(args: Array[String]): Unit = { if (args.length == 0) { println("Args: <runLocal> " + "<kuduMaster> " + "<taxiTable> " + "<numOfCenters> " + "<numOfIterations> ") return } val runLocal = args(0).equalsIgnoreCase("l") val kuduMaster = args(1) val taxiTable = args(2) val numOfCenters = args(3).toInt val numOfIterations = args(4).toInt val sc: SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") new SparkContext(sparkConfig) } val sqlContext = new SQLContext(sc) val kuduOptions = Map( "kudu.table" -> taxiTable, "kudu.master" -> kuduMaster) sqlContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load. registerTempTable("ny_taxi_trip_tmp") //Vector val vectorRDD:RDD[Vector] = sqlContext.sql("select * from ny_taxi_trip_tmp").map(r => { val taxiTrip = NyTaxiYellowTripBuilder.build(r) generateVectorOnly(taxiTrip) }) println("--Running KMeans") val clusters = KMeans.train(vectorRDD, numOfCenters, numOfIterations) println(" > vector centers:") clusters.clusterCenters.foreach(v => println(" >> " + v)) println("--Running corr") val correlMatrix: Matrix = Statistics.corr(vectorRDD, "pearson") println(" > corr: " + correlMatrix.toString) println("--Running colStats") val colStats = Statistics.colStats(vectorRDD) println(" > max: " + colStats.max) println(" > count: " + colStats.count) println(" > mean: " + colStats.mean) println(" > min: " + colStats.min) println(" > normL1: " + colStats.normL1) println(" > normL2: " + colStats.normL2) println(" > numNonZeros: " + colStats.numNonzeros) println(" > variance: " + colStats.variance) //Labeled Points }
Example 70
Source File: SolRSupport.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.streaming.ingestion.solr import java.net.{ConnectException, SocketException} import java.util import org.apache.solr.client.solrj.impl.CloudSolrServer import org.apache.solr.client.solrj.request.UpdateRequest import org.apache.solr.common.{SolrException, SolrInputDocument} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream object SolRSupport { def indexDStreamOfDocs(zkHost:String, collection:String, batchSize:Int, docDStream:DStream[SolrInputDocument]): Unit ={ docDStream.foreachRDD(docRdd => { indexDoc(zkHost, collection, batchSize, docRdd) }) } def indexDoc(zkHost:String, collection:String, batchSize:Int, docRdd:RDD[SolrInputDocument]): Unit = { docRdd.foreachPartition(it => { val solrServer = CloudSolRServerBuilder.build(zkHost) val batch = new util.ArrayList[SolrInputDocument]() while (it.hasNext) { val inputDoc = it.next() batch.add(inputDoc) if (batch.size() >= batchSize) sendBatchToSolr(solrServer, collection, batch) } if (!batch.isEmpty()) sendBatchToSolr(solrServer, collection, batch) }) } def sendBatchToSolr( solrServer: CloudSolrServer, collection:String, batch:util.Collection[SolrInputDocument]) { val req = new UpdateRequest() req.setParam("collection", collection) req.add(batch) try { solrServer.request(req) } catch { case e:Exception => { if (shouldRetry(e)) { try { Thread.sleep(2000) } catch { case e1: InterruptedException => { Thread.interrupted() } } try { solrServer.request(req) } catch { case e1: Exception => { if (e1.isInstanceOf[RuntimeException]) { throw e1.asInstanceOf[RuntimeException] } else { throw new RuntimeException(e1) } } } } else { if (e.isInstanceOf[RuntimeException]) { throw e.asInstanceOf[RuntimeException] } else { throw new RuntimeException(e) } } } } finally { batch.clear() } } def shouldRetry( exc:Exception): Boolean = { val rootCause = SolrException.getRootCause(exc) rootCause.isInstanceOf[ConnectException] || rootCause.isInstanceOf[SocketException] } }
Example 71
Source File: HBaseSQLTableScan.scala From Backup-Repo with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase.execution import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.RangePartitioning import org.apache.spark.sql.execution.LeafNode import org.apache.spark.sql.hbase._ @DeveloperApi case class HBaseSQLTableScan( relation: HBaseRelation, output: Seq[Attribute], result: RDD[Row]) extends LeafNode { override def outputPartitioning = { var ordering = List[SortOrder]() for (key <- relation.partitionKeys) { ordering = ordering :+ SortOrder(key, Ascending) } RangePartitioning(ordering.toSeq, relation.partitions.size) } override protected def doExecute(): RDD[Row] = result }
Example 72
Source File: HBaseShuffledRDD.scala From Backup-Repo with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.spark._ import org.apache.spark.rdd.{RDD, ShuffledRDD, ShuffledRDDPartition} class HBaseShuffledRDD ( prevRdd: RDD[(HBaseRawType, Array[HBaseRawType])], part: Partitioner, @transient hbPartitions: Seq[HBasePartition] = Nil) extends ShuffledRDD(prevRdd, part){ override def getPartitions: Array[Partition] = { if (hbPartitions==null || hbPartitions.isEmpty) { Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i)) } else { // only to be invoked by clients hbPartitions.toArray } } override def getPreferredLocations(split: Partition): Seq[String] = { if (hbPartitions==null || hbPartitions.isEmpty) { Seq.empty } else { split.asInstanceOf[HBasePartition].server.map { identity[String] }.toSeq } } }
Example 73
Source File: RDFS11.scala From SparkSRE with Apache License 2.0 | 5 votes |
package com.hj.examples import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object RDFS11 { def transitive(rdd:RDD[(String, String)]) = { var rddTuple = rdd val reverseTuple = rddTuple.map(x => (x._2, x._1)) var cur = 0L var pre = rddTuple.count var flag = true while (flag) { val joined = reverseTuple.join(rddTuple) val res = joined.map(x => x._2) rddTuple = rddTuple.union(res).distinct cur = rddTuple.count if(pre == cur) flag = false pre = cur } rddTuple } def main(args: Array[String]): Unit = { if(args.length != 2) { System.out.println("Arguments are invalid! \nExample: <input_path> <output_path>") System.exit(1) } val inputPath = args(0) val outputPath = args(1) val conf = new SparkConf().setAppName("RDFS11").setMaster("local[2]") val sc = new SparkContext(conf) val lines = sc.textFile(inputPath) val triples = lines.map(x => { val arr = x.split(" ") (arr(0), arr(1), arr(2)) }) var subClass = triples.filter(x => x._2.equals("rdfs:subClassOf")).map(x => (x._1, x._3)) subClass = transitive(subClass) subClass.foreach(x => println(x)) subClass.saveAsTextFile(outputPath) } }
Example 74
Source File: RDFS5.scala From SparkSRE with Apache License 2.0 | 5 votes |
package com.hj.examples import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object RDFS5 { def transitive(rdd:RDD[(String, String)]) = { var rddTuple = rdd val reverseTuple = rddTuple.map(x => (x._2, x._1)) var cur = 0L var pre = rddTuple.count var flag = true while (flag) { val joined = reverseTuple.join(rddTuple) val res = joined.map(x => x._2) rddTuple = rddTuple.union(res).distinct cur = rddTuple.count if(pre == cur) flag = false pre = cur } rddTuple } def main(args: Array[String]): Unit = { if(args.length != 2) { System.out.println("Arguments are invalid! \nExample: <input_path> <output_path>") System.exit(1) } val inputPath = args(0) val outputPath = args(1) val conf = new SparkConf().setAppName("RDFS5").setMaster("local[2]") val sc = new SparkContext(conf) val lines = sc.textFile(inputPath) val triples = lines.map(x => { val arr = x.split(" ") (arr(0), arr(1), arr(2)) }) var subProp = triples.filter(x => x._2.equals("rdfs:subPropertyOf")).map(x => (x._1, x._3)) subProp = transitive(subProp) subProp.foreach(x => println(x)) subProp.saveAsTextFile(outputPath) } }
Example 75
Source File: DFConverter.scala From flint with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import com.twosigma.flint.rdd.OrderedRDD import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.types.StructType object DFConverter { def newDataFrame(df: DataFrame): DataFrame = { new DataFrame(df.sparkSession, df.logicalPlan, RowEncoder(df.schema)) } def toDataFrame(rdd: OrderedRDD[Long, InternalRow], schema: StructType): DataFrame = { val spark = SparkSession.builder().getOrCreate() val internalRows = rdd.values spark.internalCreateDataFrame(internalRows, schema) } def toDataFrame(rdd: RDD[InternalRow], schema: StructType): DataFrame = { val spark = SparkSession.builder().getOrCreate() spark.internalCreateDataFrame(rdd, schema) } }
Example 76
Source File: WeightedLabeledPoint.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.math.stats.regression import breeze.linalg.DenseVector import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext case class WeightedLabeledPoint(label: Double, weight: Double, features: DenseVector[Double]) { def generateSampleData(sc: SparkContext, weights: DenseVector[Double], intercept: Double, numRows: Long = 100L, numPartitions: Int = 4, errorScalar: Double = 1.0, seed: Long = 1L): RDD[WeightedLabeledPoint] = { val len = weights.length + 2 // The last entry will serve as the weight of point and the second last entry will serve // as noisy of the label. val data = RandomRDDs.normalVectorRDD(sc, numRows, len, numPartitions, seed) data.map { d => val fw = d.toArray val x = new DenseVector(fw.dropRight(2)) WeightedLabeledPoint( weights.dot(x) + intercept + errorScalar * fw(len - 2), Math.abs(fw(len - 1)) + 0.5, x ) } } }
Example 77
Source File: OLSMultipleLinearRegression.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.math.stats.regression import org.apache.spark.rdd.RDD import breeze.linalg.{ DenseMatrix, DenseVector } object OLSMultipleLinearRegression { def regression(input: RDD[WeightedLabeledPoint], intercept: Boolean = true): LinearRegressionModel = { // Try to get the number of columns val nCols = if (intercept) { input.first.features.length + 1 } else { input.first.features.length } val (xx, xy, swx, srwsl, ssrw, wsl, sw, n, lw) = input.treeAggregate(( new DenseMatrix[Double](nCols, nCols), // 1. Calculate a k-by-k matrix X^TX. new DenseVector[Double](nCols), // 2. Calculate a k-dimension vector X^Ty. new DenseVector[Double](nCols), // 3. Calculate a k-dimension vector of weighted sum of X. 0.0, // 4. Calculate the square root weighted sum of labels. 0.0, // 5. Calculate the sum of square root of weights. 0.0, // 6. Calculate the weighted sum of labels. 0.0, // 7. Calculate the sum of weights. 0: Long, // 8. Calculate the length of input. 0.0 // 9. Calculate sum of log weights ))( // U is a pair of matrix and vector and v is a WeightedLabeledPoint. seqOp = (U, v) => { // Append 1.0 at the head for calculating intercept. val x = if (intercept) { DenseVector.vertcat(DenseVector(1.0), v.features) } else { v.features } val wx = x * v.weight val sqrtW = Math sqrt v.weight // Unfortunately, breeze.linalg.DenseVector does not support tensor product. (U._1 += wx.asDenseMatrix.t * x.asDenseMatrix, U._2 += wx * v.label, U._3 += wx, U._4 + v.label * sqrtW, U._5 + sqrtW, U._6 + v.label * v.weight, U._7 + v.weight, U._8 + 1, U._9 + math.log(v.weight)) }, combOp = (U1, U2) => ( U1._1 += U2._1, U1._2 += U2._2, U1._3 += U2._3, U1._4 + U2._4, U1._5 + U2._5, U1._6 + U2._6, U1._7 + U2._7, U1._8 + U2._8, U1._9 + U2._9 ) ) LinearRegressionModel(input, intercept, n, (xx + xx.t) :/ 2.0, xy, swx, srwsl, ssrw, wsl, sw, lw) } }
Example 78
Source File: PartitionsIterator.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.rdd import grizzled.slf4j.Logger import org.apache.spark.rdd.RDD import org.apache.spark.{ Partition, TaskContext } protected[flint] object PartitionsIterator { val logger = Logger(PartitionsIterator.getClass) def apply[T]( rdd: RDD[T], partitions: Seq[Partition], context: TaskContext, preservesPartitionsOrdering: Boolean = false // FIXME: This is a band-aid which should be fixed. ): PartitionsIterator[T] = new PartitionsIterator(rdd, partitions, context, preservesPartitionsOrdering) } def headPartitionIndex: Int = curPart.index }
Example 79
Source File: TreeReduce.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.rdd.function.summarize import org.apache.spark.rdd.RDD import scala.reflect.ClassTag object TreeReduce { def apply[T: ClassTag]( rdd: RDD[T] )( f: (T, T) => T, depth: Int = 2 ): T = { require(depth >= 1, s"Depth must be greater than or equal to 1 but got $depth.") val reducePartition: Iterator[T] => Option[T] = iter => { if (iter.hasNext) { Some(iter.reduceLeft(f)) } else { None } } val partiallyReduced = rdd.mapPartitions(it => Iterator(reducePartition(it))) val op: (Option[T], Option[T]) => Option[T] = (c, x) => { if (c.isDefined && x.isDefined) { Some(f(c.get, x.get)) } else if (c.isDefined) { c } else if (x.isDefined) { x } else { None } } TreeAggregate(partiallyReduced)(Option.empty[T], op, op, depth).getOrElse( sys.error("Empty collection.") ) } }
Example 80
Source File: PythonUtils.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.rdd import com.twosigma.flint.timeseries.{ TimeSeriesRDD, TimeSeriesRDDImpl } import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types._ import org.apache.spark.sql.{ CatalystTypeConvertersWrapper, Row } private[rdd] case class SchemaColumnInfo(idx: Int, clazz: Class[_ <: Ordered[_]], dataType: DataType) case class TimeSeriesRDDWithSchema(rdd: TimeSeriesRDDImpl, schema: StructType) object PythonUtils { def fromUnsortedRDD( sc: SparkContext, rdd: RDD[Row], schema: StructType, keyColumn: String ): TimeSeriesRDDImpl = { val orderedRdd = OrderedRDD.fromRDD(formatRDD[Long](rdd, schema, keyColumn), KeyPartitioningType.UnSorted) TimeSeriesRDD.fromOrderedRDD(orderedRdd, schema).asInstanceOf[TimeSeriesRDDImpl] } def toOrderedRDD( rdd: RDD[Row], schema: StructType, keyColumn: String, ranges: Seq[CloseOpen[Long]] ): OrderedRDD[Long, InternalRow] = { val keyIdx = schema.fieldIndex(keyColumn) val converter = CatalystTypeConvertersWrapper.toCatalystRowConverter(schema) OrderedRDD.fromRDD(rdd.map(row => (row.getAs[Long](keyIdx), converter(row))), ranges) } }
Example 81
Source File: TimeSeriesRDDConversionSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries import java.util.concurrent.TimeUnit import com.twosigma.flint.timeseries.row.Schema import org.apache.spark.rdd.RDD import org.apache.spark.sql.{ SQLContext, DataFrame, Row } import org.apache.spark.sql.types._ import org.apache.spark.sql.catalyst.expressions.{ GenericRowWithSchema => ExternalRow } import org.scalatest.tagobjects.Slow class TimeSeriesRDDConversionSpec extends TimeSeriesSuite { // The largest prime < 100 override val defaultPartitionParallelism = 97 // The 10000-th prime. private val defaultNumRows = 104729 private def createDataFrame(isSorted: Boolean = true)(implicit sqlContext: SQLContext): DataFrame = { val n = defaultNumRows val schema = Schema("value" -> DoubleType) val rdd: RDD[Row] = sqlContext.sparkContext.parallelize(1 to n, defaultPartitionParallelism).map { i => val data: Array[Any] = if (isSorted) { Array((i / 100).toLong, i.toDouble) } else { Array(((i + 1 - n) / 100).toLong, i.toDouble) } new ExternalRow(data, schema) } sqlContext.createDataFrame(rdd, schema) } "TimeSeriesRDD" should "convert from a sorted DataFrame correctly" taggedAs (Slow) in { implicit val _sqlContext = sqlContext (1 to 10).foreach { i => val tsRdd = TimeSeriesRDD.fromDF(createDataFrame(isSorted = true))(isSorted = true, TimeUnit.NANOSECONDS) assert(tsRdd.count() == defaultNumRows) } (1 to 10).foreach { i => val tsRdd = TimeSeriesRDD.fromDF(createDataFrame(isSorted = true))(isSorted = false, TimeUnit.NANOSECONDS) assert(tsRdd.count() == defaultNumRows) } (1 to 10).foreach { i => val tsRdd = TimeSeriesRDD.fromDF(createDataFrame(isSorted = false))(isSorted = false, TimeUnit.NANOSECONDS) assert(tsRdd.count() == defaultNumRows) } (1 to 10).foreach { i => val tsRdd = TimeSeriesRDD.fromDF( createDataFrame(isSorted = false).sort("time") )( isSorted = true, TimeUnit.NANOSECONDS ) assert(tsRdd.count() == defaultNumRows) } } }
Example 82
Source File: ParallelCollectionRDD.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.rdd import org.apache.spark.rdd.RDD import org.apache.spark.{ Partition, SparkContext, TaskContext } import scala.reflect.ClassTag case class ParallelCollectionRDDPartition[T: ClassTag]( override val index: Int, values: Seq[T] ) extends Partition class ParallelCollectionRDD[T: ClassTag]( sc: SparkContext, @transient data: Seq[Seq[T]] ) extends RDD[T](sc, Nil) { override def compute(split: Partition, context: TaskContext): Iterator[T] = split.asInstanceOf[ParallelCollectionRDDPartition[T]].values.iterator override protected def getPartitions: Array[Partition] = data.zipWithIndex.map { case (d, index) => ParallelCollectionRDDPartition(index, d) }.toArray }
Example 83
Source File: OverlappedOrderedRDDSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.rdd import com.twosigma.flint.SharedSparkContext import org.apache.spark.rdd.RDD import org.scalatest.FlatSpec class OverlappedOrderedRDDSpec extends FlatSpec with SharedSparkContext { val numSlices: Int = 3 val sliceLength: Int = 4 var rdd: RDD[(Int, Int)] = _ var orderedRdd: OrderedRDD[Int, Int] = _ var overlappedOrderedRdd: OverlappedOrderedRDD[Int, Int] = _ private def window(t: Int): (Int, Int) = (t - 2, t) override def beforeAll() { super.beforeAll() val s = sliceLength rdd = sc.parallelize(0 until numSlices, numSlices).flatMap { i => (1 to s).map { j => i * s + j } }.map { x => (x, x) } orderedRdd = OrderedRDD.fromRDD(rdd, KeyPartitioningType.Sorted) overlappedOrderedRdd = OverlappedOrderedRDD(orderedRdd, window) } "The OverlappedOrderedRDD" should "be constructed from `OrderedRDD` correctly" in { assert(overlappedOrderedRdd.rangeSplits.deep == orderedRdd.rangeSplits.deep) val benchmark = Array(1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9, 8, 9, 10, 11, 12).map { x => (x, x) } assert(overlappedOrderedRdd.collect().deep == benchmark.deep) } it should "be able to remove overlapped rows to get an `OrderedRDD` correctly" in { assert(overlappedOrderedRdd.rangeSplits.deep == orderedRdd.rangeSplits.deep) assert(overlappedOrderedRdd.nonOverlapped().collect().deep == orderedRdd.collect().deep) } it should "`mapPartitionsWithIndexOverlapped` correctly" in { val mapped = overlappedOrderedRdd.mapPartitionsWithIndexOverlapped( (index, iterator) => iterator.map { case (k, v) => (k, v * 2) } ) val benchmark = Array(1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9, 8, 9, 10, 11, 12).map { x => (x, 2 * x) } assert(mapped.collect().deep == benchmark.deep) } }
Example 84
Source File: RDDKafkaWriter.scala From spark-kafka-writer with Apache License 2.0 | 5 votes |
package com.github.benfradet.spark.kafka.writer import org.apache.kafka.clients.producer.{Callback, ProducerRecord} import org.apache.spark.rdd.RDD import scala.reflect.ClassTag override def writeToKafka[K, V]( producerConfig: Map[String, Object], transformFunc: T => ProducerRecord[K, V], callback: Option[Callback] = None ): Unit = rdd.foreachPartition { partition => val producer = KafkaProducerCache.getProducer[K, V](producerConfig) partition .map(transformFunc) .foreach(record => producer.send(record, callback.orNull)) } }
Example 85
Source File: DStreamKafkaWriterSpec.scala From spark-kafka-writer with Apache License 2.0 | 5 votes |
package com.github.benfradet.spark.kafka.writer import org.apache.kafka.clients.producer._ import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream import scala.collection.mutable import scala.concurrent.duration._ class DStreamKafkaWriterSpec extends SKRSpec { "a DStreamKafkaWriter" when { "given a dstream" should { "write its content to Kafka" in { val localTopic = topic val msgs = (1 to 10).map(_.toString) val stream = createDStream(msgs) stream.writeToKafka( producerConfig, s => new ProducerRecord[String, String](localTopic, s) ) val results = collect(ssc, localTopic) ssc.start() eventually(timeout(30.seconds), interval(1.second)) { results shouldBe msgs } } "trigger a given callback for every write to Kafka" in { val localTopic = topic val msgs = (1 to 10).map(_.toString) val stream = createDStream(msgs) stream.writeToKafka( producerConfig, s => new ProducerRecord[String, String](localTopic, s), Some(new Callback with Serializable { override def onCompletion(metadata: RecordMetadata, exception: Exception): Unit = { SKRSpec.callbackTriggerCount.incrementAndGet() } }) ) ssc.start() eventually(timeout(30.seconds), interval(1.second)) { SKRSpec.callbackTriggerCount.get() shouldBe msgs.size } } } } private def createDStream(seq: Seq[String]): DStream[String] = { val q = mutable.Queue.empty[RDD[String]] q.enqueue(ssc.sparkContext.makeRDD(seq)) ssc.queueStream(q) } }
Example 86
Source File: StreamingExample.scala From reactiveinflux-spark with Apache License 2.0 | 5 votes |
package com.pygmalios.reactiveinflux.spark.examples import com.pygmalios.reactiveinflux._ import com.pygmalios.reactiveinflux.spark._ import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.{Seconds, StreamingContext} import org.joda.time.DateTime import scala.concurrent.duration._ object StreamingExample extends App { val conf = new SparkConf() .setMaster("local[*]") .setAppName("Example") val ssc = new StreamingContext(conf, Seconds(1)) val point1 = Point( time = DateTime.now(), measurement = "measurement1", tags = Map( "tagKey1" -> "tagValue1", "tagKey2" -> "tagValue2"), fields = Map( "fieldKey1" -> "fieldValue1", "fieldKey2" -> 10.7) ) // Provide settings for reactiveinflux implicit val params = ReactiveInfluxDbName("example") implicit val awaitAtMost = 1.second // Create DStream of Influx points val queue = new scala.collection.mutable.Queue[RDD[Point]] val queueStream: DStream[Point] = ssc.queueStream(queue) // Add single RDD with a single Influx point to the DStream queue.enqueue(ssc.sparkContext.parallelize(Seq(point1))) // Save DStream to Influx queueStream.saveToInflux() // Start Spark streaming ssc.start() ssc.awaitTermination() }
Example 87
Source File: Example.scala From reactiveinflux-spark with Apache License 2.0 | 5 votes |
package com.pygmalios.reactiveinflux.spark.examples import com.pygmalios.reactiveinflux._ import com.pygmalios.reactiveinflux.spark._ import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD import org.joda.time.DateTime import scala.concurrent.duration._ object Example extends App { val conf = new SparkConf() .setMaster("local[*]") .setAppName("Example") val sc = new SparkContext(conf) val point1 = Point( time = DateTime.now(), measurement = "measurement1", tags = Map( "tagKey1" -> "tagValue1", "tagKey2" -> "tagValue2"), fields = Map( "fieldKey1" -> "fieldValue1", "fieldKey2" -> 10.7) ) // Provide settings for reactiveinflux implicit val params = ReactiveInfluxDbName("example") implicit val awaitAtMost = 1.second // Create RDD with Influx point val rdd: RDD[Point] = sc.parallelize(Seq(point1)) // Save RDD to Influx rdd.saveToInflux() // Stop Spark context sc.stop() }
Example 88
Source File: PointRDDExtensions.scala From reactiveinflux-spark with Apache License 2.0 | 5 votes |
package com.pygmalios.reactiveinflux.spark.extensions import com.pygmalios.reactiveinflux.spark.config.ReactiveInfluxSparkConfig import com.pygmalios.reactiveinflux.spark.{RDDExtensions, _} import com.pygmalios.reactiveinflux.{PointNoTime, ReactiveInfluxDbName} import org.apache.spark.rdd.RDD import org.slf4j.LoggerFactory import scala.concurrent.duration.Duration private[spark] class PointRDDExtensions[+T <: PointNoTime](rdd: RDD[T]) extends RDDExtensions[T] { import PointRDDExtensions._ override def saveToInflux()(implicit reactiveInfluxDbName: ReactiveInfluxDbName, awaitAtMost: Duration): Unit = { // Process each partition separately totalBatchCount = 0 totalPointCount = 0 rdd.foreachPartition { partition => withInflux { db => val batchSize = ReactiveInfluxSparkConfig(db.config).sparkBatchSize // Write points in batches var batchCount = 0 var pointCount = 0 partition.sliding(batchSize, batchSize).foreach { batch => // Write single batch db.write(batch) // Statistics for logging batchCount += 1 pointCount += batch.size } totalBatchCount += batchCount totalPointCount += pointCount log.debug(s"Partition with $pointCount points written to Influx in $batchCount batches.") } } log.info(s"RDD with ${rdd.partitions.size} partitions and $totalPointCount points written to Influx in $totalBatchCount batches.") } } object PointRDDExtensions { private val log = LoggerFactory.getLogger(classOf[PointRDDExtensions[_]]) // This makes sense for testing purposes only private[reactiveinflux] var totalBatchCount = 0 private[reactiveinflux] var totalPointCount = 0 }
Example 89
Source File: MSNBCStreamingExample.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 5 votes |
package com.github.maxpumperla.ml_spark.streaming import org.apache.spark.mllib.fpm.PrefixSpan import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object MSNBCStreamingExample extends App { val conf = new SparkConf() .setAppName("MSNBC data initial streaming example") .setMaster("local[4]") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, batchDuration = Seconds(10)) val transactions: RDD[Array[Int]] = sc.textFile("src/main/resources/msnbc990928.seq") map { line => line.split(" ").map(_.toInt) } val trainSequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache() val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15) val psModel = prefixSpan.run(trainSequences) val freqSequences = psModel.freqSequences.map(_.sequence).collect() val rawSequences: DStream[String] = ssc.socketTextStream("localhost", 9999) val sequences: DStream[Array[Array[Int]]] = rawSequences .map(line => line.split(" ").map(_.toInt)) .map(_.map(Array(_))) print(">>> Analysing new batch of data") sequences.foreachRDD( rdd => rdd.foreach( array => { println(">>> Sequence: ") println(array.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]")) freqSequences.count(_.deep == array.deep) match { case count if count > 0 => println("is frequent!") case _ => println("is not frequent.") } } ) ) print(">>> done") ssc.start() ssc.awaitTermination() }
Example 90
Source File: MSNBCPatternMining.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 5 votes |
package com.github.maxpumperla.ml_spark.streaming import org.apache.spark.mllib.fpm.{FPGrowth, PrefixSpan} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object MSNBCPatternMining extends App { val conf = new SparkConf() .setAppName("MSNBC.com data pattern mining") .setMaster("local[4]") val sc = new SparkContext(conf) val transactionTest = sc.parallelize(Array(Array("A", "B", "C"), Array("B", "C", "A"))) val fp = new FPGrowth().setMinSupport(0.8).setNumPartitions(5) fp.run(transactionTest) val transactions: RDD[Array[Int]] = sc.textFile("./msnbc990928.seq") map { line => line.split(" ").map(_.toInt) } // NOTE: Caching data is recommended val uniqueTransactions: RDD[Array[Int]] = transactions.map(_.distinct).cache() val fpGrowth = new FPGrowth().setMinSupport(0.01) val model = fpGrowth.run(uniqueTransactions) val count = uniqueTransactions.count() model.freqItemsets.collect().foreach { itemset => if (itemset.items.length >= 3) println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq / count.toDouble ) } val rules = model.generateAssociationRules(confidence = 0.4) rules.collect().foreach { rule => println("[" + rule.antecedent.mkString(",") + "=>" + rule.consequent.mkString(",") + "]," + (100 * rule.confidence).round / 100.0) } val frontPageConseqRules = rules.filter(_.consequent.head == 1) frontPageConseqRules.count frontPageConseqRules.filter(_.antecedent.contains(2)).count rules.filter(_.antecedent.contains(7)).count val sequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache() val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15) val psModel = prefixSpan.run(sequences) psModel.freqSequences.map(fs => (fs.sequence.length, 1)) .reduceByKey(_ + _) .sortByKey() .collect() .foreach(fs => println(s"${fs._1}: ${fs._2}")) psModel.freqSequences .map(fs => (fs.sequence.length, fs)) .groupByKey() .map(group => group._2.reduce((f1, f2) => if (f1.freq > f2.freq) f1 else f2)) .map(_.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]")) .collect.foreach(println) psModel.freqSequences .map(fs => (fs.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]"), 1)) .reduceByKey(_ + _) .reduce( (f1, f2) => if (f1._2 > f2._2) f1 else f2 ) psModel.freqSequences.reduce( (f1, f2) => if (f1.freq > f2.freq) f1 else f2 ) psModel.freqSequences.filter(_.sequence.length == 1).map(_.sequence.toString).collect.foreach(println) psModel.freqSequences.collect().foreach { freqSequence => println( freqSequence.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]") + ", " + freqSequence.freq ) } }
Example 91
Source File: MSNBCStreamingAdvanced.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 5 votes |
package com.github.maxpumperla.ml_spark.streaming import org.apache.spark.mllib.fpm.PrefixSpan import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object MSNBCStreamingAdvanced extends App { val conf = new SparkConf() .setAppName("MSNBC data initial streaming example") .setMaster("local[4]") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, batchDuration = Seconds(10)) val transactions: RDD[Array[Int]] = sc.textFile("src/main/resources/msnbc990928.seq") map { line => line.split(" ").map(_.toInt) } val trainSequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache() val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15) val psModel = prefixSpan.run(trainSequences) val freqSequences = psModel.freqSequences.map(_.sequence).collect() val rawEvents: DStream[String] = ssc.socketTextStream("localhost", 9999) val events: DStream[(Int, String)] = rawEvents.map(line => line.split(": ")) .map(kv => (kv(0).toInt, kv(1))) val countIds = events.map(e => (e._1, 1)) val counts: DStream[(Int, Int)] = countIds.reduceByKey(_ + _) def updateFunction(newValues: Seq[Int], runningCount: Option[Int]): Option[Int] = { Some(runningCount.getOrElse(0) + newValues.sum) } val runningCounts = countIds.updateStateByKey[Int](updateFunction _) val duration = Seconds(20) val slide = Seconds(10) val rawSequences: DStream[(Int, String)] = events .reduceByKeyAndWindow((v1: String, v2: String) => v1 + " " + v2, duration, slide) val sequences: DStream[Array[Array[Int]]] = rawSequences.map(_._2) .map(line => line.split(" ").map(_.toInt)) .map(_.map(Array(_))) print(">>> Analysing new batch of data") sequences.foreachRDD( rdd => rdd.foreach( array => { println(">>> Sequence: ") println(array.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]")) freqSequences.count(_.deep == array.deep) match { case count if count > 0 => println("is frequent!") case _ => println("is not frequent.") } } ) ) print(">>> done") ssc.start() ssc.awaitTermination() }
Example 92
Source File: GraphFromRdd.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 5 votes |
package com.github.maxpumperla.ml_spark.graphs import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object GraphFromRdd extends App { val conf = new SparkConf() .setAppName("RDD graph") .setMaster("local[4]") val sc = new SparkContext(conf) val vertices: RDD[(VertexId, String)] = sc.parallelize( Array((1L, "Anne"), (2L, "Bernie"), (3L, "Chris"), (4L, "Don"), (5L, "Edgar"))) val edges: RDD[Edge[String]] = sc.parallelize( Array(Edge(1L, 2L, "likes"), Edge(2L, 3L, "trusts"), Edge(3L, 4L, "believes"), Edge(4L, 5L, "worships"), Edge(1L, 3L, "loves"), Edge(4L, 1L, "dislikes"))) val friendGraph: Graph[String, String] = Graph(vertices, edges) friendGraph.vertices.collect.foreach(println) friendGraph.edges.map( e => e.srcId > e.dstId ).count() val mappedEdgeGraph: Graph[String, Boolean] = friendGraph.mapEdges( e => e.srcId > e.dstId ) val inDegVertexRdd: VertexRDD[Int] = friendGraph.aggregateMessages[Int]( sendMsg = ec => ec.sendToDst(1), mergeMsg = (msg1, msg2) => msg1+msg2 ) assert(inDegVertexRdd.collect.deep == friendGraph.inDegrees.collect.deep) friendGraph.staticPageRank(numIter = 10).vertices.collect.foreach(println) friendGraph.pageRank(tol = 0.0001, resetProb = 0.15) }
Example 93
Source File: GraphFramesExample.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 5 votes |
package com.github.maxpumperla.ml_spark.graphs import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} //import org.graphframes._ object GraphFramesExample extends App { val conf = new SparkConf() .setAppName("RDD graph") .setMaster("local[4]") val sc = new SparkContext(conf) val vertices: RDD[(VertexId, String)] = sc.parallelize( Array((1L, "Anne"), (2L, "Bernie"), (3L, "Chris"), (4L, "Don"), (5L, "Edgar"))) val edges: RDD[Edge[String]] = sc.parallelize( Array(Edge(1L, 2L, "likes"), Edge(2L, 3L, "trusts"), Edge(3L, 4L, "believes"), Edge(4L, 5L, "worships"), Edge(1L, 3L, "loves"), Edge(4L, 1L, "dislikes"))) val friendGraph: Graph[String, String] = Graph(vertices, edges) // val friendGraphFrame = GraphFrame.fromGraphX(friendGraph) // // friendGraphFrame.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3)").filter( // "e1.attr = 'trusts' OR v3.attr = 'Chris'" // ).collect.foreach(println) }
Example 94
Source File: GephiApp.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 5 votes |
package com.github.maxpumperla.ml_spark.graphs import java.io.PrintWriter import com.github.maxpumperla.ml_spark.utils.Gephi.toGexf import org.apache.spark._ import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD object GephiApp extends App { val conf = new SparkConf() .setAppName("Gephi Test Writer") .setMaster("local[4]") val sc = new SparkContext(conf) val vertices: RDD[(VertexId, String)] = sc.parallelize( Array((1L, "Anne"), (2L, "Bernie"), (3L, "Chris"), (4L, "Don"), (5L, "Edgar"))) val edges: RDD[Edge[String]] = sc.parallelize( Array(Edge(1L, 2L, "likes"), Edge(2L, 3L, "trusts"), Edge(3L, 4L, "believes"), Edge(4L, 5L, "worships"), Edge(1L, 3L, "loves"), Edge(4L, 1L, "dislikes"))) val graph: Graph[String, String] = Graph(vertices, edges) val pw = new PrintWriter("./graph.gexf") pw.write(toGexf(graph)) pw.close() }
Example 95
Source File: DCollectionGenProperties.scala From kontextfrei with Apache License 2.0 | 5 votes |
package com.danielwestheide.kontextfrei.scalatest import org.apache.spark.rdd.RDD import org.scalatest.PropSpecLike import org.scalatest.prop.GeneratorDrivenPropertyChecks trait DCollectionGenProperties[DColl[_]] extends PropSpecLike with GeneratorDrivenPropertyChecks with DCollectionGen with KontextfreiSpec[DColl] { property("Can get arbitrary DCollections") { forAll { xs: DColl[String] => ops.count(xs) === ops.collectAsArray(xs).length } } } class DCollectionGenStreamSpec extends DCollectionGenProperties[Stream] with StreamSpec class DCollectionGenRDDSpec extends DCollectionGenProperties[RDD] with RDDSpec
Example 96
Source File: CollectingInstancesProperties.scala From kontextfrei with Apache License 2.0 | 5 votes |
package com.danielwestheide.kontextfrei.scalatest import org.apache.spark.rdd.RDD import org.scalatest.enablers.Collecting import org.scalatest.{Inspectors, PropSpec, PropSpecLike} import org.scalatest.prop.GeneratorDrivenPropertyChecks trait CollectingInstancesProperties[DColl[_]] extends PropSpecLike with GeneratorDrivenPropertyChecks with KontextfreiSpec[DColl] with CollectingInstances { property("There is a Collecting instance for DCollection") { forAll { (xs: List[String]) => val dcoll = ops.unit(xs) Inspectors.forAll(dcoll) { x => assert(xs.contains(x)) } } } property( "Collecting nature of DCollection returns the original size of the input sequence") { forAll { (xs: List[String]) => val dcoll = ops.unit(xs) assert( implicitly[Collecting[String, DColl[String]]] .sizeOf(dcoll) === xs.size) } } property( "Collecting nature of DCollection returns the Some loneElement if input sequence has exactly one element") { forAll { (x: String) => val dcoll = ops.unit(List(x)) assert( implicitly[Collecting[String, DColl[String]]] .loneElementOf(dcoll) === Some(x)) } } property( "Collecting nature of DCollection returns the None as loneElement if input sequence as more than one element") { forAll { (xs: List[String]) => whenever(xs.size > 1) { val dcoll = ops.unit(xs) assert( implicitly[Collecting[String, DColl[String]]] .loneElementOf(dcoll) .isEmpty) } } } property( "Collecting nature of DCollection returns the None as loneElement if input sequence is empty") { val dcoll = ops.unit(List.empty[String]) assert( implicitly[Collecting[String, DColl[String]]] .loneElementOf(dcoll) .isEmpty) } } class CollectionInstancesStreamSpec extends CollectingInstancesProperties[Stream] with StreamSpec class CollectionInstancesRDDSpec extends CollectingInstancesProperties[RDD] with RDDSpec
Example 97
Source File: RDDPairFunctions.scala From kontextfrei with Apache License 2.0 | 5 votes |
package com.danielwestheide.kontextfrei.rdd import com.danielwestheide.kontextfrei.DCollectionPairFunctions import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import scala.collection.Map import scala.reflect.ClassTag private[kontextfrei] trait RDDPairFunctions extends DCollectionPairFunctions[RDD] { this: RDDBase => override final def cogroup[A: ClassTag, B: ClassTag, C: ClassTag]( x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Iterable[B], Iterable[C]))] = withSite(x) { _.cogroup(y) } override final def values[A: ClassTag, B: ClassTag](x: RDD[(A, B)]): RDD[B] = withSite(x) { _.values } override final def keys[A: ClassTag, B: ClassTag](x: RDD[(A, B)]): RDD[A] = withSite(x) { _.keys } override final def leftOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag]( x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (B, Option[C]))] = withSite(x) { _.leftOuterJoin(y) } override final def rightOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag]( x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Option[B], C))] = withSite(x) { _.rightOuterJoin(y) } override final def fullOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag]( x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Option[B], Option[C]))] = withSite(x) { _.fullOuterJoin(y) } override final def mapValues[A: ClassTag, B: ClassTag, C: ClassTag]( x: RDD[(A, B)])(f: B => C): RDD[(A, C)] = withSite(x) { _.mapValues(f) } override final def flatMapValues[A: ClassTag, B: ClassTag, C: ClassTag]( x: RDD[(A, B)])(f: B => TraversableOnce[C]): RDD[(A, C)] = withSite(x) { _.flatMapValues(f) } override final def reduceByKey[A: ClassTag, B: ClassTag](xs: RDD[(A, B)])( f: (B, B) => B): RDD[(A, B)] = withSite(xs) { _.reduceByKey(f) } override final def foldByKey[A: ClassTag, B: ClassTag]( xs: RDD[(A, B)])(zeroValue: B, f: (B, B) => B): RDD[(A, B)] = withSite(xs) { _.foldByKey(zeroValue)(f) } override final def aggregateByKey[A: ClassTag, B: ClassTag, C: ClassTag]( xs: RDD[(A, B)])(zeroValue: C)(seqOp: (C, B) => C, combOp: (C, C) => C): RDD[(A, C)] = withSite(xs) { _.aggregateByKey(zeroValue)(seqOp, combOp) } override final def combineByKey[A: ClassTag, B: ClassTag, C: ClassTag]( xs: RDD[(A, B)])(createCombiner: B => C)( mergeValue: (C, B) => C, mergeCombiners: (C, C) => C): RDD[(A, C)] = withSite(xs) { _.combineByKey(createCombiner, mergeValue, mergeCombiners) } override final def countByKey[A: ClassTag, B: ClassTag]( xs: RDD[(A, B)]): Map[A, Long] = withSite(xs) { _.countByKey() } override final def collectAsMap[A: ClassTag, B: ClassTag]( xs: RDD[(A, B)]): Map[A, B] = withSite(xs) { _.collectAsMap() } override final def partitionBy[A: ClassTag, B: ClassTag]( xs: RDD[(A, B)])(partitioner: Partitioner): RDD[(A, B)] = withSite(xs) { _.partitionBy(partitioner) } }
Example 98
Source File: RDDOrderedFunctions.scala From kontextfrei with Apache License 2.0 | 5 votes |
package com.danielwestheide.kontextfrei.rdd import com.danielwestheide.kontextfrei.DCollectionOrderedFunctions import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import scala.reflect.ClassTag private[kontextfrei] trait RDDOrderedFunctions extends DCollectionOrderedFunctions[RDD] { this: RDDBase => override final def sortByKey[A: ClassTag: Ordering, B: ClassTag]( x: RDD[(A, B)])(ascending: Boolean): RDD[(A, B)] = withSite(x) { _.sortByKey(ascending) } override final def sortByKeyWithNumPartitions[A: ClassTag: Ordering, B: ClassTag]( x: RDD[(A, B)])(ascending: Boolean, numPartitions: Int): RDD[(A, B)] = withSite(x) { _.sortByKey(ascending, numPartitions) } override final def filterByRange[A: ClassTag: Ordering, B: ClassTag]( x: RDD[(A, B)])(lower: A, upper: A): RDD[(A, B)] = withSite(x) { _.filterByRange(lower, upper) } override def repartitionAndSortWithinPartitions[ A: ClassTag: Ordering, B: ClassTag]( x: RDD[(A, B)])( partitioner: Partitioner) : RDD[(A, B)] = withSite(x) { _.repartitionAndSortWithinPartitions(partitioner) } }
Example 99
Source File: RDDCollectionOpsSpec.scala From kontextfrei with Apache License 2.0 | 5 votes |
package com.danielwestheide.kontextfrei import com.danielwestheide.kontextfrei.rdd.RDDOpsSupport import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.scalatest.BeforeAndAfterAll class RDDCollectionOpsSpec extends DCollectionOpsProperties[RDD] with BeforeAndAfterAll { implicit val sparkContext = new SparkContext("local[2]", "dcollection-spec") override implicit val ops: DCollectionOps[RDD] = RDDOpsSupport.rddCollectionOps override protected def afterAll(): Unit = { sparkContext.stop() } }
Example 100
Source File: TSNEHelper.scala From spark-tsne with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.tsne import breeze.linalg._ import breeze.stats._ import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix import org.apache.spark.rdd.RDD object TSNEHelper { // p_ij = (p_{i|j} + p_{j|i}) / 2n def computeP(p_ji: CoordinateMatrix, n: Int): RDD[(Int, Iterable[(Int, Double)])] = { p_ji.entries .flatMap(e => Seq( ((e.i.toInt, e.j.toInt), e.value), ((e.j.toInt, e.i.toInt), e.value) )) .reduceByKey(_ + _) // p + p' .map{case ((i, j), v) => (i, (j, math.max(v / 2 / n, 1e-12))) } // p / 2n .groupByKey() } def update(Y: DenseMatrix[Double], dY: DenseMatrix[Double], iY: DenseMatrix[Double], gains: DenseMatrix[Double], iteration: Int, param: TSNEParam): DenseMatrix[Double] = { import param._ val momentum = if (iteration <= t_momentum) initial_momentum else final_momentum gains.foreachPair { case ((i, j), old_gain) => val new_gain = math.max(min_gain, if ((dY(i, j) > 0.0) != (iY(i, j) > 0.0)) old_gain + 0.2 else old_gain * 0.8 ) gains.update(i, j, new_gain) val new_iY = momentum * iY(i, j) - eta * new_gain * dY(i, j) iY.update(i, j, new_iY) Y.update(i, j, Y(i, j) + new_iY) // Y += iY } val t_Y: DenseVector[Double] = mean(Y(::, *)).t val y_sub = Y(*, ::) Y := y_sub - t_Y } }
Example 101
Source File: LocalRunner.scala From spark-betweenness with Apache License 2.0 | 5 votes |
package com.centrality.kBC import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.graphx.Edge import org.apache.spark.graphx.Graph import org.apache.spark.graphx.VertexId import org.apache.spark.rdd.RDD object MainRunner { def main(args: Array[String]) { // Create spark context val appName="kBC" val sparkMode="local" val conf = new SparkConf().setAppName(appName).setMaster(sparkMode); val sc = new SparkContext(conf); // Create sample graph // // Create an RDD for vertices val users: RDD[(VertexId, (String, String))] = sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")), (5L, ("franklin", "prof")), (2L, ("istoica", "prof")))) // Create an RDD for edges val relationships: RDD[Edge[String]] = sc.parallelize(Array(Edge(3L, 7L, "collab"), Edge(5L, 3L, "advisor"), Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi"))) // Define a default user in case there are relationship with missing user val defaultUser = ("John Doe", "Missing") // Build the initial Graph val graph = Graph(users, relationships, defaultUser) val kBCGraph = KBetweenness.run(graph, 3) } }
Example 102
Source File: TiRDD.scala From tispark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.tispark import com.pingcap.tikv._ import com.pingcap.tikv.exception.TiInternalException import com.pingcap.tikv.meta.TiDAGRequest import com.pingcap.tikv.types.Converter import com.pingcap.tikv.util.RangeSplitter import com.pingcap.tikv.util.RangeSplitter.RegionTask import com.pingcap.tispark.{TiPartition, TiTableReference} import org.apache.spark.Partition import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import scala.collection.JavaConversions._ import scala.collection.mutable import scala.collection.mutable.ListBuffer abstract class TiRDD( val dagRequest: TiDAGRequest, val physicalId: Long, val tiConf: TiConfiguration, val tableRef: TiTableReference, @transient private val session: TiSession, @transient private val sparkSession: SparkSession) extends RDD[InternalRow](sparkSession.sparkContext, Nil) { private lazy val partitionPerSplit = tiConf.getPartitionPerSplit protected def checkTimezone(): Unit = { if (!tiConf.getLocalTimeZone.equals(Converter.getLocalTimezone)) { throw new TiInternalException( "timezone are different! driver: " + tiConf.getLocalTimeZone + " executor:" + Converter.getLocalTimezone + " please set user.timezone in spark.driver.extraJavaOptions and spark.executor.extraJavaOptions") } } override protected def getPartitions: Array[Partition] = { val keyWithRegionTasks = RangeSplitter .newSplitter(session.getRegionManager) .splitRangeByRegion(dagRequest.getRangesByPhysicalId(physicalId), dagRequest.getStoreType) val hostTasksMap = new mutable.HashMap[String, mutable.Set[RegionTask]] with mutable.MultiMap[String, RegionTask] var index = 0 val result = new ListBuffer[TiPartition] for (task <- keyWithRegionTasks) { hostTasksMap.addBinding(task.getHost, task) val tasks = hostTasksMap(task.getHost) if (tasks.size >= partitionPerSplit) { result.append(new TiPartition(index, tasks.toSeq, sparkContext.applicationId)) index += 1 hostTasksMap.remove(task.getHost) } } // add rest for (tasks <- hostTasksMap.values) { result.append(new TiPartition(index, tasks.toSeq, sparkContext.applicationId)) index += 1 } result.toArray } override protected def getPreferredLocations(split: Partition): Seq[String] = split.asInstanceOf[TiPartition].tasks.head.getHost :: Nil }
Example 103
Source File: BasicDataSourceSuite.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.datasource import com.pingcap.tikv.exception.TiBatchWriteException import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} class BasicDataSourceSuite extends BaseDataSourceTest("test_datasource_basic") { private val row1 = Row(null, "Hello") private val row2 = Row(2, "TiDB") private val row3 = Row(3, "Spark") private val row4 = Row(4, null) private val schema = StructType( List(StructField("i", IntegerType), StructField("s", StringType))) override def beforeAll(): Unit = { super.beforeAll() dropTable() jdbcUpdate(s"create table $dbtable(i int, s varchar(128))") jdbcUpdate(s"insert into $dbtable values(null, 'Hello'), (2, 'TiDB')") } test("Test Select") { if (!supportBatchWrite) { cancel } testTiDBSelect(Seq(row1, row2)) } test("Test Write Append") { if (!supportBatchWrite) { cancel } val data: RDD[Row] = sc.makeRDD(List(row3, row4)) val df = sqlContext.createDataFrame(data, schema) df.write .format("tidb") .options(tidbOptions) .option("database", database) .option("table", table) .mode("append") .save() testTiDBSelect(Seq(row1, row2, row3, row4)) } test("Test Write Overwrite") { if (!supportBatchWrite) { cancel } val data: RDD[Row] = sc.makeRDD(List(row3, row4)) val df = sqlContext.createDataFrame(data, schema) val caught = intercept[TiBatchWriteException] { df.write .format("tidb") .options(tidbOptions) .option("database", database) .option("table", table) .mode("overwrite") .save() } assert( caught.getMessage .equals("SaveMode: Overwrite is not supported. TiSpark only support SaveMode.Append.")) } override def afterAll(): Unit = try { dropTable() } finally { super.afterAll() } }
Example 104
Source File: UpperCaseColumnNameSuite.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.datasource import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class UpperCaseColumnNameSuite extends BaseDataSourceTest("test_datasource_uppser_case_column_name") { private val row1 = Row(1, 2) private val schema = StructType( List(StructField("O_ORDERKEY", IntegerType), StructField("O_CUSTKEY", IntegerType))) override def beforeAll(): Unit = { super.beforeAll() dropTable() jdbcUpdate(s""" |CREATE TABLE $dbtable (O_ORDERKEY INTEGER NOT NULL, | O_CUSTKEY INTEGER NOT NULL); """.stripMargin) } test("Test insert upper case column name") { if (!supportBatchWrite) { cancel } val data: RDD[Row] = sc.makeRDD(List(row1)) val df = sqlContext.createDataFrame(data, schema) df.write .format("tidb") .options(tidbOptions) .option("database", database) .option("table", table) .mode("append") .save() } override def afterAll(): Unit = try { dropTable() } finally { super.afterAll() } }
Example 105
Source File: MissingParameterSuite.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.datasource import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} class MissingParameterSuite extends BaseDataSourceTest("test_datasource_missing_parameter") { private val row1 = Row(null, "Hello") private val schema = StructType( List(StructField("i", IntegerType), StructField("s", StringType))) test("Missing parameter: database") { if (!supportBatchWrite) { cancel } dropTable() jdbcUpdate(s"create table $dbtable(i int, s varchar(128))") val caught = intercept[IllegalArgumentException] { val rows = row1 :: Nil val data: RDD[Row] = sc.makeRDD(rows) val df = sqlContext.createDataFrame(data, schema) df.write .format("tidb") .options(tidbOptions) .option("table", table) .mode("append") .save() } assert( caught.getMessage .equals("requirement failed: Option 'database' is required.")) } override def afterAll(): Unit = try { dropTable() } finally { super.afterAll() } }
Example 106
Source File: OnlyOnePkSuite.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.datasource import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class OnlyOnePkSuite extends BaseDataSourceTest("test_datasource_only_one_pk") { private val row3 = Row(3) private val row4 = Row(4) private val schema = StructType(List(StructField("i", IntegerType))) override def beforeAll(): Unit = { super.beforeAll() dropTable() jdbcUpdate(s"create table $dbtable(i int primary key)") } test("Test Write Append") { if (!supportBatchWrite) { cancel } val data: RDD[Row] = sc.makeRDD(List(row3, row4)) val df = sqlContext.createDataFrame(data, schema) df.write .format("tidb") .options(tidbOptions) .option("database", database) .option("table", table) .mode("append") .save() testTiDBSelect(Seq(row3, row4)) } override def afterAll(): Unit = try { dropTable() } finally { super.afterAll() } }
Example 107
Source File: WriteDDLConflictSuite.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.concurrency import com.pingcap.tikv.exception.TiBatchWriteException import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row class WriteDDLConflictSuite extends ConcurrencyTest { test("write ddl conflict using TableLock") { if (!supportBatchWrite) { cancel } if (!isEnableTableLock) { cancel } dropTable() jdbcUpdate(s"create table $dbtable(i int, s varchar(128))") jdbcUpdate(s"insert into $dbtable values(4, 'null')") doBatchWriteInBackground(Map("useTableLock" -> "true")) Thread.sleep(sleepBeforeQuery) val caught = intercept[java.sql.SQLException] { jdbcUpdate(s"alter table $dbtable ADD Email varchar(255)") } assert( caught.getMessage .startsWith("Table 'test_concurrency_write_read' was locked in WRITE LOCAL by server")) } test("write ddl conflict using SchemaVersionCheck") { if (!supportBatchWrite) { cancel } dropTable() jdbcUpdate(s"create table $dbtable(i int, s varchar(128))") jdbcUpdate(s"insert into $dbtable values(4, 'null')") new Thread(new Runnable { override def run(): Unit = { Thread.sleep(sleepBeforeQuery) jdbcUpdate(s"alter table $dbtable ADD Email varchar(255)") } }).start() val caught = intercept[TiBatchWriteException] { val data: RDD[Row] = sc.makeRDD(List(row1, row2, row3)) val df = sqlContext.createDataFrame(data, schema) df.write .format("tidb") .options(tidbOptions) .option("database", database) .option("table", table) .option("sleepAfterPrewriteSecondaryKey", sleepBeforeQuery * 2) .option("useTableLock", "false") .mode("append") .save() } assert(caught.getMessage.equals("schema has changed during prewrite!")) } }
Example 108
Source File: WriteDDLNotConflictSuite.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.concurrency import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row class WriteDDLNotConflictSuite extends ConcurrencyTest { test("ddl after GetCommitTS: add column") { doTest(s"alter table $dbtable ADD Email varchar(255)") } test("ddl after GetCommitTS: delete column") { doTest(s"alter table $dbtable drop column s") } test("ddl after GetCommitTS: rename column") { doTest(s"alter table $dbtable CHANGE s s2 varchar(128)") } test("ddl after GetCommitTS: change column type") { doTest(s"alter table $dbtable CHANGE i i BIGINT") } private def doTest(ddl: String): Unit = { if (!supportBatchWrite) { cancel } dropTable() jdbcUpdate(s"create table $dbtable(i int, s varchar(128))") jdbcUpdate(s"insert into $dbtable values(4, 'null')") new Thread(new Runnable { override def run(): Unit = { Thread.sleep(sleepBeforeQuery) jdbcUpdate(ddl) } }).start() val data: RDD[Row] = sc.makeRDD(List(row1, row2, row3)) val df = sqlContext.createDataFrame(data, schema) df.write .format("tidb") .options(tidbOptions) .option("database", database) .option("table", table) .option("sleepAfterGetCommitTS", sleepBeforeQuery * 2) .option("useTableLock", "false") .mode("append") .save() compareSelect() } }
Example 109
Source File: WriteWriteConflictSuite.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.concurrency import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row class WriteWriteConflictSuite extends ConcurrencyTest { test("write write conflict using TableLock & jdbc") { if (!supportBatchWrite) { cancel } if (!isEnableTableLock) { cancel } dropTable() jdbcUpdate(s"create table $dbtable(i int, s varchar(128))") jdbcUpdate(s"insert into $dbtable values(4, 'null')") doBatchWriteInBackground(Map("useTableLock" -> "true")) Thread.sleep(sleepBeforeQuery) val caught = intercept[java.sql.SQLException] { jdbcUpdate(s"insert into $dbtable values(5, 'test')") } assert( caught.getMessage .startsWith("Table 'test_concurrency_write_read' was locked in WRITE LOCAL by server")) } test("write write conflict using TableLock & tispark") { if (!supportBatchWrite) { cancel } if (!isEnableTableLock) { cancel } dropTable() jdbcUpdate(s"create table $dbtable(i int, s varchar(128))") jdbcUpdate(s"insert into $dbtable values(4, 'null')") doBatchWriteInBackground(Map("useTableLock" -> "true")) Thread.sleep(sleepBeforeQuery) val caught = intercept[java.sql.SQLException] { val data: RDD[Row] = sc.makeRDD(List(row5)) val df = sqlContext.createDataFrame(data, schema) df.write .format("tidb") .options(tidbOptions) .option("database", database) .option("table", table) .option("useTableLock", "true") .mode("append") .save() } assert( caught.getMessage .startsWith("Table 'test_concurrency_write_read' was locked in WRITE LOCAL by server")) } }
Example 110
Source File: LockTimeoutSuite.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.ttl import com.pingcap.tikv.TTLManager import com.pingcap.tikv.exception.GrpcException import com.pingcap.tispark.datasource.BaseDataSourceTest import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} class LockTimeoutSuite extends BaseDataSourceTest("test_lock_timeout") { private val row1 = Row(1, "Hello") private val schema = StructType( List(StructField("i", IntegerType), StructField("s", StringType))) override def beforeAll(): Unit = { super.beforeAll() dropTable() jdbcUpdate(s"create table $dbtable(i int, s varchar(128))") } test("Test Lock TTL Timeout") { if (!supportTTLUpdate) { cancel } val seconds = 1000 val sleep1 = TTLManager.MANAGED_LOCK_TTL + 10 * seconds val sleep2 = TTLManager.MANAGED_LOCK_TTL + 15 * seconds val data: RDD[Row] = sc.makeRDD(List(row1)) val df = sqlContext.createDataFrame(data, schema) new Thread(new Runnable { override def run(): Unit = { Thread.sleep(sleep1) queryTiDBViaJDBC(s"select * from $dbtable") } }).start() val grpcException = intercept[GrpcException] { df.write .format("tidb") .options(tidbOptions) .option("database", database) .option("table", table) .option("sleepAfterPrewritePrimaryKey", sleep2) .mode("append") .save() } assert(grpcException.getMessage.equals("retry is exhausted.")) assert(grpcException.getCause.getMessage.startsWith("Txn commit primary key failed")) assert( grpcException.getCause.getCause.getMessage.startsWith( "Key exception occurred and the reason is retryable: \"Txn(Mvcc(TxnLockNotFound")) } override def afterAll(): Unit = try { dropTable() } finally { super.afterAll() } }
Example 111
Source File: EmployeeRelationship.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples.graphx import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.rdd.RDD import org.apache.spark.graphx.{ Edge, Graph } object EmployeeRelationship { def main(args: Array[String]): Unit = { // vertex format: vertex_id, data val vertexArray = Array( (1L, ("John", "Software Developer")), (2L, ("Robert", "Technical Leader")), (3L, ("Charlie", "Software Architect")), (4L, ("David", "Software Developer")), (5L, ("Edward", "Software Development Manager")), (6L, ("Francesca", "Software Development Manager"))) // edge format: from_vertex_id, to_vertex_id, data val edgeArray = Array( Edge(2L, 1L, "Technical Mentor"), Edge(2L, 4L, "Technical Mentor"), Edge(3L, 2L, "Collaborator"), Edge(6L, 3L, "Team Member"), Edge(4L, 1L, "Peers"), Edge(5L, 2L, "Team Member"), Edge(5L, 3L, "Team Member"), Edge(5L, 6L, "Peers")) val sc = new SparkContext(new SparkConf().setAppName("EmployeeRelationshipJob")) val vertexRDD: RDD[(Long, (String, String))] = sc.parallelize(vertexArray) val edgeRDD: RDD[Edge[String]] = sc.parallelize(edgeArray) val graph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD) // Vanilla query println(">>> Showing the names of people who are Software Developers") graph.vertices.filter { case (id, (name, designation)) => designation.equals("Software Developer") } .collect() .foreach { case (id, (name, designation)) => println(s"... Name: $name, Designation: $designation") } // Connection analysis println(">>> People connected to Robert (Technical Leader) -> ") graph.triplets.filter(_.srcId == 2).collect() .foreach { item => println("... " + item.dstAttr._1 + ", " + item.dstAttr._2) } println(">>> Robert (Technical Leader) connected to -> ") graph.triplets.filter(_.dstId == 2).collect() .foreach { item => println("... " + item.srcAttr._1 + ", " + item.srcAttr._2) } println(">>> Technical Mentoring Analysis -> ") graph.triplets.filter(_.attr.equals("Technical Mentor")).collect() .foreach { item => println("... " + item.srcAttr._1 + " mentoring " + item.dstAttr._1) } } }
Example 112
Source File: PurchaseLogAnalysis.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples import org.apache.spark.{ SparkContext, SparkConf } import org.apache.spark.rdd.RDD object PurchaseLogAnalysis { def main(args: Array[String]): Unit = { val ctx = new SparkContext(new SparkConf().setAppName("PurchaseAnalysisJob")) val badPkts = ctx.accumulator(0, "Bad Packets") val zeroValueSales = ctx.accumulator(0, "Zero Value Sales") val missingFields = ctx.accumulator(0, "Missing Fields") val blankLines = ctx.accumulator(0, "Blank Lines") ctx.textFile("file:/media/linux-1/spark-dev/data/purchases.log", 4) .foreach { line => if (line.length() == 0) blankLines += 1 else if (line.contains("Bad data packet")) badPkts += 1 else { val fields = line.split("\t") if (fields.length != 4) missingFields += 1 else if (fields(3).toFloat == 0) zeroValueSales += 1 } } println("Purchase Log Analysis Counters:") println(s"\tBad Data Packets=${badPkts.value}") println(s"\tZero Value Sales=${zeroValueSales.value}") println(s"\tMissing Fields=${missingFields.value}") println(s"\tBlank Lines=${blankLines.value}") } }
Example 113
Source File: TestBroadcastVariables.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples import org.apache.spark.{ SparkContext, SparkConf } import org.apache.spark.rdd.RDD import org.apache.spark.broadcast.Broadcast import scala.io.Source import scala.util.{ Try, Success, Failure } import scala.collection.mutable.Map def loadCSVFile(filename: String): Option[Map[String, String]] = { val countries = Map[String, String]() Try { val bufferedSource = Source.fromFile(filename) for (line <- bufferedSource.getLines) { val Array(country, capital) = line.split(",").map(_.trim) countries += country -> capital } bufferedSource.close() return Some(countries) }.toOption } }
Example 114
Source File: TestAccumulators.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples import org.apache.spark.{ SparkContext, SparkConf } import org.apache.spark.rdd.RDD rdd.foreach { line => if (line.length() > 0) totalLines += 1 if (line.startsWith("error:")) errorLines += 1 else if (line.startsWith("info:")) infoLines += 1 else if (line.startsWith("warn:")) warnLines += 1 } println(s">>> [Using Accumulators] Total: ${totalLines.value}, Error: ${errorLines.value}, Warnings: ${warnLines.value}, Info: ${infoLines.value}") } def usingRDDTransformations(sc: SparkContext, rdd: RDD[String]): Unit = { val errorLines = rdd.filter(_.startsWith("error:")).count() val infoLines = rdd.filter(_.startsWith("info:")).count() val warnLines = rdd.filter(_.startsWith("warn:")).count() println(s">>> [Using RDD Transformations] Error: $errorLines, Warnings: $warnLines, Info: $infoLines") } }
Example 115
Source File: TestJoins.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples import org.apache.spark.{ SparkConf, SparkContext, HashPartitioner } import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import scala.Iterator object TestJoins { def main(args: Array[String]): Unit = { val sc = new SparkContext(new SparkConf().setAppName("TestJoinJob")) val x = sc.parallelize(List((1, 2), (1, 3), (2, 3), (2, 4))).partitionBy(new HashPartitioner(2)).cache val y = sc.parallelize(List((2, 5), (2, 6))).partitionBy(new HashPartitioner(2)).cache inspectRDD(x) inspectRDD(y) println(">>> joining x with y") val joinRDD = x.join(y).cache joinRDD.collect().foreach(println) inspectRDD(joinRDD) println(">>> left outer join of x with y") val leftJoin = x.leftOuterJoin(y).cache leftJoin.collect().foreach(println) inspectRDD(leftJoin) println(">>> right outer join of x with y") val rightJoin = x.rightOuterJoin(y).cache rightJoin.collect().foreach(println) inspectRDD(rightJoin) } def inspectRDD[T](rdd: RDD[T]): Unit = { println(">>> Partition length...") rdd.mapPartitions(f => Iterator(f.length), true).foreach(println) println(">>> Partition data...") rdd.foreachPartition(f => f.foreach(println)) } }
Example 116
Source File: RedisSourceRdd.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package org.apache.spark.sql.redis.stream import com.redislabs.provider.redis.RedisConfig import com.redislabs.provider.redis.util.ConnectionUtils.withConnection import org.apache.spark.rdd.RDD import org.apache.spark.sql.redis.stream.RedisSourceTypes.StreamEntry import org.apache.spark.{Partition, SparkContext, TaskContext} class RedisSourceRdd(sc: SparkContext, redisConfig: RedisConfig, offsetRanges: Seq[RedisSourceOffsetRange], autoAck: Boolean = true) extends RDD[StreamEntry](sc, Nil) { override def compute(split: Partition, context: TaskContext): Iterator[StreamEntry] = { val partition = split.asInstanceOf[RedisSourceRddPartition] val offsetRange = partition.offsetRange val streamReader = new RedisStreamReader(redisConfig) streamReader.unreadStreamEntries(offsetRange) } override protected def getPartitions: Array[Partition] = { offsetRanges.zipWithIndex.map { case (e, i) => RedisSourceRddPartition(i, e) } .toArray } } case class RedisSourceRddPartition(index: Int, offsetRange: RedisSourceOffsetRange) extends Partition
Example 117
Source File: ManyValueBenchmarkSuite.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.redislabs.provider.redis.df.benchmark import com.redislabs.provider.redis.env.RedisClusterEnv import com.redislabs.provider.redis.util.Person import org.apache.spark.rdd.RDD trait ManyValueBenchmarkSuite extends DataframeBenchmarkSuite with RedisClusterEnv { private def num = 1000000 override def suiteTags: String = s"${super.suiteTags}, Many:$num" override def rdd(): RDD[Person] = { val partitionsNum = 8 val sectionLength = num / partitionsNum spark.sparkContext .parallelize(0 until partitionsNum, partitionsNum) .mapPartitions { _ .flatMap { i => val start = i * sectionLength val end = start + sectionLength + 1 Stream.range(start, end) } .map { i => Person(s"John-$i", 30, "60 Wall Street", 150.5) } } } }
Example 118
Source File: Dijkstra.scala From graphx-algorithm with GNU General Public License v2.0 | 5 votes |
package org.apache.spark.graphx.iiot.shortestpath import org.apache.spark.graphx.GraphLoaderPlus import org.apache.spark._ import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD if (args.length < 2) sys.error("Usage: inputFileName sourceId [outputFileDirectory]") val inputFile = args(0) val sourceId: VertexId = args(1).toInt val sc = new SparkContext(new SparkConf().setAppName("Dijkstra Algorithm")) val graph = GraphLoaderPlus.edgeListFile(sc, inputFile) // `mapEdges` sometimes may be needed such as // `g.mapEdges(e => (new scala.util.Random).nextInt(100))` val g = graph.mapVertices((id, _) => if (id == sourceId) Array(0.0, id) else Array(Double.PositiveInfinity, id) ) val sssp = g.pregel(Array(Double.PositiveInfinity, -1))( (id, dist, newDist) => { if (dist(0) < newDist(0)) dist else newDist }, triplet => { if (triplet.srcAttr(0) + triplet.attr < triplet.dstAttr(0)) { Iterator((triplet.dstId, Array(triplet.srcAttr(0) + triplet.attr, triplet.srcId))) } else { Iterator.empty } }, (a, b) => { if (a(0) < b(0)) a else b } ) val format_sssp: RDD[String] = sssp.vertices.map(vertex => "Vertex " + vertex._1 + ": distance is " + vertex._2(0) + ", previous node is Vertex " + vertex._2(1).toInt) format_sssp.collect.foreach(println(_)) if (args.length > 2) { val outputFileDir = args(2) format_sssp.saveAsTextFile(outputFileDir) } } }
Example 119
Source File: ReplicatedVertexView.scala From graphx-algorithm with GNU General Public License v2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import org.apache.spark.graphx._ def updateVertices(updates: VertexRDD[VD]): ReplicatedVertexView[VD, ED] = { val shippedVerts = updates.shipVertexAttributes(hasSrcId, hasDstId) .setName("ReplicatedVertexView.updateVertices - shippedVerts %s %s (broadcast)".format( hasSrcId, hasDstId)) .partitionBy(edges.partitioner.get) val newEdges = edges.withPartitionsRDD(edges.partitionsRDD.zipPartitions(shippedVerts) { (ePartIter, shippedVertsIter) => ePartIter.map { case (pid, edgePartition) => (pid, edgePartition.updateVertices(shippedVertsIter.flatMap(_._2.iterator))) } }) new ReplicatedVertexView(newEdges, hasSrcId, hasDstId) } }
Example 120
Source File: EdgeRDDImpl.scala From graphx-algorithm with GNU General Public License v2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{OneToOneDependency, HashPartitioner, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.graphx._ class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 121
Source File: RoutingTablePartition.scala From graphx-algorithm with GNU General Public License v2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.ClassTag import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import org.apache.spark.rdd.ShuffledRDD import org.apache.spark.util.collection.{BitSet, PrimitiveVector} import org.apache.spark.graphx._ import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap import org.apache.spark.graphx.impl.RoutingTablePartition.RoutingTableMessage private[graphx] object RoutingTablePartition { def foreachWithinEdgePartition (pid: PartitionID, includeSrc: Boolean, includeDst: Boolean) (f: VertexId => Unit) { val (vidsCandidate, srcVids, dstVids) = routingTable(pid) val size = vidsCandidate.length if (includeSrc && includeDst) { // Avoid checks for performance vidsCandidate.iterator.foreach(f) } else if (!includeSrc && !includeDst) { // Do nothing } else { val relevantVids = if (includeSrc) srcVids else dstVids relevantVids.iterator.foreach { i => f(vidsCandidate(i)) } } } }
Example 122
Source File: SparkBatchAdapter.scala From eventuate with Apache License 2.0 | 5 votes |
package com.rbmhtechnology.eventuate.adapter.spark import akka.actor.ActorSystem import akka.serialization.SerializationExtension import com.datastax.spark.connector._ import com.datastax.spark.connector.types._ import com.rbmhtechnology.eventuate.DurableEvent import com.rbmhtechnology.eventuate.log.cassandra.CassandraEventLogSettings import com.typesafe.config._ import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD def eventBatch(logId: String, fromSequenceNr: Long = 1L): RDD[DurableEvent] = { context.cassandraTable(cassandraSettings.keyspace, s"${cassandraSettings.tablePrefix}_$logId") .select("event").where(s"sequence_nr >= $fromSequenceNr").as((event: DurableEvent) => event) } } private class DurableEventConverter(config: Config) extends TypeConverter[DurableEvent] { import scala.reflect.runtime.universe._ val converter = implicitly[TypeConverter[Array[Byte]]] // -------------------------------------- // FIXME: how to shutdown actor system? // -------------------------------------- @transient lazy val system = ActorSystem("TypeConverter", config) @transient lazy val serial = SerializationExtension(system) def targetTypeTag = implicitly[TypeTag[DurableEvent]] def convertPF = { case obj => deserialize(converter.convert(obj)) } def deserialize(bytes: Array[Byte]): DurableEvent = serial.deserialize(bytes, classOf[DurableEvent]).get }
Example 123
Source File: GenerateVerticesExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch08 // scalastyle:off println import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.rdd.RDD object GenerateVerticesExample { def main(args: Array[String]): Unit = { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } // ログレベルをWARNに設定 Logger.getLogger("org").setLevel(Level.WARN) // SparkContextの生成 val conf = new SparkConf().setAppName("GenerateVerticesExample") val sc = new SparkContext(conf) // 引数から設定値を取得 val (numProducts, numUsers): (Int, Int) = (args(0).toInt, args(1).toInt) implicit val recOpts: RecommendLogOptions = RecommendLogOptions(numProducts, numUsers) run(sc) sc.stop() } def run(sc: SparkContext) (implicit recOpts: RecommendLogOptions) : Unit = { // 商品リスト、ユーザリストのRDDを生成 val products: RDD[VertexProperty] = sc.parallelize(PurchaseLogGenerator.genProductList) val users: RDD[VertexProperty] = sc.parallelize(PurchaseLogGenerator.genUserList) // 商品リスト20件を表示 println("===================================") println("get top 20 products:") products.take(20).foreach(x => println(s"id: ${x.id},\ttype: ${x.kind},\tname: ${x.name}")) // ユーザリスト20件を表示 println("===================================") println("get top 20 users:") users.take(20).foreach(x => println(s"id: ${x.id},\ttype: ${x.kind},\tname: ${x.name}")) } } // scalastyle:on println
Example 124
Source File: gihyo_6_3_Transform.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Transform { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) val blackList = sc.parallelize(Array(("user002", "rockLogin"), ("user003", "rockPayment"))) run(lines, blackList) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], blackList: RDD[(String, String)]) { val userList = stream.map(x => (x, "action:Login")).transform(rdd => { val tmpUserList = rdd.leftOuterJoin(blackList) tmpUserList.filter(user => (user._2._2 == None)) }) userList.print } }
Example 125
Source File: gihyo_6_3_JoinSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import scala.collection.mutable import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper class gihyo_6_3_JoinSuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines1 = mutable.Queue[RDD[String]]() val ds1 = ssc.queueStream(lines1) val lines2 = mutable.Queue[RDD[String]]() val ds2 = ssc.queueStream(lines2) val clock = new StreamingContextWrapper(ssc).manualClock gihyo_6_3_Join.run(ds1, ds2) ssc.start() lines1 += sc.makeRDD(Seq("key1", "key2", "key3")) // test data lines2 += sc.makeRDD(Seq("key2", "key3", "key4")) // test data clock.advance(1000) Thread.sleep(1000) } }
Example 126
Source File: gihyo_6_3_CountByValueAndWindowSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import scala.collection.mutable import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper import java.nio.file.Files class gihyo_6_3_CountByValueAndWindowSuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = mutable.Queue[RDD[String]]() val ds = ssc.queueStream(lines) val clock = new StreamingContextWrapper(ssc).manualClock val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString ssc.checkpoint(checkpointDir) gihyo_6_3_countByValueAndWindow.run(ds, 2, 1) ssc.start() (1 to 3).foreach { case i => lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data clock.advance(1000) Thread.sleep(1000) } } }
Example 127
Source File: gihyo_6_3_MapSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import scala.collection.mutable import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper class gihyo_6_3_MapSuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = mutable.Queue[RDD[String]]() val ds = ssc.queueStream(lines) val clock = new StreamingContextWrapper(ssc).manualClock gihyo_6_3_Map.run(ds) ssc.start() lines += sc.makeRDD(Seq("key1", "key2", "key3", "key1")) // test data clock.advance(1000) Thread.sleep(1000) } }
Example 128
Source File: gihyo_6_3_TwitterStreamSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import java.nio.file.Files import scala.collection.mutable import scala.io.Source import twitter4j.{Status, TwitterObjectFactory} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} class gihyo_6_3_TwitterStreamSuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = mutable.Queue[RDD[Status]]() val ds = ssc.queueStream(lines) val clock = new StreamingContextWrapper(ssc).manualClock gihyo_6_3_TwitterStream.run( sc, ds, Files.createTempDirectory("TwitterTag").toString, Files.createTempDirectory("TwitterWords").toString) val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString ssc.checkpoint(checkpointDir) ssc.start() (1 to 2).foreach { case i => // test data lines += sc.makeRDD(Seq( MockTweetGenerator.createMockStatusFromJson(), MockTweetGenerator.createMockStatusFromJson(), MockTweetGenerator.createMockStatusFromJson(), MockTweetGenerator.createMockStatusFromJson())) clock.advance(1000) Thread.sleep(1000) } } } object MockTweetGenerator { // Creates a tweet status from a JSON file def createMockStatusFromJson(): Status = { val jsonFile = getClass.getResource("/streaming/test-tweet.json").getPath TwitterObjectFactory.createStatus(Source.fromFile(jsonFile).getLines().mkString) } }
Example 129
Source File: gihyo_6_3_FilterSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import scala.collection.mutable import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper class gihyo_6_3_FilterSuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = mutable.Queue[RDD[String]]() val ds = ssc.queueStream(lines) val clock = new StreamingContextWrapper(ssc).manualClock gihyo_6_3_Filter.run(ds) ssc.start() lines += sc.makeRDD(Seq("lengthOver5", "les1", "les2")) // test data clock.advance(1000) Thread.sleep(1000) } }
Example 130
Source File: gihyo_6_3_FlatMapSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import scala.collection.mutable import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper class gihyo_6_3_FlatMapSuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = mutable.Queue[RDD[String]]() val ds = ssc.queueStream(lines) val clock = new StreamingContextWrapper(ssc).manualClock gihyo_6_3_flatMap.run(ds) ssc.start() // test data lines += sc.makeRDD(Seq("Apache Spark is a fast and general-purpose cluster computing system.")) clock.advance(1000) Thread.sleep(1000) } }
Example 131
Source File: gihyo_6_3_CountSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import scala.collection.mutable import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper class gihyo_6_3_CountSuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = mutable.Queue[RDD[String]]() val ds = ssc.queueStream(lines) val clock = new StreamingContextWrapper(ssc).manualClock gihyo_6_3_Count.run(ds, 2, 1) ssc.start() (1 to 2).foreach { case i => lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data clock.advance(1000) Thread.sleep(1000) } } }
Example 132
Source File: gihyo_6_3_UnionSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import scala.collection.mutable import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper class gihyo_6_3_UnionSuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = (1 to 3).map(x => mutable.Queue[RDD[(String, String)]]()) val dss = lines.map(x => ssc.queueStream(x)) val clock = new StreamingContextWrapper(ssc).manualClock gihyo_6_3_Union.run(ssc, dss) ssc.start() lines.map(x => x += sc.makeRDD(Seq(("", "key1"), ("", "key2"), ("", "key3")))) //test data clock.advance(1000) Thread.sleep(1000) } }
Example 133
Source File: gihyo_6_3_ReduceByKeyAndWindowSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import scala.collection.mutable import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper class gihyo_6_3_ReduceByKeyAndWindowSuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = mutable.Queue[RDD[String]]() val ds = ssc.queueStream(lines) val clock = new StreamingContextWrapper(ssc).manualClock gihyo_6_3_reduceByKeyAndWindow.run(ds, 2, 1) ssc.start() (1 to 3).foreach { case i => lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data clock.advance(1000) Thread.sleep(1000) } } }
Example 134
Source File: gihyo_6_3_ReduceByKeySuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import scala.collection.mutable import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper class gihyo_6_3_ReduceByKeySuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = mutable.Queue[RDD[String]]() val ds = ssc.queueStream(lines) val clock = new StreamingContextWrapper(ssc).manualClock gihyo_6_3_reduceByKey.run(ds) ssc.start() lines += sc.makeRDD(Seq("key1", "key2", "key3", "key1")) // test data clock.advance(1000) Thread.sleep(1000) } }
Example 135
Source File: gihyo_6_3_CountByWindowSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import java.nio.file.Files import scala.collection.mutable import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper class gihyo_6_3_CountByWindowSuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = mutable.Queue[RDD[String]]() val ds = ssc.queueStream(lines) val clock = new StreamingContextWrapper(ssc).manualClock val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString ssc.checkpoint(checkpointDir) gihyo_6_3_countByWindow.run(ds, 2, 1) ssc.start() (1 to 3).foreach { case i => lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data clock.advance(1000) Thread.sleep(1000) } } }
Example 136
Source File: gihyo_6_3_UpdateStateByKeySuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import java.nio.file.Files import scala.collection.mutable import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper class gihyo_6_3_UpdateStateByKeySuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = mutable.Queue[RDD[String]]() val ds = ssc.queueStream(lines) val clock = new StreamingContextWrapper(ssc).manualClock gihyo_6_3_updateStateByKey.run(ds) val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString ssc.checkpoint(checkpointDir) ssc.start() lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data clock.advance(1000) Thread.sleep(1000) } }
Example 137
Source File: gihyo_6_3_RepartitionSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import scala.collection.mutable import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper class gihyo_6_3_RepartitionSuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = mutable.Queue[RDD[String]]() val ds = ssc.queueStream(lines) val clock = new StreamingContextWrapper(ssc).manualClock gihyo_6_3_Repartition.run(ds) ssc.start() lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data clock.advance(1000) Thread.sleep(1000) } }
Example 138
Source File: gihyo_6_3_ReduceByKeyAndWindowEfficientSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import java.nio.file.Files import scala.collection.mutable import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper class gihyo_6_3_ReduceByKeyAndWindowEfficientSuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = mutable.Queue[RDD[String]]() val ds = ssc.queueStream(lines) val clock = new StreamingContextWrapper(ssc).manualClock gihyo_6_3_reduceByKeyAndWindow_efficient.run(ds, 2, 1) val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString ssc.checkpoint(checkpointDir) ssc.start() (1 to 2).foreach { case i => lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data clock.advance(1000) Thread.sleep(1000) } } }
Example 139
Source File: gihyo_6_3_KafkaStreamSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import scala.collection.mutable import java.nio.file.Files import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper class gihyo_6_3_KafkaStreamSuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = mutable.Queue[RDD[(String, String)]]() val ds = ssc.queueStream(lines) val clock = new StreamingContextWrapper(ssc).manualClock gihyo_6_3_KafkaStream.run(ds, Files.createTempDirectory("KafkaStreamSuite").toString, 2, 1) val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString ssc.checkpoint(checkpointDir) ssc.start() (1 to 2).foreach { case i => lines += sc.makeRDD(Seq(("", "userid:userid001,action:view,pageid:value1"), ("", "userid:userid002,action:click,pageid:value2"), ("", "userid:userid003,action:view,pageid:value3"), ("", "userid:userid001,action:view,pageid:value4"))) // test data clock.advance(1000) Thread.sleep(1000) } } }
Example 140
Source File: gihyo_6_3_WiindowSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import scala.collection.mutable import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper class gihyo_6_3_WindowSuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = mutable.Queue[RDD[String]]() val ds = ssc.queueStream(lines) val clock = new StreamingContextWrapper(ssc).manualClock gihyo_6_3_Window.run(ds, 2, 1) ssc.start() (1 to 3).foreach { case i => { lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data clock.advance(1000) Thread.sleep(1000) } } } }
Example 141
Source File: gihyo_6_3_CogroupSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import scala.collection.mutable import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper class gihyo_6_3_CogroupSuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = mutable.Queue[RDD[String]]() val lines2 = mutable.Queue[RDD[String]]() val ds = ssc.queueStream(lines) val ds2 = ssc.queueStream(lines2) val clock = new StreamingContextWrapper(ssc).manualClock lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data lines2 += sc.makeRDD(Seq("key2", "key3", "key4")) // test data gihyo_6_3_Cogroup.run(ds, ds2) ssc.start() clock.advance(1000) Thread.sleep(1000) } }
Example 142
Source File: gihyo_6_2_1_SampleSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import scala.collection.mutable import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper class gihyo_6_2_1_SampleSuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = mutable.Queue[RDD[String]]() val ds = ssc.queueStream(lines) val clock = new StreamingContextWrapper(ssc).manualClock lines += sc.makeRDD(Seq("word1 word2", "word3 word1", "word4 word2")) // test data gihyo_6_2_1_Sample.run(ds) ssc.start() clock.advance(1000) Thread.sleep(1000) } }
Example 143
Source File: gihyo_6_3_TransformSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import scala.collection.mutable import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper class gihyo_6_3_TransformSuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = mutable.Queue[RDD[String]]() val ds = ssc.queueStream(lines) val clock = new StreamingContextWrapper(ssc).manualClock val blackList = sc.parallelize(Array(("user002", "rockLogin"), ("user003", "rockPayment"))) gihyo_6_3_Transform.run(ds, blackList) ssc.start() lines += sc.makeRDD(Seq("user001", "user002", "user003")) // test data clock.advance(1000) Thread.sleep(1000) } }
Example 144
Source File: gihyo_6_3_CountByValueSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import scala.collection.mutable import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper class gihyo_6_3_CountByValueSuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = mutable.Queue[RDD[String]]() val ds = ssc.queueStream(lines) val clock = new StreamingContextWrapper(ssc).manualClock gihyo_6_3_countByValue.run(ds) ssc.start() lines += sc.makeRDD(Seq("key1", "key2", "key3", "key1")) // test data clock.advance(1000) Thread.sleep(1000) } }
Example 145
Source File: gihyo_6_3_ReduceSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import scala.collection.mutable import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper class gihyo_6_3_ReduceSuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = mutable.Queue[RDD[String]]() val ds = ssc.queueStream(lines) val clock = new StreamingContextWrapper(ssc).manualClock gihyo_6_3_Reduce.run(ds) ssc.start() lines += sc.makeRDD(Seq("gi", "jutsu", "hyoron", "sha")) // test data clock.advance(1000) Thread.sleep(1000) } }
Example 146
Source File: gihyo_6_3_ReduceByWindowSuite.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import scala.collection.mutable import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContextWrapper class gihyo_6_3_ReduceByWindowSuite extends SparkFunSuite with TestStreamingContext { test("run") { val lines = mutable.Queue[RDD[String]]() val ds = ssc.queueStream(lines) val clock = new StreamingContextWrapper(ssc).manualClock gihyo_6_3_reduceByWindow.run(ds, 2, 1) ssc.start() (1 to 2).foreach { case i => { lines += sc.makeRDD(Seq("gi", "jutsu", "hyoron", "sha")) // test data clock.advance(1000) Thread.sleep(1000) } } } }
Example 147
Source File: FileReader.scala From bdd-spark with MIT License | 5 votes |
import org.apache.spark.rdd.RDD trait FileReader { def readLinesToRdd(filename : String) : RDD[String] def readText(filename : String) : String } object FileReader { class RealFileReader extends FileReader{ override def readLinesToRdd(filename: String): RDD[String] = { Spark.spark.sparkContext.textFile(filename) } override def readText(filename: String): String = { //Whatever! "" } } def apply() : FileReader = new RealFileReader }
Example 148
Source File: RecommendationModelReuse.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.MovieRecommendation import org.apache.spark.sql.SparkSession import org.apache.spark.mllib.recommendation.ALS import org.apache.spark.mllib.recommendation.MatrixFactorizationModel import org.apache.spark.mllib.recommendation.Rating import scala.Tuple2 import org.apache.spark.rdd.RDD object RecommendationModelReuse { def main(args: Array[String]): Unit = { val spark: SparkSession = SparkSession .builder() .appName("JavaLDAExample") .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/"). getOrCreate() val ratigsFile = "data/ratings.csv" val ratingDF = spark.read.format("com.databricks.spark.csv").option("header", true).load(ratigsFile) val selectedRatingsDF = ratingDF.select(ratingDF.col("userId"), ratingDF.col("movieId"), ratingDF.col("rating"), ratingDF.col("timestamp")) // Randomly split ratings RDD into training data RDD (75%) and test data RDD (25%) val splits = selectedRatingsDF.randomSplit(Array(0.75, 0.25), seed = 12345L) val testData = splits(1) val testRDD = testData.rdd.map(row => { val userId = row.getString(0) val movieId = row.getString(1) val ratings = row.getString(2) Rating(userId.toInt, movieId.toInt, ratings.toDouble) }) //Load the workflow back val same_model = MatrixFactorizationModel.load(spark.sparkContext, "model/MovieRecomModel/") // Making Predictions. Get the top 6 movie predictions for user 668 println("Rating:(UserID, MovieID, Rating)") println("----------------------------------") val topRecsForUser = same_model.recommendProducts(458, 10) for (rating <- topRecsForUser) { println(rating.toString()) } println("----------------------------------") val rmseTest = MovieRecommendation.computeRmse(same_model, testRDD, true) println("Test RMSE: = " + rmseTest) //Less is better //Movie recommendation for a specific user. Get the top 6 movie predictions for user 668 println("Recommendations: (MovieId => Rating)") println("----------------------------------") val recommendationsUser = same_model.recommendProducts(458, 10) recommendationsUser.map(rating => (rating.product, rating.rating)).foreach(println) println("----------------------------------") spark.stop() } }
Example 149
Source File: MovieRecommendation.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.MovieRecommendation import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.SQLContext import org.apache.spark.sql.SQLImplicits import org.apache.spark.sql._ import org.apache.spark.sql.Dataset import org.apache.spark.mllib.recommendation.ALS import org.apache.spark.mllib.recommendation.MatrixFactorizationModel import org.apache.spark.mllib.recommendation.Rating import scala.Tuple2 import org.apache.spark.rdd.RDD object MovieRecommendation { //Compute the RMSE to evaluate the model. Less the RMSE better the model and it's prediction capability. def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating], implicitPrefs: Boolean): Double = { val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product))) val predictionsAndRatings = predictions.map { x => ((x.user, x.product), x.rating) }.join(data.map(x => ((x.user, x.product), x.rating))).values if (implicitPrefs) { println("(Prediction, Rating)") println(predictionsAndRatings.take(5).mkString("\n")) } math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).mean()) } def main(args: Array[String]): Unit = { val spark: SparkSession = SparkSession .builder() .appName("JavaLDAExample") .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/"). getOrCreate() val ratigsFile = "data/ratings.csv" val df1 = spark.read.format("com.databricks.spark.csv").option("header", true).load(ratigsFile) val ratingsDF = df1.select(df1.col("userId"), df1.col("movieId"), df1.col("rating"), df1.col("timestamp")) ratingsDF.show(false) val moviesFile = "data/movies.csv" val df2 = spark.read.format("com.databricks.spark.csv").option("header", "true").load(moviesFile) val moviesDF = df2.select(df2.col("movieId"), df2.col("title"), df2.col("genres")) moviesDF.show(false) ratingsDF.createOrReplaceTempView("ratings") moviesDF.createOrReplaceTempView("movies") var rmseTest = computeRmse(model, testRDD, true) println("Test RMSE: = " + rmseTest) //Less is better //Movie recommendation for a specific user. Get the top 6 movie predictions for user 668 println("Recommendations: (MovieId => Rating)") println("----------------------------------") val recommendationsUser = model.recommendProducts(668, 6) recommendationsUser.map(rating => (rating.product, rating.rating)).foreach(println) println("----------------------------------") spark.stop() } }
Example 150
Source File: HbRddWriter.scala From hbrdd with Apache License 2.0 | 5 votes |
package top.spoofer.hbrdd.hbsupport import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.spark.rdd.RDD import top.spoofer.hbrdd.config.HbRddConfig import top.spoofer.hbrdd.unit.HbRddFormatsWriter import top.spoofer.hbrdd._ import HbRddWritPuter._ trait HbRddWriter { type TsValue[A] = (Long, A) // (ts, A) val LATEST_TIMESTAMP = Long.MaxValue final class SingleFamilyRDDWriter[A]( val rdd: RDD[(String, Map[String, A])], val put: HbRddPuter[A] ) extends HbRddWritCommon[A] with Serializable { def put2Hbase(tableName: String, family: String)(implicit config: HbRddConfig) = { val job = createJob(tableName, config.getHbaseConfig) rdd.flatMap({ case (rowId, data) => convert2Writable(rowId, Map(family -> data), put) }) .saveAsNewAPIHadoopDataset(job.getConfiguration) } }
Example 151
Source File: XmlReader.scala From spark-xml with Apache License 2.0 | 5 votes |
package com.databricks.spark.xml import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, SQLContext, SparkSession} import org.apache.spark.sql.types.StructType import com.databricks.spark.xml.util.XmlFile import com.databricks.spark.xml.util.FailFastMode @deprecated("Use xmlFile(SparkSession, ...)", "0.5.0") def xmlFile(sqlContext: SQLContext, path: String): DataFrame = { // We need the `charset` and `rowTag` before creating the relation. val (charset, rowTag) = { val options = XmlOptions(parameters.toMap) (options.charset, options.rowTag) } val relation = XmlRelation( () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag), Some(path), parameters.toMap, schema)(sqlContext) sqlContext.baseRelationToDataFrame(relation) } @deprecated("Use xmlRdd(SparkSession, ...)", "0.5.0") def xmlRdd(sqlContext: SQLContext, xmlRDD: RDD[String]): DataFrame = { val relation = XmlRelation( () => xmlRDD, None, parameters.toMap, schema)(sqlContext) sqlContext.baseRelationToDataFrame(relation) } }
Example 152
Source File: XmlFile.scala From spark-xml with Apache License 2.0 | 5 votes |
package com.databricks.spark.xml.util import java.io.CharArrayWriter import java.nio.charset.Charset import javax.xml.stream.XMLOutputFactory import scala.collection.Map import com.databricks.spark.xml.parsers.StaxXmlGenerator import com.sun.xml.txw2.output.IndentingXMLStreamWriter import org.apache.hadoop.io.{Text, LongWritable} import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.sql.DataFrame import com.databricks.spark.xml.{XmlOptions, XmlInputFormat} private[xml] object XmlFile { val DEFAULT_INDENT = " " def withCharset( context: SparkContext, location: String, charset: String, rowTag: String): RDD[String] = { // This just checks the charset's validity early, to keep behavior Charset.forName(charset) context.hadoopConfiguration.set(XmlInputFormat.START_TAG_KEY, s"<$rowTag>") context.hadoopConfiguration.set(XmlInputFormat.END_TAG_KEY, s"</$rowTag>") context.hadoopConfiguration.set(XmlInputFormat.ENCODING_KEY, charset) context.newAPIHadoopFile(location, classOf[XmlInputFormat], classOf[LongWritable], classOf[Text]).map { case (_, text) => new String(text.getBytes, 0, text.getLength, charset) } } def saveAsXmlFile( dataFrame: DataFrame, path: String, parameters: Map[String, String] = Map()): Unit = { val options = XmlOptions(parameters.toMap) val codecClass = CompressionCodecs.getCodecClass(options.codec) val rowSchema = dataFrame.schema val indent = XmlFile.DEFAULT_INDENT val xmlRDD = dataFrame.rdd.mapPartitions { iter => val factory = XMLOutputFactory.newInstance() val writer = new CharArrayWriter() val xmlWriter = factory.createXMLStreamWriter(writer) val indentingXmlWriter = new IndentingXMLStreamWriter(xmlWriter) indentingXmlWriter.setIndentStep(indent) new Iterator[String] { var firstRow: Boolean = true var lastRow: Boolean = true override def hasNext: Boolean = iter.hasNext || firstRow || lastRow override def next: String = { if (iter.nonEmpty) { if (firstRow) { indentingXmlWriter.writeStartElement(options.rootTag) firstRow = false } val xml = { StaxXmlGenerator( rowSchema, indentingXmlWriter, options)(iter.next()) indentingXmlWriter.flush() writer.toString } writer.reset() xml } else { if (!firstRow) { lastRow = false indentingXmlWriter.writeEndElement() indentingXmlWriter.close() writer.toString } else { // This means the iterator was initially empty. firstRow = false lastRow = false "" } } } } } codecClass match { case null => xmlRDD.saveAsTextFile(path) case codec => xmlRDD.saveAsTextFile(path, codec) } } }
Example 153
Source File: XmlRelation.scala From spark-xml with Apache License 2.0 | 5 votes |
package com.databricks.spark.xml import java.io.IOException import org.apache.hadoop.fs.Path import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.sources.{PrunedScan, InsertableRelation, BaseRelation, TableScan} import org.apache.spark.sql.types._ import com.databricks.spark.xml.util.{InferSchema, XmlFile} import com.databricks.spark.xml.parsers.StaxXmlParser case class XmlRelation protected[spark] ( baseRDD: () => RDD[String], location: Option[String], parameters: Map[String, String], userSchema: StructType = null)(@transient val sqlContext: SQLContext) extends BaseRelation with InsertableRelation with PrunedScan { private val options = XmlOptions(parameters) override val schema: StructType = { Option(userSchema).getOrElse { InferSchema.infer( baseRDD(), options) } } override def buildScan(requiredColumns: Array[String]): RDD[Row] = { val requiredFields = requiredColumns.map(schema(_)) val requestedSchema = StructType(requiredFields) StaxXmlParser.parse( baseRDD(), requestedSchema, options) } // The function below was borrowed from JSONRelation override def insert(data: DataFrame, overwrite: Boolean): Unit = { val filesystemPath = location match { case Some(p) => new Path(p) case None => throw new IOException(s"Cannot INSERT into table with no path defined") } val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration) if (overwrite) { try { fs.delete(filesystemPath, true) } catch { case e: IOException => throw new IOException( s"Unable to clear output directory ${filesystemPath.toString} prior" + s" to INSERT OVERWRITE a XML table:\n${e.toString}") } // Write the data. We assume that schema isn't changed, and we won't update it. XmlFile.saveAsXmlFile(data, filesystemPath.toString, parameters) } else { throw new IllegalArgumentException("XML tables only support INSERT OVERWRITE for now.") } } }
Example 154
Source File: SparkSuite.scala From spark-sorted with Apache License 2.0 | 5 votes |
package com.tresata.spark.sorted import org.scalactic.Equality import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.{ Dataset, SparkSession } object SparkSuite { lazy val spark: SparkSession = { val session = SparkSession.builder .master("local[*]") .appName("test") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.ui.enabled", false) .config("spark.sql.shuffle.partitions", 4) .getOrCreate() session } lazy val sc: SparkContext = spark.sparkContext lazy val jsc = new JavaSparkContext(sc) def javaSparkContext() = jsc } trait SparkSuite { implicit lazy val spark: SparkSession = SparkSuite.spark implicit lazy val sc: SparkContext = SparkSuite.spark.sparkContext implicit def rddEq[X]: Equality[RDD[X]] = new Equality[RDD[X]] { private def toCounts[Y](s: Seq[Y]): Map[Y, Int] = s.groupBy(identity).mapValues(_.size) def areEqual(a: RDD[X], b: Any): Boolean = b match { case s: Seq[_] => toCounts(a.collect) == toCounts(s) case rdd: RDD[_] => toCounts(a.collect) == toCounts(rdd.collect) } } implicit def gsEq[K, V](implicit rddEq: Equality[RDD[(K, V)]]): Equality[GroupSorted[K, V]] = new Equality[GroupSorted[K, V]] { def areEqual(a: GroupSorted[K, V], b: Any): Boolean = rddEq.areEqual(a, b) } implicit def dsEq[X](implicit rddEq: Equality[RDD[X]]): Equality[Dataset[X]] = new Equality[Dataset[X]] { def areEqual(a: Dataset[X], b: Any): Boolean = b match { case ds: Dataset[_] => rddEq.areEqual(a.rdd, ds.rdd) case x => rddEq.areEqual(a.rdd, x) } } }
Example 155
Source File: BinaryClassifierEvaluator.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.evaluation import org.apache.spark.rdd.RDD def evaluate(predictions: RDD[Boolean], actuals: RDD[Boolean]): BinaryClassificationMetrics = { predictions.zip(actuals).map { case (pred, actual) => val tp = if (pred && actual) 1d else 0d val fp = if (pred && !actual) 1d else 0d val tn = if (!pred && !actual) 1d else 0d val fn = if (!pred && actual) 1d else 0d BinaryClassificationMetrics(tp, fp, tn, fn) }.reduce(_ merge _) } }
Example 156
Source File: AugmentedExamplesEvaluator.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.evaluation import breeze.linalg._ import keystoneml.nodes.util.MaxClassifier import org.apache.spark.rdd.RDD import scala.reflect.ClassTag object AggregationPolicyType extends Enumeration { type AggregationPolicyType = Value val average, borda = Value } class AugmentedExamplesEvaluator[T : ClassTag]( names: RDD[T], numClasses: Int, policy: AggregationPolicyType.Value = AggregationPolicyType.average) extends Evaluator[DenseVector[Double], Int, MulticlassMetrics] with Serializable { def averagePolicy(preds: Array[DenseVector[Double]]): DenseVector[Double] = { preds.reduce(_ + _) :/ preds.size.toDouble } def bordaPolicy(preds: Array[DenseVector[Double]]): DenseVector[Double] = { val ranks = preds.map { vec => val sortedPreds = vec.toArray.zipWithIndex.sortBy(_._1).map(_._2) val rank = DenseVector(sortedPreds.zipWithIndex.sortBy(_._1).map(x => x._2.toDouble)) rank } ranks.reduceLeft(_ + _) } def evaluate( predicted: RDD[DenseVector[Double]], actualLabels: RDD[Int]): MulticlassMetrics = { val aggFunc = policy match { case AggregationPolicyType.borda => bordaPolicy _ case _ => averagePolicy _ } // associate a name with each predicted, actual val namedPreds = names.zip(predicted.zip(actualLabels)) // group by name to get all the predicted values for a name val groupedPreds = namedPreds.groupByKey(names.partitions.length).map { case (group, iter) => val predActuals = iter.toArray // this is a array of tuples val predsForName = predActuals.map(_._1) assert(predActuals.map(_._2).distinct.size == 1) val actualForName: Int = predActuals.map(_._2).head (predsForName, actualForName) }.cache() // Averaging policy val finalPred = groupedPreds.map(x => (aggFunc(x._1), x._2) ) val finalPredictedLabels = MaxClassifier(finalPred.map(_._1)) val finalActualLabels = finalPred.map(_._2) val ret = new MulticlassClassifierEvaluator(numClasses).evaluate(finalPredictedLabels, finalActualLabels) groupedPreds.unpersist() ret } }
Example 157
Source File: MeanAveragePrecisionEvaluator.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.evaluation import breeze.linalg.DenseVector import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ private def getAP(precisions: Array[Double], recalls: Array[Double]) = { var ap = 0.0 val levels = (0 to 10).map(x => x / 10.0) levels.foreach { t => // Find where recalls are greater than t and precision values at those indices val px = recalls.toSeq.zipWithIndex.filter(x => x._1 >= t).map(x => precisions(x._2)) val p = if (px.isEmpty) { 0.0 } else { px.max } ap = ap + p / 11.0 } ap } }
Example 158
Source File: Stats.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.utils import java.util.{Random => JRandom} import breeze.linalg._ import breeze.numerics._ import breeze.stats._ import breeze.stats.distributions._ import keystoneml.nodes.util.TopKClassifier import org.apache.spark.rdd.RDD object Stats extends Serializable { def normalizeRows(mat: DenseMatrix[Double], alpha: Double = 1.0): DenseMatrix[Double] = { // FIXME: This currently must convert the matrices to double due to breeze implicits // TODO: Could optimize, use way fewer copies val rowMeans: DenseVector[Double] = mean(mat(*, ::)).map(x => if (x.isNaN) 0 else x) val variances: DenseVector[Double] = sum((mat(::, *) - rowMeans) :^= 2.0, Axis._1) :/= (mat.cols.toDouble - 1.0) val sds: DenseVector[Double] = sqrt(variances + alpha.toDouble).map(x => if (x.isNaN) math.sqrt(alpha) else x) val out = mat(::, *) - rowMeans out(::, *) /= sds out } }
Example 159
Source File: GatherTransformerOperator.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.workflow import org.apache.spark.rdd.RDD private[workflow] case class GatherTransformerOperator[T]() extends TransformerOperator { override private[workflow] def singleTransform(inputs: Seq[DatumExpression]): Any = { inputs.map(_.get.asInstanceOf[T]) } override private[workflow] def batchTransform(inputs: Seq[DatasetExpression]): RDD[_] = { inputs.map(_.get.asInstanceOf[RDD[T]].map(t => Seq(t))).reduceLeft((x, y) => { x.zip(y).map(z => z._1 ++ z._2) }) } }
Example 160
Source File: PipelineDataset.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.workflow import org.apache.spark.rdd.RDD class PipelineDataset[T] private[workflow](executor: GraphExecutor, sink: SinkId) extends PipelineResult[RDD[T]]( executor, sink) object PipelineDataset { private[workflow] def apply[T](rdd: RDD[T]): PipelineDataset[T] = { val emptyGraph = Graph(Set(), Map(), Map(), Map()) val (graphWithDataset, nodeId) = emptyGraph.addNode(new DatasetOperator(rdd), Seq()) val (graph, sinkId) = graphWithDataset.addSink(nodeId) new PipelineDataset[T](new GraphExecutor(graph), sinkId) } }
Example 161
Source File: KernelMatrix.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.learning import scala.collection.mutable.HashMap import scala.reflect.ClassTag import breeze.linalg._ import org.apache.spark.rdd.RDD import keystoneml.utils.{MatrixUtils, Stats} import keystoneml.workflow.{Transformer, LabelEstimator} class BlockKernelMatrix[T: ClassTag]( val kernelGen: KernelTransformer[T], val data: RDD[T], val cacheKernel: Boolean) extends KernelMatrix { val colBlockCache = HashMap.empty[Seq[Int], RDD[DenseMatrix[Double]]] val diagBlockCache = HashMap.empty[Seq[Int], DenseMatrix[Double]] def apply(colIdxs: Seq[Int]): RDD[DenseMatrix[Double]] = { if (colBlockCache.contains(colIdxs)) { colBlockCache(colIdxs) } else { val (kBlock, diagBlock) = kernelGen.computeKernel(data, colIdxs) if (cacheKernel) { colBlockCache += (colIdxs -> kBlock) diagBlockCache += (colIdxs -> diagBlock) } kBlock } } def unpersist(colIdxs: Seq[Int]): Unit = { if (colBlockCache.contains(colIdxs) && !cacheKernel) { colBlockCache(colIdxs).unpersist(true) } } def diagBlock(idxs: Seq[Int]): DenseMatrix[Double] = { if (!diagBlockCache.contains(idxs)) { val (kBlock, diagBlock) = kernelGen.computeKernel(data, idxs) if (cacheKernel) { colBlockCache += (idxs -> kBlock) diagBlockCache += (idxs -> diagBlock) } diagBlock } else { diagBlockCache(idxs) } } }
Example 162
Source File: LinearMapper.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.learning import breeze.linalg._ import edu.berkeley.cs.amplab.mlmatrix.{NormalEquations, RowPartitionedMatrix} import keystoneml.nodes.stats.{StandardScaler, StandardScalerModel} import keystoneml.nodes.util.Densify import org.apache.spark.rdd.RDD import keystoneml.utils.MatrixUtils import keystoneml.workflow.{LabelEstimator, Transformer} object LinearMapEstimator extends Serializable { def apply(lambda: Option[Double] = None) = new LinearMapEstimator(lambda) def computeCost( trainingFeatures: RDD[DenseVector[Double]], trainingLabels: RDD[DenseVector[Double]], lambda: Double, x: DenseMatrix[Double], bOpt: Option[DenseVector[Double]]): Double = { val nTrain = trainingLabels.count val modelBroadcast = trainingLabels.context.broadcast(x) val bBroadcast = trainingLabels.context.broadcast(bOpt) val axb = trainingFeatures.mapPartitions(rows => { MatrixUtils.rowsToMatrixIter(rows).flatMap { rMat => val mat = rMat * modelBroadcast.value val out = bBroadcast.value.map { b => mat(*, ::) :+= b mat }.getOrElse(mat) MatrixUtils.matrixToRowArray(out).iterator } }) val cost = axb.zip(trainingLabels).map { part => val axb = part._1 val labels = part._2 val out = axb - labels math.pow(norm(out), 2) }.reduce(_ + _) if (lambda == 0) { cost/(2.0*nTrain.toDouble) } else { val wNorm = math.pow(norm(x.toDenseVector), 2) cost/(2.0*nTrain.toDouble) + lambda/2.0 * wNorm } } }
Example 163
Source File: LocalLeastSquaresEstimator.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.learning import breeze.linalg._ import breeze.stats._ import keystoneml.nodes.stats.StandardScalerModel import org.apache.spark.rdd.RDD import keystoneml.utils.MatrixUtils import keystoneml.workflow.LabelEstimator def trainWithL2( trainingFeatures: RDD[DenseVector[Double]], trainingLabels: RDD[DenseVector[Double]], lambda: Double): LinearMapper[DenseVector[Double]] = { val A_parts = trainingFeatures.mapPartitions { x => MatrixUtils.rowsToMatrixIter(x) }.collect() val b_parts = trainingLabels.mapPartitions { x => MatrixUtils.rowsToMatrixIter(x) }.collect() val A_local = DenseMatrix.vertcat(A_parts:_*) val b_local = DenseMatrix.vertcat(b_parts:_*) val featuresMean = mean(A_local(::, *)).t val labelsMean = mean(b_local(::, *)).t val A_zm = A_local(*, ::) - featuresMean val b_zm = b_local(*, ::) - labelsMean val AAt = A_zm * A_zm.t val model = A_zm.t * ( (AAt + (DenseMatrix.eye[Double](AAt.rows) :* lambda)) \ b_zm ) LinearMapper(model, Some(labelsMean), Some(new StandardScalerModel(featuresMean, None))) } }
Example 164
Source File: LinearDiscriminantAnalysis.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.learning import breeze.linalg._ import breeze.stats._ import org.apache.spark.rdd.RDD import keystoneml.utils.MatrixUtils import keystoneml.workflow.LabelEstimator override def fit(data: RDD[DenseVector[Double]], labels: RDD[Int]): LinearMapper[DenseVector[Double]] = { val sample = labels.zip(data).collect() computeLDA(sample) } def computeLDA(dataAndLabels: Array[(Int, DenseVector[Double])]): LinearMapper[DenseVector[Double]] = { val featuresByClass = dataAndLabels.groupBy(_._1).values.map(x => MatrixUtils.rowsToMatrix(x.map(_._2))) val meanByClass = featuresByClass.map(f => mean(f(::, *))) // each mean is a row vector, not col val sW = featuresByClass.zip(meanByClass).map(f => { val featuresMinusMean = f._1(*, ::) - f._2.t // row vector, not column featuresMinusMean.t * featuresMinusMean }).reduce(_+_) val numByClass = featuresByClass.map(_.rows : Double) val features = MatrixUtils.rowsToMatrix(dataAndLabels.map(_._2)) val totalMean = mean(features(::, *)) // A row-vector, not a column-vector val sB = meanByClass.zip(numByClass).map { case (classMean, classNum) => { val m = classMean - totalMean (m.t * m) :* classNum } }.reduce(_+_) val eigen = eig((inv(sW): DenseMatrix[Double]) * sB) val eigenvectors = (0 until eigen.eigenvectors.cols).map(eigen.eigenvectors(::, _).toDenseMatrix.t) val topEigenvectors = eigenvectors.zip(eigen.eigenvalues.toArray).sortBy(x => -math.abs(x._2)).map(_._1).take(numDimensions) val W = DenseMatrix.horzcat(topEigenvectors:_*) new LinearMapper(W) } }
Example 165
Source File: LeastSquaresEstimator.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.learning import breeze.linalg._ import keystoneml.nodes.util.{Densify, Sparsify} import org.apache.spark.rdd.RDD import keystoneml.pipelines.Logging import keystoneml.workflow._ import scala.reflect._ class LeastSquaresEstimator[T <: Vector[Double]: ClassTag]( lambda: Double = 0, numMachines: Option[Int] = None, cpuWeight: Double = 3.8e-4, memWeight: Double = 2.9e-1, networkWeight: Double = 1.32) extends OptimizableLabelEstimator[T, DenseVector[Double], DenseVector[Double]] with WeightedNode with Logging { val options: Seq[(CostModel, LabelEstimator[T, DenseVector[Double], DenseVector[Double]])] = Seq( { val solver = new DenseLBFGSwithL2[T](new LeastSquaresDenseGradient, regParam = lambda, numIterations = 20) (solver, solver) }, { val solver = new SparseLBFGSwithL2(new LeastSquaresSparseGradient, regParam = lambda, numIterations = 20) (solver, TransformerLabelEstimatorChain(Sparsify(), solver)) }, { val solver = new BlockLeastSquaresEstimator(1000, 3, lambda = lambda) (solver, TransformerLabelEstimatorChain(Densify(), solver)) }, { val solver = new LinearMapEstimator(Some(lambda)) (solver, TransformerLabelEstimatorChain(Densify(), solver)) } ) override val default: LabelEstimator[T, DenseVector[Double], DenseVector[Double]] with WeightedNode = { new DenseLBFGSwithL2[T](new LeastSquaresDenseGradient, regParam = lambda, numIterations = 20) } override def optimize( sample: RDD[T], sampleLabels: RDD[DenseVector[Double]], numPerPartition: Map[Int, Int]) : LabelEstimator[T, DenseVector[Double], DenseVector[Double]] = { val n = numPerPartition.values.map(_.toLong).sum val d = sample.first().length val k = sampleLabels.first().length val sparsity = sample.map(x => x.activeSize.toDouble / x.length).sum() / sample.count() val realNumMachines = numMachines.getOrElse { if (sample.sparkContext.getExecutorStorageStatus.length == 1) { 1 } else { sample.sparkContext.getExecutorStorageStatus.length - 1 } } logDebug(s"Optimizable Param n is $n") logDebug(s"Optimizable Param d is $d") logDebug(s"Optimizable Param k is $k") logDebug(s"Optimizable Param sparsity is $sparsity") logDebug(s"Optimizable Param numMachines is $realNumMachines") options.minBy(_._1.cost(n, d, k, sparsity, realNumMachines, cpuWeight, memWeight, networkWeight))._2 } override val weight: Int = default.weight }
Example 166
Source File: SparseLinearMapper.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.learning import breeze.linalg._ import org.apache.spark.rdd.RDD import keystoneml.workflow.Transformer override def apply(in: RDD[SparseVector[Double]]): RDD[DenseVector[Double]] = { val modelBroadcast = in.context.broadcast(x) val bBroadcast = in.context.broadcast(bOpt) in.map(row => { val out = modelBroadcast.value.t * row bBroadcast.value.foreach { b => out :+= b } out }) } }
Example 167
Source File: ApproximatePCA.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.learning import breeze.linalg._ import breeze.numerics._ import breeze.stats._ import breeze.stats.distributions.{Gaussian, ThreadLocalRandomGenerator, RandBasis} import com.github.fommil.netlib.LAPACK._ import edu.berkeley.cs.amplab.mlmatrix.util.QRUtils import org.apache.commons.math3.random.MersenneTwister import org.apache.spark.rdd.RDD import org.netlib.util.intW import keystoneml.pipelines.Logging import keystoneml.workflow.Estimator def approximateQ(A: DenseMatrix[Double], l: Int, q: Int, seed: Int = 0): DenseMatrix[Double] = { val d = A.cols val randBasis: RandBasis = new RandBasis(new ThreadLocalRandomGenerator(new MersenneTwister(seed))) val omega = DenseMatrix.rand(d, l, Gaussian(0,1)(randBasis)) //cpu: d*l, mem: d*l val y0 = A*omega //cpu: n*d*l, mem: n*l var Q = QRUtils.qrQR(y0)._1 //cpu: n*l**2 for (i <- 1 to q) { val YHat = Q.t * A //cpu: l*n*d, mem: l*d val Qh = QRUtils.qrQR(YHat.t)._1 //cpu: d*l^2, mem: d*l val Yj = A * Qh //cpu: n*d*l, mem: n*l Q = QRUtils.qrQR(Yj)._1 //cpu: n*l^2, mem: n*l } Q } }
Example 168
Source File: DistributedPCA.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.learning import breeze.linalg._ import breeze.numerics._ import breeze.stats._ import com.github.fommil.netlib.LAPACK.{getInstance => lapack} import org.apache.spark.rdd.RDD import org.netlib.util.intW import keystoneml.pipelines._ import keystoneml.utils.MatrixUtils import keystoneml.workflow.{Transformer, Estimator} import edu.berkeley.cs.amplab.mlmatrix.{RowPartition, NormalEquations, RowPartitionedMatrix, TSQR} def fit(samples: RDD[DenseVector[Float]]): PCATransformer = { new PCATransformer(computePCA(samples, dims)) } def computePCA(dataMat: RDD[DenseVector[Float]], dims: Int): DenseMatrix[Float] = { val mat = new RowPartitionedMatrix(dataMat.mapPartitions { part => val dblIter = part.map(x => convert(x, Double)) MatrixUtils.rowsToMatrixIter(dblIter).map(RowPartition(_)) }) val means = DenseVector(mat.colSums():_*) :/ mat.numRows().toDouble val meansBC = dataMat.context.broadcast(means) val zeroMeanMat = new RowPartitionedMatrix(mat.rdd.map { part => RowPartition(part.mat(*, ::) - meansBC.value) }) val rPart = new TSQR().qrR(zeroMeanMat) val svd.SVD(u, s, pcaT) = svd(rPart) val pca = convert(pcaT.t, Float) val matlabConventionPCA = PCAEstimator.enforceMatlabPCASignConvention(pca) // Return a subset of the columns. matlabConventionPCA(::, 0 until dims) } override def cost( n: Long, d: Int, k: Int, sparsity: Double, numMachines: Int, cpuWeight: Double, memWeight: Double, networkWeight: Double): Double = { val log2NumMachines = math.log(numMachines.toDouble) / math.log(2.0) val flops = n.toDouble * d * d / numMachines + d.toDouble * d * d * log2NumMachines val bytesScanned = n.toDouble * d val network = d.toDouble * d * log2NumMachines math.max(cpuWeight * flops, memWeight * bytesScanned) + networkWeight * network } }
Example 169
Source File: WrapperTrait.scala From sparker with GNU General Public License v3.0 | 5 votes |
package SparkER.Wrappers import SparkER.DataStructures.{KeyValue, MatchingEntities, Profile} import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import scala.collection.mutable.MutableList def rowToAttributes(columnNames: Array[String], row: Row, explodeInnerFields: Boolean = false, innerSeparator: String = ","): MutableList[KeyValue] = { val attributes: MutableList[KeyValue] = new MutableList() for (i <- 0 to row.size - 1) { try { val value = row(i) val attributeKey = columnNames(i) if (value != null) { value match { case listOfAttributes: Iterable[Any] => listOfAttributes map { attributeValue => attributes += KeyValue(attributeKey, attributeValue.toString) } case stringAttribute: String => if (explodeInnerFields) { stringAttribute.split(innerSeparator) map { attributeValue => attributes += KeyValue(attributeKey, attributeValue) } } else { attributes += KeyValue(attributeKey, stringAttribute) } case singleAttribute => attributes += KeyValue(attributeKey, singleAttribute.toString) } } } catch { case e: Throwable => println(e) } } attributes } }
Example 170
Source File: SerializedProfilesLoader.scala From sparker with GNU General Public License v3.0 | 5 votes |
package SparkER.Wrappers import java.io.{IOException, _} import SparkER.DataStructures.Profile import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD def loadSerializedObject(fileName: String): Any = { var `object`: Any = null try { val file: InputStream = new FileInputStream(fileName) val buffer: InputStream = new BufferedInputStream(file) val input: ObjectInput = new ObjectInputStream(buffer) try { `object` = input.readObject } finally { input.close } } catch { case cnfEx: ClassNotFoundException => { System.err.println(fileName) cnfEx.printStackTrace } case ioex: IOException => { System.err.println(fileName) ioex.printStackTrace } } return `object` } }
Example 171
Source File: Converters.scala From sparker with GNU General Public License v3.0 | 5 votes |
package SparkER.Utilities import SparkER.BlockBuildingMethods.TokenBlocking import org.apache.spark.rdd.RDD import SparkER.DataStructures._ import org.apache.spark.partial.PartialResult def profilesBlockToBlocks(profilesBlocks: RDD[ProfileBlocks], separatorIDs: Array[Long] = Array.emptyLongArray): RDD[BlockAbstract] = { val blockIDProfileID = profilesBlocks flatMap { profileWithBlocks => val profileID = profileWithBlocks.profileID profileWithBlocks.blocks map { BlockWithSize => (BlockWithSize.blockID, profileID) } } val blocks = blockIDProfileID.groupByKey().map { block => val blockID = block._1 val profilesID = block._2.toSet if (separatorIDs.isEmpty) { BlockDirty(blockID, Array(profilesID)) } else { BlockClean(blockID, TokenBlocking.separateProfiles(profilesID, separatorIDs)) } } blocks.filter(_.getComparisonSize() > 0).map(x => x) } }
Example 172
Source File: BlockFiltering.scala From sparker with GNU General Public License v3.0 | 5 votes |
package SparkER.BlockRefinementMethods import SparkER.DataStructures.{BlockWithComparisonSize, ProfileBlocks} import SparkER.Utilities.BoundedPriorityQueue import org.apache.log4j.LogManager import org.apache.spark.rdd.RDD } } def blockFilteringAdvanced(profilesWithBlocks: RDD[ProfileBlocks], r: Double, minCardinality: Int = 1): RDD[ProfileBlocks] = { profilesWithBlocks map { profileWithBlocks => val blocksSortedByComparisons = profileWithBlocks.blocks.toList.sortWith(_.comparisons < _.comparisons) val blocksToKeep = Math.round(blocksSortedByComparisons.size * r).toInt val threshold = blocksSortedByComparisons(blocksToKeep-1).comparisons ProfileBlocks(profileWithBlocks.profileID, blocksSortedByComparisons.filter(_.comparisons <= threshold).toSet) } } }
Example 173
Source File: SerializedObjectLoader.scala From sparker with GNU General Public License v3.0 | 5 votes |
package Wrappers import DataStructures.{KeyValue, MatchingEntities, Profile} import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD object SerializedObjectLoader extends WrapperTrait { def loadProfiles(filePath: String, startIDFrom: Long = 0, realFieldID: String = "", sourceId: Int = 0): RDD[Profile] = { @transient lazy val log = org.apache.log4j.LogManager.getRootLogger log.info("SPARKER - Start to loading entities") val entities = DataLoaders.SerializedLoader.loadSerializedDataset(filePath) log.info("SPARKER - Loading ended") log.info("SPARKER - Start to generate profiles") val profiles: Array[Profile] = new Array(entities.size()) for (i <- 0 until entities.size()) { val profile = Profile(id = i + startIDFrom, originalID = i + "", sourceId = sourceId) val entity = entities.get(i) val it = entity.getAttributes.iterator() while (it.hasNext) { val attribute = it.next() profile.addAttribute(KeyValue(attribute.getName, attribute.getValue)) } profiles.update(i, profile) } log.info("SPARKER - Ended to loading profiles") log.info("SPARKER - Start to parallelize profiles") val sc = SparkContext.getOrCreate() sc.union(profiles.grouped(10000).map(sc.parallelize(_)).toArray) } def loadGroundtruth(filePath: String): RDD[MatchingEntities] = { val groundtruth = DataLoaders.SerializedLoader.loadSerializedGroundtruth(filePath) val matchingEntitites: Array[MatchingEntities] = new Array(groundtruth.size()) var i = 0 val it = groundtruth.iterator while (it.hasNext) { val matching = it.next() matchingEntitites.update(i, MatchingEntities(matching.getEntityId1.toString, matching.getEntityId2.toString)) i += 1 } val sc = SparkContext.getOrCreate() sc.union(matchingEntitites.grouped(10000).map(sc.parallelize(_)).toArray) } }
Example 174
Source File: WrapperTrait.scala From sparker with GNU General Public License v3.0 | 5 votes |
package Wrappers import DataStructures.{KeyValue, MatchingEntities, Profile} import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import scala.collection.mutable.MutableList def rowToAttributes(columnNames: Array[String], row: Row, explodeInnerFields: Boolean = false, innerSeparator: String = ","): MutableList[KeyValue] = { val attributes: MutableList[KeyValue] = new MutableList() for (i <- 0 to row.size - 1) { try { val value = row(i) val attributeKey = columnNames(i) if (value != null) { value match { case listOfAttributes: Iterable[Any] => listOfAttributes map { attributeValue => attributes += KeyValue(attributeKey, attributeValue.toString) } case stringAttribute: String => if (explodeInnerFields) { stringAttribute.split(innerSeparator) map { attributeValue => attributes += KeyValue(attributeKey, attributeValue) } } else { attributes += KeyValue(attributeKey, stringAttribute) } case singleAttribute => attributes += KeyValue(attributeKey, singleAttribute.toString) } } } catch { case e: Throwable => println(e) } } attributes } }
Example 175
Source File: SerializedProfilesLoader.scala From sparker with GNU General Public License v3.0 | 5 votes |
package Wrappers import java.io.{IOException, _} import DataStructures.Profile import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD def loadSerializedObject(fileName: String): Any = { var `object`: Any = null try { val file: InputStream = new FileInputStream(fileName) val buffer: InputStream = new BufferedInputStream(file) val input: ObjectInput = new ObjectInputStream(buffer) try { `object` = input.readObject } finally { input.close } } catch { case cnfEx: ClassNotFoundException => { System.err.println(fileName) cnfEx.printStackTrace } case ioex: IOException => { System.err.println(fileName) ioex.printStackTrace } } return `object` } }
Example 176
Source File: Converters.scala From sparker with GNU General Public License v3.0 | 5 votes |
package Utilities import BlockBuildingMethods.TokenBlocking import org.apache.spark.rdd.RDD import DataStructures._ import org.apache.spark.partial.PartialResult def profilesBlockToBlocks(profilesBlocks: RDD[ProfileBlocks], separatorIDs: Array[Long] = Array.emptyLongArray): RDD[BlockAbstract] = { val blockIDProfileID = profilesBlocks flatMap { profileWithBlocks => val profileID = profileWithBlocks.profileID profileWithBlocks.blocks map { BlockWithSize => (BlockWithSize.blockID, profileID) } } val blocks = blockIDProfileID.groupByKey().map { block => val blockID = block._1 val profilesID = block._2.toSet if (separatorIDs.isEmpty) { BlockDirty(blockID, Array(profilesID)) } else { BlockClean(blockID, TokenBlocking.separateProfiles(profilesID, separatorIDs)) } } blocks.filter(_.getComparisonSize() >= 1).map(x => x) } }
Example 177
Source File: BlockFiltering.scala From sparker with GNU General Public License v3.0 | 5 votes |
package BlockRefinementMethods import DataStructures.{BlockWithComparisonSize, ProfileBlocks} import Utilities.BoundedPriorityQueue import org.apache.log4j.LogManager import org.apache.spark.rdd.RDD } } def blockFilteringAdvanced(profilesWithBlocks: RDD[ProfileBlocks], r: Double, minCardinality: Int = 1): RDD[ProfileBlocks] = { profilesWithBlocks map { profileWithBlocks => val blocksSortedByComparisons = profileWithBlocks.blocks.toList.sortWith(_.comparisons < _.comparisons) val blocksToKeep = Math.round(blocksSortedByComparisons.size * r).toInt val threshold = blocksSortedByComparisons(blocksToKeep-1).comparisons ProfileBlocks(profileWithBlocks.profileID, blocksSortedByComparisons.filter(_.comparisons <= threshold).toSet) } } }
Example 178
Source File: SerializedObjectLoader.scala From sparker with GNU General Public License v3.0 | 5 votes |
package Wrappers import DataStructures.{KeyValue, MatchingEntities, Profile} import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD object SerializedObjectLoader extends WrapperTrait{ def loadProfiles(filePath : String, startIDFrom : Long = 0, realFieldID : String = "") : RDD[Profile] = { @transient lazy val log = org.apache.log4j.LogManager.getRootLogger log.info("SPARKER - Start to loading entities") val entities = DataLoaders.SerializedLoader.loadSerializedDataset(filePath) log.info("SPARKER - Loading ended") log.info("SPARKER - Start to generate profiles") val profiles : Array[Profile] = new Array(entities.size()) for(i <- 0 to entities.size()-1){ val profile = Profile(id = i+startIDFrom, originalID = i+"") val entity = entities.get(i) val it = entity.getAttributes.iterator() while(it.hasNext){ val attribute = it.next() profile.addAttribute(KeyValue(attribute.getName, attribute.getValue)) } profiles.update(i, profile) } log.info("SPARKER - Ended to loading profiles") log.info("SPARKER - Start to parallelize profiles") val sc = SparkContext.getOrCreate() sc.union(profiles.grouped(10000).map(sc.parallelize(_)).toArray) } def loadGroundtruth(filePath : String) : RDD[MatchingEntities] = { val groundtruth = DataLoaders.SerializedLoader.loadSerializedGroundtruth(filePath) val matchingEntitites : Array[MatchingEntities] = new Array(groundtruth.size()) var i = 0 val it = groundtruth.iterator while(it.hasNext){ val matching = it.next() matchingEntitites.update(i, MatchingEntities(matching.getEntityId1.toString, matching.getEntityId2.toString)) i+=1 } val sc = SparkContext.getOrCreate() sc.union(matchingEntitites.grouped(10000).map(sc.parallelize(_)).toArray) } }
Example 179
Source File: WrapperTrait.scala From sparker with GNU General Public License v3.0 | 5 votes |
package Wrappers import DataStructures.{KeyValue, MatchingEntities, Profile} import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import scala.collection.mutable.MutableList def rowToAttributes(columnNames : Array[String], row : Row, explodeInnerFields:Boolean = false, innerSeparator : String = ",") : MutableList[KeyValue] = { val attributes: MutableList[KeyValue] = new MutableList() for(i <- 0 to row.size-1){ try{ val value = row(i) val attributeKey = columnNames(i) if(value != null){ value match { case listOfAttributes : Iterable[Any] => listOfAttributes map { attributeValue => attributes += KeyValue(attributeKey, attributeValue.toString) } case stringAttribute : String => if(explodeInnerFields){ stringAttribute.split(innerSeparator) map { attributeValue => attributes += KeyValue(attributeKey, attributeValue) } } else { attributes += KeyValue(attributeKey, stringAttribute) } case singleAttribute => attributes += KeyValue(attributeKey, singleAttribute.toString) } } } catch{ case e : Throwable => println(e) } } attributes } }
Example 180
Source File: SerializedProfilesLoader.scala From sparker with GNU General Public License v3.0 | 5 votes |
package Wrappers import java.io.{IOException, _} import DataStructures.Profile import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD def loadSerializedObject(fileName: String): Any = { var `object`: Any = null try { val file: InputStream = new FileInputStream(fileName) val buffer: InputStream = new BufferedInputStream(file) val input: ObjectInput = new ObjectInputStream(buffer) try { `object` = input.readObject } finally { input.close } } catch { case cnfEx: ClassNotFoundException => { System.err.println(fileName) cnfEx.printStackTrace } case ioex: IOException => { System.err.println(fileName) ioex.printStackTrace } } return `object` } }
Example 181
Source File: Converters.scala From sparker with GNU General Public License v3.0 | 5 votes |
package Utilities import org.apache.spark.rdd.RDD import DataStructures._ import org.apache.spark.partial.PartialResult def profilesBlockToBlocks(profilesBlocks : RDD[ProfileBlocks], separatorID : Long = -1) : RDD[BlockAbstract] = { val blockIDProfileID = profilesBlocks flatMap { profileWithBlocks => val profileID = profileWithBlocks.profileID profileWithBlocks.blocks map { BlockWithSize => (BlockWithSize.blockID, profileID) } } val blocks = blockIDProfileID.groupByKey().map { block => val blockID = block._1 val profilesID = block._2.toSet if (separatorID < 0){ BlockDirty(blockID, (profilesID, Set.empty)) } else{ BlockClean(blockID, (profilesID.partition(_ <= separatorID))) } } blocks.filter(_.getComparisonSize() >=1).map(x => x) } }
Example 182
Source File: BlockFiltering.scala From sparker with GNU General Public License v3.0 | 5 votes |
package BlockRefinementMethods import DataStructures.{BlockWithComparisonSize, ProfileBlocks} import Utilities.BoundedPriorityQueue import org.apache.log4j.LogManager import org.apache.spark.rdd.RDD } } def blockFilteringAdvanced(profilesWithBlocks: RDD[ProfileBlocks], r: Double, minCardinality: Int = 1): RDD[ProfileBlocks] = { profilesWithBlocks map { profileWithBlocks => val blocksSortedByComparisons = profileWithBlocks.blocks.toList.sortWith(_.comparisons < _.comparisons) val blocksToKeep = Math.round(blocksSortedByComparisons.size * r).toInt val threshold = blocksSortedByComparisons(blocksToKeep-1).comparisons ProfileBlocks(profileWithBlocks.profileID, blocksSortedByComparisons.filter(_.comparisons <= threshold).toSet) } } }
Example 183
Source File: CNNModel.scala From SparkMLlibDeepLearn with Apache License 2.0 | 5 votes |
package CNN import breeze.linalg.{ Matrix => BM, CSCMatrix => BSM, DenseMatrix => BDM, Vector => BV, DenseVector => BDV, SparseVector => BSV } import org.apache.spark.rdd.RDD def Loss(predict: RDD[PredictCNNLabel]): Double = { val predict1 = predict.map(f => f.error) // error and loss // ��������� val loss1 = predict1 val (loss2, counte) = loss1.treeAggregate((0.0, 0L))( seqOp = (c, v) => { // c: (e, count), v: (m) val e1 = c._1 val e2 = (v :* v).sum val esum = e1 + e2 (esum, c._2 + 1) }, combOp = (c1, c2) => { // c: (e, count) val e1 = c1._1 val e2 = c2._1 val esum = e1 + e2 (esum, c1._2 + c2._2) }) val Loss = (loss2 / counte.toDouble) * 0.5 Loss } }
Example 184
Source File: NeuralNetModel.scala From SparkMLlibDeepLearn with Apache License 2.0 | 5 votes |
package NN import breeze.linalg.{ Matrix => BM, CSCMatrix => BSM, DenseMatrix => BDM, Vector => BV, DenseVector => BDV, SparseVector => BSV } import org.apache.spark.rdd.RDD def Loss(predict: RDD[PredictNNLabel]): Double = { val predict1 = predict.map(f => f.error) // error and loss // ��������� val loss1 = predict1 val (loss2, counte) = loss1.treeAggregate((0.0, 0L))( seqOp = (c, v) => { // c: (e, count), v: (m) val e1 = c._1 val e2 = (v :* v).sum val esum = e1 + e2 (esum, c._2 + 1) }, combOp = (c1, c2) => { // c: (e, count) val e1 = c1._1 val e2 = c2._1 val esum = e1 + e2 (esum, c1._2 + c2._2) }) val Loss = loss2 / counte.toDouble Loss * 0.5 } }
Example 185
Source File: DBNModel.scala From SparkMLlibDeepLearn with Apache License 2.0 | 5 votes |
package DBN import breeze.linalg.{ Matrix => BM, CSCMatrix => BSM, DenseMatrix => BDM, Vector => BV, DenseVector => BDV, SparseVector => BSV } import org.apache.spark.rdd.RDD import scala.collection.mutable.ArrayBuffer class DBNModel( val config: DBNConfig, val dbn_W: Array[BDM[Double]], val dbn_b: Array[BDM[Double]], val dbn_c: Array[BDM[Double]]) extends Serializable { def dbnunfoldtonn(outputsize: Int): (Array[Int], Int, Array[BDM[Double]]) = { //1 size layer ����ת�� val size = if (outputsize > 0) { val size1 = config.size val size2 = ArrayBuffer[Int]() size2 ++= size1 size2 += outputsize size2.toArray } else config.size val layer = if (outputsize > 0) config.layer + 1 else config.layer //2 dbn_W ����ת�� var initW = ArrayBuffer[BDM[Double]]() for (i <- 0 to dbn_W.length - 1) { initW += BDM.horzcat(dbn_c(i), dbn_W(i)) } (size, layer, initW.toArray) } }
Example 186
Source File: StringKeyRDD.scala From cuesheet with Apache License 2.0 | 5 votes |
package com.kakao.cuesheet.convert import java.nio.charset.StandardCharsets.UTF_8 import com.kakao.mango.concurrent._ import com.kakao.mango.couchbase.Couchbase import com.kakao.mango.hbase.HBase import com.kakao.mango.json._ import com.kakao.mango.util.Retry import org.apache.spark.rdd.RDD import scala.concurrent.duration._ class StringKeyRDD[T](rdd: RDD[(String, T)]) extends SaveToES(rdd) { def saveToCouchbase(nodes: Seq[String], bucket: String, expiry: Int = 0, maxRate: Double = 1e7, password: String = null): Unit = { // rate per executor val rate = maxRate / rdd.sparkContext.getExecutorMemoryStatus.size rdd.foreachPartition { partition => // BackPressureException may happen, so retry 10 times // if that fails, Spark task scheduler may retry again. val cluster = Couchbase(nodes: _*) val client = cluster.bucket(bucket, password) val converted = partition.map { case (key, value: Array[Byte]) => (key, new String(value, UTF_8)) case (key, value: String) => (key, value) case (key, value) => (key, toJson(value)) } for (group <- converted.grouped(1000)) { Retry(10, 100.millis) { client.putAll(group, rate, expiry).sync() } } cluster.disconnect() } } def saveToHBase(quorum: String, table: String, family: String, qualifier: String, maxRate: Double = 1e7): Unit = { // rate per executor val rate = maxRate / rdd.sparkContext.getExecutorMemoryStatus.size rdd.foreachPartition { partition => val hbase = HBase(quorum) val column = hbase.column(table, family, qualifier) val converted = partition.map { case (key, value: Array[Byte]) => (key.getBytes(UTF_8), value) case (key, value: String) => (key.getBytes(UTF_8), value.getBytes(UTF_8)) case (key, value) => (key.getBytes(UTF_8), serialize(value)) } for (group <- converted.grouped(1000)) { Retry(10, 100.millis) { column.putAllBytes(group, rate).sync() } } } } }
Example 187
Source File: HBaseReaders.scala From cuesheet with Apache License 2.0 | 5 votes |
package com.kakao.cuesheet.convert import com.kakao.mango.util.Conversions._ import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.client.Result import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.mapreduce.TableInputFormat import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import scala.collection.JavaConversions._ trait HBaseReaders { val sc: SparkContext def hbaseTable(quorum: String, table: String): RDD[(String, ((String, String), (Long, String)))] = { hbaseTableBinary(quorum, table).map { case (rowkey, ((family, qualifier), (timestamp, value))) => (rowkey.string, ((family.string, qualifier.string), (timestamp, value.string))) } } def hbaseColumnBinary(quorum: String, table: String, family: Array[Byte], qualifier: Array[Byte]): RDD[(Array[Byte], (Long, Array[Byte]))] = { hbaseTableBinary(quorum, table).collect { case (rowkey, ((f, q), cell)) if family.sameElements(f) && qualifier.sameElements(q) => (rowkey, cell) } } def hbaseColumn(quorum: String, table: String, family: String, qualifier: String): RDD[(String, (Long, String))] = { hbaseTable(quorum, table).collect { case (rowkey, ((f, q), cell)) if family == f && qualifier == q => (rowkey, cell) } } }
Example 188
Source File: JoinableRDD.scala From cuesheet with Apache License 2.0 | 5 votes |
package com.kakao.cuesheet.convert import org.apache.spark.HashPartitioner import org.apache.spark.rdd.RDD import scala.reflect.ClassTag class JoinableRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]) { def selfJoin(numPartitions: Int = rdd.partitions.length): RDD[(K, (V, V))] = fastJoin(rdd, numPartitions) def fastJoin[W](other: RDD[(K, W)], numPartitions: Int = rdd.partitions.length): RDD[(K, (V, W))] = { val partitioner = new HashPartitioner(numPartitions) val grouped = rdd cogroup other val left = grouped.flatMap{ case (k, (vs, ws)) => vs.zipWithIndex.map { case (v, idx) => ((k, idx), v) } }.partitionBy(partitioner) val right = grouped.flatMap { case (k, (vs, ws)) => ws.map { w => ((k, w.hashCode()), (w, vs.size)) } }.partitionBy(partitioner).flatMap { case ((k, r), (w, size)) => (0 until size).map(i => ((k, w), i)) }.map { case ((k, w), idx) => ((k, idx), w) } (left join right).map { case ((k, idx), (v, w)) => (k, (v, w)) } } }
Example 189
Source File: SavingStream.scala From cuesheet with Apache License 2.0 | 5 votes |
package com.kakao.cuesheet.convert import com.kakao.mango.concurrent.{NamedExecutors, RichExecutorService} import com.kakao.mango.text.ThreadSafeDateFormat import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Time import org.apache.spark.streaming.dstream.DStream import java.util.concurrent.{Future => JFuture} import scala.reflect.runtime.universe.TypeTag object SavingStream { val yyyyMMdd = ThreadSafeDateFormat("yyyy-MM-dd") val hh = ThreadSafeDateFormat("HH") val mm = ThreadSafeDateFormat("mm") val m0 = (ms: Long) => mm(ms).charAt(0) + "0" } @transient var executor: RichExecutorService = _ def ex: RichExecutorService = { if (executor == null) { this.synchronized { if (executor == null) { executor = new RichExecutorService(es.get()) } } } executor } def saveAsPartitionedTable(table: String, path: String, format: String = "orc")(toPartition: Time => Seq[(String, String)]): Unit = { stream.foreachRDD { (rdd, time) => ex.submit { toDF(rdd).appendToExternalTablePartition(table, path, format, toPartition(time): _*) } } } def saveAsDailyPartitionedTable(table: String, path: String, dateColumn: String = "date", format: String = "orc"): Unit = { saveAsPartitionedTable(table, path, format) { time => val ms = time.milliseconds Seq(dateColumn -> yyyyMMdd(ms)) } } def saveAsHourlyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", format: String = "orc"): Unit = { saveAsPartitionedTable(table, path, format) { time => val ms = time.milliseconds Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms)) } } def saveAsTenMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = { saveAsPartitionedTable(table, path, format) { time => val ms = time.milliseconds Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> m0(ms)) } } def saveAsMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = { saveAsPartitionedTable(table, path, format) { time => val ms = time.milliseconds Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> mm(ms)) } } } class ProductStream[T <: Product : TypeTag](stream: DStream[T])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[T](stream) { override def toDF(rdd: RDD[T]) = ctx.createDataFrame(rdd) } class JsonStream(stream: DStream[String])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[String](stream) { override def toDF(rdd: RDD[String]) = ctx.read.json(rdd) } class MapStream[T](stream: DStream[Map[String, T]])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[Map[String, T]](stream) { import com.kakao.mango.json._ override def toDF(rdd: RDD[Map[String, T]]) = ctx.read.json(rdd.map(toJson)) } class RowStream(stream: DStream[Row])(implicit ctx: HiveContext, es: ExecutorSupplier, schema: StructType) extends SavingStream[Row](stream) { override def toDF(rdd: RDD[Row]): DataFrame = ctx.createDataFrame(rdd, schema) }
Example 190
Source File: MemsqlRDD.scala From memsql-spark-connector with Apache License 2.0 | 5 votes |
package com.memsql.spark import java.sql.{Connection, PreparedStatement, ResultSet} import com.memsql.spark.SQLGen.VariableList import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils} import org.apache.spark.sql.types._ import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext} case class MemsqlRDD(query: String, variables: VariableList, options: MemsqlOptions, schema: StructType, expectedOutput: Seq[Attribute], @transient val sc: SparkContext) extends RDD[Row](sc, Nil) { override protected def getPartitions: Array[Partition] = MemsqlQueryHelpers.GetPartitions(options, query, variables) override def compute(rawPartition: Partition, context: TaskContext): Iterator[Row] = { var closed = false var rs: ResultSet = null var stmt: PreparedStatement = null var conn: Connection = null var partition: MemsqlPartition = rawPartition.asInstanceOf[MemsqlPartition] def tryClose(name: String, what: AutoCloseable): Unit = { try { if (what != null) { what.close() } } catch { case e: Exception => logWarning(s"Exception closing $name", e) } } def close(): Unit = { if (closed) { return } tryClose("resultset", rs) tryClose("statement", stmt) tryClose("connection", conn) closed = true } context.addTaskCompletionListener { context => close() } conn = JdbcUtils.createConnectionFactory(partition.connectionInfo)() stmt = conn.prepareStatement(partition.query) JdbcHelpers.fillStatement(stmt, partition.variables) rs = stmt.executeQuery() var rowsIter = JdbcUtils.resultSetToRows(rs, schema) if (expectedOutput.nonEmpty) { val schemaDatatypes = schema.map(_.dataType) val expectedDatatypes = expectedOutput.map(_.dataType) if (schemaDatatypes != expectedDatatypes) { val columnEncoders = schemaDatatypes.zip(expectedDatatypes).zipWithIndex.map { case ((_: StringType, _: NullType), _) => ((_: Row) => null) case ((_: ShortType, _: BooleanType), i) => ((r: Row) => r.getShort(i) != 0) case ((_: IntegerType, _: BooleanType), i) => ((r: Row) => r.getInt(i) != 0) case ((_: LongType, _: BooleanType), i) => ((r: Row) => r.getLong(i) != 0) case ((l, r), i) => { options.assert(l == r, s"MemsqlRDD: unable to encode ${l} into ${r}") ((r: Row) => r.get(i)) } } rowsIter = rowsIter .map(row => Row.fromSeq(columnEncoders.map(_(row)))) } } CompletionIterator[Row, Iterator[Row]](new InterruptibleIterator[Row](context, rowsIter), close) } }
Example 191
Source File: KMeanTest.scala From SparseML with Apache License 2.0 | 5 votes |
import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.{ScalableKMeans, KMeans} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.{SparseVector, Vectors, Vector} import scala.util.Random //spark/bin/spark-submit --master spark://10.100.34.48:7077 --class ScalableKMeanTest --executor-memory 20g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 1000000 100 0.1 1 my 9 //guale spark/bin/spark-submit --master spark://10.100.34.48:7077 --class ScalableKMeanTest --executor-memory 5g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 5000000 100 0.1 1 my 15 object ScalableKMeanTest { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setAppName(s"kmeans: ${args.mkString(",")}") val sc = new SparkContext(conf) val k = args(0).toInt val dimension = args(1).toInt val recordNum = args(2).toInt val sparsity = args(3).toDouble val iterations = args(4).toInt val means = args(5) val parNumber = args(6).toInt val data: RDD[Vector] = sc.parallelize(1 to recordNum, parNumber).map(i => { val ran = new Random() val indexArr = ran.shuffle((0 until dimension).toList).take((dimension * sparsity).toInt).sorted.toArray val valueArr = (1 to (dimension * sparsity).toInt).map(in => ran.nextDouble()).sorted.toArray val vec: Vector = new SparseVector(dimension, indexArr, valueArr) vec }).cache() println(args.mkString(", ")) println(data.count() + " records generated") val st = System.nanoTime() val model = if(means == "my") { println("running scalable kmeans") val model = new ScalableKMeans() .setK(k) .setInitializationMode("random") .setMaxIterations(iterations) .run(data) model } else { println("running mllib kmeans") val model = new KMeans() .setK(k) .setInitializationMode("random") .setMaxIterations(iterations) .run(data) model } println((System.nanoTime() - st) / 1e9 + " seconds cost") println("final clusters: " + model.clusterCenters.length) println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n")) sc.stop() } }
Example 192
Source File: LRUtils.scala From SparseML with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.sparselr.Utils import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD object LRUtils { def bytes2Int (buffer: Array[Byte], pos: Int): (Int, Int) = { var result: Int = 0 var position: Int = pos var byte = buffer(pos) var shiftNum = 0 while ((byte & 0x80) != 0) { result = result | ((byte & 0x7F)<<shiftNum) position += 1 byte = buffer(position) shiftNum += 7 } result = result | ((byte & 0x7F)<<shiftNum) (result, position) } //featureId cached in X is localId def loadFileAsMatrix( sc: SparkContext, path: String, minPartitions: Int): RDD[(Array[Double], Matrix)] = { val lines = sc.textFile(path, minPartitions) .map(_.trim) .filter(line => !(line.isEmpty || line.startsWith("#"))) val data = lines.mapPartitions { samples => val labels = new PrimitiveVector[Double]() val builder = new MatrixBuilder() samples.foreach { line => val items = line.split(' ') labels += items.head.toDouble val featureIdAndValues = items.tail.filter(_.nonEmpty) val indices = new PrimitiveVector[Int]() val values = new PrimitiveVector[Float]() featureIdAndValues.foreach { item => val featureAndValue = item.split(":") indices += featureAndValue(0).toInt val value = featureAndValue(1).toFloat values += value } builder.add(new SparseVector(indices.trim.array, values.trim.array)) } Iterator((labels.trim.array, builder.toMatrix)) } data } }
Example 193
Source File: LogisticRegression.scala From SparseML with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.sparselr import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap import org.apache.spark.mllib.sparselr.Utils._ import org.apache.spark.SparkEnv import org.apache.spark.rdd.RDD import org.apache.spark.broadcast.Broadcast object LogisticRegression { def train(input: RDD[(Array[Double], Matrix)], optimizer: Optimizer ): (Array[Int], Array[Double]) = { val hdfsIndex2global = new Int2IntOpenHashMap() var index = 0 input.map { point => point._2 match { case x: CompressedSparseMatrix => println("x.length" + x.mappings.length) case _ => throw new IllegalArgumentException(s"dot doesn't support ${input.getClass}.") } }.count val global2hdfsIndex = input.map { point => point._2 match { case x: CompressedSparseMatrix => x.mappings case _ => throw new IllegalArgumentException(s"dot doesn't support ${input.getClass}.") } }.collect().flatMap(t => t).distinct global2hdfsIndex.foreach{value => hdfsIndex2global.put(value, index) index += 1 } val bcHdfsIndex2global = input.context.broadcast(hdfsIndex2global) val examples = input.map(global2globalMapping(bcHdfsIndex2global)).cache() val numTraining = examples.count() println(s"Training: $numTraining.") SparkEnv.get.blockManager.removeBroadcast(bcHdfsIndex2global.id, true) val examplesTest = examples.mapPartitions(_.flatMap { case (y, part) => part.asInstanceOf[CompressedSparseMatrix].tupletIterator(y)}) val weights = Vectors.dense(new Array[Double](global2hdfsIndex.size)) val newWeights = optimizer.optimize(examplesTest, weights) ((global2hdfsIndex, newWeights.toArray)) } //globalId to localId for mappings in Matrix def global2globalMapping(bchdfsIndex2global: Broadcast[Int2IntOpenHashMap]) (partition: (Array[Double], Matrix)): (Array[Double], Matrix) = { val hdfsIndex2global = bchdfsIndex2global.value partition._2 match { case x: CompressedSparseMatrix => val local2hdfsIndex = x.mappings for (i <- 0 until local2hdfsIndex.length) { local2hdfsIndex(i) = hdfsIndex2global.get(local2hdfsIndex(i)) } case _ => throw new IllegalArgumentException(s"dot doesn't support ${partition.getClass}.") } partition } }
Example 194
Source File: OneWayANOVA.scala From StatisticsOnSpark with Apache License 2.0 | 5 votes |
package main.ANOVA import org.apache.commons.math3.distribution.FDistribution import org.apache.spark.rdd.RDD def anovaPValue(categoryData: Iterable[RDD[Double]]): Double = { val anovaStats = getAnovaStats(categoryData) val fdist: FDistribution = new FDistribution(null, anovaStats.dfbg, anovaStats.dfwg) return 1.0 - fdist.cumulativeProbability(anovaStats.F) } private case class ANOVAStats(dfbg: Double, dfwg: Double, F: Double) private def getAnovaStats(categoryData: Iterable[RDD[Double]]): ANOVAStats = { var dfwg: Long = 0 var sswg: Double = 0 var totsum: Double = 0 var totsumsq: Double = 0 var totnum: Long = 0 for (data <- categoryData) { val sum: Double = data.sum() val sumsq: Double = data.map(i => i * i).sum() val num = data.count() totnum += num totsum += sum totsumsq += sumsq dfwg += num - 1 val ss: Double = sumsq - ((sum * sum) / num) sswg += ss } val sst: Double = totsumsq - ((totsum * totsum) / totnum) val ssbg: Double = sst - sswg val dfbg: Int = categoryData.size - 1 val msbg: Double = ssbg / dfbg val mswg: Double = sswg / dfwg val F: Double = msbg / mswg ANOVAStats(dfbg, dfwg, F) } }
Example 195
Source File: TwoSampleIndependentTTest.scala From StatisticsOnSpark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import org.apache.commons.math3.distribution.TDistribution import org.apache.commons.math3.util.FastMath import org.apache.spark.rdd.RDD def tTest(sample1: RDD[Double], sample2: RDD[Double]): Double = { val n1 = sample1.count() val n2 = sample2.count() val m1 = sample1.sum() / n1 val m2 = sample2.sum() / n2 val v1 = sample1.map(d => (d - m1) * (d - m1)).sum() / (n1 - 1) val v2 = sample2.map(d => (d - m2) * (d - m2)).sum() / (n2 - 1) val t: Double = math.abs((m1 - m2) / FastMath.sqrt((v1 / n1) + (v2 / n2))) val degreesOfFreedom: Double = (((v1 / n1) + (v2 / n2)) * ((v1 / n1) + (v2 / n2))) / ((v1 * v1) / (n1 * n1 * (n1 - 1d)) + (v2 * v2) / (n2 * n2 * (n2 - 1d))) // pass a null rng to avoid unneeded overhead as we will not sample from this distribution val distribution: TDistribution = new TDistribution(null, degreesOfFreedom) 2.0 * distribution.cumulativeProbability(-t) } }
Example 196
Source File: EtlProcessor.scala From etl-light with MIT License | 5 votes |
package yamrcraft.etlite.processors import kafka.common.TopicAndPartition import kafka.message.MessageAndMetadata import kafka.serializer.DefaultDecoder import org.apache.spark._ import org.apache.spark.rdd.RDD import org.apache.spark.streaming.kafka._ import org.slf4j.LoggerFactory import yamrcraft.etlite.Settings import yamrcraft.etlite.state.{KafkaOffsetsState, KafkaStateManager} import yamrcraft.etlite.transformers.InboundMessage object EtlProcessor { val logger = LoggerFactory.getLogger(this.getClass) def run(settings: Settings) = { val context = createContext(settings) val stateManager = new KafkaStateManager(settings.etl.state) val lastState = stateManager.readState logger.info(s"last persisted state: $lastState") val currState = stateManager.fetchNextState(lastState, settings) logger.info(s"batch working state: $currState") val rdd = createRDD(context, currState, settings) processRDD(rdd, currState.jobId, settings) logger.info("committing state") stateManager.commitState(currState) } private def createContext(settings: Settings) = { val sparkConf = new SparkConf() .setAppName(settings.spark.appName) .setAll(settings.spark.conf) new SparkContext(sparkConf) } private def createRDD(context: SparkContext, state: KafkaOffsetsState, settings: Settings): RDD[InboundMessage] = { KafkaUtils.createRDD[Array[Byte], Array[Byte], DefaultDecoder, DefaultDecoder, InboundMessage]( context, settings.kafka.properties, state.ranges.toArray, Map[TopicAndPartition, Broker](), (msgAndMeta: MessageAndMetadata[Array[Byte], Array[Byte]]) => { InboundMessage(msgAndMeta.topic, msgAndMeta.key(), msgAndMeta.message()) } ) } private def processRDD(kafkaRDD: RDD[InboundMessage], jobId: Long, settings: Settings) = { // passed to remote workers val etlSettings = settings.etl logger.info(s"RDD processing started [rdd=${kafkaRDD.id}, jobId=$jobId]") val rdd = settings.etl.maxNumOfOutputFiles.map(kafkaRDD.coalesce(_)).getOrElse(kafkaRDD) rdd.foreachPartition { partition => // executed at the worker new PartitionProcessor(jobId, TaskContext.get.partitionId(), etlSettings) .processPartition(partition) } logger.info(s"RDD processing ended [rdd=${kafkaRDD.id}, jobId=$jobId]") } }
Example 197
Source File: YahooParser.scala From spark-timeseries with Apache License 2.0 | 5 votes |
package com.cloudera.sparkts.parsers import com.cloudera.sparkts.TimeSeries import com.cloudera.sparkts.TimeSeries._ import java.time._ import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD object YahooParser { def yahooStringToTimeSeries( text: String, keyPrefix: String = "", zone: ZoneId = ZoneId.systemDefault()) : TimeSeries[String] = { val lines = text.split('\n') val labels = lines(0).split(',').tail.map(keyPrefix + _) val samples = lines.tail.map { line => val tokens = line.split(',') val dt = LocalDate.parse(tokens.head).atStartOfDay(zone) (dt, tokens.tail.map(_.toDouble)) }.reverse timeSeriesFromIrregularSamples(samples, labels, zone) } def yahooFiles( dir: String, sc: SparkContext, zone: ZoneId = ZoneId.systemDefault()) : RDD[TimeSeries[String]] = { sc.wholeTextFiles(dir).map { case (path, text) => YahooParser.yahooStringToTimeSeries(text, path.split('/').last, zone) } } }
Example 198
Source File: DatasourceRDD.scala From datasource-receiver with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.datasource.receiver import org.apache.spark.partial.{BoundedDouble, CountEvaluator, PartialResult} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.streaming.datasource.config.ParametersUtils import org.apache.spark.streaming.datasource.models.{InputSentences, OffsetOperator} import org.apache.spark.{Logging, Partition, TaskContext} private[datasource] class DatasourceRDD( @transient sqlContext: SQLContext, inputSentences: InputSentences, datasourceParams: Map[String, String] ) extends RDD[Row](sqlContext.sparkContext, Nil) with Logging with ParametersUtils { private var totalCalculated: Option[Long] = None private val InitTableName = "initTable" private val LimitedTableName = "limitedTable" private val TempInitQuery = s"select * from $InitTableName" val dataFrame = inputSentences.offsetConditions.fold(sqlContext.sql(inputSentences.query)) { case offset => val parsedQuery = parseInitialQuery val conditionsSentence = offset.fromOffset.extractConditionSentence(parsedQuery) val orderSentence = offset.fromOffset.extractOrderSentence(parsedQuery, inverse = offset.limitRecords.isEmpty) val limitSentence = inputSentences.extractLimitSentence sqlContext.sql(parsedQuery + conditionsSentence + orderSentence + limitSentence) } private def parseInitialQuery: String = { if (inputSentences.query.toUpperCase.contains("WHERE") || inputSentences.query.toUpperCase.contains("ORDER") || inputSentences.query.toUpperCase.contains("LIMIT") ) { sqlContext.sql(inputSentences.query).registerTempTable(InitTableName) TempInitQuery } else inputSentences.query } def progressInputSentences: InputSentences = { if (!dataFrame.rdd.isEmpty()) { inputSentences.offsetConditions.fold(inputSentences) { case offset => val offsetValue = if (offset.limitRecords.isEmpty) dataFrame.rdd.first().get(dataFrame.schema.fieldIndex(offset.fromOffset.name)) else { dataFrame.registerTempTable(LimitedTableName) val limitedQuery = s"select * from $LimitedTableName order by ${offset.fromOffset.name} " + s"${OffsetOperator.toInverseOrderOperator(offset.fromOffset.operator)} limit 1" sqlContext.sql(limitedQuery).rdd.first().get(dataFrame.schema.fieldIndex(offset.fromOffset.name)) } inputSentences.copy(offsetConditions = Option(offset.copy(fromOffset = offset.fromOffset.copy( value = Option(offsetValue), operator = OffsetOperator.toProgressOperator(offset.fromOffset.operator))))) } } else inputSentences } override def isEmpty(): Boolean = { totalCalculated.fold { withScope { partitions.length == 0 || take(1).length == 0 } } { total => total == 0L } } override def getPartitions: Array[Partition] = dataFrame.rdd.partitions override def compute(thePart: Partition, context: TaskContext): Iterator[Row] = dataFrame.rdd.compute(thePart, context) override def getPreferredLocations(thePart: Partition): Seq[String] = dataFrame.rdd.preferredLocations(thePart) }
Example 199
Source File: JsonInputStreamQuery.scala From spark-cep with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.examples import scala.collection.mutable.SynchronizedQueue import scala.io.Source import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.streaming.StreamSQLContext import org.apache.spark.streaming.{Duration, StreamingContext} object JsonInputStreamQuery { def main(args: Array[String]): Unit = { val ssc = new StreamingContext("local[10]", "test", Duration(3000)) val sc = ssc.sparkContext val streamSqlContext = new StreamSQLContext(ssc, new SQLContext(sc)) import streamSqlContext._ // Here we read data line by line from a given file and then put it into a queue DStream. // You can replace any kind of String type DStream here including kafka DStream. val queue = new SynchronizedQueue[RDD[String]]() Source.fromFile("src/main/resources/student.json").getLines().foreach(msg => queue.enqueue(sc.parallelize(List(msg)))) val queueDStream = ssc.queueStream[String](queue) // We can infer the schema of json automatically by using inferJsonSchema val schema = streamSqlContext.inferJsonSchema("src/main/resources/student.json") streamSqlContext.registerDStreamAsTable( streamSqlContext.jsonDStream(queueDStream, schema), "jsonTable") sql("SELECT * FROM jsonTable").print() ssc.start() ssc.awaitTerminationOrTimeout(30 * 1000) ssc.stop() } }
Example 200
Source File: ExistingDStream.scala From spark-cep with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming import org.apache.spark.rdd.{EmptyRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.streaming.Time import org.apache.spark.streaming.dstream.DStream private[streaming] case class PhysicalDStream(output: Seq[Attribute], @transient stream: DStream[InternalRow]) extends SparkPlan with StreamPlan { def children = Nil override def doExecute() = { assert(validTime != null) Utils.invoke(classOf[DStream[InternalRow]], stream, "getOrCompute", (classOf[Time], validTime)) .asInstanceOf[Option[RDD[InternalRow]]] .getOrElse(new EmptyRDD[InternalRow](sparkContext)) } }