com.holdenkarau.spark.testing.SharedSparkContext Scala Examples
The following examples show how to use com.holdenkarau.spark.testing.SharedSparkContext.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: JoinOrderTestSuite.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.tests.rangejoins import java.io.{OutputStreamWriter, PrintWriter} import com.holdenkarau.spark.testing.{DataFrameSuiteBase, SharedSparkContext} import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ IntegerType, StringType, StructField, StructType } import org.bdgenomics.utils.instrumentation.{ Metrics, MetricsListener, RecordedMetrics } import org.biodatageeks.sequila.rangejoins.IntervalTree.IntervalTreeJoinStrategyOptim import org.scalatest.{BeforeAndAfter, FunSuite} class JoinOrderTestSuite extends FunSuite with DataFrameSuiteBase with BeforeAndAfter with SharedSparkContext { val schema = StructType( Seq(StructField("chr", StringType), StructField("start", IntegerType), StructField("end", IntegerType))) val metricsListener = new MetricsListener(new RecordedMetrics()) val writer = new PrintWriter(new OutputStreamWriter(System.out)) before { System.setSecurityManager(null) spark.experimental.extraStrategies = new IntervalTreeJoinStrategyOptim( spark) :: Nil Metrics.initialize(sc) val rdd1 = sc .textFile(getClass.getResource("/refFlat.txt.bz2").getPath) .map(r => r.split('\t')) .map( r => Row( r(2).toString, r(4).toInt, r(5).toInt )) val ref = spark.createDataFrame(rdd1, schema) ref.createOrReplaceTempView("ref") val rdd2 = sc .textFile(getClass.getResource("/snp150Flagged.txt.bz2").getPath) .map(r => r.split('\t')) .map( r => Row( r(1).toString, r(2).toInt, r(3).toInt )) val snp = spark .createDataFrame(rdd2, schema) snp.createOrReplaceTempView("snp") } test("Join order - broadcasting snp table") { spark.sqlContext.setConf("spark.biodatageeks.rangejoin.useJoinOrder", "true") val query = s""" |SELECT snp.*,ref.* FROM ref JOIN snp |ON (ref.chr=snp.chr AND snp.end>=ref.start AND snp.start<=ref.end) """.stripMargin assert(spark.sql(query).count === 616404L) } test("Join order - broadcasting ref table") { spark.sqlContext.setConf("spark.biodatageeks.rangejoin.useJoinOrder", "true") val query = s""" |SELECT snp.*,ref.* FROM snp JOIN ref |ON (ref.chr=snp.chr AND snp.end>=ref.start AND snp.start<=ref.end) """.stripMargin assert(spark.sql(query).count === 616404L) } after { Metrics.print(writer, Some(metricsListener.metrics.sparkMetrics.stageTimes)) writer.flush() Metrics.stopRecording() } }
Example 2
Source File: PackageSpec.scala From Spark2Elasticsearch with Apache License 2.0 | 5 votes |
package com.github.jparkie.spark.elasticsearch.sql import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.sql.SQLContext import org.scalatest.{ MustMatchers, WordSpec } class PackageSpec extends WordSpec with MustMatchers with SharedSparkContext { "Package com.github.jparkie.spark.elasticsearch.sql" must { "lift DataFrame into SparkEsDataFrameFunctions" in { val sqlContext = new SQLContext(sc) val inputData = Seq( ("TEST_VALUE_1", 1), ("TEST_VALUE_2", 2), ("TEST_VALUE_3", 3) ) val outputDataFrame = sqlContext.createDataFrame(inputData) .toDF("key", "value") // If sparkContext is available, DataFrame was lifted into SparkEsDataFrameFunctions. outputDataFrame.sparkContext } } }
Example 3
Source File: AnomalyDetection$Test.scala From spark-anomaly-detection with MIT License | 5 votes |
package com.micvog.ml import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.scalactic.Equality import org.scalatest.{FlatSpec, FunSuite, Matchers} class AnomalyDetection$Test extends FlatSpec with Matchers with SharedSparkContext { { val point = Vectors.dense(Array(14.8593411857427, 14.9006647394062)) val means = Vectors.dense(Array(14.1122257839456, 14.9977105081362)) val variances = Vectors.dense(Array(1.83263141349452, 1.70974533082878)) "probFunction" should "return correct product value" in { val p = AnomalyDetection.probFunction(point, means, variances) assert(p === 0.0769984879544 +- 0.0001) } "predict" should "predict the anomaly" in { assert(!AnomalyDetection.predict(point, means, variances, 0.05)) } "predict" should "predict non anomaly" in { assert(AnomalyDetection.predict(point, means, variances, 0.08)) } } private def vectorequality() = { new Equality[Vector] { def areEqual(a: Vector, b: Any): Boolean = b match { case v: Vector => v.toArray.zip(a.toArray).map(pair => pair._1 === pair._2 +- 0.001).reduce((a, b) => a && b) case _ => false } } } def trainModel(): AnomalyDetectionModel = { val trainingExamplesFilePath = "./src/test/resources/training.csv" val trainingData = sc.textFile(trainingExamplesFilePath, 2).cache() val trainingRdd = FeaturesParser.parseFeatures(trainingData) new AnomalyDetection().run(trainingRdd) } "run" should "return model with correct mean and variance" in { val model: AnomalyDetectionModel = trainModel() //use scalactic's more relaxing equality implicit val vectorEq = vectorequality() assert(model.means === Vectors.dense(Array(79.9843751617201, 5.13662727300755))) assert(model.variances === Vectors.dense(Array(356.44539323536225, 3.79818173645375))) } "optimize" should "calculate epsilon and F1 score" in { val cvFilePath = "./src/test/resources/cross_val.csv" val cvData = sc.textFile(cvFilePath, 2).cache() val cvPointsRdd: RDD[LabeledPoint] = FeaturesParser.parseFeaturesWithLabel(cvData) val model = trainModel() val optimalModel = new AnomalyDetection().optimize(cvPointsRdd, model) assert(optimalModel.epsilon === 3.382218E-4 +- 0.0000000001) } }
Example 4
Source File: BEDBaseTestSuite.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.tests.base import com.holdenkarau.spark.testing.{DataFrameSuiteBase, SharedSparkContext} import org.scalatest.{BeforeAndAfter, FunSuite} class BEDBaseTestSuite extends FunSuite with DataFrameSuiteBase with SharedSparkContext with BeforeAndAfter{ val bedPath: String = getClass.getResource("/bed/test.bed").getPath val tableNameBED = "targets" val bedSimplePath: String = getClass.getResource("/bed/simple.bed").getPath val tableNameSimpleBED = "simple_targets" before{ spark.sql(s"DROP TABLE IF EXISTS $tableNameBED") spark.sql(s""" |CREATE TABLE $tableNameBED |USING org.biodatageeks.sequila.datasources.BED.BEDDataSource |OPTIONS(path "$bedPath") | """.stripMargin) spark.sql(s"DROP TABLE IF EXISTS $tableNameSimpleBED") spark.sql(s""" |CREATE TABLE $tableNameSimpleBED |USING org.biodatageeks.sequila.datasources.BED.BEDDataSource |OPTIONS(path "$bedSimplePath") | """.stripMargin) } def after = { spark.sql(s"DROP TABLE IF EXISTS $tableNameBED") spark.sql(s"DROP TABLE IF EXISTS $tableNameSimpleBED") } }
Example 5
Source File: BEDReaderTestSuite.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.tests.datasources import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.sql.SequilaSession import org.biodatageeks.sequila.tests.base.BEDBaseTestSuite import org.biodatageeks.sequila.utils.SequilaRegister class BEDReaderTestSuite extends BEDBaseTestSuite with SharedSparkContext { test("Read BED file") { val ss = SequilaSession(spark) SequilaRegister.register(ss) val sqlText = s"SELECT * FROM ${tableNameBED}" ss .sql(sqlText) .show() val res = ss .sql(sqlText) .first() assert(res.getString(0) === "22") assert(res.getInt(1) === 1000 + 1) //test 1-based assert(res.getInt(2) === 5000) assert(res.getString(5) === "+") assert(res.getAs[Array[Int]](10) === Array(567, 488)) } test("Read Simple BED file") { val ss = SequilaSession(spark) SequilaRegister.register(ss) val sqlText = s"SELECT * FROM ${tableNameSimpleBED}" ss .sql(sqlText) .show() val res = ss .sql(sqlText) .first() assert(res.getString(0) === "11") assert(res.getInt(1) === 1000 + 1) //test 1-based assert(res.getInt(2) === 5000) assert(res.getString(3) === null) } }
Example 6
Source File: FASTQReaderTestSuite.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.tests.datasources import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.sql.SequilaSession import org.biodatageeks.sequila.tests.base.FASTQBaseTestSuite import org.biodatageeks.sequila.utils.SequilaRegister class FASTQReaderTestSuite extends FASTQBaseTestSuite with SharedSparkContext { test("Read FASTQ file"){ val ss = SequilaSession(spark) SequilaRegister.register(ss) val sqlText = s"SELECT * FROM ${tableNameFASTQ}" ss .sql(sqlText) .show() val res = ss .sql(sqlText) .first() assert(res.getString(0) === "NA12988") assert(res.getBoolean(8) === false) assert(res.getString(11) == "GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT") assert(res.getString(12) == "!''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65") } }
Example 7
Source File: VCFDataSourceTestSuite.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.tests.datasources import com.holdenkarau.spark.testing.{DataFrameSuiteBase, SharedSparkContext} import org.biodatageeks.sequila.utils.Columns import org.scalatest.{BeforeAndAfter, FunSuite} class VCFDataSourceTestSuite extends FunSuite with DataFrameSuiteBase with BeforeAndAfter with SharedSparkContext { val vcfPath: String = getClass.getResource("/vcf/test.vcf").getPath val tableNameVCF = "variants" before { spark.sql(s"DROP TABLE IF EXISTS $tableNameVCF") spark.sql(s""" |CREATE TABLE $tableNameVCF |USING org.biodatageeks.sequila.datasources.VCF.VCFDataSource |OPTIONS(path "$vcfPath") | """.stripMargin) } test("VCF - Row count VCFDataSource") { val query = s"SELECT * FROM $tableNameVCF" spark .sql(query) .printSchema() assert( spark .sql(query) .first() .getString(0) === "20") assert(spark.sql(query).count() === 7L) } after { spark.sql(s"DROP TABLE IF EXISTS $tableNameVCF") } }
Example 8
Source File: PileupTestBase.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.tests.pileup import com.holdenkarau.spark.testing.{DataFrameSuiteBase, SharedSparkContext} import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession} import org.apache.spark.sql.types.{IntegerType, ShortType, StringType, StructField, StructType} import org.scalatest.{BeforeAndAfter, FunSuite} class PileupTestBase extends FunSuite with DataFrameSuiteBase with BeforeAndAfter with SharedSparkContext{ val sampleId = "NA12878.multichrom.md" val samResPath: String = getClass.getResource("/multichrom/mdbam/samtools.pileup").getPath val referencePath: String = getClass.getResource("/reference/Homo_sapiens_assembly18_chr1_chrM.small.fasta").getPath val bamPath: String = getClass.getResource(s"/multichrom/mdbam/${sampleId}.bam").getPath val cramPath : String = getClass.getResource(s"/multichrom/mdcram/${sampleId}.cram").getPath val tableName = "reads_bam" val tableNameCRAM = "reads_cram" val schema: StructType = StructType( List( StructField("contig", StringType, nullable = true), StructField("position", IntegerType, nullable = true), StructField("reference", StringType, nullable = true), StructField("coverage", ShortType, nullable = true), StructField("pileup", StringType, nullable = true), StructField("quality", StringType, nullable = true) ) ) before { System.setProperty("spark.kryo.registrator", "org.biodatageeks.sequila.pileup.serializers.CustomKryoRegistrator") spark .conf.set("spark.sql.shuffle.partitions",1) //FIXME: In order to get orderBy in Samtools tests working - related to exchange partitions stage spark.sql(s"DROP TABLE IF EXISTS $tableName") spark.sql( s""" |CREATE TABLE $tableName |USING org.biodatageeks.sequila.datasources.BAM.BAMDataSource |OPTIONS(path "$bamPath") | """.stripMargin) spark.sql(s"DROP TABLE IF EXISTS $tableNameCRAM") spark.sql( s""" |CREATE TABLE $tableNameCRAM |USING org.biodatageeks.sequila.datasources.BAM.CRAMDataSource |OPTIONS(path "$cramPath", refPath "$referencePath" ) | """.stripMargin) val mapToString = (map: Map[Byte, Short]) => { if (map == null) "null" else map.map({ case (k, v) => k.toChar -> v}).mkString.replace(" -> ", ":") } val byteToString = ((byte: Byte) => byte.toString) spark.udf.register("mapToString", mapToString) spark.udf.register("byteToString", byteToString) } }
Example 9
Source File: FeatureCountsTestSuite.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.tests.rangejoins import com.holdenkarau.spark.testing.{DataFrameSuiteBase, SharedSparkContext} import htsjdk.samtools.ValidationStringency import org.apache.hadoop.io.LongWritable import org.biodatageeks.sequila.apps.FeatureCounts.Region import org.biodatageeks.sequila.rangejoins.IntervalTree.IntervalTreeJoinStrategyOptim import org.biodatageeks.sequila.utils.{Columns, DataQualityFuncs} import org.scalatest.{BeforeAndAfter, FunSuite} import org.seqdoop.hadoop_bam.util.SAMHeaderReader import org.seqdoop.hadoop_bam.{BAMInputFormat, SAMRecordWritable} class FeatureCountsTestSuite extends FunSuite with DataFrameSuiteBase with BeforeAndAfter with SharedSparkContext { before { System.setSecurityManager(null) spark.experimental.extraStrategies = new IntervalTreeJoinStrategyOptim( spark) :: Nil } test("Feature counts for chr1:20138-20294") { val query = s""" | SELECT count(*),targets.${Columns.CONTIG},targets.${Columns.START},targets.${Columns.END} | FROM reads JOIN targets |ON ( | targets.${Columns.CONTIG}=reads.${Columns.CONTIG} | AND | reads.${Columns.END} >= targets.${Columns.START} | AND | reads.${Columns.START} <= targets.${Columns.END} |) | GROUP BY targets.${Columns.CONTIG},targets.${Columns.START},targets.${Columns.END} | HAVING ${Columns.CONTIG}='1' AND ${Columns.START} = 20138 AND ${Columns.END} = 20294""".stripMargin spark.sparkContext.hadoopConfiguration.set( SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, ValidationStringency.SILENT.toString) val alignments = spark.sparkContext .newAPIHadoopFile[LongWritable, SAMRecordWritable, BAMInputFormat]( getClass.getResource("/NA12878.slice.bam").getPath) .map(_._2.get) .map(r => Region(DataQualityFuncs.cleanContig(r.getContig), r.getStart, r.getEnd)) val reads = spark.sqlContext .createDataFrame(alignments) .withColumnRenamed("contigName", Columns.CONTIG) .withColumnRenamed("start", Columns.START) .withColumnRenamed("end", Columns.END) reads.createOrReplaceTempView("reads") val targets = spark.sqlContext .createDataFrame(Array(Region("1", 20138, 20294))) .withColumnRenamed("contigName", Columns.CONTIG) .withColumnRenamed("start", Columns.START) .withColumnRenamed("end", Columns.END) targets.createOrReplaceTempView("targets") spark.sql(query).explain(false) assert(spark.sql(query).first().getLong(0) === 1484L) } }
Example 10
Source File: SparkEsBulkWriterSpec.scala From Spark2Elasticsearch with Apache License 2.0 | 5 votes |
package com.github.jparkie.spark.elasticsearch import com.github.jparkie.spark.elasticsearch.conf.{ SparkEsMapperConf, SparkEsWriteConf } import com.github.jparkie.spark.elasticsearch.sql.{ SparkEsDataFrameMapper, SparkEsDataFrameSerializer } import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.sql.types.{ LongType, StringType, StructField, StructType } import org.apache.spark.sql.{ Row, SQLContext } import org.scalatest.{ MustMatchers, WordSpec } class SparkEsBulkWriterSpec extends WordSpec with MustMatchers with SharedSparkContext { val esServer = new ElasticSearchServer() override def beforeAll(): Unit = { super.beforeAll() esServer.start() } override def afterAll(): Unit = { esServer.stop() super.afterAll() } "SparkEsBulkWriter" must { "execute write() successfully" in { esServer.createAndWaitForIndex("test_index") val sqlContext = new SQLContext(sc) val inputSparkEsWriteConf = SparkEsWriteConf( bulkActions = 10, bulkSizeInMB = 1, concurrentRequests = 0, flushTimeoutInSeconds = 1 ) val inputMapperConf = SparkEsMapperConf( esMappingId = Some("id"), esMappingParent = None, esMappingVersion = None, esMappingVersionType = None, esMappingRouting = None, esMappingTTLInMillis = None, esMappingTimestamp = None ) val inputSchema = StructType( Array( StructField("id", StringType, true), StructField("parent", StringType, true), StructField("version", LongType, true), StructField("routing", StringType, true), StructField("ttl", LongType, true), StructField("timestamp", StringType, true), StructField("value", LongType, true) ) ) val inputData = sc.parallelize { Array( Row("TEST_ID_1", "TEST_PARENT_1", 1L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 1L), Row("TEST_ID_1", "TEST_PARENT_2", 2L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 2L), Row("TEST_ID_1", "TEST_PARENT_3", 3L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 3L), Row("TEST_ID_1", "TEST_PARENT_4", 4L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 4L), Row("TEST_ID_1", "TEST_PARENT_5", 5L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 5L), Row("TEST_ID_5", "TEST_PARENT_6", 6L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 6L), Row("TEST_ID_6", "TEST_PARENT_7", 7L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 7L), Row("TEST_ID_7", "TEST_PARENT_8", 8L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 8L), Row("TEST_ID_8", "TEST_PARENT_9", 9L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 9L), Row("TEST_ID_9", "TEST_PARENT_10", 10L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 10L), Row("TEST_ID_10", "TEST_PARENT_11", 11L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 11L) ) } val inputDataFrame = sqlContext.createDataFrame(inputData, inputSchema) val inputDataIterator = inputDataFrame.rdd.toLocalIterator val inputSparkEsBulkWriter = new SparkEsBulkWriter[Row]( esIndex = "test_index", esType = "test_type", esClient = () => esServer.client, sparkEsSerializer = new SparkEsDataFrameSerializer(inputSchema), sparkEsMapper = new SparkEsDataFrameMapper(inputMapperConf), sparkEsWriteConf = inputSparkEsWriteConf ) inputSparkEsBulkWriter.write(null, inputDataIterator) val outputGetResponse = esServer.client.prepareGet("test_index", "test_type", "TEST_ID_1").get() outputGetResponse.isExists mustEqual true outputGetResponse.getSource.get("parent").asInstanceOf[String] mustEqual "TEST_PARENT_5" outputGetResponse.getSource.get("version").asInstanceOf[Integer] mustEqual 5 outputGetResponse.getSource.get("routing").asInstanceOf[String] mustEqual "TEST_ROUTING_1" outputGetResponse.getSource.get("ttl").asInstanceOf[Integer] mustEqual 86400000 outputGetResponse.getSource.get("timestamp").asInstanceOf[String] mustEqual "TEST_TIMESTAMP_1" outputGetResponse.getSource.get("value").asInstanceOf[Integer] mustEqual 5 } } }
Example 11
Source File: LongReadsTestSuite.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.tests.coverage import com.holdenkarau.spark.testing.{DataFrameSuiteBase, SharedSparkContext} import org.apache.spark.sql.{SequilaSession, SparkSession} import org.biodatageeks.sequila.utils.{Columns, InternalParams, SequilaRegister} import org.scalatest.{BeforeAndAfter, FunSuite} class LongReadsTestSuite extends FunSuite with DataFrameSuiteBase with BeforeAndAfter with SharedSparkContext { val bamPath: String = getClass.getResource("/nanopore_guppy_slice.bam").getPath val splitSize = 30000 val tableNameBAM = "reads" before { System.setSecurityManager(null) spark.sql(s"DROP TABLE IF EXISTS $tableNameBAM") spark.sql(s""" |CREATE TABLE $tableNameBAM |USING org.biodatageeks.sequila.datasources.BAM.BAMDataSource |OPTIONS(path "$bamPath") | """.stripMargin) } test("BAM - Nanopore with guppy basecaller") { val session: SparkSession = SequilaSession(spark) SequilaRegister.register(session) session.sparkContext .setLogLevel("WARN") val bdg = session.sql(s"SELECT * FROM ${tableNameBAM}") assert(bdg.count() === 150) } test("BAM - coverage - Nanopore with guppy basecaller") { spark.sqlContext.setConf(InternalParams.InputSplitSize, (splitSize * 10).toString) val session2: SparkSession = SequilaSession(spark) SequilaRegister.register(session2) val query = s"""SELECT ${Columns.CONTIG}, ${Columns.START}, ${Columns.COVERAGE} FROM bdg_coverage('$tableNameBAM','nanopore_guppy_slice','bases') order by ${Columns.CONTIG},${Columns.START},${Columns.END} """.stripMargin val covMultiPartitionDF = session2.sql(query) //covMultiPartitionDF.coalesce(1).write.mode("overwrite").option("delimiter", "\t").csv("/Users/aga/workplace/multiPart.csv") assert(covMultiPartitionDF.count() == 45620) // total count check 45620<---> 45842 assert(covMultiPartitionDF.filter(s"${Columns.COVERAGE}== 0").count == 0) assert( covMultiPartitionDF .where(s"${Columns.CONTIG}='21' and ${Columns.START} == 5010515") .first() .getShort(2) == 1) // value check [first element] assert( covMultiPartitionDF .where(s"${Columns.CONTIG}='21' and ${Columns.START} == 5022667") .first() .getShort(2) == 15) // value check [partition boundary] assert( covMultiPartitionDF .where(s"${Columns.CONTIG}='21' and ${Columns.START} == 5036398") .first() .getShort(2) == 14) // value check [partition boundary] assert( covMultiPartitionDF .where(s"${Columns.CONTIG}='21' and ${Columns.START} == 5056356") .first() .getShort(2) == 1) // value check [last element] } }
Example 12
Source File: SequilaDatasourceStrategyTestSuite.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.tests.optimizations import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.sql.SequilaSession import org.biodatageeks.sequila.tests.base.BAMBaseTestSuite import org.biodatageeks.sequila.utils.{Columns, SequilaRegister} class SequilaDatasourceStrategyTestSuite extends BAMBaseTestSuite with SharedSparkContext { test("Test query with distinct sample optimization") { val ss = SequilaSession(spark) SequilaRegister.register(ss) assert( ss.sql(s"SELECT distinct ${Columns.SAMPLE} FROM $tableNameBAM LIMIT 10") .count() === 1) assert( ss.sql(s"SELECT distinct ${Columns.SAMPLE} FROM $tableNameBAM LIMIT 10") .first() .getString(0) === "NA12878") } test("TEST query all columns with LIMIT optimization") { val ss = SequilaSession(spark) SequilaRegister.register(ss) ss.sparkContext.setLogLevel("INFO") val sqlText = s"SELECT * FROM $tableNameBAM LIMIT 10" ss.time { ss .sql(sqlText) .show } } test("TEST query subset columns with LIMIT optimization") { val ss = SequilaSession(spark) SequilaRegister.register(ss) ss.sparkContext.setLogLevel("INFO") val sqlText = s"SELECT ${Columns.QNAME},${Columns.SEQUENCE},${Columns.BASEQ} FROM $tableNameBAM LIMIT 10" ss.time { ss .sql(sqlText) .show } } }
Example 13
Source File: TransformationTestWithSparkTestingBase.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter16.SparkTesting import org.scalatest.Assertions._ import org.apache.spark.rdd.RDD import com.holdenkarau.spark.testing.SharedSparkContext import org.scalatest.FunSuite class TransformationTestWithSparkTestingBase extends FunSuite with SharedSparkContext { def tokenize(line: RDD[String]) = { line.map(x => x.split(' ')).collect() } test("works, obviously!") { assert(1 == 1) } test("Words counting") { assert(sc.parallelize("Hello world My name is Reza".split("\\W")).map(_ + 1).count == 6) } test("Testing RDD transformations using a shared Spark Context") { val input = List("Testing", "RDD transformations", "using a shared", "Spark Context") val expected = Array(Array("Testing"), Array("RDD", "transformations"), Array("using", "a", "shared"), Array("Spark", "Context")) val transformed = tokenize(sc.parallelize(input)) assert(transformed === expected) } }
Example 14
Source File: LagDstrFactorySuite.scala From lagraph with Apache License 2.0 | 5 votes |
package com.ibm.lagraph.impl // TODO get rid of printlns // scalastyle:off println import com.holdenkarau.spark.testing.SharedSparkContext import org.scalatest.FunSuite import org.scalatest.Matchers import scala.reflect.ClassTag import scala.collection.mutable.{Map => MMap} import com.ibm.lagraph._ class LagDstrFactorySuite extends FunSuite with Matchers with SharedSparkContext { val DEBUG = false val denseGraphSizes = List(1 << 4, 1 << 5) // val sparseGraphSizes = List(1 << 16, 1 << 17, 1 << 29, 1 << 30) val sparseGraphSizes = List(1 << 16, 1 << 17, 1 << 26, 1 << 27) val nblocks = List(1 << 0, 1 << 1, 1 << 2, 1 << 3) test("test initializing spark context") { val hc: LagContext = LagContext.getLagDstrContext(sc, 1 << 3, 1) val list = nblocks val rdd = sc.parallelize(list) assert(rdd.count === list.length) } test("LagDstrContext.vIndices") { for (graphSize <- denseGraphSizes) { for (nblock <- nblocks) { if (DEBUG) println("LagDstrContext.vIndices", graphSize, nblock) val hc: LagContext = LagContext.getLagDstrContext(sc, graphSize, nblock) val start = 2 val end = start + hc.graphSize val v = hc.vIndices(start) val vRes = hc.vToVector(v) assert(v.size == hc.graphSize) assert(vRes.size == (end - start)) (start until end.toInt).map { r => assert(vRes(r - start) == r) } } } } test("LagDstrContext.mIndices") { for (graphSize <- denseGraphSizes) { for (nblock <- nblocks) { if (DEBUG) println("LagDstrContext.mIndices", graphSize, nblock) val hc: LagContext = LagContext.getLagDstrContext(sc, graphSize, nblock) val start = (2L, 2L) val m = hc.mIndices(start) val (mResMap, sparseValue) = hc.mToMap(m) val mRes = LagContext.vectorOfVectorFromMap(mResMap, sparseValue, m.size) val end = (start._1 + graphSize, start._2 + graphSize) assert(mRes.size == (end._1 - start._1)) mRes.zipWithIndex.map { case (vr, r) => { assert(vr.size == (end._2 - start._2)) vr.zipWithIndex.map { case (vc, c) => assert(vc == (start._1 + r, start._2 + c)) } } } } } } test("LagDstrContext.mReplicate") { for (graphSize <- denseGraphSizes) { for (nblock <- nblocks) { if (DEBUG) println("LagDstrContext.mReplicate", graphSize, nblock) val hc: LagContext = LagContext.getLagDstrContext(sc, graphSize, nblock) val singleValue: Double = 99.0 val m = hc.mReplicate(singleValue) val (mResMap, sparseValue) = hc.mToMap(m) val mRes = LagContext.vectorOfVectorFromMap(mResMap, sparseValue, m.size) mRes.zipWithIndex.map { case (vr, r) => { assert(vr.size == graphSize) vr.zipWithIndex.map { case (vc, c) => assert(vc == singleValue) } } } } } } } // scalastyle:on println
Example 15
Source File: FilmsTest.scala From spark-flow with Apache License 2.0 | 5 votes |
package com.bloomberg.sparkflow.example import com.bloomberg.sparkflow.example.FilmsPipeline.FilmMain import com.holdenkarau.spark.testing.SharedSparkContext import org.scalatest.FunSuite class FilmsTest extends FunSuite with SharedSparkContext { private def testFile(fileName: String): String = { Thread.currentThread().getContextClassLoader.getResource(fileName).toString } test("pipeline"){ val filmPipe = new FilmMain filmPipe.filmRows.getDF(sc).show() filmPipe.topActors.get(sc).foreach(println) filmPipe.filmsWithTopActors.getDataset(sc).show() println(filmPipe.filmsWithTopActors.count.get(sc)) } }
Example 16
Source File: HashingTest.scala From spark-flow with Apache License 2.0 | 5 votes |
package com.bloomberg.sparkflow.serialization import org.scalatest._ import com.bloomberg.sparkflow._ import com.bloomberg.sparkflow.serialization.HashingSample import com.bloomberg.sparkflow.serialization.ClassExploration._ import com.bloomberg.sparkflow.serialization.Hashing._ import com.holdenkarau.spark.testing.SharedSparkContext class HashingTest extends FunSuite with SharedSparkContext with ShouldMatchers{ test("functionHashing"){ var param = 7 val input = 5 val another = (x: Int) => x * 2 val nested = (x: Int) => x * 4 + param + another(x) val g = (x: Int) => nested(x) + param val initialOutput = g(input) val initialGHash = hashClass(g) assert(initialGHash != hashClass(nested)) assert(initialGHash != hashClass(another)) assert(initialGHash == hashClass(g)) param = 10 assert(initialGHash != hashClass(g)) assert(initialOutput != g(input)) } test("dcHashing"){ val numbers = parallelize(1 to 10) val filtered = numbers.filter(_ < 6) val doubled = filtered.map(_ * 2) val after = doubled.map(SomeFunctions.func4) val allSignatures = Set(numbers.getSignature, filtered.getSignature, doubled.getSignature, after.getSignature) assert(allSignatures.size == 4) } test("caseHashing"){ // println(s"fieldObjects: ${getFieldObjects(HashingSample.result)}") // println(s"result: ${HashingSample.result.getSignature}") assert(HashingSample.result.getSignature.length > 0) } }
Example 17
Source File: SecondaryPairDCFunctionsTest.scala From spark-flow with Apache License 2.0 | 5 votes |
package com.bloomberg.sparkflow.dc import com.bloomberg.sparkflow._ import com.holdenkarau.spark.testing.SharedSparkContext import org.scalatest.{ShouldMatchers, FunSuite} class SecondaryPairDCFunctionsTest extends FunSuite with SharedSparkContext with ShouldMatchers { test("testRepartAndSort") { val input = parallelize(Seq( (("a",3), 0), (("b",2), 0), (("b",1), 0), (("b",3), 0), (("a",2), 0), (("a",1), 0))) val sortAndRepart = input.repartitionAndSecondarySortWithinPartitions(2) val result = sortAndRepart.mapPartitions(it => Iterator(it.toList)) val expected = Seq( List( (("a",1), 0), (("a",2), 0), (("a",3), 0)), List( (("b",1), 0), (("b",2), 0), (("b",3), 0))) expected should contain theSameElementsAs result.getRDD(sc).collect() } }
Example 18
Source File: DRTest.scala From spark-flow with Apache License 2.0 | 5 votes |
package com.bloomberg.sparkflow.dc import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.mllib.clustering.LDA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.rdd.RDD import org.scalatest._ import scala.util.Random import com.bloomberg.sparkflow._ val randomVecs = parallelize(1 to 100).map(i => Vectors.dense(Seq.fill(10)(Random.nextDouble()).toArray)) val corpus = randomVecs.zipWithUniqueId().map{case (k,v) => (v,k)} val ldaModel = corpus.mapToResult(rdd => new LDA().setK(3).run(rdd)) } test("regularSpark"){ val numbers: RDD[Int] = sc.parallelize(1 to 10) val doubles: RDD[Double] = numbers.map(_.toDouble) val sum: Double = doubles.sum() val normalized: RDD[Double] = doubles.map(_ / sum) } }
Example 19
Source File: LuceneRDDMoreLikeThisSpec.scala From spark-lucenerdd with Apache License 2.0 | 5 votes |
package org.zouzias.spark.lucenerdd import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.SparkConf import scala.collection.JavaConverters._ import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers} import scala.io.Source class LuceneRDDMoreLikeThisSpec extends FlatSpec with Matchers with BeforeAndAfterEach with SharedSparkContext { var luceneRDD: LuceneRDD[_] = _ override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). setMaster("local[*]"). setAppName("test"). set("spark.ui.enabled", "false"). set("spark.app.id", appID)) override def afterEach() { luceneRDD.close() } "LuceneRDD.moreLikeThis" should "return relevant documents" in { val words: Seq[String] = Source.fromFile("src/test/resources/alice.txt") .getLines().map(_.toLowerCase).toSeq val rdd = sc.parallelize(words) luceneRDD = LuceneRDD(rdd) val results = luceneRDD .moreLikeThis("_1", "alice adventures wonderland", 1, 1) .collect() results.length > 0 should equal(true) val firstDoc = results.head val x = firstDoc.getString(firstDoc.fieldIndex("_1")) x.contains("alice") && x.contains("wonderland") && x.contains("adventures") should equal(true) val lastDoc = results.last val y = lastDoc.getString(lastDoc.fieldIndex("_1")) y.contains("alice") && !y.contains("wonderland") && !y.contains("adventures") should equal(true) } }
Example 20
Source File: LucenePrimitiveTypesSpec.scala From spark-lucenerdd with Apache License 2.0 | 5 votes |
package org.zouzias.spark.lucenerdd import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.SparkConf import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers} class LucenePrimitiveTypesSpec extends FlatSpec with Matchers with BeforeAndAfterEach with SharedSparkContext { override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). setMaster("local[*]"). setAppName("test"). set("spark.ui.enabled", "false"). set("spark.app.id", appID)) def randomString(length: Int): String = scala.util.Random.alphanumeric.take(length).mkString val array = (1 to 24).map(randomString(_)) var luceneRDD: LuceneRDD[_] = _ override def afterEach() { luceneRDD.close() } "LuceneRDD" should "work with RDD[Array[String]]" in { val array = Array(Array("aaa", "aaa2"), Array("bbb", "bbb2"), Array("ccc", "ccc2"), Array("ddd"), Array("eee")) val rdd = sc.parallelize(array) luceneRDD = LuceneRDD(rdd) luceneRDD.count should be (array.length) } "LuceneRDD" should "work with RDD[Set[String]]" in { val array = Array(Set("aaa", "aaa2"), Set("bbb", "bbb2"), Set("ccc", "ccc2"), Set("ddd"), Set("eee")) val rdd = sc.parallelize(array) luceneRDD = LuceneRDD(rdd) luceneRDD.count should be (array.length) } "LuceneRDD" should "work with RDD[String]" in { val array = Array("aaa", "bbb", "ccc", "ddd", "eee") val rdd = sc.parallelize(array) luceneRDD = LuceneRDD(rdd) luceneRDD.count should be (array.length) } "LuceneRDD" should "work with RDD[Int]" in { val array = (1 to 22) val rdd = sc.parallelize(array) luceneRDD = LuceneRDD(rdd) luceneRDD.count should be (array.size) } "LuceneRDD" should "work with RDD[Float]" in { val array: IndexedSeq[Float] = (1 to 22).map(_.toFloat) val rdd = sc.parallelize(array) luceneRDD = LuceneRDD(rdd) luceneRDD.count should be (array.size) } "LuceneRDD" should "work with RDD[Double]" in { val array: IndexedSeq[Double] = (1 to 22).map(_.toDouble) val rdd = sc.parallelize(array) luceneRDD = LuceneRDD(rdd) luceneRDD.count should be (array.size) } "LuceneRDD" should "work with RDD[Long]" in { val array: IndexedSeq[Long] = (1 to 22).map(_.toLong) val rdd = sc.parallelize(array) luceneRDD = LuceneRDD(rdd) luceneRDD.count should equal (array.size) } "LuceneRDD" should "work with RDD[Map[String, String]]" in { val maps = List(Map( "a" -> "hello"), Map("b" -> "world"), Map("c" -> "how are you")) val rdd = sc.parallelize(maps) luceneRDD = LuceneRDD(rdd) luceneRDD.count should equal (maps.size) luceneRDD.termQuery("a", "hello").isEmpty() should equal (false) luceneRDD.prefixQuery("b", "wor").isEmpty() should equal (false) luceneRDD.prefixQuery("a", "no").isEmpty() should equal (true) } "LuceneRDD" should "work with RDD[String] and ignore null values" in { val array = Array("aaa", null, "ccc", null, "eee") val rdd = sc.parallelize(array) luceneRDD = LuceneRDD(rdd) luceneRDD.count should be (array.length) } }
Example 21
Source File: BlockingLinkageSpec.scala From spark-lucenerdd with Apache License 2.0 | 5 votes |
package org.zouzias.spark.lucenerdd import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.lucene.index.Term import org.apache.lucene.search.{Query, TermQuery} import org.apache.spark.SparkConf import org.apache.spark.sql.{Row, SparkSession} import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers} import org.zouzias.spark.lucenerdd.testing.Person class BlockingLinkageSpec extends FlatSpec with Matchers with BeforeAndAfterEach with SharedSparkContext { override val conf: SparkConf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). setMaster("local[*]"). setAppName("test"). set("spark.ui.enabled", "false"). set("spark.app.id", appID)) "LuceneRDD.blockEntityLinkage" should "deduplicate elements on unique elements" in { val spark = SparkSession.builder().getOrCreate() import spark.implicits._ val peopleLeft: Array[Person] = Array("fear", "death", "water", "fire", "house") .zipWithIndex.map { case (str, index) => val email = if (index % 2 == 0) "[email protected]" else "[email protected]" Person(str, index, email) } val peopleRight: Array[Person] = Array("fear", "death", "water", "fire", "house") .zipWithIndex.map { case (str, index) => val email = if (index % 2 == 0) "[email protected]" else "[email protected]" Person(str, index, email) } val leftDF = sc.parallelize(peopleLeft).repartition(2).toDF() val rightDF = sc.parallelize(peopleRight).repartition(3).toDF() // Define a Lucene Term linker val linker: Row => Query = { row => val name = row.getString(row.fieldIndex("name")) val term = new Term("name", name) new TermQuery(term) } val linked = LuceneRDD.blockEntityLinkage(leftDF, rightDF, linker, Array("email"), Array("email")) val linkedCount, dfCount = (linked.count, leftDF.count()) linkedCount should equal(dfCount) // Check for correctness // Age is a unique index linked.collect().foreach { case (row, results) => val leftAge, rightAge = (row.getInt(row.fieldIndex("age")), results.headOption.map(x => x.getInt(x.fieldIndex("age")))) leftAge should equal(rightAge) } } }
Example 22
Source File: LuceneRDDCustomCaseClassImplicitsSpec.scala From spark-lucenerdd with Apache License 2.0 | 5 votes |
package org.zouzias.spark.lucenerdd import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.SparkConf import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers} import org.zouzias.spark.lucenerdd.testing.Person class LuceneRDDCustomCaseClassImplicitsSpec extends FlatSpec with Matchers with BeforeAndAfterEach with SharedSparkContext { var luceneRDD: LuceneRDD[_] = _ override def afterEach() { luceneRDD.close() } override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). setMaster("local[*]"). setAppName("test"). set("spark.ui.enabled", "false"). set("spark.app.id", appID)) val elem: Array[Person] = Array("fear", "death", "water", "fire", "house") .zipWithIndex.map{ case (str, index) => Person(str, index, s"${str}@gmail.com")} "LuceneRDD(case class).count" should "handle nulls properly" in { val elemsWithNulls = Array("fear", "death", "water", "fire", "house") .zipWithIndex.map{ case (str, index) => Person(str, index, null)} val rdd = sc.parallelize(elemsWithNulls) luceneRDD = LuceneRDD(rdd) luceneRDD.count() should equal (elemsWithNulls.length) } "LuceneRDD(case class).count" should "return correct number of elements" in { val rdd = sc.parallelize(elem) luceneRDD = LuceneRDD(rdd) luceneRDD.count() should equal (elem.length) } "LuceneRDD(case class).fields" should "return all fields" in { val rdd = sc.parallelize(elem) luceneRDD = LuceneRDD(rdd) luceneRDD.fields().size should equal(3) luceneRDD.fields().contains("name") should equal(true) luceneRDD.fields().contains("age") should equal(true) luceneRDD.fields().contains("email") should equal(true) } "LuceneRDD(case class).termQuery" should "correctly search with TermQueries" in { val rdd = sc.parallelize(elem) luceneRDD = LuceneRDD(rdd) val results = luceneRDD.termQuery("name", "water") results.count() should equal(1) } }
Example 23
Source File: ShapeLuceneRDDImplicitsSpec.scala From spark-lucenerdd with Apache License 2.0 | 5 votes |
package org.zouzias.spark.lucenerdd.spatial.shape.implicits import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers} import org.zouzias.spark.lucenerdd.spatial.shape.{ShapeLuceneRDD, _} import org.zouzias.spark.lucenerdd.testing.LuceneRDDTestUtils import org.zouzias.spark.lucenerdd._ import org.zouzias.spark.lucenerdd.spatial.shape.context.ContextLoader class ShapeLuceneRDDImplicitsSpec extends FlatSpec with Matchers with BeforeAndAfterEach with SharedSparkContext with ContextLoader with LuceneRDDTestUtils { val Radius: Double = 5D override val conf = ShapeLuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). setMaster("local[*]"). setAppName("test"). set("spark.ui.enabled", "false"). set("spark.app.id", appID)) "ShapeLuceneRDDImplicits" should "implicitly convert to point" in { val rdd = sc.parallelize(cities) val shapeRDD = ShapeLuceneRDD(rdd) shapeRDD.count should equal(cities.length) } "ShapeLuceneRDDImplicits" should "implicitly convert to circle" in { val circleCities: Array[(((Double, Double), Double), String)] = cities.map(convertToCircle) val rdd = sc.parallelize(circleCities) val shapeRDD = ShapeLuceneRDD(rdd) shapeRDD.count should equal(circleCities.length) } "ShapeLuceneRDDImplicits" should "implicitly convert to rectangle" in { val rectangleCities = cities.map(convertToRectangle) val rdd = sc.parallelize(rectangleCities) val shapeRDD = ShapeLuceneRDD(rdd) shapeRDD.count should equal(rectangleCities.length) } "ShapeLuceneRDDImplicits" should "implicitly convert POINTS from WKT" in { val sparkSession = SparkSession.builder().getOrCreate() val citiesDF = sparkSession.read.parquet("data/world-cities-points.parquet") import sparkSession.implicits._ val citiesRDD = citiesDF.map(row => (row.getString(2), (row.getString(0), row.getString(1)))) val total = citiesDF.count() total > 0 should equal(true) val shapeRDD = ShapeLuceneRDD(citiesRDD) shapeRDD.count > 0 should equal(true) } "ShapeLuceneRDDImplicits" should "implicitly convert BBOX from WKT" in { val sparkSession = SparkSession.builder().getOrCreate() import sparkSession.implicits._ val countriesDF = sparkSession.read.parquet("data/countries-bbox.parquet") val citiesRDD = countriesDF.map(row => (row.getString(2), (row.getString(0), row.getString(1)))) val total = countriesDF.count() total > 0 should equal(true) val shapeRDD = ShapeLuceneRDD(citiesRDD) shapeRDD.count > 0 should equal(true) } "ShapeLuceneRDDImplicits" should "implicitly convert to polygon" in { val polygonCities = cities.map(convertToPolygon(_, Radius)) val rdd = sc.parallelize(polygonCities) val shapeRDD = ShapeLuceneRDD(rdd) shapeRDD.count should equal(polygonCities.length) } }
Example 24
Source File: LuceneRDDSearchSpec.scala From spark-lucenerdd with Apache License 2.0 | 5 votes |
package org.zouzias.spark.lucenerdd import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.SparkConf import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers} import org.zouzias.spark.lucenerdd.testing.LuceneRDDTestUtils class LuceneRDDSearchSpec extends FlatSpec with Matchers with BeforeAndAfterEach with LuceneRDDTestUtils with SharedSparkContext { var luceneRDD: LuceneRDD[_] = _ override def Radius: Double = 0 override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). setMaster("local[*]"). setAppName("test"). set("spark.ui.enabled", "false"). set("spark.app.id", appID)) override def afterEach() { luceneRDD.close() } val First = "_1" val array = List("fear", "death", " apologies", "romance", "tree", "fashion", "fascism") "LuceneRDD.query" should "use phrase query syntax" in { val words = Array("aabaa", "aaacaa", "aadaa", "aaaa", "qwerty") val rdd = sc.parallelize(words) luceneRDD = LuceneRDD(rdd) luceneRDD.query("_1:aadaa").isEmpty() should equal (false) luceneRDD.query("_1:aa*").count() should equal (4) luceneRDD.query("_1:q*").count() should equal (1) } "LuceneRDD.count" should "return correct number of elements" in { val rdd = sc.parallelize(array) luceneRDD = LuceneRDD(rdd) luceneRDD.count should equal (array.size) } "LuceneRDD.termQuery" should "correctly search with TermQueries" in { val rdd = sc.parallelize(array) luceneRDD = LuceneRDD(rdd) val results = luceneRDD.termQuery(First, array(1)) results.count() should equal (1) } "LuceneRDD.prefixQuery" should "correctly search with PrefixQueries" in { val prefices = Array("aaaabcd", "aaadcb", "aaz", "az", "qwerty") val rdd = sc.parallelize(prefices) luceneRDD = LuceneRDD(rdd) luceneRDD.prefixQuery(First, "a").count() should equal (4) luceneRDD.prefixQuery(First, "aa").count() should equal(3) luceneRDD.prefixQuery(First, "aaa").count() should equal (2) luceneRDD.prefixQuery(First, "aaaa").count() should equal (1) } "LuceneRDD.fuzzyQuery" should "correctly search with FuzzyQuery" in { val rdd = sc.parallelize(array) luceneRDD = LuceneRDD(rdd) luceneRDD.fuzzyQuery(First, "fear", 1).count() should equal (1) luceneRDD.fuzzyQuery(First, "fascsm", 1).count() should equal(1) luceneRDD.fuzzyQuery(First, "dath", 1).count() should equal (1) luceneRDD.fuzzyQuery(First, "tree", 1).count() should equal (1) } "LuceneRDD.phraseQuery" should "correctly search with PhraseQuery" in { val phrases = Array("hello world", "the company name was", "highlight lucene") val rdd = sc.parallelize(phrases) luceneRDD = LuceneRDD(rdd) luceneRDD.phraseQuery(First, "company name", 10).count() should equal (1) luceneRDD.phraseQuery(First, "hello world", 10).count() should equal (1) luceneRDD.phraseQuery(First, "highlight lucene", 10).count() should equal(1) } }
Example 25
Source File: BlockingDedupSpec.scala From spark-lucenerdd with Apache License 2.0 | 5 votes |
package org.zouzias.spark.lucenerdd import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.lucene.index.Term import org.apache.lucene.search.{Query, TermQuery} import org.apache.spark.SparkConf import org.apache.spark.sql.{Row, SparkSession} import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers} import org.zouzias.spark.lucenerdd.testing.Person class BlockingDedupSpec extends FlatSpec with Matchers with BeforeAndAfterEach with SharedSparkContext { override val conf: SparkConf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). setMaster("local[*]"). setAppName("test"). set("spark.ui.enabled", "false"). set("spark.app.id", appID)) "LuceneRDD.blockDedup" should "deduplicate elements on unique elements" in { val spark = SparkSession.builder().getOrCreate() import spark.implicits._ val people: Array[Person] = Array("fear", "death", "water", "fire", "house") .zipWithIndex.map { case (str, index) => val email = if (index % 2 == 0) "[email protected]" else "[email protected]" Person(str, index, email) } val df = sc.parallelize(people).repartition(2).toDF() val linker: Row => Query = { row => val name = row.getString(row.fieldIndex("name")) val term = new Term("name", name) new TermQuery(term) } val linked = LuceneRDD.blockDedup(df, linker, Array("email")) val linkedCount, dfCount = (linked.count, df.count()) linkedCount should equal(dfCount) // Check for correctness // Age is a unique index linked.collect().foreach { case (row, results) => val leftAge, rightAge = (row.getInt(row.fieldIndex("age")), results.headOption.map(x => x.getInt(x.fieldIndex("age")))) leftAge should equal(rightAge) } } }
Example 26
Source File: LuceneRDDTermVectorsSpec.scala From spark-lucenerdd with Apache License 2.0 | 5 votes |
package org.zouzias.spark.lucenerdd import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.SparkConf import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers} import org.zouzias.spark.lucenerdd.testing.LuceneRDDTestUtils class LuceneRDDTermVectorsSpec extends FlatSpec with Matchers with BeforeAndAfterEach with LuceneRDDTestUtils with SharedSparkContext { var luceneRDD: LuceneRDD[_] = _ override def Radius: Double = 0 override val conf: SparkConf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). setMaster("local[*]"). setAppName("test"). set("spark.ui.enabled", "false"). set("spark.app.id", appID)) override def afterEach() { luceneRDD.close() } val First = "_1" "LuceneRDD.termVectors" should "return valid terms" in { val words = Array("To smile or not to smile smile", "Don't cry because it's over, smile because it happened", "So many books, so little time", "A room without books is like a body without a soul", "If you tell the truth, you don't have to remember anything") val rdd = sc.parallelize(words) luceneRDD = LuceneRDD(rdd) val terms = luceneRDD.termVectors(First).collect() // These terms should exist terms.exists(_.term.compareToIgnoreCase("time") == 0) should equal(true) terms.exists(_.term.compareToIgnoreCase("room") == 0) should equal(true) terms.exists(_.term.compareToIgnoreCase("soul") == 0) should equal(true) terms.exists(_.term.compareToIgnoreCase("smile") == 0) should equal(true) terms.exists(t => (t.term.compareToIgnoreCase("smile") == 0) && t.count == 3) should equal (true) terms.exists(t => (t.term.compareToIgnoreCase("becaus") == 0) && t.count == 2) should equal (true) } }
Example 27
Source File: LuceneRDDTuplesSpec.scala From spark-lucenerdd with Apache License 2.0 | 5 votes |
package org.zouzias.spark.lucenerdd import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.SparkConf import org.scalatest.{FlatSpec, Matchers} class LuceneRDDTuplesSpec extends FlatSpec with Matchers with SharedSparkContext { val First = "_1" val Second = "_2" val array = List("fear", "death", " apology", "romance", "tree", "fashion", "fascism") override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). setMaster("local[*]"). setAppName("test"). set("spark.ui.enabled", "false"). set("spark.app.id", appID)) "LuceneRDD" should "work with Tuple2" in { val rdd = sc.parallelize(array).map(x => (x, x)) val luceneRDD = LuceneRDD(rdd) luceneRDD.count should equal (array.size) } "LuceneRDD" should "work with Tuple3" in { val rdd = sc.parallelize(array).map(x => (x, x, x)) val luceneRDD = LuceneRDD(rdd) val results = luceneRDD.termQuery(Second, array(1)) results.count should equal (1) } "LuceneRDD" should "work with Tuple4" in { val rdd = sc.parallelize(array).map(x => (x, x, x, x)) val luceneRDD = LuceneRDD(rdd) val results = luceneRDD.termQuery(Second, array(1)) results.count should equal (1) } "LuceneRDD" should "work with Tuple5" in { val rdd = sc.parallelize(array).map(x => (x, x, x, x, x)) val luceneRDD = LuceneRDD(rdd) val results = luceneRDD.termQuery(Second, array(1)) results.count should equal (1) } "LuceneRDD" should "work with Tuple6" in { val rdd = sc.parallelize(array).map(x => (x, x, x, x, x, x)) val luceneRDD = LuceneRDD(rdd) val results = luceneRDD.termQuery(Second, array(1)) results.count should equal (1) } "LuceneRDD" should "work with Tuple7" in { val rdd = sc.parallelize(array).map(x => (x, x, 2.0d, 1.0d, x, 1, x)) val luceneRDD = LuceneRDD(rdd) val results = luceneRDD.termQuery(First, array.head) results.count should equal (1) } "LuceneRDD" should "work with Tuple8" in { val rdd = sc.parallelize(array).map(x => (x, x, 2.0d, 1.0d, x, 1, x, 3.4)) val luceneRDD = LuceneRDD(rdd) val results = luceneRDD.termQuery(First, array(1)) results.count should equal (1) } "LuceneRDD" should "work with mixed types in Tuples" in { val rdd = sc.parallelize(array).map(x => (x, 1, x, 2L, x, 3.0F)) val luceneRDD = LuceneRDD(rdd) val results = luceneRDD.termQuery(First, array(1)) results.count should equal (1) } }
Example 28
Source File: FacetedLuceneRDDImplicitsSpec.scala From spark-lucenerdd with Apache License 2.0 | 5 votes |
package org.zouzias.spark.lucenerdd.facets import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers} import org.zouzias.spark.lucenerdd.testing.FavoriteCaseClass import org.zouzias.spark.lucenerdd.{LuceneRDD, LuceneRDDKryoRegistrator} class FacetedLuceneRDDImplicitsSpec extends FlatSpec with Matchers with BeforeAndAfterEach with SharedSparkContext { var luceneRDD: LuceneRDD[_] = _ override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). setMaster("local[*]"). setAppName("test"). set("spark.ui.enabled", "false"). set("spark.app.id", appID)) override def afterEach() { luceneRDD.close() } val elem = Array("fear", "death", "water", "fire", "house") .zipWithIndex.map{ case (str, index) => FavoriteCaseClass(str, index, 10L, 12.3F, s"${str}@gmail.com")} "FacetedLuceneRDD(case class).count" should "return correct number of elements" in { val rdd = sc.parallelize(elem) val spark = SparkSession.builder().getOrCreate() import spark.implicits._ val df = rdd.toDF() luceneRDD = FacetedLuceneRDD(df) luceneRDD.count should equal (elem.size) } "FacetedLuceneRDD(case class).fields" should "return all fields" in { val rdd = sc.parallelize(elem) val spark = SparkSession.builder().getOrCreate() import spark.implicits._ val df = rdd.toDF() luceneRDD = FacetedLuceneRDD(df) luceneRDD.fields().size should equal(5) luceneRDD.fields().contains("name") should equal(true) luceneRDD.fields().contains("age") should equal(true) luceneRDD.fields().contains("myLong") should equal(true) luceneRDD.fields().contains("myFloat") should equal(true) luceneRDD.fields().contains("email") should equal(true) } "FacetedLuceneRDD(case class).termQuery" should "correctly search with TermQueries" in { val rdd = sc.parallelize(elem) val spark = SparkSession.builder().getOrCreate() import spark.implicits._ val df = rdd.toDF() luceneRDD = FacetedLuceneRDD(df) val results = luceneRDD.termQuery("name", "water") results.count() should equal(1) } }
Example 29
Source File: SparkCassBulkWriterSpec.scala From Spark2Cassandra with Apache License 2.0 | 5 votes |
package com.github.jparkie.spark.cassandra import com.datastax.driver.core.querybuilder.QueryBuilder import com.datastax.spark.connector.AllColumns import com.datastax.spark.connector.writer.{ RowWriterFactory, SqlRowWriter } import com.github.jparkie.spark.cassandra.client.SparkCassSSTableLoaderClientManager import com.github.jparkie.spark.cassandra.conf.{ SparkCassServerConf, SparkCassWriteConf } import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.sql.{ Row, SQLContext } import org.scalatest.{ MustMatchers, WordSpec } import scala.collection.JavaConverters._ class SparkCassBulkWriterSpec extends WordSpec with MustMatchers with CassandraServerSpecLike with SharedSparkContext { val testKeyspace = "test_keyspace" val testTable = "test_table" override def beforeAll(): Unit = { super.beforeAll() getCassandraConnector.withSessionDo { currentSession => createKeyspace(currentSession, testKeyspace) currentSession.execute( s"""CREATE TABLE $testKeyspace.$testTable ( | test_key BIGINT PRIMARY KEY, | test_value VARCHAR |); """.stripMargin ) } } "SparkCassBulkWriter" must { "write() successfully" in { val sqlContext = new SQLContext(sc) import sqlContext.implicits._ implicit val testRowWriterFactory: RowWriterFactory[Row] = SqlRowWriter.Factory val testCassandraConnector = getCassandraConnector val testSparkCassWriteConf = SparkCassWriteConf() val testSparkCassServerConf = SparkCassServerConf( // See https://github.com/jsevellec/cassandra-unit/blob/master/cassandra-unit/src/main/resources/cu-cassandra.yaml storagePort = 7010 ) val testSparkCassBulkWriter = SparkCassBulkWriter( testCassandraConnector, testKeyspace, testTable, AllColumns, testSparkCassWriteConf, testSparkCassServerConf ) val testRDD = sc.parallelize(1 to 25) .map(currentNumber => (currentNumber.toLong, s"Hello World: $currentNumber!")) val testDataFrame = testRDD.toDF("test_key", "test_value") sc.runJob(testDataFrame.rdd, testSparkCassBulkWriter.write _) getCassandraConnector.withSessionDo { currentSession => val queryStatement = QueryBuilder.select("test_key", "test_value") .from(testKeyspace, testTable) .limit(25) val resultSet = currentSession.execute(queryStatement) val outputSet = resultSet.all.asScala .map(currentRow => (currentRow.getLong("test_key"), currentRow.getString("test_value"))) .toMap for (currentNumber <- 1 to 25) { val currentKey = currentNumber.toLong outputSet(currentKey) mustEqual s"Hello World: $currentNumber!" } } SparkCassSSTableLoaderClientManager.evictAll() } } }
Example 30
Source File: SparkCassDataFrameFunctionsSpec.scala From Spark2Cassandra with Apache License 2.0 | 5 votes |
package com.github.jparkie.spark.cassandra.sql import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.sql.SQLContext import org.scalatest.{ MustMatchers, WordSpec } class SparkCassDataFrameFunctionsSpec extends WordSpec with MustMatchers with SharedSparkContext { "Package com.github.jparkie.spark.cassandra.sql" must { "lift DataFrame into SparkCassDataFrameFunctions" in { val sqlContext = new SQLContext(sc) import sqlContext.implicits._ val testRDD = sc.parallelize(1 to 25) .map(currentNumber => (currentNumber.toLong, s"Hello World: $currentNumber!")) val testDataFrame = testRDD.toDF("test_key", "test_value") // If internalSparkContext is available, RDD was lifted. testDataFrame.internalSparkContext } } }
Example 31
Source File: SparkCassRDDFunctionsSpec.scala From Spark2Cassandra with Apache License 2.0 | 5 votes |
package com.github.jparkie.spark.cassandra.rdd import com.holdenkarau.spark.testing.SharedSparkContext import org.scalatest.{ MustMatchers, WordSpec } class SparkCassRDDFunctionsSpec extends WordSpec with MustMatchers with SharedSparkContext { "Package com.github.jparkie.spark.cassandra.rdd" must { "lift RDD into SparkCassRDDFunctions" in { val testRDD = sc.parallelize(1 to 25) .map(currentNumber => (currentNumber.toLong, s"Hello World: $currentNumber!")) // If internalSparkContext is available, RDD was lifted. testRDD.internalSparkContext } } }
Example 32
Source File: PointRDDExtensionsSpec.scala From reactiveinflux-spark with Apache License 2.0 | 5 votes |
package com.pygmalios.reactiveinflux.extensions import com.holdenkarau.spark.testing.SharedSparkContext import com.pygmalios.reactiveinflux.Point.Measurement import com.pygmalios.reactiveinflux._ import com.pygmalios.reactiveinflux.extensions.PointRDDExtensionsSpec._ import com.pygmalios.reactiveinflux.spark._ import com.pygmalios.reactiveinflux.spark.extensions.PointRDDExtensions import org.joda.time.{DateTime, DateTimeZone} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfter, FlatSpec} import scala.concurrent.duration._ @RunWith(classOf[JUnitRunner]) class PointRDDExtensionsSpec extends FlatSpec with SharedSparkContext with BeforeAndAfter { before { withInflux(_.create()) } after { withInflux(_.drop()) } behavior of "saveToInflux" it should "write single point to Influx" in { val points = List(point1) val rdd = sc.parallelize(points) // Execute rdd.saveToInflux() // Assert assert(PointRDDExtensions.totalBatchCount == 1) assert(PointRDDExtensions.totalPointCount == 1) val result = withInflux( _.query(Query(s"SELECT * FROM $measurement1")) .result .singleSeries) assert(result.rows.size == 1) val row = result.rows.head assert(row.time == point1.time) assert(row.values.size == 5) } it should "write 1000 points to Influx" in { val points = (1 to 1000).map { i => Point( time = point1.time.plusMinutes(i), measurement = point1.measurement, tags = point1.tags, fields = point1.fields ) } val rdd = sc.parallelize(points) // Execute rdd.saveToInflux() // Assert assert(PointRDDExtensions.totalBatchCount == 8) assert(PointRDDExtensions.totalPointCount == 1000) val result = withInflux( _.query(Query(s"SELECT * FROM $measurement1")) .result .singleSeries) assert(result.rows.size == 1000) } } object PointRDDExtensionsSpec { implicit val params: ReactiveInfluxDbName = ReactiveInfluxDbName("test") implicit val awaitAtMost: Duration = 1.second val measurement1: Measurement = "measurement1" val point1 = Point( time = new DateTime(1983, 1, 10, 7, 43, 10, 3, DateTimeZone.UTC), measurement = measurement1, tags = Map("tagKey1" -> "tagValue1", "tagKey2" -> "tagValue2"), fields = Map("fieldKey1" -> StringFieldValue("fieldValue1"), "fieldKey2" -> BigDecimalFieldValue(10.7))) }
Example 33
Source File: KNNPropSpec.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util.knn import scala.reflect.ClassTag import org.scalacheck.{Arbitrary, Gen} import org.scalacheck.Arbitrary.arbitrary import org.scalacheck.Gen.{choose, oneOf} import org.scalatest.PropSpec import org.apache.spark.ml.linalg.{ CosineDistance, EuclideanDistance, ManhattanDistance, JaccardDistance, HammingDistance } import org.apache.spark.ml.linalg.{Vector, SparseVector, DenseVector, Vectors} import com.holdenkarau.spark.testing.SharedSparkContext abstract class KNNPropSpec extends PropSpec with SharedSparkContext { implicit def arbitraryDenseVector: Arbitrary[DenseVector] = Arbitrary { for (arr <- arbitrary[Array[Double]]) yield new DenseVector(arr) } implicit def arbitrarySparseVector: Arbitrary[SparseVector] = Arbitrary { for (vec <- arbitrary[DenseVector]) yield vec.toSparse } implicit def arbitraryVector: Arbitrary[Vector] = Arbitrary( Gen.frequency( 1 -> arbitrary[DenseVector], 1 -> arbitrary[SparseVector] )) private def arraysOfNM[T: ClassTag](numRows: Int, numCols: Int, gen: Gen[T]): Gen[Array[Array[T]]] = Gen.listOfN(numRows * numCols, gen).map { square => square.toArray.grouped(numCols).toArray } private def vectorsOfNM(numRows: Int, numCols: Int, gen: Gen[Double]): Gen[Array[DenseVector]] = for { arrays <- arraysOfNM(numRows, numCols, gen) } yield arrays.map(arr => new DenseVector(arr)) val treeGen = for { measure <- oneOf(CosineDistance, EuclideanDistance, ManhattanDistance, HammingDistance, JaccardDistance) numVectors <- choose(1, 100) vectors <- vectorsOfNM(numVectors, 2, choose(-10.0, 10.0)) } yield vectors .scanLeft(Seq[Vector]())(_ :+ _) .tail .map( vs => VPTree(vs.map(v => VectorEntry(0L, v)).toIndexedSeq, measure, 10, 10, 10)) }
Example 34
Source File: DistributedPropSpec.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.linalg.distributed import scala.reflect.ClassTag import org.scalacheck.Gen import org.scalacheck.Gen.{choose, listOfN} import org.scalatest.PropSpec import org.apache.spark.mllib.linalg.DenseVector import com.holdenkarau.spark.testing.SharedSparkContext abstract class DistributedPropSpec extends PropSpec with SharedSparkContext { private def arraysOfNM[T: ClassTag](numRows: Int, numCols: Int, gen: Gen[T]): Gen[Array[Array[T]]] = Gen.listOfN(numRows * numCols, gen).map { square => square.toArray.grouped(numCols).toArray } private def vectorsOfNM(numRows: Int, numCols: Int, gen: Gen[Double]): Gen[Array[DenseVector]] = for { arrays <- arraysOfNM(numRows, numCols, gen) } yield arrays.map(arr => new DenseVector(arr)) val coordinateMatrixGen = for { lrow <- choose(5, 10) lcol <- choose(5, 10) lvecs <- vectorsOfNM(lrow, lcol, choose(-10.0, 10.0)) rrow <- choose(5, 10) rcol <- choose(5, 10) rvecs <- vectorsOfNM(rrow, rcol, choose(-10.0, 10.0)) } yield ( new IndexedRowMatrix(sc.parallelize(lvecs.zipWithIndex.map { case (vector, i) => new IndexedRow(i, vector) })).toCoordinateMatrix, new IndexedRowMatrix(sc.parallelize(rvecs.zipWithIndex.map { case (vector, i) => new IndexedRow(i, vector) })).toCoordinateMatrix ) }
Example 35
Source File: HDFSClusterTest.scala From spark-testing-base with Apache License 2.0 | 5 votes |
package com.holdenkarau.spark.testing import java.io.{ BufferedReader, BufferedWriter, InputStreamReader, OutputStreamWriter} import com.holdenkarau.spark.testing.{RDDComparisons, SharedSparkContext} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.rdd.RDD import org.scalatest.FunSuite class HDFSClusterTest extends FunSuite with SharedSparkContext with RDDComparisons { var hdfsCluster: HDFSCluster = null override def beforeAll(): Unit = { super.beforeAll() hdfsCluster = new HDFSCluster hdfsCluster.startHDFS() } test("get the namenode uri") { val nameNodeURI = hdfsCluster.getNameNodeURI() assert(nameNodeURI == "hdfs://localhost:8020") } test("read and write from spark to hdfs") { val list = List(1, 2, 3, 4, 5) val numRDD: RDD[Int] = sc.parallelize(list) val path = hdfsCluster.getNameNodeURI() + "/myRDD" numRDD.saveAsTextFile(path) val loadedRDD: RDD[Int] = sc.textFile(path).map(_.toInt) assertRDDEquals(numRDD, loadedRDD) } test("test creating local file to hdfs") { val path = new Path(hdfsCluster.getNameNodeURI() + "/myfile") val fs = FileSystem.get(path.toUri, new Configuration()) val writer = new BufferedWriter(new OutputStreamWriter(fs.create(path))) val writtenString = "hello, it's me" writer.write(writtenString) writer.close() val reader = new BufferedReader(new InputStreamReader(fs.open(path))) val readString = reader.readLine() reader.close() assert(writtenString == readString) } override def afterAll() { hdfsCluster.shutdownHDFS() super.afterAll() } }
Example 36
Source File: WordCountTest.scala From sparkProjectTemplate.g8 with Apache License 2.0 | 5 votes |
package $organization$.$name$ import com.holdenkarau.spark.testing.SharedSparkContext import org.scalatest.FunSuite class WordCountTest extends FunSuite with SharedSparkContext { test("word count with Stop Words Removed"){ val linesRDD = sc.parallelize(Seq( "How happy was the panda? You ask.", "Panda is the most happy panda in all the#!?ing land!")) val stopWords: Set[String] = Set("a", "the", "in", "was", "there", "she", "he") val splitTokens: Array[Char] = "#%?!. ".toCharArray val wordCounts = WordCount.withStopWordsFiltered( linesRDD, splitTokens, stopWords) val wordCountsAsMap = wordCounts.collectAsMap() assert(!wordCountsAsMap.contains("the")) assert(!wordCountsAsMap.contains("?")) assert(!wordCountsAsMap.contains("#!?ing")) assert(wordCountsAsMap.contains("ing")) assert(wordCountsAsMap.get("panda").get.equals(3)) } }