com.holdenkarau.spark.testing.SharedSparkContext Scala Example

Source File: JoinOrderTestSuite.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.tests.rangejoins

import java.io.{OutputStreamWriter, PrintWriter}

import com.holdenkarau.spark.testing.{DataFrameSuiteBase, SharedSparkContext}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{
  IntegerType,
  StringType,
  StructField,
  StructType
}
import org.bdgenomics.utils.instrumentation.{
  Metrics,
  MetricsListener,
  RecordedMetrics
}
import org.biodatageeks.sequila.rangejoins.IntervalTree.IntervalTreeJoinStrategyOptim
import org.scalatest.{BeforeAndAfter, FunSuite}

class JoinOrderTestSuite
    extends FunSuite
    with DataFrameSuiteBase
    with BeforeAndAfter
    with SharedSparkContext {

  val schema = StructType(
    Seq(StructField("chr", StringType),
        StructField("start", IntegerType),
        StructField("end", IntegerType)))
  val metricsListener = new MetricsListener(new RecordedMetrics())
  val writer = new PrintWriter(new OutputStreamWriter(System.out))
  before {
    System.setSecurityManager(null)
    spark.experimental.extraStrategies = new IntervalTreeJoinStrategyOptim(
      spark) :: Nil
    Metrics.initialize(sc)
    val rdd1 = sc
      .textFile(getClass.getResource("/refFlat.txt.bz2").getPath)
      .map(r => r.split('\t'))
      .map(
        r =>
          Row(
            r(2).toString,
            r(4).toInt,
            r(5).toInt
        ))
    val ref = spark.createDataFrame(rdd1, schema)
    ref.createOrReplaceTempView("ref")

    val rdd2 = sc
      .textFile(getClass.getResource("/snp150Flagged.txt.bz2").getPath)
      .map(r => r.split('\t'))
      .map(
        r =>
          Row(
            r(1).toString,
            r(2).toInt,
            r(3).toInt
        ))
    val snp = spark
      .createDataFrame(rdd2, schema)
    snp.createOrReplaceTempView("snp")
  }

  test("Join order - broadcasting snp table") {
    spark.sqlContext.setConf("spark.biodatageeks.rangejoin.useJoinOrder",
                             "true")
    val query =
      s"""
         |SELECT snp.*,ref.* FROM ref JOIN snp
         |ON (ref.chr=snp.chr AND snp.end>=ref.start AND snp.start<=ref.end)
       """.stripMargin

    assert(spark.sql(query).count === 616404L)

  }

  test("Join order - broadcasting ref table") {
    spark.sqlContext.setConf("spark.biodatageeks.rangejoin.useJoinOrder",
                             "true")
    val query =
      s"""
         |SELECT snp.*,ref.* FROM snp JOIN ref
         |ON (ref.chr=snp.chr AND snp.end>=ref.start AND snp.start<=ref.end)
       """.stripMargin
    assert(spark.sql(query).count === 616404L)

  }
  after {
    Metrics.print(writer, Some(metricsListener.metrics.sparkMetrics.stageTimes))
    writer.flush()
    Metrics.stopRecording()
  }
}

Source File: PackageSpec.scala From Spark2Elasticsearch with Apache License 2.0

5 votes

package com.github.jparkie.spark.elasticsearch.sql

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.sql.SQLContext
import org.scalatest.{ MustMatchers, WordSpec }

class PackageSpec extends WordSpec with MustMatchers with SharedSparkContext {
  "Package com.github.jparkie.spark.elasticsearch.sql" must {
    "lift DataFrame into SparkEsDataFrameFunctions" in {

      val sqlContext = new SQLContext(sc)

      val inputData = Seq(
        ("TEST_VALUE_1", 1),
        ("TEST_VALUE_2", 2),
        ("TEST_VALUE_3", 3)
      )

      val outputDataFrame = sqlContext.createDataFrame(inputData)
        .toDF("key", "value")

      // If sparkContext is available, DataFrame was lifted into SparkEsDataFrameFunctions.
      outputDataFrame.sparkContext
    }
  }
}

Source File: AnomalyDetection$Test.scala From spark-anomaly-detection with MIT License

5 votes

package com.micvog.ml

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.scalactic.Equality
import org.scalatest.{FlatSpec, FunSuite, Matchers}

class AnomalyDetection$Test extends FlatSpec with Matchers with SharedSparkContext {
  {
    val point = Vectors.dense(Array(14.8593411857427, 14.9006647394062))
    val means = Vectors.dense(Array(14.1122257839456, 14.9977105081362))
    val variances = Vectors.dense(Array(1.83263141349452, 1.70974533082878))

    "probFunction" should "return correct product value" in {
      val p = AnomalyDetection.probFunction(point, means, variances)
      assert(p === 0.0769984879544 +- 0.0001)
    }

    "predict" should "predict the anomaly" in {
      assert(!AnomalyDetection.predict(point, means, variances, 0.05))
    }

    "predict" should "predict non anomaly" in {
      assert(AnomalyDetection.predict(point, means, variances, 0.08))
    }
  }

  private def vectorequality() = {
    new Equality[Vector] {
      def areEqual(a: Vector, b: Any): Boolean =
        b match {
          case v: Vector => v.toArray.zip(a.toArray).map(pair => pair._1 === pair._2 +- 0.001).reduce((a, b) => a && b)
          case _ => false
        }
    }
  }

  def trainModel(): AnomalyDetectionModel = {
    val trainingExamplesFilePath = "./src/test/resources/training.csv"
    val trainingData = sc.textFile(trainingExamplesFilePath, 2).cache()
    val trainingRdd = FeaturesParser.parseFeatures(trainingData)
    new AnomalyDetection().run(trainingRdd)
  }

  "run" should "return model with correct mean and variance" in {
    val model: AnomalyDetectionModel = trainModel()

    //use scalactic's more relaxing equality
    implicit val vectorEq = vectorequality()

    assert(model.means === Vectors.dense(Array(79.9843751617201, 5.13662727300755)))
    assert(model.variances === Vectors.dense(Array(356.44539323536225, 3.79818173645375)))
  }

  "optimize" should "calculate epsilon and F1 score" in {
    val cvFilePath = "./src/test/resources/cross_val.csv"
    val cvData = sc.textFile(cvFilePath, 2).cache()
    val cvPointsRdd: RDD[LabeledPoint] = FeaturesParser.parseFeaturesWithLabel(cvData)

    val model = trainModel()
    val optimalModel = new AnomalyDetection().optimize(cvPointsRdd, model)
    assert(optimalModel.epsilon === 3.382218E-4 +- 0.0000000001)
  }

}

Source File: BEDBaseTestSuite.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.tests.base

import com.holdenkarau.spark.testing.{DataFrameSuiteBase, SharedSparkContext}
import org.scalatest.{BeforeAndAfter, FunSuite}

class BEDBaseTestSuite
    extends
      FunSuite
    with DataFrameSuiteBase
    with SharedSparkContext with BeforeAndAfter{

  val bedPath: String = getClass.getResource("/bed/test.bed").getPath
  val tableNameBED = "targets"

  val bedSimplePath: String = getClass.getResource("/bed/simple.bed").getPath
  val tableNameSimpleBED = "simple_targets"


  before{
    spark.sql(s"DROP TABLE IF EXISTS $tableNameBED")
    spark.sql(s"""
         |CREATE TABLE $tableNameBED
         |USING org.biodatageeks.sequila.datasources.BED.BEDDataSource
         |OPTIONS(path "$bedPath")
         |
      """.stripMargin)
    spark.sql(s"DROP TABLE IF EXISTS $tableNameSimpleBED")
    spark.sql(s"""
                 |CREATE TABLE $tableNameSimpleBED
                 |USING org.biodatageeks.sequila.datasources.BED.BEDDataSource
                 |OPTIONS(path "$bedSimplePath")
                 |
      """.stripMargin)

  }

  def after = {

    spark.sql(s"DROP TABLE IF EXISTS $tableNameBED")
    spark.sql(s"DROP TABLE IF EXISTS $tableNameSimpleBED")

  }


}

Source File: BEDReaderTestSuite.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.tests.datasources

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.sql.SequilaSession
import org.biodatageeks.sequila.tests.base.BEDBaseTestSuite
import org.biodatageeks.sequila.utils.SequilaRegister

class BEDReaderTestSuite extends BEDBaseTestSuite with SharedSparkContext {

  test("Read BED file") {

    val ss = SequilaSession(spark)
    SequilaRegister.register(ss)
    val sqlText = s"SELECT * FROM ${tableNameBED}"
    ss
      .sql(sqlText)
      .show()
    val res = ss
      .sql(sqlText)
      .first()

    assert(res.getString(0) === "22")
    assert(res.getInt(1) === 1000 + 1) //test  1-based
    assert(res.getInt(2) === 5000)
    assert(res.getString(5) === "+")
    assert(res.getAs[Array[Int]](10) === Array(567, 488))
  }

  test("Read Simple BED file") {
    val ss = SequilaSession(spark)
    SequilaRegister.register(ss)
    val sqlText = s"SELECT * FROM ${tableNameSimpleBED}"
    ss
      .sql(sqlText)
      .show()

    val res = ss
      .sql(sqlText)
      .first()

    assert(res.getString(0) === "11")
    assert(res.getInt(1) === 1000 + 1) //test  1-based
    assert(res.getInt(2) === 5000)
    assert(res.getString(3) === null)


  }

}

Source File: FASTQReaderTestSuite.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.tests.datasources

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.sql.SequilaSession
import org.biodatageeks.sequila.tests.base.FASTQBaseTestSuite
import org.biodatageeks.sequila.utils.SequilaRegister

class FASTQReaderTestSuite extends FASTQBaseTestSuite with SharedSparkContext {

  test("Read FASTQ file"){

    val ss = SequilaSession(spark)
    SequilaRegister.register(ss)
    val sqlText =  s"SELECT * FROM ${tableNameFASTQ}"
    ss
      .sql(sqlText)
      .show()
    val res = ss
      .sql(sqlText)
      .first()
   assert(res.getString(0) === "NA12988")
   assert(res.getBoolean(8) === false)
   assert(res.getString(11) == "GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT")
   assert(res.getString(12) == "!''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65")
  }

}

Source File: VCFDataSourceTestSuite.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.tests.datasources

import com.holdenkarau.spark.testing.{DataFrameSuiteBase, SharedSparkContext}
import org.biodatageeks.sequila.utils.Columns
import org.scalatest.{BeforeAndAfter, FunSuite}

class VCFDataSourceTestSuite
    extends FunSuite
    with DataFrameSuiteBase
    with BeforeAndAfter
    with SharedSparkContext {

  val vcfPath: String = getClass.getResource("/vcf/test.vcf").getPath
  val tableNameVCF = "variants"
  before {
    spark.sql(s"DROP TABLE IF EXISTS $tableNameVCF")
    spark.sql(s"""
         |CREATE TABLE $tableNameVCF
         |USING org.biodatageeks.sequila.datasources.VCF.VCFDataSource
         |OPTIONS(path "$vcfPath")
         |
      """.stripMargin)

  }
  test("VCF - Row count VCFDataSource") {
    val query = s"SELECT * FROM $tableNameVCF"
    spark
      .sql(query)
      .printSchema()

    assert(
      spark
        .sql(query)
        .first()
        .getString(0) === "20")

    assert(spark.sql(query).count() === 7L)

  }

  after {
    spark.sql(s"DROP TABLE IF EXISTS  $tableNameVCF")
  }

}

Source File: PileupTestBase.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.tests.pileup

import com.holdenkarau.spark.testing.{DataFrameSuiteBase, SharedSparkContext}
import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession}
import org.apache.spark.sql.types.{IntegerType, ShortType, StringType, StructField, StructType}
import org.scalatest.{BeforeAndAfter, FunSuite}

class PileupTestBase extends FunSuite
  with DataFrameSuiteBase
  with BeforeAndAfter
  with SharedSparkContext{

  val sampleId = "NA12878.multichrom.md"
  val samResPath: String = getClass.getResource("/multichrom/mdbam/samtools.pileup").getPath
  val referencePath: String = getClass.getResource("/reference/Homo_sapiens_assembly18_chr1_chrM.small.fasta").getPath
  val bamPath: String = getClass.getResource(s"/multichrom/mdbam/${sampleId}.bam").getPath
  val cramPath : String = getClass.getResource(s"/multichrom/mdcram/${sampleId}.cram").getPath
  val tableName = "reads_bam"
  val tableNameCRAM = "reads_cram"

  val schema: StructType = StructType(
    List(
      StructField("contig", StringType, nullable = true),
      StructField("position", IntegerType, nullable = true),
      StructField("reference", StringType, nullable = true),
      StructField("coverage", ShortType, nullable = true),
      StructField("pileup", StringType, nullable = true),
      StructField("quality", StringType, nullable = true)
    )
  )
  before {
    System.setProperty("spark.kryo.registrator", "org.biodatageeks.sequila.pileup.serializers.CustomKryoRegistrator")
    spark
      .conf.set("spark.sql.shuffle.partitions",1) //FIXME: In order to get orderBy in Samtools tests working - related to exchange partitions stage
    spark.sql(s"DROP TABLE IF EXISTS $tableName")
    spark.sql(
      s"""
         |CREATE TABLE $tableName
         |USING org.biodatageeks.sequila.datasources.BAM.BAMDataSource
         |OPTIONS(path "$bamPath")
         |
      """.stripMargin)

    spark.sql(s"DROP TABLE IF EXISTS $tableNameCRAM")
    spark.sql(
      s"""
         |CREATE TABLE $tableNameCRAM
         |USING org.biodatageeks.sequila.datasources.BAM.CRAMDataSource
         |OPTIONS(path "$cramPath", refPath "$referencePath" )
         |
      """.stripMargin)

    val mapToString = (map: Map[Byte, Short]) => {
      if (map == null)
        "null"
      else
        map.map({
          case (k, v) => k.toChar -> v}).mkString.replace(" -> ", ":")
    }

    val byteToString = ((byte: Byte) => byte.toString)

    spark.udf.register("mapToString", mapToString)
    spark.udf.register("byteToString", byteToString)
  }

}

Source File: FeatureCountsTestSuite.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.tests.rangejoins

import com.holdenkarau.spark.testing.{DataFrameSuiteBase, SharedSparkContext}
import htsjdk.samtools.ValidationStringency
import org.apache.hadoop.io.LongWritable
import org.biodatageeks.sequila.apps.FeatureCounts.Region
import org.biodatageeks.sequila.rangejoins.IntervalTree.IntervalTreeJoinStrategyOptim
import org.biodatageeks.sequila.utils.{Columns, DataQualityFuncs}
import org.scalatest.{BeforeAndAfter, FunSuite}
import org.seqdoop.hadoop_bam.util.SAMHeaderReader
import org.seqdoop.hadoop_bam.{BAMInputFormat, SAMRecordWritable}



class FeatureCountsTestSuite
    extends FunSuite
    with DataFrameSuiteBase
    with BeforeAndAfter
    with SharedSparkContext {

  before {
    System.setSecurityManager(null)
    spark.experimental.extraStrategies = new IntervalTreeJoinStrategyOptim(
      spark) :: Nil
  }

  test("Feature counts for chr1:20138-20294") {
    val query = s"""
        | SELECT count(*),targets.${Columns.CONTIG},targets.${Columns.START},targets.${Columns.END}
        | FROM reads JOIN targets
        |ON (
        |  targets.${Columns.CONTIG}=reads.${Columns.CONTIG}
        |  AND
        |  reads.${Columns.END} >= targets.${Columns.START}
        |  AND
        |  reads.${Columns.START} <= targets.${Columns.END}
        |)
        | GROUP BY targets.${Columns.CONTIG},targets.${Columns.START},targets.${Columns.END}
        | HAVING ${Columns.CONTIG}='1' AND ${Columns.START} = 20138 AND ${Columns.END} = 20294""".stripMargin

    spark.sparkContext.hadoopConfiguration.set(
      SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY,
      ValidationStringency.SILENT.toString)

    val alignments = spark.sparkContext
      .newAPIHadoopFile[LongWritable, SAMRecordWritable, BAMInputFormat](
        getClass.getResource("/NA12878.slice.bam").getPath)
      .map(_._2.get)
      .map(r => Region(DataQualityFuncs.cleanContig(r.getContig), r.getStart, r.getEnd))

    val reads = spark.sqlContext
      .createDataFrame(alignments)
      .withColumnRenamed("contigName", Columns.CONTIG)
      .withColumnRenamed("start", Columns.START)
      .withColumnRenamed("end", Columns.END)

    reads.createOrReplaceTempView("reads")

    val targets = spark.sqlContext
      .createDataFrame(Array(Region("1", 20138, 20294)))
      .withColumnRenamed("contigName", Columns.CONTIG)
      .withColumnRenamed("start", Columns.START)
      .withColumnRenamed("end", Columns.END)

    targets.createOrReplaceTempView("targets")

    spark.sql(query).explain(false)
    assert(spark.sql(query).first().getLong(0) === 1484L)

  }

}

Source File: SparkEsBulkWriterSpec.scala From Spark2Elasticsearch with Apache License 2.0

5 votes

package com.github.jparkie.spark.elasticsearch

import com.github.jparkie.spark.elasticsearch.conf.{ SparkEsMapperConf, SparkEsWriteConf }
import com.github.jparkie.spark.elasticsearch.sql.{ SparkEsDataFrameMapper, SparkEsDataFrameSerializer }
import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.sql.types.{ LongType, StringType, StructField, StructType }
import org.apache.spark.sql.{ Row, SQLContext }
import org.scalatest.{ MustMatchers, WordSpec }

class SparkEsBulkWriterSpec extends WordSpec with MustMatchers with SharedSparkContext {
  val esServer = new ElasticSearchServer()

  override def beforeAll(): Unit = {
    super.beforeAll()

    esServer.start()
  }

  override def afterAll(): Unit = {
    esServer.stop()

    super.afterAll()
  }

  "SparkEsBulkWriter" must {
    "execute write() successfully" in {
      esServer.createAndWaitForIndex("test_index")

      val sqlContext = new SQLContext(sc)

      val inputSparkEsWriteConf = SparkEsWriteConf(
        bulkActions = 10,
        bulkSizeInMB = 1,
        concurrentRequests = 0,
        flushTimeoutInSeconds = 1
      )
      val inputMapperConf = SparkEsMapperConf(
        esMappingId = Some("id"),
        esMappingParent = None,
        esMappingVersion = None,
        esMappingVersionType = None,
        esMappingRouting = None,
        esMappingTTLInMillis = None,
        esMappingTimestamp = None
      )
      val inputSchema = StructType(
        Array(
          StructField("id", StringType, true),
          StructField("parent", StringType, true),
          StructField("version", LongType, true),
          StructField("routing", StringType, true),
          StructField("ttl", LongType, true),
          StructField("timestamp", StringType, true),
          StructField("value", LongType, true)
        )
      )
      val inputData = sc.parallelize {
        Array(
          Row("TEST_ID_1", "TEST_PARENT_1", 1L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 1L),
          Row("TEST_ID_1", "TEST_PARENT_2", 2L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 2L),
          Row("TEST_ID_1", "TEST_PARENT_3", 3L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 3L),
          Row("TEST_ID_1", "TEST_PARENT_4", 4L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 4L),
          Row("TEST_ID_1", "TEST_PARENT_5", 5L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 5L),
          Row("TEST_ID_5", "TEST_PARENT_6", 6L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 6L),
          Row("TEST_ID_6", "TEST_PARENT_7", 7L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 7L),
          Row("TEST_ID_7", "TEST_PARENT_8", 8L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 8L),
          Row("TEST_ID_8", "TEST_PARENT_9", 9L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 9L),
          Row("TEST_ID_9", "TEST_PARENT_10", 10L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 10L),
          Row("TEST_ID_10", "TEST_PARENT_11", 11L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 11L)
        )
      }
      val inputDataFrame = sqlContext.createDataFrame(inputData, inputSchema)
      val inputDataIterator = inputDataFrame.rdd.toLocalIterator
      val inputSparkEsBulkWriter = new SparkEsBulkWriter[Row](
        esIndex = "test_index",
        esType = "test_type",
        esClient = () => esServer.client,
        sparkEsSerializer = new SparkEsDataFrameSerializer(inputSchema),
        sparkEsMapper = new SparkEsDataFrameMapper(inputMapperConf),
        sparkEsWriteConf = inputSparkEsWriteConf
      )

      inputSparkEsBulkWriter.write(null, inputDataIterator)

      val outputGetResponse = esServer.client.prepareGet("test_index", "test_type", "TEST_ID_1").get()

      outputGetResponse.isExists mustEqual true
      outputGetResponse.getSource.get("parent").asInstanceOf[String] mustEqual "TEST_PARENT_5"
      outputGetResponse.getSource.get("version").asInstanceOf[Integer] mustEqual 5
      outputGetResponse.getSource.get("routing").asInstanceOf[String] mustEqual "TEST_ROUTING_1"
      outputGetResponse.getSource.get("ttl").asInstanceOf[Integer] mustEqual 86400000
      outputGetResponse.getSource.get("timestamp").asInstanceOf[String] mustEqual "TEST_TIMESTAMP_1"
      outputGetResponse.getSource.get("value").asInstanceOf[Integer] mustEqual 5
    }
  }
}

Source File: LongReadsTestSuite.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.tests.coverage

import com.holdenkarau.spark.testing.{DataFrameSuiteBase, SharedSparkContext}
import org.apache.spark.sql.{SequilaSession, SparkSession}
import org.biodatageeks.sequila.utils.{Columns, InternalParams, SequilaRegister}
import org.scalatest.{BeforeAndAfter, FunSuite}

class LongReadsTestSuite
    extends FunSuite
    with DataFrameSuiteBase
    with BeforeAndAfter
    with SharedSparkContext {

  val bamPath: String =
    getClass.getResource("/nanopore_guppy_slice.bam").getPath
  val splitSize = 30000
  val tableNameBAM = "reads"

  before {

    System.setSecurityManager(null)
    spark.sql(s"DROP TABLE IF EXISTS $tableNameBAM")
    spark.sql(s"""
         |CREATE TABLE $tableNameBAM
         |USING org.biodatageeks.sequila.datasources.BAM.BAMDataSource
         |OPTIONS(path "$bamPath")
         |
      """.stripMargin)

  }
  test("BAM - Nanopore with guppy basecaller") {

    val session: SparkSession = SequilaSession(spark)
    SequilaRegister.register(session)
    session.sparkContext
      .setLogLevel("WARN")
    val bdg = session.sql(s"SELECT * FROM ${tableNameBAM}")
    assert(bdg.count() === 150)
  }

  test("BAM - coverage - Nanopore with guppy basecaller") {
    spark.sqlContext.setConf(InternalParams.InputSplitSize,
                             (splitSize * 10).toString)
    val session2: SparkSession = SequilaSession(spark)
    SequilaRegister.register(session2)
    val query =
      s"""SELECT ${Columns.CONTIG}, ${Columns.START}, ${Columns.COVERAGE}
        FROM bdg_coverage('$tableNameBAM','nanopore_guppy_slice','bases')
        order by ${Columns.CONTIG},${Columns.START},${Columns.END}
        """.stripMargin
    val covMultiPartitionDF = session2.sql(query)

    //covMultiPartitionDF.coalesce(1).write.mode("overwrite").option("delimiter", "\t").csv("/Users/aga/workplace/multiPart.csv")
    assert(covMultiPartitionDF.count() == 45620) // total count check 45620<---> 45842

    assert(covMultiPartitionDF.filter(s"${Columns.COVERAGE}== 0").count == 0)

    assert(
      covMultiPartitionDF
        .where(s"${Columns.CONTIG}='21' and ${Columns.START} == 5010515")
        .first()
        .getShort(2) == 1) // value check [first element]
    assert(
      covMultiPartitionDF
        .where(s"${Columns.CONTIG}='21' and ${Columns.START} == 5022667")
        .first()
        .getShort(2) == 15) // value check [partition boundary]
    assert(
      covMultiPartitionDF
        .where(s"${Columns.CONTIG}='21' and ${Columns.START} == 5036398")
        .first()
        .getShort(2) == 14) // value check [partition boundary]
    assert(
      covMultiPartitionDF
        .where(s"${Columns.CONTIG}='21' and ${Columns.START} == 5056356")
        .first()
        .getShort(2) == 1) // value check [last element]

  }

}

Source File: SequilaDatasourceStrategyTestSuite.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.tests.optimizations

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.sql.SequilaSession
import org.biodatageeks.sequila.tests.base.BAMBaseTestSuite
import org.biodatageeks.sequila.utils.{Columns, SequilaRegister}

class SequilaDatasourceStrategyTestSuite extends BAMBaseTestSuite with SharedSparkContext {


    test("Test query with distinct sample optimization") {
      val ss = SequilaSession(spark)
      SequilaRegister.register(ss)
      assert(
        ss.sql(s"SELECT distinct ${Columns.SAMPLE} FROM $tableNameBAM LIMIT 10")
          .count() === 1)

      assert(
        ss.sql(s"SELECT distinct ${Columns.SAMPLE} FROM $tableNameBAM LIMIT 10")
          .first()
          .getString(0) === "NA12878")
    }

  test("TEST query all columns with LIMIT optimization") {
    val ss = SequilaSession(spark)
    SequilaRegister.register(ss)
    ss.sparkContext.setLogLevel("INFO")
    val sqlText = s"SELECT * FROM $tableNameBAM LIMIT 10"
    ss.time {
      ss
        .sql(sqlText)
        .show
    }

  }

  test("TEST query subset columns with LIMIT optimization") {
    val ss = SequilaSession(spark)
    SequilaRegister.register(ss)
    ss.sparkContext.setLogLevel("INFO")
    val sqlText = s"SELECT ${Columns.QNAME},${Columns.SEQUENCE},${Columns.BASEQ} FROM $tableNameBAM LIMIT 10"
    ss.time {
      ss
        .sql(sqlText)
        .show
    }
  }
}

Source File: TransformationTestWithSparkTestingBase.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License

5 votes

package com.chapter16.SparkTesting

import org.scalatest.Assertions._
import org.apache.spark.rdd.RDD
import com.holdenkarau.spark.testing.SharedSparkContext
import org.scalatest.FunSuite

class TransformationTestWithSparkTestingBase extends FunSuite with SharedSparkContext {
  def tokenize(line: RDD[String]) = {
    line.map(x => x.split(' ')).collect()
  }

  test("works, obviously!") {
    assert(1 == 1)
  }

  test("Words counting") {
    assert(sc.parallelize("Hello world My name is Reza".split("\\W")).map(_ + 1).count == 6)
  }

  test("Testing RDD transformations using a shared Spark Context") {
    val input = List("Testing", "RDD transformations", "using a shared", "Spark Context")
    val expected = Array(Array("Testing"), Array("RDD", "transformations"), Array("using", "a", "shared"), Array("Spark", "Context"))
    val transformed = tokenize(sc.parallelize(input))
    assert(transformed === expected)
  }
}

Source File: LagDstrFactorySuite.scala From lagraph with Apache License 2.0

5 votes

package com.ibm.lagraph.impl
// TODO get rid of printlns
// scalastyle:off println

import com.holdenkarau.spark.testing.SharedSparkContext
import org.scalatest.FunSuite
import org.scalatest.Matchers
import scala.reflect.ClassTag
import scala.collection.mutable.{Map => MMap}
import com.ibm.lagraph._

class LagDstrFactorySuite extends FunSuite with Matchers with SharedSparkContext {
  val DEBUG = false

  val denseGraphSizes = List(1 << 4, 1 << 5)
  //  val sparseGraphSizes = List(1 << 16, 1 << 17, 1 << 29, 1 << 30)
  val sparseGraphSizes = List(1 << 16, 1 << 17, 1 << 26, 1 << 27)
  val nblocks = List(1 << 0, 1 << 1, 1 << 2, 1 << 3)

  test("test initializing spark context") {
    val hc: LagContext = LagContext.getLagDstrContext(sc, 1 << 3, 1)
    val list = nblocks
    val rdd = sc.parallelize(list)
    assert(rdd.count === list.length)
  }

  test("LagDstrContext.vIndices") {
    for (graphSize <- denseGraphSizes) {
      for (nblock <- nblocks) {
        if (DEBUG) println("LagDstrContext.vIndices", graphSize, nblock)
        val hc: LagContext = LagContext.getLagDstrContext(sc, graphSize, nblock)
        val start = 2
        val end = start + hc.graphSize
        val v = hc.vIndices(start)
        val vRes = hc.vToVector(v)
        assert(v.size == hc.graphSize)
        assert(vRes.size == (end - start))
        (start until end.toInt).map { r =>
          assert(vRes(r - start) == r)
        }
      }
    }
  }

  test("LagDstrContext.mIndices") {
    for (graphSize <- denseGraphSizes) {
      for (nblock <- nblocks) {
        if (DEBUG) println("LagDstrContext.mIndices", graphSize, nblock)
        val hc: LagContext = LagContext.getLagDstrContext(sc, graphSize, nblock)
        val start = (2L, 2L)
        val m = hc.mIndices(start)
        val (mResMap, sparseValue) = hc.mToMap(m)
        val mRes =
          LagContext.vectorOfVectorFromMap(mResMap, sparseValue, m.size)
        val end = (start._1 + graphSize, start._2 + graphSize)
        assert(mRes.size == (end._1 - start._1))
        mRes.zipWithIndex.map {
          case (vr, r) => {
            assert(vr.size == (end._2 - start._2))
            vr.zipWithIndex.map {
              case (vc, c) => assert(vc == (start._1 + r, start._2 + c))
            }
          }
        }
      }
    }
  }
  test("LagDstrContext.mReplicate") {
    for (graphSize <- denseGraphSizes) {
      for (nblock <- nblocks) {
        if (DEBUG) println("LagDstrContext.mReplicate", graphSize, nblock)
        val hc: LagContext = LagContext.getLagDstrContext(sc, graphSize, nblock)
        val singleValue: Double = 99.0
        val m = hc.mReplicate(singleValue)
        val (mResMap, sparseValue) = hc.mToMap(m)
        val mRes =
          LagContext.vectorOfVectorFromMap(mResMap, sparseValue, m.size)
        mRes.zipWithIndex.map {
          case (vr, r) => {
            assert(vr.size == graphSize)
            vr.zipWithIndex.map {
              case (vc, c) => assert(vc == singleValue)
            }
          }
        }
      }
    }
  }
}
// scalastyle:on println

Source File: FilmsTest.scala From spark-flow with Apache License 2.0

5 votes

package com.bloomberg.sparkflow.example

import com.bloomberg.sparkflow.example.FilmsPipeline.FilmMain
import com.holdenkarau.spark.testing.SharedSparkContext
import org.scalatest.FunSuite


class FilmsTest extends FunSuite with SharedSparkContext  {
  private def testFile(fileName: String): String = {
    Thread.currentThread().getContextClassLoader.getResource(fileName).toString
  }

  test("pipeline"){

    val filmPipe = new FilmMain
    filmPipe.filmRows.getDF(sc).show()

    filmPipe.topActors.get(sc).foreach(println)
    filmPipe.filmsWithTopActors.getDataset(sc).show()
    println(filmPipe.filmsWithTopActors.count.get(sc))
  }
}

Source File: HashingTest.scala From spark-flow with Apache License 2.0

5 votes

package com.bloomberg.sparkflow.serialization

import org.scalatest._
import com.bloomberg.sparkflow._
import com.bloomberg.sparkflow.serialization.HashingSample
import com.bloomberg.sparkflow.serialization.ClassExploration._
import com.bloomberg.sparkflow.serialization.Hashing._
import com.holdenkarau.spark.testing.SharedSparkContext


class HashingTest extends FunSuite with SharedSparkContext with ShouldMatchers{

  test("functionHashing"){
    var param = 7
    val input = 5

    val another = (x: Int) => x * 2
    val nested = (x: Int) => x * 4 + param + another(x)
    val g = (x: Int) => nested(x) + param

    val initialOutput = g(input)
    val initialGHash = hashClass(g)
    assert(initialGHash != hashClass(nested))
    assert(initialGHash != hashClass(another))

    assert(initialGHash == hashClass(g))
    param = 10
    assert(initialGHash != hashClass(g))
    assert(initialOutput != g(input))

  }

  test("dcHashing"){
    val numbers = parallelize(1 to 10)
    val filtered = numbers.filter(_ < 6)
    val doubled = filtered.map(_ * 2)
    val after = doubled.map(SomeFunctions.func4)

    val allSignatures = Set(numbers.getSignature,
      filtered.getSignature,
      doubled.getSignature,
      after.getSignature)

    assert(allSignatures.size == 4)
  }

  test("caseHashing"){
//    println(s"fieldObjects: ${getFieldObjects(HashingSample.result)}")
//    println(s"result: ${HashingSample.result.getSignature}")
   
   assert(HashingSample.result.getSignature.length > 0)
  }


}

Source File: SecondaryPairDCFunctionsTest.scala From spark-flow with Apache License 2.0

5 votes

package com.bloomberg.sparkflow.dc

import com.bloomberg.sparkflow._
import com.holdenkarau.spark.testing.SharedSparkContext
import org.scalatest.{ShouldMatchers, FunSuite}


class SecondaryPairDCFunctionsTest extends FunSuite with SharedSparkContext with ShouldMatchers {

  test("testRepartAndSort") {
    val input = parallelize(Seq(
      (("a",3), 0),
      (("b",2), 0),
      (("b",1), 0),
      (("b",3), 0),
      (("a",2), 0),
      (("a",1), 0)))

    val sortAndRepart = input.repartitionAndSecondarySortWithinPartitions(2)

    val result = sortAndRepart.mapPartitions(it => Iterator(it.toList))

    val expected = Seq(
      List(
      (("a",1), 0),
      (("a",2), 0),
      (("a",3), 0)),
      List(
      (("b",1), 0),
      (("b",2), 0),
      (("b",3), 0)))

    expected should contain theSameElementsAs result.getRDD(sc).collect()

  }

}

Source File: DRTest.scala From spark-flow with Apache License 2.0

5 votes

package com.bloomberg.sparkflow.dc

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.mllib.clustering.LDA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.scalatest._

import scala.util.Random
import com.bloomberg.sparkflow._



    val randomVecs = parallelize(1 to 100).map(i => Vectors.dense(Seq.fill(10)(Random.nextDouble()).toArray))
    val corpus = randomVecs.zipWithUniqueId().map{case (k,v) => (v,k)}
    val ldaModel = corpus.mapToResult(rdd => new LDA().setK(3).run(rdd))

  }

  test("regularSpark"){
    val numbers: RDD[Int] = sc.parallelize(1 to 10)
    val doubles: RDD[Double] = numbers.map(_.toDouble)
    val sum: Double = doubles.sum()
    val normalized: RDD[Double] = doubles.map(_ / sum)
  }
}

Source File: LuceneRDDMoreLikeThisSpec.scala From spark-lucenerdd with Apache License 2.0

5 votes

package org.zouzias.spark.lucenerdd

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.SparkConf
import scala.collection.JavaConverters._
import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers}

import scala.io.Source

class LuceneRDDMoreLikeThisSpec extends FlatSpec
  with Matchers
  with BeforeAndAfterEach
  with SharedSparkContext {

  var luceneRDD: LuceneRDD[_] = _


  override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
    setMaster("local[*]").
    setAppName("test").
    set("spark.ui.enabled", "false").
    set("spark.app.id", appID))

  override def afterEach() {
    luceneRDD.close()
  }

  "LuceneRDD.moreLikeThis" should "return relevant documents" in {
    val words: Seq[String] = Source.fromFile("src/test/resources/alice.txt")
      .getLines().map(_.toLowerCase).toSeq
    val rdd = sc.parallelize(words)
    luceneRDD = LuceneRDD(rdd)
    val results = luceneRDD
      .moreLikeThis("_1", "alice adventures wonderland", 1, 1)
      .collect()

    results.length > 0 should equal(true)
    val firstDoc = results.head
    val x = firstDoc.getString(firstDoc.fieldIndex("_1"))

    x.contains("alice") &&
      x.contains("wonderland") &&
      x.contains("adventures") should equal(true)

    val lastDoc = results.last
    val y = lastDoc.getString(lastDoc.fieldIndex("_1"))


      y.contains("alice") &&
        !y.contains("wonderland") &&
        !y.contains("adventures") should equal(true)

  }
}

Source File: LucenePrimitiveTypesSpec.scala From spark-lucenerdd with Apache License 2.0

5 votes

package org.zouzias.spark.lucenerdd

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.SparkConf
import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers}

class LucenePrimitiveTypesSpec extends FlatSpec with Matchers
  with BeforeAndAfterEach
  with SharedSparkContext {

  override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
    setMaster("local[*]").
    setAppName("test").
    set("spark.ui.enabled", "false").
    set("spark.app.id", appID))

  def randomString(length: Int): String = scala.util.Random.alphanumeric.take(length).mkString
  val array = (1 to 24).map(randomString(_))

  var luceneRDD: LuceneRDD[_] = _

  override def afterEach() {
    luceneRDD.close()
  }

  

  "LuceneRDD" should "work with RDD[Array[String]]" in {
    val array = Array(Array("aaa", "aaa2"), Array("bbb", "bbb2"),
      Array("ccc", "ccc2"), Array("ddd"), Array("eee"))
    val rdd = sc.parallelize(array)
    luceneRDD = LuceneRDD(rdd)
    luceneRDD.count should be (array.length)
  }

  "LuceneRDD" should "work with RDD[Set[String]]" in {
    val array = Array(Set("aaa", "aaa2"), Set("bbb", "bbb2"),
      Set("ccc", "ccc2"), Set("ddd"), Set("eee"))
    val rdd = sc.parallelize(array)
    luceneRDD = LuceneRDD(rdd)
    luceneRDD.count should be (array.length)
  }

  "LuceneRDD" should "work with RDD[String]" in {
    val array = Array("aaa", "bbb", "ccc", "ddd", "eee")
    val rdd = sc.parallelize(array)
    luceneRDD = LuceneRDD(rdd)
    luceneRDD.count should be (array.length)
  }

  "LuceneRDD" should "work with RDD[Int]" in {
    val array = (1 to 22)
    val rdd = sc.parallelize(array)
    luceneRDD = LuceneRDD(rdd)
    luceneRDD.count should be (array.size)
  }

  "LuceneRDD" should "work with RDD[Float]" in {
    val array: IndexedSeq[Float] = (1 to 22).map(_.toFloat)
    val rdd = sc.parallelize(array)
    luceneRDD = LuceneRDD(rdd)
    luceneRDD.count should be (array.size)
  }

  "LuceneRDD" should "work with RDD[Double]" in {
    val array: IndexedSeq[Double] = (1 to 22).map(_.toDouble)
    val rdd = sc.parallelize(array)
    luceneRDD = LuceneRDD(rdd)
    luceneRDD.count should be (array.size)
  }

  "LuceneRDD" should "work with RDD[Long]" in {
    val array: IndexedSeq[Long] = (1 to 22).map(_.toLong)
    val rdd = sc.parallelize(array)
    luceneRDD = LuceneRDD(rdd)
    luceneRDD.count should equal (array.size)
  }

  "LuceneRDD" should "work with RDD[Map[String, String]]" in {
    val maps = List(Map( "a" -> "hello"), Map("b" -> "world"), Map("c" -> "how are you"))
    val rdd = sc.parallelize(maps)
    luceneRDD = LuceneRDD(rdd)
    luceneRDD.count should equal (maps.size)
    luceneRDD.termQuery("a", "hello").isEmpty() should equal (false)
    luceneRDD.prefixQuery("b", "wor").isEmpty() should equal (false)
    luceneRDD.prefixQuery("a", "no").isEmpty() should equal (true)
  }

  "LuceneRDD" should "work with RDD[String] and ignore null values" in {
    val array = Array("aaa", null, "ccc", null, "eee")
    val rdd = sc.parallelize(array)
    luceneRDD = LuceneRDD(rdd)
    luceneRDD.count should be (array.length)
  }

}

Source File: BlockingLinkageSpec.scala From spark-lucenerdd with Apache License 2.0

5 votes

package org.zouzias.spark.lucenerdd

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.lucene.index.Term
import org.apache.lucene.search.{Query, TermQuery}
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Row, SparkSession}
import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers}
import org.zouzias.spark.lucenerdd.testing.Person

class BlockingLinkageSpec extends FlatSpec
  with Matchers
  with BeforeAndAfterEach
  with SharedSparkContext {

  override val conf: SparkConf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
    setMaster("local[*]").
    setAppName("test").
    set("spark.ui.enabled", "false").
    set("spark.app.id", appID))

  "LuceneRDD.blockEntityLinkage" should "deduplicate elements on unique elements" in {
    val spark = SparkSession.builder().getOrCreate()
    import spark.implicits._

    val peopleLeft: Array[Person] = Array("fear", "death", "water", "fire", "house")
      .zipWithIndex.map { case (str, index) =>
      val email = if (index % 2 == 0) "[email protected]" else "[email protected]"
      Person(str, index, email)
    }

    val peopleRight: Array[Person] = Array("fear", "death", "water", "fire", "house")
      .zipWithIndex.map { case (str, index) =>
      val email = if (index % 2 == 0) "[email protected]" else "[email protected]"
      Person(str, index, email)
    }

    val leftDF = sc.parallelize(peopleLeft).repartition(2).toDF()
    val rightDF = sc.parallelize(peopleRight).repartition(3).toDF()

    // Define a Lucene Term linker
    val linker: Row => Query = { row =>
      val name = row.getString(row.fieldIndex("name"))
      val term = new Term("name", name)

      new TermQuery(term)
    }


    val linked = LuceneRDD.blockEntityLinkage(leftDF, rightDF, linker,
      Array("email"), Array("email"))

    val linkedCount, dfCount = (linked.count, leftDF.count())

    linkedCount should equal(dfCount)

    // Check for correctness
    // Age is a unique index
    linked.collect().foreach { case (row, results) =>
      val leftAge, rightAge = (row.getInt(row.fieldIndex("age")),
        results.headOption.map(x => x.getInt(x.fieldIndex("age"))))

      leftAge should equal(rightAge)

    }
  }
}

Source File: LuceneRDDCustomCaseClassImplicitsSpec.scala From spark-lucenerdd with Apache License 2.0

5 votes

package org.zouzias.spark.lucenerdd

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.SparkConf
import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers}
import org.zouzias.spark.lucenerdd.testing.Person

class LuceneRDDCustomCaseClassImplicitsSpec extends FlatSpec
  with Matchers
  with BeforeAndAfterEach
  with SharedSparkContext {

  var luceneRDD: LuceneRDD[_] = _

  override def afterEach() {
    luceneRDD.close()
  }

  override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
    setMaster("local[*]").
    setAppName("test").
    set("spark.ui.enabled", "false").
    set("spark.app.id", appID))

  val elem: Array[Person] = Array("fear", "death", "water", "fire", "house")
    .zipWithIndex.map{ case (str, index) => Person(str, index, s"${str}@gmail.com")}

  "LuceneRDD(case class).count" should "handle nulls properly" in {
    val elemsWithNulls = Array("fear", "death", "water", "fire", "house")
      .zipWithIndex.map{ case (str, index) => Person(str, index, null)}
    val rdd = sc.parallelize(elemsWithNulls)
    luceneRDD = LuceneRDD(rdd)
    luceneRDD.count() should equal (elemsWithNulls.length)
  }

  "LuceneRDD(case class).count" should "return correct number of elements" in {
    val rdd = sc.parallelize(elem)
    luceneRDD = LuceneRDD(rdd)
    luceneRDD.count() should equal (elem.length)
  }

  "LuceneRDD(case class).fields" should "return all fields" in {
    val rdd = sc.parallelize(elem)
    luceneRDD = LuceneRDD(rdd)

    luceneRDD.fields().size should equal(3)
    luceneRDD.fields().contains("name") should equal(true)
    luceneRDD.fields().contains("age") should equal(true)
    luceneRDD.fields().contains("email") should equal(true)
  }

  "LuceneRDD(case class).termQuery" should "correctly search with TermQueries" in {
    val rdd = sc.parallelize(elem)
    luceneRDD = LuceneRDD(rdd)

    val results = luceneRDD.termQuery("name", "water")
    results.count() should equal(1)
  }
}

Source File: ShapeLuceneRDDImplicitsSpec.scala From spark-lucenerdd with Apache License 2.0

5 votes

package org.zouzias.spark.lucenerdd.spatial.shape.implicits

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers}
import org.zouzias.spark.lucenerdd.spatial.shape.{ShapeLuceneRDD, _}
import org.zouzias.spark.lucenerdd.testing.LuceneRDDTestUtils
import org.zouzias.spark.lucenerdd._
import org.zouzias.spark.lucenerdd.spatial.shape.context.ContextLoader

class ShapeLuceneRDDImplicitsSpec extends FlatSpec
  with Matchers
  with BeforeAndAfterEach
  with SharedSparkContext
  with ContextLoader
  with LuceneRDDTestUtils {

  val Radius: Double = 5D

  override val conf = ShapeLuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
    setMaster("local[*]").
    setAppName("test").
    set("spark.ui.enabled", "false").
    set("spark.app.id", appID))

  "ShapeLuceneRDDImplicits" should "implicitly convert to point" in {

    val rdd = sc.parallelize(cities)
    val shapeRDD = ShapeLuceneRDD(rdd)

    shapeRDD.count should equal(cities.length)
  }

  "ShapeLuceneRDDImplicits" should "implicitly convert to circle" in {

    val circleCities: Array[(((Double, Double), Double), String)]
    = cities.map(convertToCircle)
    val rdd = sc.parallelize(circleCities)
    val shapeRDD = ShapeLuceneRDD(rdd)

    shapeRDD.count should equal(circleCities.length)
  }

  "ShapeLuceneRDDImplicits" should "implicitly convert to rectangle" in {

    val rectangleCities = cities.map(convertToRectangle)
    val rdd = sc.parallelize(rectangleCities)
    val shapeRDD = ShapeLuceneRDD(rdd)

    shapeRDD.count should equal(rectangleCities.length)
  }

  "ShapeLuceneRDDImplicits" should "implicitly convert POINTS from WKT" in {
    val sparkSession = SparkSession.builder().getOrCreate()
    val citiesDF = sparkSession.read.parquet("data/world-cities-points.parquet")
    import sparkSession.implicits._
    val citiesRDD = citiesDF.map(row =>
      (row.getString(2), (row.getString(0), row.getString(1))))

    val total = citiesDF.count()
    total > 0 should equal(true)

    val shapeRDD = ShapeLuceneRDD(citiesRDD)

    shapeRDD.count > 0 should equal(true)
  }

  "ShapeLuceneRDDImplicits" should "implicitly convert BBOX from WKT" in {
    val sparkSession = SparkSession.builder().getOrCreate()
    import sparkSession.implicits._
    val countriesDF = sparkSession.read.parquet("data/countries-bbox.parquet")
    val citiesRDD = countriesDF.map(row =>
      (row.getString(2), (row.getString(0), row.getString(1))))

    val total = countriesDF.count()
    total > 0 should equal(true)

    val shapeRDD = ShapeLuceneRDD(citiesRDD)

    shapeRDD.count > 0 should equal(true)
  }

  "ShapeLuceneRDDImplicits" should "implicitly convert to polygon" in {

    val polygonCities = cities.map(convertToPolygon(_, Radius))
    val rdd = sc.parallelize(polygonCities)
    val shapeRDD = ShapeLuceneRDD(rdd)

    shapeRDD.count should equal(polygonCities.length)
  }

}

Source File: LuceneRDDSearchSpec.scala From spark-lucenerdd with Apache License 2.0

5 votes

package org.zouzias.spark.lucenerdd

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.SparkConf
import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers}
import org.zouzias.spark.lucenerdd.testing.LuceneRDDTestUtils

class LuceneRDDSearchSpec extends FlatSpec
  with Matchers
  with BeforeAndAfterEach
  with LuceneRDDTestUtils
  with SharedSparkContext {

  var luceneRDD: LuceneRDD[_] = _

  override def Radius: Double = 0

  override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
    setMaster("local[*]").
    setAppName("test").
    set("spark.ui.enabled", "false").
    set("spark.app.id", appID))

  override def afterEach() {
    luceneRDD.close()
  }


  val First = "_1"

  val array = List("fear", "death", " apologies", "romance", "tree", "fashion", "fascism")

  "LuceneRDD.query" should "use phrase query syntax" in {
    val words = Array("aabaa", "aaacaa", "aadaa", "aaaa", "qwerty")
    val rdd = sc.parallelize(words)
    luceneRDD = LuceneRDD(rdd)
    luceneRDD.query("_1:aadaa").isEmpty() should equal (false)
    luceneRDD.query("_1:aa*").count() should equal (4)
    luceneRDD.query("_1:q*").count() should equal (1)
  }

  "LuceneRDD.count" should "return correct number of elements" in {
    val rdd = sc.parallelize(array)
    luceneRDD = LuceneRDD(rdd)
    luceneRDD.count should equal (array.size)
  }

  "LuceneRDD.termQuery" should "correctly search with TermQueries" in {
    val rdd = sc.parallelize(array)
    luceneRDD = LuceneRDD(rdd)
    val results = luceneRDD.termQuery(First, array(1))
    results.count() should equal (1)
  }

  "LuceneRDD.prefixQuery" should "correctly search with PrefixQueries" in {

    val prefices = Array("aaaabcd", "aaadcb", "aaz", "az", "qwerty")
    val rdd = sc.parallelize(prefices)
    luceneRDD = LuceneRDD(rdd)

    luceneRDD.prefixQuery(First, "a").count() should equal (4)
    luceneRDD.prefixQuery(First, "aa").count() should equal(3)
    luceneRDD.prefixQuery(First, "aaa").count() should equal (2)
    luceneRDD.prefixQuery(First, "aaaa").count() should equal (1)
  }

  "LuceneRDD.fuzzyQuery" should "correctly search with FuzzyQuery" in {
    val rdd = sc.parallelize(array)
    luceneRDD = LuceneRDD(rdd)

    luceneRDD.fuzzyQuery(First, "fear", 1).count() should equal (1)
    luceneRDD.fuzzyQuery(First, "fascsm", 1).count() should equal(1)
    luceneRDD.fuzzyQuery(First, "dath", 1).count() should equal (1)
    luceneRDD.fuzzyQuery(First, "tree", 1).count() should equal (1)
  }

  

  "LuceneRDD.phraseQuery" should "correctly search with PhraseQuery" in {
    val phrases = Array("hello world", "the company name was", "highlight lucene")
    val rdd = sc.parallelize(phrases)
    luceneRDD = LuceneRDD(rdd)

    luceneRDD.phraseQuery(First, "company name", 10).count() should equal (1)
    luceneRDD.phraseQuery(First, "hello world", 10).count() should equal (1)
    luceneRDD.phraseQuery(First, "highlight lucene", 10).count() should equal(1)
  }
}

Source File: BlockingDedupSpec.scala From spark-lucenerdd with Apache License 2.0

5 votes

package org.zouzias.spark.lucenerdd

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.lucene.index.Term
import org.apache.lucene.search.{Query, TermQuery}
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Row, SparkSession}
import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers}
import org.zouzias.spark.lucenerdd.testing.Person

class BlockingDedupSpec extends FlatSpec
  with Matchers
  with BeforeAndAfterEach
  with SharedSparkContext {

  override val conf: SparkConf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
    setMaster("local[*]").
    setAppName("test").
    set("spark.ui.enabled", "false").
    set("spark.app.id", appID))

  "LuceneRDD.blockDedup" should "deduplicate elements on unique elements" in {
    val spark = SparkSession.builder().getOrCreate()
    import spark.implicits._

    val people: Array[Person] = Array("fear", "death", "water", "fire", "house")
      .zipWithIndex.map { case (str, index) =>
      val email = if (index % 2 == 0) "[email protected]" else "[email protected]"
      Person(str, index, email)
    }
    val df = sc.parallelize(people).repartition(2).toDF()

    val linker: Row => Query = { row =>
      val name = row.getString(row.fieldIndex("name"))
      val term = new Term("name", name)

      new TermQuery(term)
    }


    val linked = LuceneRDD.blockDedup(df, linker, Array("email"))

    val linkedCount, dfCount = (linked.count, df.count())

    linkedCount should equal(dfCount)

    // Check for correctness
    // Age is a unique index
    linked.collect().foreach { case (row, results) =>
      val leftAge, rightAge = (row.getInt(row.fieldIndex("age")),
        results.headOption.map(x => x.getInt(x.fieldIndex("age"))))

      leftAge should equal(rightAge)

    }
  }
}

Source File: LuceneRDDTermVectorsSpec.scala From spark-lucenerdd with Apache License 2.0

5 votes

package org.zouzias.spark.lucenerdd

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.SparkConf
import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers}
import org.zouzias.spark.lucenerdd.testing.LuceneRDDTestUtils

class LuceneRDDTermVectorsSpec extends FlatSpec
  with Matchers
  with BeforeAndAfterEach
  with LuceneRDDTestUtils
  with SharedSparkContext {

  var luceneRDD: LuceneRDD[_] = _

  override def Radius: Double = 0

  override val conf: SparkConf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
    setMaster("local[*]").
    setAppName("test").
    set("spark.ui.enabled", "false").
    set("spark.app.id", appID))

  override def afterEach() {
    luceneRDD.close()
  }

  val First = "_1"

  "LuceneRDD.termVectors" should "return valid terms" in {

    val words = Array("To smile or not to smile smile",
      "Don't cry because it's over, smile because it happened",
      "So many books, so little time",
      "A room without books is like a body without a soul",
      "If you tell the truth, you don't have to remember anything")
    val rdd = sc.parallelize(words)

    luceneRDD = LuceneRDD(rdd)

    val terms = luceneRDD.termVectors(First).collect()

    // These terms should exist
    terms.exists(_.term.compareToIgnoreCase("time") == 0) should equal(true)
    terms.exists(_.term.compareToIgnoreCase("room") == 0) should equal(true)
    terms.exists(_.term.compareToIgnoreCase("soul") == 0) should equal(true)
    terms.exists(_.term.compareToIgnoreCase("smile") == 0) should equal(true)

    terms.exists(t => (t.term.compareToIgnoreCase("smile") == 0)
      && t.count == 3) should equal (true)
    terms.exists(t => (t.term.compareToIgnoreCase("becaus") == 0)
      && t.count == 2) should equal (true)
  }
}

Source File: LuceneRDDTuplesSpec.scala From spark-lucenerdd with Apache License 2.0

5 votes

package org.zouzias.spark.lucenerdd

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.SparkConf
import org.scalatest.{FlatSpec, Matchers}

class LuceneRDDTuplesSpec extends FlatSpec with Matchers with SharedSparkContext {

  val First = "_1"
  val Second = "_2"

  val array = List("fear", "death", " apology", "romance", "tree", "fashion", "fascism")


  override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
    setMaster("local[*]").
    setAppName("test").
    set("spark.ui.enabled", "false").
    set("spark.app.id", appID))

  "LuceneRDD" should "work with Tuple2" in {
    val rdd = sc.parallelize(array).map(x => (x, x))
    val luceneRDD = LuceneRDD(rdd)
    luceneRDD.count should equal (array.size)
  }

  "LuceneRDD" should "work with Tuple3" in {
    val rdd = sc.parallelize(array).map(x => (x, x, x))
    val luceneRDD = LuceneRDD(rdd)
    val results = luceneRDD.termQuery(Second, array(1))
    results.count should equal (1)
  }

  "LuceneRDD" should "work with Tuple4" in {
    val rdd = sc.parallelize(array).map(x => (x, x, x, x))
    val luceneRDD = LuceneRDD(rdd)
    val results = luceneRDD.termQuery(Second, array(1))
    results.count should equal (1)
  }

  "LuceneRDD" should "work with Tuple5" in {
    val rdd = sc.parallelize(array).map(x => (x, x, x, x, x))
    val luceneRDD = LuceneRDD(rdd)
    val results = luceneRDD.termQuery(Second, array(1))
    results.count should equal (1)
  }

  "LuceneRDD" should "work with Tuple6" in {
    val rdd = sc.parallelize(array).map(x => (x, x, x, x, x, x))
    val luceneRDD = LuceneRDD(rdd)
    val results = luceneRDD.termQuery(Second, array(1))
    results.count should equal (1)
  }

  "LuceneRDD" should "work with Tuple7" in {
    val rdd = sc.parallelize(array).map(x => (x, x, 2.0d, 1.0d, x, 1, x))
    val luceneRDD = LuceneRDD(rdd)
    val results = luceneRDD.termQuery(First, array.head)
    results.count should equal (1)
  }

  "LuceneRDD" should "work with Tuple8" in {
    val rdd = sc.parallelize(array).map(x => (x, x, 2.0d, 1.0d, x, 1, x, 3.4))
    val luceneRDD = LuceneRDD(rdd)
    val results = luceneRDD.termQuery(First, array(1))
    results.count should equal (1)
  }

  "LuceneRDD" should "work with mixed types in Tuples" in {
    val rdd = sc.parallelize(array).map(x => (x, 1, x, 2L, x, 3.0F))
    val luceneRDD = LuceneRDD(rdd)
    val results = luceneRDD.termQuery(First, array(1))
    results.count should equal (1)
  }
}

Source File: FacetedLuceneRDDImplicitsSpec.scala From spark-lucenerdd with Apache License 2.0

5 votes

package org.zouzias.spark.lucenerdd.facets

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers}
import org.zouzias.spark.lucenerdd.testing.FavoriteCaseClass
import org.zouzias.spark.lucenerdd.{LuceneRDD, LuceneRDDKryoRegistrator}

class FacetedLuceneRDDImplicitsSpec  extends FlatSpec
  with Matchers
  with BeforeAndAfterEach
  with SharedSparkContext {

  var luceneRDD: LuceneRDD[_] = _


  override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
    setMaster("local[*]").
    setAppName("test").
    set("spark.ui.enabled", "false").
    set("spark.app.id", appID))

  override def afterEach() {
    luceneRDD.close()
  }


  val elem = Array("fear", "death", "water", "fire", "house")
    .zipWithIndex.map{ case (str, index) =>
    FavoriteCaseClass(str, index, 10L, 12.3F, s"${str}@gmail.com")}


  "FacetedLuceneRDD(case class).count" should "return correct number of elements" in {
    val rdd = sc.parallelize(elem)
    val spark = SparkSession.builder().getOrCreate()
    import spark.implicits._
    val df = rdd.toDF()
    luceneRDD = FacetedLuceneRDD(df)
    luceneRDD.count should equal (elem.size)
  }

  "FacetedLuceneRDD(case class).fields" should "return all fields" in {
    val rdd = sc.parallelize(elem)
    val spark = SparkSession.builder().getOrCreate()
    import spark.implicits._
    val df = rdd.toDF()
    luceneRDD = FacetedLuceneRDD(df)

    luceneRDD.fields().size should equal(5)
    luceneRDD.fields().contains("name") should equal(true)
    luceneRDD.fields().contains("age") should equal(true)
    luceneRDD.fields().contains("myLong") should equal(true)
    luceneRDD.fields().contains("myFloat") should equal(true)
    luceneRDD.fields().contains("email") should equal(true)
  }

  "FacetedLuceneRDD(case class).termQuery" should "correctly search with TermQueries" in {
    val rdd = sc.parallelize(elem)
    val spark = SparkSession.builder().getOrCreate()
    import spark.implicits._
    val df = rdd.toDF()
    luceneRDD = FacetedLuceneRDD(df)

    val results = luceneRDD.termQuery("name", "water")
    results.count() should equal(1)
  }
}

Source File: SparkCassBulkWriterSpec.scala From Spark2Cassandra with Apache License 2.0

5 votes

package com.github.jparkie.spark.cassandra

import com.datastax.driver.core.querybuilder.QueryBuilder
import com.datastax.spark.connector.AllColumns
import com.datastax.spark.connector.writer.{ RowWriterFactory, SqlRowWriter }
import com.github.jparkie.spark.cassandra.client.SparkCassSSTableLoaderClientManager
import com.github.jparkie.spark.cassandra.conf.{ SparkCassServerConf, SparkCassWriteConf }
import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.sql.{ Row, SQLContext }
import org.scalatest.{ MustMatchers, WordSpec }

import scala.collection.JavaConverters._

class SparkCassBulkWriterSpec extends WordSpec with MustMatchers with CassandraServerSpecLike with SharedSparkContext {
  val testKeyspace = "test_keyspace"
  val testTable = "test_table"

  override def beforeAll(): Unit = {
    super.beforeAll()

    getCassandraConnector.withSessionDo { currentSession =>
      createKeyspace(currentSession, testKeyspace)

      currentSession.execute(
        s"""CREATE TABLE $testKeyspace.$testTable (
            |  test_key BIGINT PRIMARY KEY,
            |  test_value VARCHAR
            |);
         """.stripMargin
      )
    }
  }

  "SparkCassBulkWriter" must {
    "write() successfully" in {
      val sqlContext = new SQLContext(sc)

      import sqlContext.implicits._

      implicit val testRowWriterFactory: RowWriterFactory[Row] = SqlRowWriter.Factory

      val testCassandraConnector = getCassandraConnector
      val testSparkCassWriteConf = SparkCassWriteConf()
      val testSparkCassServerConf = SparkCassServerConf(
        // See https://github.com/jsevellec/cassandra-unit/blob/master/cassandra-unit/src/main/resources/cu-cassandra.yaml
        storagePort = 7010
      )

      val testSparkCassBulkWriter = SparkCassBulkWriter(
        testCassandraConnector,
        testKeyspace,
        testTable,
        AllColumns,
        testSparkCassWriteConf,
        testSparkCassServerConf
      )

      val testRDD = sc.parallelize(1 to 25)
        .map(currentNumber => (currentNumber.toLong, s"Hello World: $currentNumber!"))
      val testDataFrame = testRDD.toDF("test_key", "test_value")

      sc.runJob(testDataFrame.rdd, testSparkCassBulkWriter.write _)

      getCassandraConnector.withSessionDo { currentSession =>
        val queryStatement = QueryBuilder.select("test_key", "test_value")
          .from(testKeyspace, testTable)
          .limit(25)

        val resultSet = currentSession.execute(queryStatement)

        val outputSet = resultSet.all.asScala
          .map(currentRow => (currentRow.getLong("test_key"), currentRow.getString("test_value")))
          .toMap

        for (currentNumber <- 1 to 25) {
          val currentKey = currentNumber.toLong

          outputSet(currentKey) mustEqual s"Hello World: $currentNumber!"
        }
      }

      SparkCassSSTableLoaderClientManager.evictAll()
    }
  }
}

Source File: SparkCassDataFrameFunctionsSpec.scala From Spark2Cassandra with Apache License 2.0

5 votes

package com.github.jparkie.spark.cassandra.sql

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.sql.SQLContext
import org.scalatest.{ MustMatchers, WordSpec }

class SparkCassDataFrameFunctionsSpec extends WordSpec with MustMatchers with SharedSparkContext {
  "Package com.github.jparkie.spark.cassandra.sql" must {
    "lift DataFrame into SparkCassDataFrameFunctions" in {
      val sqlContext = new SQLContext(sc)

      import sqlContext.implicits._

      val testRDD = sc.parallelize(1 to 25)
        .map(currentNumber => (currentNumber.toLong, s"Hello World: $currentNumber!"))
      val testDataFrame = testRDD.toDF("test_key", "test_value")

      // If internalSparkContext is available, RDD was lifted.
      testDataFrame.internalSparkContext
    }
  }
}

Source File: SparkCassRDDFunctionsSpec.scala From Spark2Cassandra with Apache License 2.0

5 votes

package com.github.jparkie.spark.cassandra.rdd

import com.holdenkarau.spark.testing.SharedSparkContext
import org.scalatest.{ MustMatchers, WordSpec }

class SparkCassRDDFunctionsSpec extends WordSpec with MustMatchers with SharedSparkContext {
  "Package com.github.jparkie.spark.cassandra.rdd" must {
    "lift RDD into SparkCassRDDFunctions" in {
      val testRDD = sc.parallelize(1 to 25)
        .map(currentNumber => (currentNumber.toLong, s"Hello World: $currentNumber!"))

      // If internalSparkContext is available, RDD was lifted.
      testRDD.internalSparkContext
    }
  }
}

Source File: PointRDDExtensionsSpec.scala From reactiveinflux-spark with Apache License 2.0

5 votes

package com.pygmalios.reactiveinflux.extensions

import com.holdenkarau.spark.testing.SharedSparkContext
import com.pygmalios.reactiveinflux.Point.Measurement
import com.pygmalios.reactiveinflux._
import com.pygmalios.reactiveinflux.extensions.PointRDDExtensionsSpec._
import com.pygmalios.reactiveinflux.spark._
import com.pygmalios.reactiveinflux.spark.extensions.PointRDDExtensions
import org.joda.time.{DateTime, DateTimeZone}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfter, FlatSpec}

import scala.concurrent.duration._

@RunWith(classOf[JUnitRunner])
class PointRDDExtensionsSpec extends FlatSpec with SharedSparkContext
  with BeforeAndAfter {

  before {
    withInflux(_.create())
  }

  after {
    withInflux(_.drop())
  }

  behavior of "saveToInflux"

  it should "write single point to Influx" in {
    val points = List(point1)
    val rdd = sc.parallelize(points)

    // Execute
    rdd.saveToInflux()

    // Assert
    assert(PointRDDExtensions.totalBatchCount == 1)
    assert(PointRDDExtensions.totalPointCount == 1)
    val result = withInflux(
      _.query(Query(s"SELECT * FROM $measurement1"))
      .result
      .singleSeries)

    assert(result.rows.size == 1)

    val row = result.rows.head
    assert(row.time == point1.time)
    assert(row.values.size == 5)
  }

  it should "write 1000 points to Influx" in {
    val points = (1 to 1000).map { i =>
      Point(
        time = point1.time.plusMinutes(i),
        measurement = point1.measurement,
        tags = point1.tags,
        fields = point1.fields
      )
    }
    val rdd = sc.parallelize(points)

    // Execute
    rdd.saveToInflux()

    // Assert
    assert(PointRDDExtensions.totalBatchCount == 8)
    assert(PointRDDExtensions.totalPointCount == 1000)
    val result = withInflux(
      _.query(Query(s"SELECT * FROM $measurement1"))
        .result
        .singleSeries)

    assert(result.rows.size == 1000)
  }
}

object PointRDDExtensionsSpec {
  implicit val params: ReactiveInfluxDbName = ReactiveInfluxDbName("test")
  implicit val awaitAtMost: Duration = 1.second

  val measurement1: Measurement = "measurement1"
  val point1 = Point(
    time        = new DateTime(1983, 1, 10, 7, 43, 10, 3, DateTimeZone.UTC),
    measurement = measurement1,
    tags        = Map("tagKey1" -> "tagValue1", "tagKey2" -> "tagValue2"),
    fields      = Map("fieldKey1" -> StringFieldValue("fieldValue1"), "fieldKey2" -> BigDecimalFieldValue(10.7)))
}

Source File: KNNPropSpec.scala From spark-tda with Apache License 2.0

5 votes

package org.apache.spark.ml.util.knn

import scala.reflect.ClassTag
import org.scalacheck.{Arbitrary, Gen}
import org.scalacheck.Arbitrary.arbitrary
import org.scalacheck.Gen.{choose, oneOf}
import org.scalatest.PropSpec
import org.apache.spark.ml.linalg.{
  CosineDistance,
  EuclideanDistance,
  ManhattanDistance,
  JaccardDistance,
  HammingDistance
}
import org.apache.spark.ml.linalg.{Vector, SparseVector, DenseVector, Vectors}
import com.holdenkarau.spark.testing.SharedSparkContext


abstract class KNNPropSpec extends PropSpec with SharedSparkContext {
  implicit def arbitraryDenseVector: Arbitrary[DenseVector] =
    Arbitrary {
      for (arr <- arbitrary[Array[Double]]) yield new DenseVector(arr)
    }

  implicit def arbitrarySparseVector: Arbitrary[SparseVector] =
    Arbitrary {
      for (vec <- arbitrary[DenseVector]) yield vec.toSparse
    }

  implicit def arbitraryVector: Arbitrary[Vector] =
    Arbitrary(
      Gen.frequency(
        1 -> arbitrary[DenseVector],
        1 -> arbitrary[SparseVector]
      ))

  private def arraysOfNM[T: ClassTag](numRows: Int,
                                      numCols: Int,
                                      gen: Gen[T]): Gen[Array[Array[T]]] =
    Gen.listOfN(numRows * numCols, gen).map { square =>
      square.toArray.grouped(numCols).toArray
    }

  private def vectorsOfNM(numRows: Int,
                          numCols: Int,
                          gen: Gen[Double]): Gen[Array[DenseVector]] =
    for {
      arrays <- arraysOfNM(numRows, numCols, gen)
    } yield arrays.map(arr => new DenseVector(arr))

  val treeGen = for {
    measure <- oneOf(CosineDistance,
                     EuclideanDistance,
                     ManhattanDistance,
                     HammingDistance,
                     JaccardDistance)
    numVectors <- choose(1, 100)
    vectors <- vectorsOfNM(numVectors, 2, choose(-10.0, 10.0))
  } yield
    vectors
      .scanLeft(Seq[Vector]())(_ :+ _)
      .tail
      .map(
        vs =>
          VPTree(vs.map(v => VectorEntry(0L, v)).toIndexedSeq,
                 measure,
                 10,
                 10,
                 10))
}

Source File: DistributedPropSpec.scala From spark-tda with Apache License 2.0

5 votes

package org.apache.spark.mllib.linalg.distributed

import scala.reflect.ClassTag
import org.scalacheck.Gen
import org.scalacheck.Gen.{choose, listOfN}
import org.scalatest.PropSpec
import org.apache.spark.mllib.linalg.DenseVector
import com.holdenkarau.spark.testing.SharedSparkContext


abstract class DistributedPropSpec extends PropSpec with SharedSparkContext {
  private def arraysOfNM[T: ClassTag](numRows: Int,
                                      numCols: Int,
                                      gen: Gen[T]): Gen[Array[Array[T]]] =
    Gen.listOfN(numRows * numCols, gen).map { square =>
      square.toArray.grouped(numCols).toArray
    }

  private def vectorsOfNM(numRows: Int,
                          numCols: Int,
                          gen: Gen[Double]): Gen[Array[DenseVector]] =
    for {
      arrays <- arraysOfNM(numRows, numCols, gen)
    } yield arrays.map(arr => new DenseVector(arr))

  val coordinateMatrixGen = for {
    lrow <- choose(5, 10)
    lcol <- choose(5, 10)
    lvecs <- vectorsOfNM(lrow, lcol, choose(-10.0, 10.0))
    rrow <- choose(5, 10)
    rcol <- choose(5, 10)
    rvecs <- vectorsOfNM(rrow, rcol, choose(-10.0, 10.0))
  } yield
    (
      new IndexedRowMatrix(sc.parallelize(lvecs.zipWithIndex.map {
        case (vector, i) => new IndexedRow(i, vector)
      })).toCoordinateMatrix,
      new IndexedRowMatrix(sc.parallelize(rvecs.zipWithIndex.map {
        case (vector, i) => new IndexedRow(i, vector)
      })).toCoordinateMatrix
    )
}

Source File: HDFSClusterTest.scala From spark-testing-base with Apache License 2.0

5 votes

package com.holdenkarau.spark.testing

import java.io.{
  BufferedReader, BufferedWriter, InputStreamReader, OutputStreamWriter}

import com.holdenkarau.spark.testing.{RDDComparisons, SharedSparkContext}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite

class HDFSClusterTest extends FunSuite with SharedSparkContext with RDDComparisons {

  var hdfsCluster: HDFSCluster = null

  override def beforeAll(): Unit = {
    super.beforeAll()
    hdfsCluster = new HDFSCluster
    hdfsCluster.startHDFS()
  }

  test("get the namenode uri") {
    val nameNodeURI = hdfsCluster.getNameNodeURI()
    assert(nameNodeURI == "hdfs://localhost:8020")
  }

  test("read and write from spark to hdfs") {
    val list = List(1, 2, 3, 4, 5)
    val numRDD: RDD[Int] = sc.parallelize(list)

    val path = hdfsCluster.getNameNodeURI() + "/myRDD"
    numRDD.saveAsTextFile(path)

    val loadedRDD: RDD[Int] = sc.textFile(path).map(_.toInt)
    assertRDDEquals(numRDD, loadedRDD)
  }

  test("test creating local file to hdfs") {
    val path = new Path(hdfsCluster.getNameNodeURI() + "/myfile")
    val fs = FileSystem.get(path.toUri, new Configuration())

    val writer = new BufferedWriter(new OutputStreamWriter(fs.create(path)))
    val writtenString = "hello, it's me"
    writer.write(writtenString)
    writer.close()

    val reader = new BufferedReader(new InputStreamReader(fs.open(path)))
    val readString = reader.readLine()
    reader.close()

    assert(writtenString == readString)
  }

  override def afterAll() {
    hdfsCluster.shutdownHDFS()
    super.afterAll()
  }
}

Source File: WordCountTest.scala From sparkProjectTemplate.g8 with Apache License 2.0

5 votes

package $organization$.$name$



import com.holdenkarau.spark.testing.SharedSparkContext
import org.scalatest.FunSuite

class WordCountTest extends FunSuite with SharedSparkContext {
  test("word count with Stop Words Removed"){
    val linesRDD = sc.parallelize(Seq(
      "How happy was the panda? You ask.",
      "Panda is the most happy panda in all the#!?ing land!"))

    val stopWords: Set[String] = Set("a", "the", "in", "was", "there", "she", "he")
    val splitTokens: Array[Char] = "#%?!. ".toCharArray

    val wordCounts = WordCount.withStopWordsFiltered(
      linesRDD, splitTokens, stopWords)
    val wordCountsAsMap = wordCounts.collectAsMap()
    assert(!wordCountsAsMap.contains("the"))
    assert(!wordCountsAsMap.contains("?"))
    assert(!wordCountsAsMap.contains("#!?ing"))
    assert(wordCountsAsMap.contains("ing"))
    assert(wordCountsAsMap.get("panda").get.equals(3))
  }
}

com.holdenkarau.spark.testing.SharedSparkContext Scala Examples