scala.collection.immutable.HashSet Scala Example

Source File: CollectionConvertersSuite.scala From pureconfig with Mozilla Public License 2.0

5 votes

package pureconfig

import scala.collection.JavaConverters._
import scala.collection.immutable.{ HashSet, ListSet, Queue, TreeSet }

import com.typesafe.config.{ ConfigFactory, ConfigValueFactory, ConfigValueType }
import pureconfig.error.{ ConfigReaderFailures, ConvertFailure, WrongType }

class CollectionConvertersSuite extends BaseSuite {
  implicit override val generatorDrivenConfig = PropertyCheckConfiguration(minSuccessful = 100)

  behavior of "ConfigConvert"

  checkArbitrary[HashSet[String]]

  checkArbitrary[List[Float]]
  checkRead[List[Int]](
    // order of keys maintained
    ConfigValueFactory.fromMap(Map("2" -> 1, "0" -> 2, "1" -> 3).asJava) -> List(2, 3, 1),
    ConfigValueFactory.fromMap(Map("3" -> 2, "1" -> 4).asJava) -> List(4, 2),
    ConfigValueFactory.fromMap(Map("1" -> 1, "a" -> 2).asJava) -> List(1))

  checkFailures[List[Int]](
    ConfigValueFactory.fromMap(Map("b" -> 1, "a" -> 2).asJava) -> ConfigReaderFailures(
      ConvertFailure(WrongType(ConfigValueType.OBJECT, Set(ConfigValueType.LIST)), emptyConfigOrigin, "")),
    ConfigValueFactory.fromMap(Map().asJava) -> ConfigReaderFailures(
      ConvertFailure(WrongType(ConfigValueType.OBJECT, Set(ConfigValueType.LIST)), emptyConfigOrigin, "")))

  checkArbitrary[ListSet[Int]]

  checkArbitrary[Map[String, Int]]
  checkFailures[Map[String, Int]](
    // nested map should fail
    ConfigFactory.parseString("conf.a=1").root() -> ConfigReaderFailures(
      ConvertFailure(WrongType(ConfigValueType.OBJECT, Set(ConfigValueType.NUMBER)), stringConfigOrigin(1), "conf")),
    // wrong value type should fail
    ConfigFactory.parseString("{ a=b }").root() -> ConfigReaderFailures(
      ConvertFailure(WrongType(ConfigValueType.STRING, Set(ConfigValueType.NUMBER)), stringConfigOrigin(1), "a")))

  checkArbitrary[Queue[Boolean]]

  checkArbitrary[Set[Double]]
  checkRead[Set[Int]](
    ConfigValueFactory.fromMap(Map("1" -> 4, "2" -> 5, "3" -> 6).asJava) -> Set(4, 5, 6))

  checkArbitrary[Stream[String]]

  checkArbitrary[TreeSet[Int]]

  checkArbitrary[Vector[Short]]

  checkArbitrary[Option[Int]]

  checkArbitrary[Array[Int]]
}

Source File: TestUtils.scala From shc with Apache License 2.0

5 votes

package org.apache.spark.sql

import java.nio.ByteBuffer
import java.io.{IOException, File}
import java.nio.ByteBuffer
import java.util

import org.apache.avro.generic.GenericData

import scala.collection.immutable.HashSet
import scala.collection.mutable.ArrayBuffer
import scala.util.Random

import com.google.common.io.Files
import org.apache.spark.sql.SQLContext

import scala.util.Random

object TestUtils {

  def generateRandomByteBuffer(rand: Random, size: Int): ByteBuffer = {
    val bb = ByteBuffer.allocate(size)
    val arrayOfBytes = new Array[Byte](size)
    rand.nextBytes(arrayOfBytes)
    bb.put(arrayOfBytes)
  }

  def generateRandomMap(rand: Random, size: Int): java.util.Map[String, Int] = {
    val jMap = new util.HashMap[String, Int]()
    for (i <- 0 until size) {
      jMap.put(rand.nextString(5), i)
    }
    jMap
  }

  def generateRandomArray(rand: Random, size: Int): util.ArrayList[Boolean] = {
    val vec = new util.ArrayList[Boolean]()
    for (i <- 0 until size) {
      vec.add(rand.nextBoolean())
    }
    vec
  }
}

Source File: ColumnarTestUtils.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar

import scala.collection.immutable.HashSet
import scala.util.Random

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, GenericMutableRow}
import org.apache.spark.sql.catalyst.util.{GenericArrayData, ArrayBasedMapData}
import org.apache.spark.sql.types.{AtomicType, Decimal}
import org.apache.spark.unsafe.types.UTF8String

object ColumnarTestUtils {
  def makeNullRow(length: Int): GenericMutableRow = {
    val row = new GenericMutableRow(length)
    (0 until length).foreach(row.setNullAt)
    row
  }

  def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = {
    def randomBytes(length: Int) = {
      val bytes = new Array[Byte](length)
      Random.nextBytes(bytes)
      bytes
    }

    (columnType match {
      case NULL => null
      case BOOLEAN => Random.nextBoolean()
      case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte
      case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort
      case INT => Random.nextInt()
      case LONG => Random.nextLong()
      case FLOAT => Random.nextFloat()
      case DOUBLE => Random.nextDouble()
      case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32)))
      case BINARY => randomBytes(Random.nextInt(32))
      case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale)
      case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale)
      case STRUCT(_) =>
        new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10))))
      case ARRAY(_) =>
        new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt()))
      case MAP(_) =>
        ArrayBasedMapData(
          Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32)))))
    }).asInstanceOf[JvmType]
  }

  def makeRandomValues(
      head: ColumnType[_],
      tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail)

  def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = {
    columnTypes.map(makeRandomValue(_))
  }

  def makeUniqueRandomValues[JvmType](
      columnType: ColumnType[JvmType],
      count: Int): Seq[JvmType] = {

    Iterator.iterate(HashSet.empty[JvmType]) { set =>
      set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next()
    }.drop(count).next().toSeq
  }

  def makeRandomRow(
      head: ColumnType[_],
      tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail)

  def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = {
    val row = new GenericMutableRow(columnTypes.length)
    makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) =>
      row(index) = value
    }
    row
  }

  def makeUniqueValuesAndSingleValueRows[T <: AtomicType](
      columnType: NativeColumnType[T],
      count: Int): (Seq[T#InternalType], Seq[GenericMutableRow]) = {

    val values = makeUniqueRandomValues(columnType, count)
    val rows = values.map { value =>
      val row = new GenericMutableRow(1)
      row(0) = value
      row
    }

    (values, rows)
  }
}

Source File: IpRangeTest.scala From ipaddr with Apache License 2.0

5 votes

package com.risksense.ipaddr

import scala.collection.immutable.HashSet

// scalastyle:off multiple.string.literals magic.number

class IpRangeTest extends UnitSpec {

  private val addr1 = "192.168.1.200"
  private val addr2 = "192.168.1.230"
  private val range = IpRange(addr1, addr2)
  private val range2 = IpRange("192.168.1.210", "192.168.1.220")
  private val range3 = IpRange("192.168.1.100", "192.168.1.210")
  private val range4 = IpRange("192.168.1.220", "192.168.1.240")

  "Creating an IpRange" should "result in failure if addresses are invalid" in {
    // first address invalid
    an[IpaddrException] should be thrownBy IpRange("1.2.300.20", "1.2.3.2")

    // second address invalid
    an[IpaddrException] should be thrownBy IpRange("192.168.1.200", "192.168.1.256")

    // first address > second address
    an[IpaddrException] should be thrownBy IpRange("192.168.1.230", "192.168.1.229")
  }

  it should "succeed if addresses are valid" in {
    IpRange("10.2.10.12", "10.2.10.15") shouldBe a[IpRange]
    IpRange("10.2.10.230", "10.2.10.230") shouldBe a[IpRange]
  }

  "An IpRange object" should "perform all range operations" in {
    range.toString() should be(addr1 + "-" + addr2)
    range.first should be(3232235976L)
    range.last should be(3232236006L)
    range.key should be((4, 3232235976L, 3232236006L))
    range.sortKey should be((4, 3232235976L, 27))
  }

  it should "perform contains operation" in {
    // Check range edge addresses
    range.contains(addr1) should be(true)
    range.contains(addr2) should be(true)
    range.contains(range2) should be(true)
    range.contains(range3) should be(false)
    range.contains(range4) should be(false)

    val net = IpNetwork("10.4.10.100/30")
    val rightRange = "10.4.10.105"
    val r1 = IpRange("10.4.10.101", "10.4.10.102")
    val r2 = IpRange("10.4.10.99", rightRange)
    val r3 = IpRange("10.4.10.100", rightRange)
    val r4 = IpRange("10.4.10.101", rightRange)
    r1.contains(net) should be(false)
    r2.contains(net) should be(true)
    r3.contains(net) should be(true)
    r4.contains(net) should be(false)
    an[IpaddrException] should be thrownBy r4.contains("1.2.3") // address is bad
  }

  it should "perform cidrs operation" in {
    val net1 = IpNetwork("192.168.1.200/29")
    val net2 = IpNetwork("192.168.1.208/28")
    val net3 = IpNetwork("192.168.1.224/30")
    val net4 = IpNetwork("192.168.1.228/31")
    val net5 = IpNetwork("192.168.1.230/32")
    val netList = List(net1, net2, net3, net4, net5)
    range.cidrs should be(netList)
  }

  it should "check for equality" in {
    val hs = HashSet(range2, range, range3)
    range should be(IpRange(addr1, addr2))
    range == range2 should be(false)
    range.equals(range) should be(true)
    range.equals(addr1) should be(false)
    hs.contains(range) should be(true)
    hs.contains(range4) should be(false)
  }

}

Source File: ColumnarTestUtils.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar

import scala.collection.immutable.HashSet
import scala.util.Random

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData}
import org.apache.spark.sql.types.{AtomicType, Decimal}
import org.apache.spark.unsafe.types.UTF8String

object ColumnarTestUtils {
  def makeNullRow(length: Int): GenericInternalRow = {
    val row = new GenericInternalRow(length)
    (0 until length).foreach(row.setNullAt)
    row
  }

  def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = {
    def randomBytes(length: Int) = {
      val bytes = new Array[Byte](length)
      Random.nextBytes(bytes)
      bytes
    }

    (columnType match {
      case NULL => null
      case BOOLEAN => Random.nextBoolean()
      case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte
      case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort
      case INT => Random.nextInt()
      case LONG => Random.nextLong()
      case FLOAT => Random.nextFloat()
      case DOUBLE => Random.nextDouble()
      case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32)))
      case BINARY => randomBytes(Random.nextInt(32))
      case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale)
      case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale)
      case STRUCT(_) =>
        new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10))))
      case ARRAY(_) =>
        new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt()))
      case MAP(_) =>
        ArrayBasedMapData(
          Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32)))))
      case _ => throw new IllegalArgumentException(s"Unknown column type $columnType")
    }).asInstanceOf[JvmType]
  }

  def makeRandomValues(
      head: ColumnType[_],
      tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail)

  def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = {
    columnTypes.map(makeRandomValue(_))
  }

  def makeUniqueRandomValues[JvmType](
      columnType: ColumnType[JvmType],
      count: Int): Seq[JvmType] = {

    Iterator.iterate(HashSet.empty[JvmType]) { set =>
      set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next()
    }.drop(count).next().toSeq
  }

  def makeRandomRow(
      head: ColumnType[_],
      tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail)

  def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = {
    val row = new GenericInternalRow(columnTypes.length)
    makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) =>
      row(index) = value
    }
    row
  }

  def makeUniqueValuesAndSingleValueRows[T <: AtomicType](
      columnType: NativeColumnType[T],
      count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = {

    val values = makeUniqueRandomValues(columnType, count)
    val rows = values.map { value =>
      val row = new GenericInternalRow(1)
      row(0) = value
      row
    }

    (values, rows)
  }
}

Source File: TwoSum.scala From coding-interview-questions-scala with Apache License 2.0

5 votes

package org.questions.arrays

import scala.annotation.tailrec
import scala.collection.immutable.HashSet


class TwoSum {
  def findPairSum(seq: Seq[Int], sum: Int): Option[(Int, Int)] = {

    @tailrec
    def inner(seq: Seq[Int], hash: HashSet[Int]): Option[(Int, Int)] = seq match {
      case Nil => None
      case head :: tail =>
        val lookingFor = sum - head
        if (hash.contains(lookingFor))
          Some(head -> lookingFor)
        else inner(tail, hash + head)
    }

    inner(seq, HashSet.empty[Int])
  }
}

Source File: ColumnarTestUtils.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.columnar

import scala.collection.immutable.HashSet
import scala.util.Random
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
import org.apache.spark.sql.types.{DataType, Decimal, AtomicType}
import org.apache.spark.unsafe.types.UTF8String
//列测试工具
object ColumnarTestUtils {
  def makeNullRow(length: Int): GenericMutableRow = {
    val row = new GenericMutableRow(length)
    (0 until length).foreach(row.setNullAt)
    row
  }
  //产生随机值
  def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = {
    def randomBytes(length: Int) = {
      val bytes = new Array[Byte](length)
      Random.nextBytes(bytes)
      bytes
    }

    (columnType match {
      case BOOLEAN => Random.nextBoolean()
      case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte
      case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort
      case INT => Random.nextInt()
      case DATE => Random.nextInt()
      case LONG => Random.nextLong()
      case TIMESTAMP => Random.nextLong()
      case FLOAT => Random.nextFloat()
      case DOUBLE => Random.nextDouble()
      case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32)))
      case BINARY => randomBytes(Random.nextInt(32))
      case FIXED_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale)
      case _ =>
        // Using a random one-element map instead of an arbitrary object
        //使用随机一元映射代替任意对象
        Map(Random.nextInt() -> Random.nextString(Random.nextInt(32)))
    }).asInstanceOf[JvmType]
  }

  def makeRandomValues(
      head: ColumnType[_],
      tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail)

  def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = {
    columnTypes.map(makeRandomValue(_))
  }
 //使唯一随机值
  def makeUniqueRandomValues[JvmType](
      columnType: ColumnType[JvmType],
      count: Int): Seq[JvmType] = {

    Iterator.iterate(HashSet.empty[JvmType]) { set =>
      set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next()
    }.drop(count).next().toSeq
  }

  def makeRandomRow(
      head: ColumnType[_],
      tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail)

  def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = {
    val row = new GenericMutableRow(columnTypes.length)
    makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) =>
      row(index) = value
    }
    row
  }
  //使唯一值和单值行
  def makeUniqueValuesAndSingleValueRows[T <: AtomicType](
      columnType: NativeColumnType[T],
      count: Int): (Seq[T#InternalType], Seq[GenericMutableRow]) = {

    val values = makeUniqueRandomValues(columnType, count)
    val rows = values.map { value =>
      val row = new GenericMutableRow(1)
      row(0) = value
      row
    }

    (values, rows)
  }
}

Source File: OptimizeInSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import scala.collection.immutable.HashSet
import org.apache.spark.sql.catalyst.analysis.{EliminateSubQueries, UnresolvedAttribute}
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.rules.RuleExecutor
import org.apache.spark.sql.types._

// For implicit conversions
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.dsl.expressions._

class OptimizeInSuite extends PlanTest {

  object Optimize extends RuleExecutor[LogicalPlan] {
    val batches =
      Batch("AnalysisNodes", Once,
        EliminateSubQueries) ::
      Batch("ConstantFolding", Once,
        ConstantFolding,
        BooleanSimplification,
        OptimizeIn) :: Nil
  }

  val testRelation = LocalRelation('a.int, 'b.int, 'c.int)
  //OptimizedIn测试：当子项未优化为InSet时少于10项
  test("OptimizedIn test: In clause not optimized to InSet when less than 10 items") {
    val originalQuery =
      testRelation
        .where(In(UnresolvedAttribute("a"), Seq(Literal(1), Literal(2))))
        .analyze

    val optimized = Optimize.execute(originalQuery.analyze)
    comparePlans(optimized, originalQuery)
  }
  //优化测试：在优化到InSert的子句中,超过10项
  test("OptimizedIn test: In clause optimized to InSet when more than 10 items") {
    val originalQuery =
      testRelation
        .where(In(UnresolvedAttribute("a"), (1 to 11).map(Literal(_))))
        .analyze

    val optimized = Optimize.execute(originalQuery.analyze)
    val correctAnswer =
      testRelation
        .where(InSet(UnresolvedAttribute("a"), (1 to 11).toSet))
        .analyze

    comparePlans(optimized, correctAnswer)
  }
  //OptimizedIn测试：在子句未优化的情况下,过滤器具有属性
  test("OptimizedIn test: In clause not optimized in case filter has attributes") {
    val originalQuery =
      testRelation
        .where(In(UnresolvedAttribute("a"), Seq(Literal(1), Literal(2), UnresolvedAttribute("b"))))
        .analyze

    val optimized = Optimize.execute(originalQuery.analyze)
    val correctAnswer =
      testRelation
        .where(In(UnresolvedAttribute("a"), Seq(Literal(1), Literal(2), UnresolvedAttribute("b"))))
        .analyze

    comparePlans(optimized, correctAnswer)
  }
}

Source File: MutableSet.scala From spark1.52 with Apache License 2.0

5 votes

package scalaDemo


object MutableSet {
  def main(args: Array[String]):Unit= {
    //可变
    import scala.collection.mutable.Set
    val movieSet = Set("Hitch", "Poltergeist")
    movieSet += "Shrek"
    println(movieSet)
    //不可变
    import scala.collection.immutable.HashSet
    val hashSet = HashSet("Tomatoes", "Chilies")
    println(hashSet + "Coriander")
  }

}

Source File: ColumnarTestUtils.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.columnar

import java.sql.Timestamp

import scala.collection.immutable.HashSet
import scala.util.Random

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
import org.apache.spark.sql.types.{UTF8String, DataType, Decimal, AtomicType}

object ColumnarTestUtils {
  def makeNullRow(length: Int): GenericMutableRow = {
    val row = new GenericMutableRow(length)
    (0 until length).foreach(row.setNullAt)
    row
  }

  def makeRandomValue[T <: DataType, JvmType](columnType: ColumnType[T, JvmType]): JvmType = {
    def randomBytes(length: Int) = {
      val bytes = new Array[Byte](length)
      Random.nextBytes(bytes)
      bytes
    }

    (columnType match {
      case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte
      case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort
      case INT => Random.nextInt()
      case LONG => Random.nextLong()
      case FLOAT => Random.nextFloat()
      case DOUBLE => Random.nextDouble()
      case FIXED_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale)
      case STRING => UTF8String(Random.nextString(Random.nextInt(32)))
      case BOOLEAN => Random.nextBoolean()
      case BINARY => randomBytes(Random.nextInt(32))
      case DATE => Random.nextInt()
      case TIMESTAMP =>
        val timestamp = new Timestamp(Random.nextLong())
        timestamp.setNanos(Random.nextInt(999999999))
        timestamp
      case _ =>
        // Using a random one-element map instead of an arbitrary object
        Map(Random.nextInt() -> Random.nextString(Random.nextInt(32)))
    }).asInstanceOf[JvmType]
  }

  def makeRandomValues(
      head: ColumnType[_ <: DataType, _],
      tail: ColumnType[_ <: DataType, _]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail)

  def makeRandomValues(columnTypes: Seq[ColumnType[_ <: DataType, _]]): Seq[Any] = {
    columnTypes.map(makeRandomValue(_))
  }

  def makeUniqueRandomValues[T <: DataType, JvmType](
      columnType: ColumnType[T, JvmType],
      count: Int): Seq[JvmType] = {

    Iterator.iterate(HashSet.empty[JvmType]) { set =>
      set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next()
    }.drop(count).next().toSeq
  }

  def makeRandomRow(
      head: ColumnType[_ <: DataType, _],
      tail: ColumnType[_ <: DataType, _]*): Row = makeRandomRow(Seq(head) ++ tail)

  def makeRandomRow(columnTypes: Seq[ColumnType[_ <: DataType, _]]): Row = {
    val row = new GenericMutableRow(columnTypes.length)
    makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) =>
      row(index) = value
    }
    row
  }

  def makeUniqueValuesAndSingleValueRows[T <: AtomicType](
      columnType: NativeColumnType[T],
      count: Int): (Seq[T#InternalType], Seq[GenericMutableRow]) = {

    val values = makeUniqueRandomValues(columnType, count)
    val rows = values.map { value =>
      val row = new GenericMutableRow(1)
      row(0) = value
      row
    }

    (values, rows)
  }
}

Source File: OptimizeInSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import scala.collection.immutable.HashSet
import org.apache.spark.sql.catalyst.analysis.{EliminateSubQueries, UnresolvedAttribute}
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.rules.RuleExecutor
import org.apache.spark.sql.types._

// For implicit conversions
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.dsl.expressions._

class OptimizeInSuite extends PlanTest {

  object Optimize extends RuleExecutor[LogicalPlan] {
    val batches =
      Batch("AnalysisNodes", Once,
        EliminateSubQueries) ::
      Batch("ConstantFolding", Once,
        ConstantFolding,
        BooleanSimplification,
        OptimizeIn) :: Nil
  }

  val testRelation = LocalRelation('a.int, 'b.int, 'c.int)

  test("OptimizedIn test: In clause optimized to InSet") {
    val originalQuery =
      testRelation
        .where(In(UnresolvedAttribute("a"), Seq(Literal(1), Literal(2))))
        .analyze

    val optimized = Optimize.execute(originalQuery.analyze)
    val correctAnswer =
      testRelation
        .where(InSet(UnresolvedAttribute("a"), HashSet[Any]() + 1 + 2))
        .analyze

    comparePlans(optimized, correctAnswer)
  }

  test("OptimizedIn test: In clause not optimized in case filter has attributes") {
    val originalQuery =
      testRelation
        .where(In(UnresolvedAttribute("a"), Seq(Literal(1), Literal(2), UnresolvedAttribute("b"))))
        .analyze

    val optimized = Optimize.execute(originalQuery.analyze)
    val correctAnswer =
      testRelation
        .where(In(UnresolvedAttribute("a"), Seq(Literal(1), Literal(2), UnresolvedAttribute("b"))))
        .analyze

    comparePlans(optimized, correctAnswer)
  }
}

Source File: ColumnarTestUtils.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar

import scala.collection.immutable.HashSet
import scala.util.Random

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData}
import org.apache.spark.sql.types.{AtomicType, Decimal}
import org.apache.spark.unsafe.types.UTF8String

object ColumnarTestUtils {
  def makeNullRow(length: Int): GenericInternalRow = {
    val row = new GenericInternalRow(length)
    (0 until length).foreach(row.setNullAt)
    row
  }

  def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = {
    def randomBytes(length: Int) = {
      val bytes = new Array[Byte](length)
      Random.nextBytes(bytes)
      bytes
    }

    (columnType match {
      case NULL => null
      case BOOLEAN => Random.nextBoolean()
      case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte
      case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort
      case INT => Random.nextInt()
      case LONG => Random.nextLong()
      case FLOAT => Random.nextFloat()
      case DOUBLE => Random.nextDouble()
      case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32)))
      case BINARY => randomBytes(Random.nextInt(32))
      case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale)
      case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale)
      case STRUCT(_) =>
        new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10))))
      case ARRAY(_) =>
        new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt()))
      case MAP(_) =>
        ArrayBasedMapData(
          Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32)))))
      case _ => throw new IllegalArgumentException(s"Unknown column type $columnType")
    }).asInstanceOf[JvmType]
  }

  def makeRandomValues(
      head: ColumnType[_],
      tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail)

  def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = {
    columnTypes.map(makeRandomValue(_))
  }

  def makeUniqueRandomValues[JvmType](
      columnType: ColumnType[JvmType],
      count: Int): Seq[JvmType] = {

    Iterator.iterate(HashSet.empty[JvmType]) { set =>
      set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next()
    }.drop(count).next().toSeq
  }

  def makeRandomRow(
      head: ColumnType[_],
      tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail)

  def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = {
    val row = new GenericInternalRow(columnTypes.length)
    makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) =>
      row(index) = value
    }
    row
  }

  def makeUniqueValuesAndSingleValueRows[T <: AtomicType](
      columnType: NativeColumnType[T],
      count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = {

    val values = makeUniqueRandomValues(columnType, count)
    val rows = values.map { value =>
      val row = new GenericInternalRow(1)
      row(0) = value
      row
    }

    (values, rows)
  }
}

Source File: InsertOrdSet.scala From daml with Apache License 2.0

5 votes

// Copyright (c) 2020 Digital Asset (Switzerland) GmbH and/or its affiliates. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

package com.daml.lf.data


import scala.collection.immutable.{HashSet, Set, Queue}
import scala.collection.{SetLike, AbstractSet}
import scala.collection.generic.{
  ImmutableSetFactory,
  GenericCompanion,
  CanBuildFrom,
  GenericSetTemplate
}

final class InsertOrdSet[T] private (_items: Queue[T], _hashSet: HashSet[T])
    extends AbstractSet[T]
    with Set[T]
    with SetLike[T, InsertOrdSet[T]]
    with GenericSetTemplate[T, InsertOrdSet]
    with Serializable {
  override def empty: InsertOrdSet[T] = InsertOrdSet.empty
  override def size: Int = _hashSet.size

  def iterator: Iterator[T] =
    _items.reverseIterator

  override def contains(elem: T): Boolean =
    _hashSet.contains(elem)

  override def +(elem: T): InsertOrdSet[T] =
    if (_hashSet.contains(elem))
      this
    else
      new InsertOrdSet(
        elem +: _items,
        _hashSet + elem
      )

  override def -(elem: T): InsertOrdSet[T] =
    new InsertOrdSet(
      _items.filter(elem2 => elem != elem2),
      _hashSet - elem
    )

  override def companion: GenericCompanion[InsertOrdSet] = InsertOrdSet

}

object InsertOrdSet extends ImmutableSetFactory[InsertOrdSet] {
  private val Empty = new InsertOrdSet(Queue.empty, HashSet.empty)
  override def empty[T] = Empty.asInstanceOf[InsertOrdSet[T]]
  def emptyInstance: InsertOrdSet[Any] = empty[Any]

  def fromSeq[T](s: Seq[T]): InsertOrdSet[T] =
    new InsertOrdSet(Queue(s.reverse: _*), HashSet(s: _*))

  implicit def canBuildFrom[A]: CanBuildFrom[Coll, A, InsertOrdSet[A]] =
    setCanBuildFrom[A]

}

Source File: ClusterListener.scala From akka-cluster-load-balancing with MIT License

5 votes

package kamkor.actor

import scala.collection.immutable.HashSet
import scala.concurrent.duration.DurationInt
import akka.actor.{ Actor, Props }
import akka.cluster.Cluster
import akka.cluster.ClusterEvent.MemberUp
import akka.cluster.metrics.{ ClusterMetricsChanged, ClusterMetricsExtension, NodeMetrics }
import akka.cluster.metrics.StandardMetrics.HeapMemory
import kamkor.{ ConsumerApp }
import kamkor.metrics.{ ClusterHeapMetrics, MetricsLogger }

class ClusterListener(metricsIntervalSeconds: Int) extends Actor {

  import context.dispatcher
  context.system.scheduler.schedule(
    metricsIntervalSeconds.seconds, metricsIntervalSeconds.seconds, self, "logConsumersHeapUse")

  private[this] val cluster = Cluster(context.system)
  private[this] val metricsLogger =
    new MetricsLogger(name = cluster.selfAddress.port.getOrElse(0).toString())
  private[this] val clusterHeapMetrics = new ClusterHeapMetrics()

  private var consumers: Set[String] = HashSet.empty

  override def preStart(): Unit = {
    ClusterMetricsExtension(context.system).subscribe(self)
    cluster.subscribe(self, classOf[MemberUp])
  }
  override def postStop(): Unit = {
    ClusterMetricsExtension(context.system).unsubscribe(self)
    Cluster(context.system).unsubscribe(self)
  }

  def receive: Receive = {
    case MemberUp(m) if m.roles.contains(ConsumerApp.clusterRole) =>
      consumers += m.address.hostPort
    case ClusterMetricsChanged(clusterMetrics) =>
      clusterMetrics
        .filter(nm => consumers.contains(nm.address.hostPort))
        .foreach(updateHeapUse(_))
    case "logConsumersHeapUse" => {
      metricsLogger.log(clusterHeapMetrics.calculateAverages)
      clusterHeapMetrics.clear()
    }
  }

  private[this] def updateHeapUse(nodeMetrics: NodeMetrics) {
    nodeMetrics match {
      case HeapMemory(address, timestamp, used, committed, max) => {
        val usedMB = Math.round(used.doubleValue / 1024 / 1024)
        clusterHeapMetrics.update(address.hostPort, usedMB)
      }
      case _ => // no heap info
    }
  }

}

object ClusterListener {

  def props(metricsIntervalSeconds: Int): Props = Props(new ClusterListener(metricsIntervalSeconds: Int))

}

Source File: TestEntityConstraints.scala From eidos with Apache License 2.0

5 votes

package org.clulab.wm.eidos.entities

import org.clulab.wm.eidos.extraction.EntityConstraints
import org.clulab.wm.eidos.test.TestUtils._

import scala.collection.immutable.HashSet

class TestEntityConstraints extends Test {

  def matchBrackets(text: String) =
      EntityConstraints.matchingBrackets(text.split(' ').toSeq)

  behavior of "EntityConstraints"

  it should "approve of properly nested parentheses" in {

    matchBrackets("This has none.") should be (true)
    matchBrackets("This has ( one pair ) .") should be (true)
    matchBrackets("This has ( ( nested pairs ) ) .") should be (true)
    matchBrackets("This has ( [ { } ] ) mixed pairs .") should be (true)
    matchBrackets("This has ( { ) } intermixed pairs .") should be (true)
    matchBrackets("This has -LRB- one strange pair -RRB- .") should be (true)
    matchBrackets("This has ( double ( nesting ) ( of ) parens ) .") should be (true)
  }

  it should "disapprove of improperly nested parentheses" in {
    matchBrackets("This starts with ) a reversed pair ( .") should be (false)
    matchBrackets("This has ( one normal pair ) and ) a reversed pair ( .") should be (false)
    matchBrackets("This count is just uneven ( in this one .") should be (false)
    matchBrackets("and this ) too") should be (false)
  }
}

Source File: ColumnarTestUtils.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar

import scala.collection.immutable.HashSet
import scala.util.Random

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData}
import org.apache.spark.sql.types.{AtomicType, Decimal}
import org.apache.spark.unsafe.types.UTF8String

object ColumnarTestUtils {
  def makeNullRow(length: Int): GenericInternalRow = {
    val row = new GenericInternalRow(length)
    (0 until length).foreach(row.setNullAt)
    row
  }

  def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = {
    def randomBytes(length: Int) = {
      val bytes = new Array[Byte](length)
      Random.nextBytes(bytes)
      bytes
    }

    (columnType match {
      case NULL => null
      case BOOLEAN => Random.nextBoolean()
      case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte
      case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort
      case INT => Random.nextInt()
      case LONG => Random.nextLong()
      case FLOAT => Random.nextFloat()
      case DOUBLE => Random.nextDouble()
      case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32)))
      case BINARY => randomBytes(Random.nextInt(32))
      case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale)
      case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale)
      case STRUCT(_) =>
        new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10))))
      case ARRAY(_) =>
        new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt()))
      case MAP(_) =>
        ArrayBasedMapData(
          Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32)))))
      case _ => throw new IllegalArgumentException(s"Unknown column type $columnType")
    }).asInstanceOf[JvmType]
  }

  def makeRandomValues(
      head: ColumnType[_],
      tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail)

  def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = {
    columnTypes.map(makeRandomValue(_))
  }

  def makeUniqueRandomValues[JvmType](
      columnType: ColumnType[JvmType],
      count: Int): Seq[JvmType] = {

    Iterator.iterate(HashSet.empty[JvmType]) { set =>
      set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next()
    }.drop(count).next().toSeq
  }

  def makeRandomRow(
      head: ColumnType[_],
      tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail)

  def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = {
    val row = new GenericInternalRow(columnTypes.length)
    makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) =>
      row(index) = value
    }
    row
  }

  def makeUniqueValuesAndSingleValueRows[T <: AtomicType](
      columnType: NativeColumnType[T],
      count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = {

    val values = makeUniqueRandomValues(columnType, count)
    val rows = values.map { value =>
      val row = new GenericInternalRow(1)
      row(0) = value
      row
    }

    (values, rows)
  }
}

Source File: ColumnarTestUtils.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar

import scala.collection.immutable.HashSet
import scala.util.Random

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData}
import org.apache.spark.sql.types.{AtomicType, Decimal}
import org.apache.spark.unsafe.types.UTF8String

object ColumnarTestUtils {
  def makeNullRow(length: Int): GenericInternalRow = {
    val row = new GenericInternalRow(length)
    (0 until length).foreach(row.setNullAt)
    row
  }

  def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = {
    def randomBytes(length: Int) = {
      val bytes = new Array[Byte](length)
      Random.nextBytes(bytes)
      bytes
    }

    (columnType match {
      case NULL => null
      case BOOLEAN => Random.nextBoolean()
      case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte
      case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort
      case INT => Random.nextInt()
      case LONG => Random.nextLong()
      case FLOAT => Random.nextFloat()
      case DOUBLE => Random.nextDouble()
      case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32)))
      case BINARY => randomBytes(Random.nextInt(32))
      case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale)
      case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale)
      case STRUCT(_) =>
        new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10))))
      case ARRAY(_) =>
        new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt()))
      case MAP(_) =>
        ArrayBasedMapData(
          Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32)))))
      case _ => throw new IllegalArgumentException(s"Unknown column type $columnType")
    }).asInstanceOf[JvmType]
  }

  def makeRandomValues(
      head: ColumnType[_],
      tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail)

  def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = {
    columnTypes.map(makeRandomValue(_))
  }

  def makeUniqueRandomValues[JvmType](
      columnType: ColumnType[JvmType],
      count: Int): Seq[JvmType] = {

    Iterator.iterate(HashSet.empty[JvmType]) { set =>
      set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next()
    }.drop(count).next().toSeq
  }

  def makeRandomRow(
      head: ColumnType[_],
      tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail)

  def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = {
    val row = new GenericInternalRow(columnTypes.length)
    makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) =>
      row(index) = value
    }
    row
  }

  def makeUniqueValuesAndSingleValueRows[T <: AtomicType](
      columnType: NativeColumnType[T],
      count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = {

    val values = makeUniqueRandomValues(columnType, count)
    val rows = values.map { value =>
      val row = new GenericInternalRow(1)
      row(0) = value
      row
    }

    (values, rows)
  }
}

Source File: SetCreateAccessBench.scala From abc with Apache License 2.0

5 votes

package com.rklaehn.abc

import cats.kernel.instances.all._
import ichi.bench.Thyme
import ichi.bench.Thyme.HowWarm

import scala.collection.immutable.{HashSet, SortedSet}

object SetCreateAccessBench extends App {

  val th = Thyme.warmed(verbose = println, warmth = HowWarm.BenchOff)

  val ns = Array(1, 10, 100, 1000, 10000, 100000)

  def createInt(): Unit = {
    for (n ← ns) {
      val elements = (0 until n).toArray
      def s0 = HashSet(elements:_*)
      def s1 = SortedSet(elements:_*)
      def s2 = ArraySet(elements:_*)
      th.pbenchOffWarm(s"Create HashSet[Int] vs ArraySet[Int] $n")(
        th.Warm(s0.asInstanceOf[AnyRef]))(
        th.Warm(s2.asInstanceOf[AnyRef]))
      th.pbenchOffWarm(s"Create SortedSet[Int] vs ArraySet[Int] $n")(
        th.Warm(s1.asInstanceOf[AnyRef]))(
        th.Warm(s2.asInstanceOf[AnyRef]))
    }
  }

  def accessInt(): Unit = {
    for (n ← ns) {
      val elements = (0 until n).toArray
      val s0 = HashSet(elements:_*)
      val s1 = SortedSet(elements:_*)
      val s2 = ArraySet(elements:_*)
      val x = 0
      th.pbenchOffWarm(s"Access HashSet[Int] vs ArraySet[Int] $n")(
        th.Warm(s0(x)))(
        th.Warm(s2(x)))
      th.pbenchOffWarm(s"Access SortedSet[Int] vs ArraySet[Int] $n")(
        th.Warm(s1(x)))(
        th.Warm(s2(x)))
    }
  }

  createInt()
  accessInt()
}

Source File: SetSetBench.scala From abc with Apache License 2.0

5 votes

package com.rklaehn.abc

import java.util.concurrent.TimeUnit

import org.openjdk.jmh.annotations._
import org.openjdk.jmh.infra.Blackhole
 import cats.kernel.instances.all._
import ScalaCollectionConverters._

import scala.collection.immutable.{HashSet, SortedSet}

sealed trait SetSetBenchOps {
  def union: Any
  def intersect: Any
  def diff: Any
  def subsetOf: Boolean
  def filter(f: Int => Boolean): Any
}

object SetSetBenchOps extends BenchUtil {

  def apply(a: Seq[Int], b: Seq[Int], kind: String) = {
    val a1 = a.map(mix)
    val b1 = b.map(mix)
    kind match {
      case "hashset" => ScalaCollectionBench(HashSet(a1: _*), HashSet(b1: _*))
      case "sortedset" => ScalaCollectionBench(SortedSet(a1: _*), SortedSet(b1: _*))
      case "arrayset" => TypeClassBench(ArraySet(a1: _*), ArraySet(b1: _*))
      case "arrayset2" => ScalaCollectionBench(ArraySet(a1: _*).asCollection, ArraySet(b1: _*).asCollection)
    }
  }

  private final case class ScalaCollectionBench(a: Set[Int], b: Set[Int]) extends SetSetBenchOps {
    override def union: Any = a union b
    override def diff: Any = a diff b
    override def subsetOf: Boolean = a subsetOf b
    override def intersect: Any = a intersect b
    override def filter(f: (Int) => Boolean): Any = a filter f
  }

  private final case class TypeClassBench(a: ArraySet[Int], b: ArraySet[Int]) extends SetSetBenchOps {
    override def union: Any = a union b
    override def diff: Any = a diff b
    override def subsetOf: Boolean = a subsetOf b
    override def intersect: Any = a intersect b
    override def filter(f: (Int) => Boolean): Any = a filter f
  }
}

@BenchmarkMode(Array(Mode.AverageTime))
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Thread)
class SetSetBench {

  @Param(Array("1", "10", "100", "1000", "10000", "100000"))
  var size = 0

  @Param(Array("0.0", "0.5", "1.0"))
//  @Param(Array("0.5"))
  var offset = 0.0

  @Param(Array("arrayset", "hashset", "sortedset")) //, "arrayset2"))
  var kind = ""

  var k: Int = 0
  var bench: SetSetBenchOps = _

  val shift = 1000000 // so we don't get the cached java.lang.Integer instances

  @Setup
  def setup(): Unit = {
    k = (offset * size).toInt
    bench = SetSetBenchOps(shift until (shift + size), (shift + k) until (shift + k + size), kind)
  }


  @Benchmark
  def union(x: Blackhole): Unit = {
    x.consume(bench.union)
  }

  @Benchmark
  def intersect(x: Blackhole): Unit = {
    x.consume(bench.intersect)
  }

  @Benchmark
  def diff(x: Blackhole): Unit = {
    x.consume(bench.diff)
  }

  @Benchmark
  def subsetOf(x: Blackhole): Unit = {
    x.consume(bench.subsetOf)
  }

  @Benchmark
  def filter(x: Blackhole): Unit = {
    x.consume(bench.filter(_ < k + shift))
  }
}

Source File: SetElementBench.scala From abc with Apache License 2.0

5 votes

package com.rklaehn.abc

import java.util.concurrent.TimeUnit

import org.openjdk.jmh.annotations._
import org.openjdk.jmh.infra.Blackhole
 import cats.kernel.instances.all._
import ScalaCollectionConverters._

import scala.collection.immutable.{HashSet, SortedSet}

sealed trait SetElementBenchOps {
  def containsTrue: Any
  def containsFalse: Any
}

object SetElementBenchOps extends BenchUtil {

  def apply(a: Seq[Int], c: Int, n: Int, kind: String) = {
    val a1 = a.map(mix)
    val c1 = mix(c)
    val n1 = mix(n)
    require(a1.length == a.length)
    kind match {
      case "hashset" => ScalaCollectionBench(HashSet(a1: _*), c1, n1)
      case "sortedset" => ScalaCollectionBench(SortedSet(a1: _*), c1, n1)
      case "arrayset" => TypeClassBench(ArraySet(a1: _*), c1, n1)
      case "arrayset2" => ScalaCollectionBench(ArraySet(a1: _*).asCollection, c1, n1)
    }
  }

  private final case class ScalaCollectionBench(a: Set[Int], c: Int, n: Int) extends SetElementBenchOps {
    override def containsTrue: Boolean = a.contains(c)
    override def containsFalse: Boolean = a.contains(n)
  }

  private final case class TypeClassBench(a: ArraySet[Int], c: Int, n: Int) extends SetElementBenchOps {
    override def containsTrue: Boolean = a.contains(c)
    override def containsFalse: Boolean = a.contains(n)
  }
}

@BenchmarkMode(Array(Mode.AverageTime))
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Thread)
class SetElementBench {

  @Param(Array("1", "10", "100", "1000", "10000", "100000"))
  var size = 0

  @Param(Array("arrayset", "hashset", "sortedset")) //, "arrayset2"))
  var kind = ""

  var k: Int = 0
  var bench: SetElementBenchOps = _

  @Setup
  def setup(): Unit = {
    val c = (0.3 * size).toInt // a value that is contained in the set
    val n = (1.3 * size).toInt // a value that is not contained in the set
    bench = SetElementBenchOps(0 until size, c, n, kind)
  }

  @Benchmark
  def containsFalse(x: Blackhole): Unit = x.consume(bench.containsFalse)

  @Benchmark
  def containsTrue(x: Blackhole): Unit = x.consume(bench.containsTrue)
}

Source File: SetCreateBench.scala From abc with Apache License 2.0

5 votes

package com.rklaehn.abc

import java.util.concurrent.TimeUnit

import org.openjdk.jmh.annotations._
import org.openjdk.jmh.infra.Blackhole
 import cats.kernel.instances.all._

import scala.collection.immutable.{HashSet, SortedSet}

sealed trait SetCreateBenchOps {
  def createBulk: Any
  def createElements: Any
}

object SetCreateBenchOps extends BenchUtil {

  def apply(a: Seq[Int], kind: String) = {
    val a1 = a.map(mix).toArray
    require(a1.length == a.length)
    kind match {
      case "hashset" => ScalaCollectionBench(a1, x => HashSet.apply(x: _*))
      case "sortedset" => ScalaCollectionBench(a1, x => SortedSet.apply(x: _*))
      case "arrayset" => TypeClassBench(a1)
      case "arrayset2" => ???
    }
  }

  private final case class ScalaCollectionBench(a: Array[Int], f: Array[Int] => Any) extends SetCreateBenchOps {
    override def createBulk: Any = f(a)
    override def createElements: Any = f(a)
  }

  private final case class TypeClassBench(a: Array[Int]) extends SetCreateBenchOps {
    override def createBulk: Any = {
      ArraySet(a: _*)
    }
    override def createElements: Any = {
      a.foldLeft(ArraySet.empty[Int])(_ + _)
    }
  }
}

@BenchmarkMode(Array(Mode.AverageTime))
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Thread)
class SetCreateBench {

  @Param(Array("1", "10", "100", "1000", "10000", "100000"))
  var size = 0

  @Param(Array("arrayset", "hashset", "sortedset")) //, "arrayset2"))
  var kind = ""

  var bench: SetCreateBenchOps = _

  @Setup
  def setup(): Unit = {
    bench = SetCreateBenchOps(0 until size, kind)
  }

  @Benchmark
  def createBulk(x: Blackhole): Unit = x.consume(bench.createBulk)

  @Benchmark
  def createElements(x: Blackhole): Unit = x.consume(bench.createElements)
}

Source File: CreateSizeList.scala From abc with Apache License 2.0

5 votes

package com.rklaehn.abc

import org.github.jamm.MemoryMeter
import cats.kernel.instances.all._
import scala.collection.immutable.{SortedSet, HashSet}

object CreateSizeList extends App {

  lazy val mm = new MemoryMeter()
  lazy val overhead = mm.measure(new java.lang.Object)
  lazy val pointerSize = (mm.measure(new Array[java.lang.Object](256)) - mm.measure(new Array[java.lang.Object](128))) / 128

  println("|n|ArraySet|HashSet|SortedSet|")
  println("|--:|--:|--:|--:|")
  for(n <- Seq(1, 10, 100, 1000, 10000, 100000)) {
    val xs = Array(1 to n: _*)
    val as = ArraySet[Int](xs: _*)
    val hs = HashSet[Int](xs: _*)
    val ss = SortedSet[Int](xs: _*)
    val ass = mm.measureDeep(as)
    val hss = mm.measureDeep(hs)
    val sss = mm.measureDeep(ss)
    println(s"| $n| $ass| $hss| $sss|")
  }

  println()
  println("|n|ArrayMap|HashMap|SortedMap|")
  println("|--:|--:|--:|--:|")
  for(n <- Seq(1, 10, 100, 1000, 10000, 100000)) {
    val xs = Array(1 to n: _*)
    val entries = xs.map(x => x -> x)
    val as = ArrayMap(entries: _*)
    val hs = HashSet(entries: _*)
    val ss = SortedSet(entries: _*)
    val ass = mm.measureDeep(as)
    val hss = mm.measureDeep(hs)
    val sss = mm.measureDeep(ss)
    println(s"| $n| $ass| $hss| $sss|")
  }

  println()
  println("|n|ArraySeq|Vector|List|")
  println("|--:|--:|--:|--:|")
  for(n <- Seq(1, 10, 100, 1000, 10000, 100000)) {
    val xs = Array(1 to n: _*)
    val as = ArraySeq(xs: _*)
    val hs = Vector(xs: _*)
    val ss = List(xs: _*)
    val ass = mm.measureDeep(as)
    val hss = mm.measureDeep(hs)
    val sss = mm.measureDeep(ss)
    println(s"| $n| $ass| $hss| $sss|")
  }
}

Source File: DeltaPushFilter.scala From connectors with Apache License 2.0

5 votes

package org.apache.spark.sql.delta

import scala.collection.immutable.HashSet
import scala.collection.JavaConverters._

import org.apache.hadoop.hive.ql.exec.{FunctionRegistry, SerializationUtilities}
import org.apache.hadoop.hive.ql.lib._
import org.apache.hadoop.hive.ql.parse.SemanticException
import org.apache.hadoop.hive.ql.plan.{ExprNodeColumnDesc, ExprNodeConstantDesc, ExprNodeGenericFuncDesc}
import org.apache.hadoop.hive.ql.udf.generic._
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.catalyst.expressions.{And, EqualNullSafe, EqualTo, Expression, GreaterThan, GreaterThanOrEqual, InSet, LessThan, LessThanOrEqual, Like, Literal, Not}

object DeltaPushFilter extends Logging {
  lazy val supportedPushDownUDFs = Array(
    "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual",
    "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan",
    "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan",
    "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan",
    "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan",
    "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotEqual",
    "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualNS",
    "org.apache.hadoop.hive.ql.udf.UDFLike",
    "org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn"
  )

  def partitionFilterConverter(hiveFilterExprSeriablized: String): Seq[Expression] = {
    if (hiveFilterExprSeriablized != null) {
      val filterExpr = SerializationUtilities.deserializeExpression(hiveFilterExprSeriablized)
      val opRules = new java.util.LinkedHashMap[Rule, NodeProcessor]()
      val nodeProcessor = new NodeProcessor() {
        @throws[SemanticException]
        def process(nd: Node, stack: java.util.Stack[Node],
            procCtx: NodeProcessorCtx, nodeOutputs: Object*): Object = {
          nd match {
            case e: ExprNodeGenericFuncDesc if FunctionRegistry.isOpAnd(e) =>
              nodeOutputs.map(_.asInstanceOf[Expression]).reduce(And)
            case e: ExprNodeGenericFuncDesc =>
              val (columnDesc, constantDesc) =
                if (nd.getChildren.get(0).isInstanceOf[ExprNodeColumnDesc]) {
                  (nd.getChildren.get(0), nd.getChildren.get(1))
                } else { (nd.getChildren.get(1), nd.getChildren.get(0)) }

              val columnAttr = UnresolvedAttribute(
                columnDesc.asInstanceOf[ExprNodeColumnDesc].getColumn)
              val constantVal = Literal(constantDesc.asInstanceOf[ExprNodeConstantDesc].getValue)
              nd.asInstanceOf[ExprNodeGenericFuncDesc].getGenericUDF match {
                case f: GenericUDFOPNotEqualNS =>
                  Not(EqualNullSafe(columnAttr, constantVal))
                case f: GenericUDFOPNotEqual =>
                  Not(EqualTo(columnAttr, constantVal))
                case f: GenericUDFOPEqualNS =>
                  EqualNullSafe(columnAttr, constantVal)
                case f: GenericUDFOPEqual =>
                  EqualTo(columnAttr, constantVal)
                case f: GenericUDFOPGreaterThan =>
                  GreaterThan(columnAttr, constantVal)
                case f: GenericUDFOPEqualOrGreaterThan =>
                  GreaterThanOrEqual(columnAttr, constantVal)
                case f: GenericUDFOPLessThan =>
                  LessThan(columnAttr, constantVal)
                case f: GenericUDFOPEqualOrLessThan =>
                  LessThanOrEqual(columnAttr, constantVal)
                case f: GenericUDFBridge if f.getUdfName.equals("like") =>
                  Like(columnAttr, constantVal)
                case f: GenericUDFIn =>
                  val inConstantVals = nd.getChildren.asScala
                    .filter(_.isInstanceOf[ExprNodeConstantDesc])
                    .map(_.asInstanceOf[ExprNodeConstantDesc].getValue)
                    .map(Literal(_)).toSet
                  InSet(columnAttr, HashSet() ++ inConstantVals)
                case _ =>
                  throw new RuntimeException(s"Unsupported func(${nd.getName}) " +
                    s"which can not be pushed down to delta")
              }
            case _ => null
          }
        }
      }

      val disp = new DefaultRuleDispatcher(nodeProcessor, opRules, null)
      val ogw = new DefaultGraphWalker(disp)
      val topNodes = new java.util.ArrayList[Node]()
      topNodes.add(filterExpr)
      val nodeOutput = new java.util.HashMap[Node, Object]()
      try {
        ogw.startWalking(topNodes, nodeOutput)
      } catch {
        case ex: Exception =>
          throw new RuntimeException(ex)
      }
      logInfo(s"converted partition filter expr:" +
        s"${nodeOutput.get(filterExpr).asInstanceOf[Expression].toJSON}")
      Seq(nodeOutput.get(filterExpr).asInstanceOf[Expression])
    } else Seq.empty[org.apache.spark.sql.catalyst.expressions.Expression]
  }
}

Source File: ColumnarTestUtils.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar

import scala.collection.immutable.HashSet
import scala.util.Random

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData}
import org.apache.spark.sql.types.{AtomicType, Decimal}
import org.apache.spark.unsafe.types.UTF8String

object ColumnarTestUtils {
  def makeNullRow(length: Int): GenericInternalRow = {
    val row = new GenericInternalRow(length)
    (0 until length).foreach(row.setNullAt)
    row
  }

  def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = {
    def randomBytes(length: Int) = {
      val bytes = new Array[Byte](length)
      Random.nextBytes(bytes)
      bytes
    }

    (columnType match {
      case NULL => null
      case BOOLEAN => Random.nextBoolean()
      case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte
      case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort
      case INT => Random.nextInt()
      case LONG => Random.nextLong()
      case FLOAT => Random.nextFloat()
      case DOUBLE => Random.nextDouble()
      case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32)))
      case BINARY => randomBytes(Random.nextInt(32))
      case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale)
      case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale)
      case STRUCT(_) =>
        new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10))))
      case ARRAY(_) =>
        new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt()))
      case MAP(_) =>
        ArrayBasedMapData(
          Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32)))))
      case _ => throw new IllegalArgumentException(s"Unknown column type $columnType")
    }).asInstanceOf[JvmType]
  }

  def makeRandomValues(
      head: ColumnType[_],
      tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail)

  def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = {
    columnTypes.map(makeRandomValue(_))
  }

  def makeUniqueRandomValues[JvmType](
      columnType: ColumnType[JvmType],
      count: Int): Seq[JvmType] = {

    Iterator.iterate(HashSet.empty[JvmType]) { set =>
      set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next()
    }.drop(count).next().toSeq
  }

  def makeRandomRow(
      head: ColumnType[_],
      tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail)

  def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = {
    val row = new GenericInternalRow(columnTypes.length)
    makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) =>
      row(index) = value
    }
    row
  }

  def makeUniqueValuesAndSingleValueRows[T <: AtomicType](
      columnType: NativeColumnType[T],
      count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = {

    val values = makeUniqueRandomValues(columnType, count)
    val rows = values.map { value =>
      val row = new GenericInternalRow(1)
      row(0) = value
      row
    }

    (values, rows)
  }
}

scala.collection.immutable.HashSet Scala Examples