org.apache.spark.sql.sources.Filter Scala Examples

The following examples show how to use org.apache.spark.sql.sources.Filter.
Example 1
Source File: CarbonBoundReference.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.sources.Filter

case class CastExpr(expr: Expression) extends Filter {
  override def references: Array[String] = null

case class FalseExpr() extends Filter {
  override def references: Array[String] = null

case class CarbonEndsWith(expr: Expression) extends Filter {
  override def references: Array[String] = null

case class CarbonContainsWith(expr: Expression) extends Filter {
  override def references: Array[String] = null
Example 2
Source File: DataFrameReaderFunctions.scala    From couchbase-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.couchbase.spark.sql

import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, DataFrameReader}

class DataFrameReaderFunctions(@transient val dfr: DataFrameReader) extends Serializable {

  private def buildFrame(options: Map[String, String] = null, schema: StructType = null,
    schemaFilter: Option[Filter] = null): DataFrame = {
    val builder = dfr

    val filter =
    if (filter.isDefined) {
      builder.option("schemaFilter", filter.get)

    if (options != null) {


Example 3
Source File: HiveAcidRelation.scala    From spark-acid   with Apache License 2.0 5 votes vote down vote up
package com.qubole.spark.hiveacid.datasource

import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Column, DataFrame, Row, SQLContext, SparkSession}
import org.apache.spark.sql.sources.{BaseRelation, Filter, InsertableRelation, PrunedFilteredScan}
import org.apache.spark.sql.types._
import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidTable, SparkAcidConf}
import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
import com.qubole.spark.hiveacid.merge.{MergeWhenClause, MergeWhenNotInsert}
import org.apache.spark.sql.catalyst.AliasIdentifier
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan

import collection.JavaConversions._

case class HiveAcidRelation(sparkSession: SparkSession,
                            fullyQualifiedTableName: String,
                            parameters: Map[String, String])
    extends BaseRelation
    with InsertableRelation
    with PrunedFilteredScan
    with Logging {

  private val hiveAcidMetadata: HiveAcidMetadata = HiveAcidMetadata.fromSparkSession(
  private val hiveAcidTable: HiveAcidTable = new HiveAcidTable(sparkSession,
    hiveAcidMetadata, parameters)

  private val readOptions = SparkAcidConf(sparkSession, parameters)

  override def sqlContext: SQLContext = sparkSession.sqlContext

  override val schema: StructType = if (readOptions.includeRowIds) {
  } else {

  override def insert(data: DataFrame, overwrite: Boolean): Unit = {
   // sql insert into and overwrite
    if (overwrite) {
    } else {

  def update(condition: Option[Column], newValues: Map[String, Column]): Unit = {
    hiveAcidTable.update(condition, newValues)

  def delete(condition: Column): Unit = {
  override def sizeInBytes: Long = {
    val compressionFactor = sparkSession.sessionState.conf.fileCompressionFactor
    (sparkSession.sessionState.conf.defaultSizeInBytes * compressionFactor).toLong

  def merge(sourceDf: DataFrame,
            mergeExpression: Expression,
            matchedClause: Seq[MergeWhenClause],
            notMatched: Option[MergeWhenNotInsert],
            sourceAlias: Option[AliasIdentifier],
            targetAlias: Option[AliasIdentifier]): Unit = {
    hiveAcidTable.merge(sourceDf, mergeExpression, matchedClause,
      notMatched, sourceAlias, targetAlias)

  def getHiveAcidTable(): HiveAcidTable = {

  // FIXME: should it be true / false. Recommendation seems to
  //  be to leave it as true
  override val needConversion: Boolean = false

  override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
    val readOptions = SparkAcidConf(sparkSession, parameters)
    // sql "select *"
    hiveAcidTable.getRdd(requiredColumns, filters, readOptions)
Example 4
Source File: BEDRelation.scala    From bdg-sequila   with Apache License 2.0 5 votes vote down vote up
package org.biodatageeks.sequila.datasources.BED

import org.apache.log4j.Logger
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Encoders, Row, SQLContext, SparkSession}
import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan}
import org.biodatageeks.sequila.utils.{Columns, DataQualityFuncs}

class BEDRelation(path: String)(@transient val sqlContext: SQLContext)
  extends BaseRelation
    with PrunedFilteredScan
    with Serializable {

  @transient val logger = Logger.getLogger(this.getClass.getCanonicalName)
  override def schema: org.apache.spark.sql.types.StructType = Encoders.product[org.biodatageeks.formats.BrowserExtensibleData].schema

  private def getValueFromColumn(colName:String, r:Array[String]): Any = {
    colName match {
      case Columns.CONTIG       =>  DataQualityFuncs.cleanContig(r(0) )
      case Columns.START        =>  r(1).toInt + 1 //Convert interval to 1-based
      case Columns.END          =>  r(2).toInt
      case Columns.NAME         =>  if (r.length > 3) Some (r(3)) else None
      case Columns.SCORE        =>  if (r.length > 4) Some (r(4).toInt) else None
      case Columns.STRAND       =>  if (r.length > 5) Some (r(5)) else None
      case Columns.THICK_START  =>  if (r.length > 6) Some (r(6).toInt) else None
      case Columns.THICK_END    =>  if (r.length > 7) Some (r(7).toInt) else None
      case Columns.ITEM_RGB     =>  if (r.length > 8) Some (r(8).split(",").map(_.toInt)) else None
      case Columns.BLOCK_COUNT  =>  if (r.length > 9) Some (r(9).toInt) else None
      case Columns.BLOCK_SIZES  =>  if (r.length > 10) Some (r(10).split(",").map(_.toInt)) else None
      case Columns.BLOCK_STARTS =>  if (r.length > 11) Some (r(11).split(",").map(_.toInt)) else None
      case _                    =>  throw new Exception(s"Unknown column found: ${colName}")
  override def buildScan(requiredColumns:Array[String], filters:Array[Filter]): RDD[Row] = {
              val record = new Array[Any](requiredColumns.length)
              for (i <- 0 to requiredColumns.length - 1) {
                record(i) = getValueFromColumn(requiredColumns(i), r)



Example 5
Source File: MetastoreIndexSuite.scala    From parquet-index   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources

import org.apache.hadoop.fs.Path

import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType

import com.github.lightcopy.testutil.UnitTestSuite
import com.github.lightcopy.testutil.implicits._

// Test catalog to check internal methods
private[datasources] class TestIndex extends MetastoreIndex {
  private var internalIndexFilters: Seq[Filter] = Nil
  override def tablePath(): Path = ???
  override def partitionSchema: StructType = ???
  override def indexSchema: StructType = ???
  override def dataSchema: StructType = ???
  override def setIndexFilters(filters: Seq[Filter]) = {
    internalIndexFilters = filters
  override def indexFilters: Seq[Filter] = internalIndexFilters
  override def listFilesWithIndexSupport(
      partitionFilters: Seq[Expression],
      dataFilters: Seq[Expression],
      indexFilters: Seq[Filter]): Seq[PartitionDirectory] = ???
  override def inputFiles: Array[String] = ???
  override def sizeInBytes: Long = ???

class MetastoreIndexSuite extends UnitTestSuite {
  test("provide sequence of path based on table path") {
    val catalog = new TestIndex() {
      override def tablePath(): Path = new Path("test")

    catalog.rootPaths should be (Seq(new Path("test")))

  test("when using listFiles directly supply empty index filter") {
    var indexSeq: Seq[Filter] = null
    var filterSeq: Seq[Expression] = null
    val catalog = new TestIndex() {
      override def listFilesWithIndexSupport(
          partitionFilters: Seq[Expression],
          dataFilters: Seq[Expression],
          indexFilters: Seq[Filter]): Seq[PartitionDirectory] = {
        indexSeq = indexFilters
        filterSeq = partitionFilters

    catalog.listFiles(Seq.empty, Seq.empty)
    indexSeq should be (Nil)
    filterSeq should be (Nil)

  test("refresh should be no-op by default") {
    val catalog = new TestIndex()
Example 6
Source File: MongodbRDDIterator.scala    From Spark-MongoDB   with Apache License 2.0 5 votes vote down vote up
package com.stratio.datasource.mongodb.rdd

import com.mongodb.casbah.Imports._
import com.stratio.datasource.mongodb.query.FilterSection
import com.stratio.datasource.mongodb.reader.MongodbReader
import com.stratio.datasource.util.Config
import org.apache.spark._
import org.apache.spark.sql.sources.Filter

class MongodbRDDIterator(
  taskContext: TaskContext,
  partition: Partition,
  config: Config,
  requiredColumns: Array[String],
  filters: FilterSection)
  extends Iterator[DBObject] {

  private var closed = false
  private var initialized = false

  lazy val reader = {
    initialized = true

  // Register an on-task-completion callback to close the input stream.
  taskContext.addTaskCompletionListener((context: TaskContext) => closeIfNeeded())

  override def hasNext: Boolean = {
    !closed && reader.hasNext

  override def next(): DBObject = {
    if (!hasNext) {
      throw new NoSuchElementException("End of stream")

  def closeIfNeeded(): Unit = {
    if (!closed) {
      closed = true

  protected def close(): Unit = {
    if (initialized) {
      initialized = false

  def initReader() = {
    val reader = new MongodbReader(config, requiredColumns, filters)
Example 7
Source File: customFilters.scala    From Spark-MongoDB   with Apache License 2.0 5 votes vote down vote up
package com.stratio.datasource.mongodb.sources

import org.apache.spark.sql.sources.Filter

trait GeoFilter extends Filter {
  val attribute: String
  val maxDistance: Option[Double]

case class Near(
                 attribute: String,
                 x: Double, y: Double,
                 maxDistance: Option[Double] = None
               ) extends GeoFilter

case class NearSphere(
                       attribute: String,
                       longitude: Double, latitude: Double,
                       maxDistance: Option[Double] = None
                     ) extends GeoFilter 
Example 8
Source File: DeltaSourceUtils.scala    From delta   with Apache License 2.0 5 votes vote down vote up

import java.util.Locale

import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.catalyst.expressions
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.sources
import org.apache.spark.sql.sources.Filter

object DeltaSourceUtils {
  val NAME = "delta"
  val ALT_NAME = "delta"

  // Batch relations don't pass partitioning columns to `CreatableRelationProvider`s, therefore
  // as a hack, we pass in the partitioning columns among the options.
  val PARTITIONING_COLUMNS_KEY = "__partition_columns"

  def isDeltaDataSourceName(name: String): Boolean = {
    name.toLowerCase(Locale.ROOT) == NAME || name.toLowerCase(Locale.ROOT) == ALT_NAME

  def translateFilters(filters: Array[Filter]): Expression = {
    case sources.EqualTo(attribute, value) =>
      expressions.EqualTo(UnresolvedAttribute(attribute), expressions.Literal.create(value))
    case sources.EqualNullSafe(attribute, value) =>
      expressions.EqualNullSafe(UnresolvedAttribute(attribute), expressions.Literal.create(value))
    case sources.GreaterThan(attribute, value) =>
      expressions.GreaterThan(UnresolvedAttribute(attribute), expressions.Literal.create(value))
    case sources.GreaterThanOrEqual(attribute, value) =>
        UnresolvedAttribute(attribute), expressions.Literal.create(value))
    case sources.LessThan(attribute, value) =>
      expressions.LessThanOrEqual(UnresolvedAttribute(attribute), expressions.Literal.create(value))
    case sources.LessThanOrEqual(attribute, value) =>
      expressions.LessThanOrEqual(UnresolvedAttribute(attribute), expressions.Literal.create(value))
    case sources.In(attribute, values) =>
    case sources.IsNull(attribute) => expressions.IsNull(UnresolvedAttribute(attribute))
    case sources.IsNotNull(attribute) => expressions.IsNotNull(UnresolvedAttribute(attribute))
    case sources.Not(otherFilter) => expressions.Not(translateFilters(Array(otherFilter)))
    case sources.And(filter1, filter2) =>
      expressions.And(translateFilters(Array(filter1)), translateFilters(Array(filter2)))
    case sources.Or(filter1, filter2) =>
      expressions.Or(translateFilters(Array(filter1)), translateFilters(Array(filter2)))
    case sources.StringStartsWith(attribute, value) =>
      new expressions.Like(
        UnresolvedAttribute(attribute), expressions.Literal.create(s"${value}%"))
    case sources.StringEndsWith(attribute, value) =>
      new expressions.Like(
        UnresolvedAttribute(attribute), expressions.Literal.create(s"%${value}"))
    case sources.StringContains(attribute, value) =>
      new expressions.Like(
        UnresolvedAttribute(attribute), expressions.Literal.create(s"%${value}%"))
    case sources.AlwaysTrue() => expressions.Literal.TrueLiteral
    case sources.AlwaysFalse() => expressions.Literal.FalseLiteral
Example 9
Source File: TableIndexConnector.scala    From spark-dynamodb   with Apache License 2.0 5 votes vote down vote up
package com.audienceproject.spark.dynamodb.connector

import{ItemCollection, ScanOutcome}
import org.apache.spark.sql.sources.Filter

import scala.collection.JavaConverters._

private[dynamodb] class TableIndexConnector(tableName: String, indexName: String, parallelism: Int, parameters: Map[String, String])
    extends DynamoConnector with Serializable {

    private val consistentRead = parameters.getOrElse("stronglyConsistentReads", "false").toBoolean
    private val filterPushdown = parameters.getOrElse("filterPushdown", "true").toBoolean
    private val region = parameters.get("region")
    private val roleArn = parameters.get("roleArn")

    override val filterPushdownEnabled: Boolean = filterPushdown

    override val (keySchema, readLimit, itemLimit, totalSegments) = {
        val table = getDynamoDB(region, roleArn).getTable(tableName)
        val indexDesc = table.describe().getGlobalSecondaryIndexes.asScala.find(_.getIndexName == indexName).get

        // Key schema.
        val keySchema = KeySchema.fromDescription(indexDesc.getKeySchema.asScala)

        // User parameters.
        val bytesPerRCU = parameters.getOrElse("bytesPerRCU", "4000").toInt
        val maxPartitionBytes = parameters.getOrElse("maxpartitionbytes", "128000000").toInt
        val targetCapacity = parameters.getOrElse("targetCapacity", "1").toDouble
        val readFactor = if (consistentRead) 1 else 2

        // Table parameters.
        val indexSize = indexDesc.getIndexSizeBytes
        val itemCount = indexDesc.getItemCount

        // Partitioning calculation.
        val numPartitions = parameters.get("readpartitions").map(_.toInt).getOrElse({
            val sizeBased = (indexSize / maxPartitionBytes).toInt max 1
            val remainder = sizeBased % parallelism
            if (remainder > 0) sizeBased + (parallelism - remainder)
            else sizeBased

        // Provisioned or on-demand throughput.
        val readThroughput = parameters.getOrElse("throughput", Option(indexDesc.getProvisionedThroughput.getReadCapacityUnits)
            .filter(_ > 0).map(_.longValue().toString)

        // Rate limit calculation.
        val avgItemSize = indexSize.toDouble / itemCount
        val readCapacity = readThroughput * targetCapacity

        val rateLimit = readCapacity / parallelism
        val itemLimit = ((bytesPerRCU / avgItemSize * rateLimit).toInt * readFactor) max 1

        (keySchema, rateLimit, itemLimit, numPartitions)

    override def scan(segmentNum: Int, columns: Seq[String], filters: Seq[Filter]): ItemCollection[ScanOutcome] = {
        val scanSpec = new ScanSpec()

        if (columns.nonEmpty) {
            val xspec = new ExpressionSpecBuilder().addProjections(columns: _*)

            if (filters.nonEmpty && filterPushdown) {


        getDynamoDB(region, roleArn).getTable(tableName).getIndex(indexName).scan(scanSpec)

Example 10
Source File: ScanPartition.scala    From spark-dynamodb   with Apache License 2.0 5 votes vote down vote up
package com.audienceproject.spark.dynamodb.datasource

import com.audienceproject.spark.dynamodb.connector.DynamoConnector
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.sources.v2.reader.{InputPartition, InputPartitionReader}
import org.apache.spark.sql.types.{StructField, StructType}

import scala.collection.JavaConverters._

class ScanPartition(schema: StructType,
                    partitionIndex: Int,
                    connector: DynamoConnector,
                    filters: Array[Filter])
    extends InputPartition[InternalRow] {

    private val requiredColumns =

    private lazy val typeConversions = schema.collect({
        case StructField(name, dataType, _, _) => name -> TypeConversion(name, dataType)

    override def createPartitionReader(): InputPartitionReader[InternalRow] = {
        if (connector.isEmpty) new EmptyReader
        else new PartitionReader

    private class EmptyReader extends InputPartitionReader[InternalRow] {
        override def next(): Boolean = false

        override def get(): InternalRow = throw new IllegalStateException("Unable to call get() on empty iterator")

        override def close(): Unit = {}

    private class PartitionReader extends InputPartitionReader[InternalRow] {

        private val pageIterator = connector.scan(partitionIndex, requiredColumns, filters).pages().iterator().asScala
        private val rateLimiter = RateLimiter.create(connector.readLimit)

        private var innerIterator: Iterator[InternalRow] = Iterator.empty

        private var currentRow: InternalRow = _
        private var proceed = false

        override def next(): Boolean = {
            proceed = true
            innerIterator.hasNext || {
                if (pageIterator.hasNext) {
                else false

        override def get(): InternalRow = {
            if (proceed) {
                currentRow =
                proceed = false

        override def close(): Unit = {}

        private def nextPage(): Unit = {
            val page =
            val result = page.getLowLevelResult
            Option(result.getScanResult.getConsumedCapacity).foreach(cap => rateLimiter.acquire(cap.getCapacityUnits.toInt max 1))
            innerIterator = result.getItems.iterator()


    private def itemToRow(requiredColumns: Seq[String])(item: Item): InternalRow =
        if (requiredColumns.nonEmpty) InternalRow.fromSeq( => typeConversions(columnName)(item)))
        else InternalRow.fromSeq(item.asMap()

Example 11
Source File: TextMatchUDF.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.index

import org.apache.spark.sql.sources.Filter

import org.apache.carbondata.common.annotations.InterfaceAudience

class TextMatchUDF extends ((String) => Boolean) with Serializable {
  override def apply(v1: String): Boolean = {
    v1.length > 0

class TextMatchMaxDocUDF extends ((String, Int) => Boolean) with Serializable {
  override def apply(v1: String, v2: Int): Boolean = {
    v1.length > 0

case class TextMatch(queryString: String) extends Filter {
  override def references: Array[String] = null

case class TextMatchLimit(queryString: String, maxDoc: String) extends Filter {
  override def references: Array[String] = null
Example 12
Source File: InPolygonUDF.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.geo

import org.apache.spark.sql.sources.Filter

import org.apache.carbondata.common.annotations.InterfaceAudience

class InPolygonUDF extends (String => Boolean) with Serializable {
  override def apply(v1: String): Boolean = {
    true // Carbon applies the filter. So, Spark do not have to apply filter.

case class InPolygon(queryString: String) extends Filter {
  override def references: Array[String] = null
Example 13
Source File: RiakRelation.scala    From spark-riak-connector   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.riak

import com.basho.riak.spark._
import scala.reflect._
import com.basho.riak.spark.rdd.connector.{RiakConnectorConf, RiakConnector}
import com.basho.riak.spark.rdd.{ReadConf, RiakTSRDD}
import com.basho.riak.spark.util.TSConversionUtil
import com.basho.riak.spark.writer.WriteConf
import com.basho.riak.spark.writer.mapper.SqlDataMapper
import org.apache.spark.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{InsertableRelation, BaseRelation, Filter, PrunedFilteredScan}
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import scala.collection.convert.decorateAsScala._
import com.basho.riak.spark.query.QueryBucketDef

object RiakRelation {
  def apply(bucket: String,
            sqlContext: SQLContext,
            schema: Option[StructType] = None,
            connector: Option[RiakConnector] = None,
            readConf: ReadConf,
            writeConf: WriteConf): RiakRelation = {

    new RiakRelation(bucket, connector.getOrElse(RiakConnector(sqlContext.sparkContext.getConf)),
      readConf, writeConf, sqlContext, schema)

  def apply(sqlContext: SQLContext, parameters: Map[String, String], schema: Option[StructType]): RiakRelation = {
    val existingConf = sqlContext.sparkContext.getConf
    val bucketDef = BucketDef(parameters(DefaultSource.RiakBucketProperty), None)
    val riakConnector = new RiakConnector(RiakConnectorConf(existingConf, parameters))
    val readConf = ReadConf(existingConf, parameters)
    val writeConf = WriteConf(existingConf, parameters)
    RiakRelation(bucketDef.bucket, sqlContext, schema, Some(riakConnector), readConf, writeConf)
Example 14
Source File: SequoiadbRDD.scala    From spark-sequoiadb   with Apache License 2.0 5 votes vote down vote up
package com.sequoiadb.spark.rdd

import org.apache.spark.SparkContext
import com.sequoiadb.spark.partitioner._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.sources.Filter
import org.apache.spark.{Partition, TaskContext}
import org.bson.BSONObject
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.mutable.ArrayBuffer

  def apply (
    sc: SQLContext,
    config: SequoiadbConfig,
    partitioner: Option[SequoiadbPartitioner] = None,
    requiredColumns: Array[String] = Array(),
    filters: Array[Filter] = Array(),
    queryReturnType: Int = SequoiadbConfig.QUERYRETURNBSON,
    queryLimit: Long = -1) = {
    new SequoiadbRDD ( sc.sparkContext, config, partitioner,
      requiredColumns, filters, queryReturnType, queryLimit)
Example 15
Source File: SequoiadbRDDIterator.scala    From spark-sequoiadb   with Apache License 2.0 5 votes vote down vote up
package com.sequoiadb.spark.rdd

import org.apache.spark._
import org.apache.spark.sql.sources.Filter
import org.bson.BSONObject
import org.slf4j.{Logger, LoggerFactory}

class SequoiadbRDDIterator(
  taskContext: TaskContext,
  partition: Partition,
  config: SequoiadbConfig,
  requiredColumns: Array[String],
  filters: Array[Filter],
  queryReturnType: Int = SequoiadbConfig.QUERYRETURNBSON,
  queryLimit: Long = -1)
  extends Iterator[BSONObject] {

  private var LOG: Logger = LoggerFactory.getLogger(this.getClass.getName())
  protected var finished = false
  private var closed = false
  private var initialized = false

  lazy val reader = {
    initialized = true

  // Register an on-task-completion callback to close the input stream.
  taskContext.addTaskCompletionListener((context: TaskContext) => closeIfNeeded())

  override def hasNext: Boolean = {
    !finished && reader.hasNext

  override def next(): BSONObject = {
    if (!hasNext) {
      throw new NoSuchElementException("End of stream")

  def closeIfNeeded(): Unit = {
    if (!closed) {
      closed = true

  protected def close(): Unit = {
    if (initialized) {

  def initReader() = {
    val reader = new SequoiadbReader(config,requiredColumns,filters, queryReturnType, queryLimit)
Example 16
Source File: ArrowFileFormat.scala    From OAP   with Apache License 2.0 5 votes vote down vote up

import scala.collection.JavaConverters._

import{ArrowFilters, ArrowOptions}
import org.apache.arrow.dataset.scanner.ScanOptions
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileStatus
import org.apache.hadoop.mapreduce.Job

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile}
import org.apache.spark.sql.execution.datasources.v2.arrow.ArrowUtils
import org.apache.spark.sql.sources.{DataSourceRegister, Filter}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.util.CaseInsensitiveStringMap;

class ArrowFileFormat extends FileFormat with DataSourceRegister with Serializable {

  val batchSize = 4096

  def convert(files: Seq[FileStatus], options: Map[String, String]): Option[StructType] = {
    ArrowUtils.readSchema(files, new CaseInsensitiveStringMap(options.asJava))

  override def inferSchema(
                            sparkSession: SparkSession,
                            options: Map[String, String],
                            files: Seq[FileStatus]): Option[StructType] = {
    convert(files, options)

  override def prepareWrite(
                             sparkSession: SparkSession,
                             job: Job,
                             options: Map[String, String],
                             dataSchema: StructType): OutputWriterFactory = {
    throw new UnsupportedOperationException("Write is not supported for Arrow source")

  override def supportBatch(sparkSession: SparkSession, dataSchema: StructType): Boolean = true

  override def buildReaderWithPartitionValues(sparkSession: SparkSession,
      dataSchema: StructType,
      partitionSchema: StructType,
      requiredSchema: StructType,
      filters: Seq[Filter],
      options: Map[String, String],
      hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = {
    (file: PartitionedFile) => {

      val sqlConf = sparkSession.sessionState.conf;
      val enableFilterPushDown = sqlConf.arrowFilterPushDown
      val factory = ArrowUtils.makeArrowDiscovery(
        file.filePath, new ArrowOptions(
          new CaseInsensitiveStringMap(

      // todo predicate validation / pushdown
      val dataset = factory.finish();

      val filter = if (enableFilterPushDown) {
      } else {

      val scanOptions = new ScanOptions( =>,
        filter, batchSize)
      val scanner = dataset.newScan(scanOptions)
      val itrList = scanner
        .map(task => task.scan())

      val itr = itrList
        .flatMap(itr => itr.asScala)
        .map(vsr => ArrowUtils.loadVsr(vsr, file.partitionValues, partitionSchema, dataSchema))
      new UnsafeItr(itr).asInstanceOf[Iterator[InternalRow]]

  override def shortName(): String = "arrow"

object ArrowFileFormat {
  class UnsafeItr[T](delegate: Iterator[T]) extends Iterator[T] {
    override def hasNext: Boolean = delegate.hasNext

    override def next(): T =
Example 17
Source File: ArrowScanBuilder.scala    From OAP   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.sql.SparkSession
import{Scan, SupportsPushDownFilters}
import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.util.CaseInsensitiveStringMap

case class ArrowScanBuilder(
    sparkSession: SparkSession,
    fileIndex: PartitioningAwareFileIndex,
    schema: StructType,
    dataSchema: StructType,
    options: CaseInsensitiveStringMap)
    extends FileScanBuilder(sparkSession, fileIndex, dataSchema)
    with SupportsPushDownFilters {

  private var filters: Array[Filter] = Array.empty
  private lazy val pushedArrowFilters: Array[Filter] = {
    filters // todo filter validation & pushdown

  override def pushFilters(filters: Array[Filter]): Array[Filter] = {
    this.filters = filters

  override def pushedFilters: Array[Filter] = pushedArrowFilters

  override def build(): Scan = {
Example 18
Source File: ArrowScan.scala    From OAP   with Apache License 2.0 5 votes vote down vote up

import scala.collection.JavaConverters._

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
import org.apache.spark.sql.execution.datasources.v2.FileScan
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.util.CaseInsensitiveStringMap
import org.apache.spark.util.SerializableConfiguration

case class ArrowScan(
    sparkSession: SparkSession,
    fileIndex: PartitioningAwareFileIndex,
    readDataSchema: StructType,
    readPartitionSchema: StructType,
    pushedFilters: Array[Filter],
    options: CaseInsensitiveStringMap,
    partitionFilters: Seq[Expression] = Seq.empty,
    dataFilters: Seq[Expression] = Seq.empty)
    extends FileScan {

  override def createReaderFactory(): PartitionReaderFactory = {
    val caseSensitiveMap = options.asCaseSensitiveMap().asScala.toMap
    val hconf = sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap)
    val broadcastedConf =
      sparkSession.sparkContext.broadcast(new SerializableConfiguration(hconf))
      new ArrowOptions(options.asScala.toMap))

  override def withFilters(partitionFilters: Seq[Expression],
                           dataFilters: Seq[Expression]): FileScan =
    this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters)
Example 19
Source File: TestDataFile.scala    From OAP   with Apache License 2.0 5 votes vote down vote up

import org.apache.hadoop.conf.Configuration

import org.apache.spark.sql.execution.datasources.oap.filecache.FiberCache
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType

private[oap] case class TestDataFile(path: String, schema: StructType, configuration: Configuration)
  extends DataFile {

  override def iterator(
      requiredIds: Array[Int],
      filters: Seq[Filter]): OapCompletionIterator[Any] =
    new OapCompletionIterator(Iterator.empty, {})

  override def iteratorWithRowIds(
      requiredIds: Array[Int],
      rowIds: Array[Int],
      filters: Seq[Filter]):
  OapCompletionIterator[Any] = new OapCompletionIterator(Iterator.empty, {})

  override def totalRows(): Long = 0

  override def getDataFileMeta(): DataFileMeta =
    throw new UnsupportedOperationException

  override def cache(groupId: Int, fiberId: Int): FiberCache =
    throw new UnsupportedOperationException
Example 20
Source File: FilterHelper.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.oap.utils

import org.apache.hadoop.conf.Configuration
import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate}
import org.apache.parquet.hadoop.ParquetInputFormat

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources.parquet.ParquetFiltersWrapper
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType

object FilterHelper {

  def tryToPushFilters(
      sparkSession: SparkSession,
      requiredSchema: StructType,
      filters: Seq[Filter]): Option[FilterPredicate] = {
    tryToPushFilters(sparkSession.sessionState.conf, requiredSchema, filters)

  def tryToPushFilters(
      conf: SQLConf,
      requiredSchema: StructType,
      filters: Seq[Filter]): Option[FilterPredicate] = {
    if (conf.parquetFilterPushDown) {
        // Collects all converted Parquet filter predicates. Notice that not all predicates can be
        // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap`
        // is used here.
        .flatMap(ParquetFiltersWrapper.createFilter(conf, requiredSchema, _))
    } else {

  def setFilterIfExist(configuration: Configuration, pushed: Option[FilterPredicate]): Unit = {
    pushed match {
      case Some(filters) => ParquetInputFormat.setFilterPredicate(configuration, filters)
      case _ => // do nothing
Example 21
Source File: SqlBuilderSuiteBase.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.sources.sql

import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.DataType
import org.scalatest.FunSuite

import scala.util.matching.Regex

trait SqlBuilderSuiteBase {
  self: FunSuite =>

  val sqlBuilder: SqlBuilder // scalastyle:ignore

  def testExpressionToSql(sql: String)(expr: Expression): Unit = {
    val cleanSql = cleanUpSql(sql)
    test(s"expressionToSql: $cleanSql | with $expr") {

  def testBuildSelect(sql: String)
                     (i1: SqlLikeRelation, i2: Seq[String], i3: Seq[Filter]): Unit = {
    val cleanSql = cleanUpSql(sql)
    test(s"buildSelect: $cleanSql | with $i1 $i2 $i3") {
      assertResult(cleanSql)(sqlBuilder.buildSelect(i1, i2, i3))

  def testLogicalPlan(sql: String)(plan: LogicalPlan): Unit = {
    val cleanSql = cleanUpSql(sql)
    test(s"logical plan: $cleanSql | with $plan") {

  def testLogicalPlanInternal(sql: String)(plan: LogicalPlan): Unit = {
    val cleanSql = cleanUpSql(sql)
    test(s"logical plan (internal): $cleanSql | with $plan") {
      assertResult(cleanSql)(sqlBuilder.internalLogicalPlanToSql(plan, noProject = true))

  def testUnsupportedLogicalPlan(plan: LogicalPlan): Unit = {
    test(s"invalid logical plan: $plan") {
      intercept[RuntimeException] {

  private def cleanUpSql(q: String): String =
    q.replaceAll("\\s+", " ").trim

  def testUnsupportedLogicalPlanInternal(plan: LogicalPlan): Unit = {
    test(s"invalid logical plan (internal): $plan") {
      intercept[RuntimeException] {

  def testGeneratedSqlDataType(expected: String)(dataType: DataType): Unit = {
    test(s"The generated sql type for ${dataType.simpleString} is $expected") {
      val generated = sqlBuilder.typeToSql(dataType)

Example 22
Source File: ScanAndFilterImplicits.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis.systables

import org.apache.spark.sql.Row
import org.apache.spark.sql.sources.{And, Filter, FilterUtils}
import org.apache.spark.sql.types.StructType
import FilterUtils._

        values.foldLeft(Seq.empty[Row]) {
          case (acc, value) =>
            val scanned = scanFunction(value)
            if (validation(scanned)) {
              acc :+ scanned
            } else acc


object ScanAndFilterImplicits extends ScanAndFilterImplicits 
Example 23
Source File: RiakTSStreamingRDD.scala    From spark-riak-connector   with Apache License 2.0 5 votes vote down vote up
package com.basho.riak.spark.streaming

import com.basho.riak.spark.rdd.connector.RiakConnector
import com.basho.riak.spark.rdd.{ReadConf, RiakTSRDD}
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType
import org.apache.spark.streaming.StreamingContext

import scala.reflect.ClassTag

class RiakTSStreamingRDD[R] private[spark](
    ssc: StreamingContext,
    connector: RiakConnector,
    bucketName: String,
    schema: Option[StructType] = None,
    columnNames: Option[Seq[String]] = None,
    whereConstraints: Option[(String, Seq[Any])] = None,
    filters: Array[Filter] = Array(),
    tsRangeFieldName: Option[String] = None,
    quantum: Option[Long] = None,
    query: Option[String] = None,
    readConf: ReadConf = ReadConf())(
    ct: ClassTag[R])
  extends RiakTSRDD[R](
    sc = ssc.sparkContext,
    connector = connector,
    bucketName = bucketName,
    schema = schema,
    columnNames = columnNames,
    whereConstraints = whereConstraints,
    filters = filters,
    tsRangeFieldName = tsRangeFieldName,
    quantum = quantum,
    query = query,
    readConf = readConf)