org.apache.spark.sql.sources.Filter Scala Examples
The following examples show how to use org.apache.spark.sql.sources.Filter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: CarbonBoundReference.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.sources.Filter case class CastExpr(expr: Expression) extends Filter { override def references: Array[String] = null } case class FalseExpr() extends Filter { override def references: Array[String] = null } case class CarbonEndsWith(expr: Expression) extends Filter { override def references: Array[String] = null } case class CarbonContainsWith(expr: Expression) extends Filter { override def references: Array[String] = null }
Example 2
Source File: DataFrameReaderFunctions.scala From couchbase-spark-connector with Apache License 2.0 | 5 votes |
package com.couchbase.spark.sql import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, DataFrameReader} class DataFrameReaderFunctions(@transient val dfr: DataFrameReader) extends Serializable { private def buildFrame(options: Map[String, String] = null, schema: StructType = null, schemaFilter: Option[Filter] = null): DataFrame = { val builder = dfr .format(source) .schema(schema) val filter = schemaFilter.map(N1QLRelation.filterToExpression) if (filter.isDefined) { builder.option("schemaFilter", filter.get) } if (options != null) { builder.options(options) } builder.load() } }
Example 3
Source File: HiveAcidRelation.scala From spark-acid with Apache License 2.0 | 5 votes |
package com.qubole.spark.hiveacid.datasource import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Column, DataFrame, Row, SQLContext, SparkSession} import org.apache.spark.sql.sources.{BaseRelation, Filter, InsertableRelation, PrunedFilteredScan} import org.apache.spark.sql.types._ import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidTable, SparkAcidConf} import com.qubole.spark.hiveacid.hive.HiveAcidMetadata import com.qubole.spark.hiveacid.merge.{MergeWhenClause, MergeWhenNotInsert} import org.apache.spark.sql.catalyst.AliasIdentifier import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import collection.JavaConversions._ case class HiveAcidRelation(sparkSession: SparkSession, fullyQualifiedTableName: String, parameters: Map[String, String]) extends BaseRelation with InsertableRelation with PrunedFilteredScan with Logging { private val hiveAcidMetadata: HiveAcidMetadata = HiveAcidMetadata.fromSparkSession( sparkSession, fullyQualifiedTableName ) private val hiveAcidTable: HiveAcidTable = new HiveAcidTable(sparkSession, hiveAcidMetadata, parameters) private val readOptions = SparkAcidConf(sparkSession, parameters) override def sqlContext: SQLContext = sparkSession.sqlContext override val schema: StructType = if (readOptions.includeRowIds) { hiveAcidMetadata.tableSchemaWithRowId } else { hiveAcidMetadata.tableSchema } override def insert(data: DataFrame, overwrite: Boolean): Unit = { // sql insert into and overwrite if (overwrite) { hiveAcidTable.insertOverwrite(data) } else { hiveAcidTable.insertInto(data) } } def update(condition: Option[Column], newValues: Map[String, Column]): Unit = { hiveAcidTable.update(condition, newValues) } def delete(condition: Column): Unit = { hiveAcidTable.delete(condition) } override def sizeInBytes: Long = { val compressionFactor = sparkSession.sessionState.conf.fileCompressionFactor (sparkSession.sessionState.conf.defaultSizeInBytes * compressionFactor).toLong } def merge(sourceDf: DataFrame, mergeExpression: Expression, matchedClause: Seq[MergeWhenClause], notMatched: Option[MergeWhenNotInsert], sourceAlias: Option[AliasIdentifier], targetAlias: Option[AliasIdentifier]): Unit = { hiveAcidTable.merge(sourceDf, mergeExpression, matchedClause, notMatched, sourceAlias, targetAlias) } def getHiveAcidTable(): HiveAcidTable = { hiveAcidTable } // FIXME: should it be true / false. Recommendation seems to // be to leave it as true override val needConversion: Boolean = false override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { val readOptions = SparkAcidConf(sparkSession, parameters) // sql "select *" hiveAcidTable.getRdd(requiredColumns, filters, readOptions) } }
Example 4
Source File: BEDRelation.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.datasources.BED import org.apache.log4j.Logger import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Encoders, Row, SQLContext, SparkSession} import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan} import org.biodatageeks.sequila.utils.{Columns, DataQualityFuncs} class BEDRelation(path: String)(@transient val sqlContext: SQLContext) extends BaseRelation with PrunedFilteredScan with Serializable { @transient val logger = Logger.getLogger(this.getClass.getCanonicalName) override def schema: org.apache.spark.sql.types.StructType = Encoders.product[org.biodatageeks.formats.BrowserExtensibleData].schema private def getValueFromColumn(colName:String, r:Array[String]): Any = { colName match { case Columns.CONTIG => DataQualityFuncs.cleanContig(r(0) ) case Columns.START => r(1).toInt + 1 //Convert interval to 1-based case Columns.END => r(2).toInt case Columns.NAME => if (r.length > 3) Some (r(3)) else None case Columns.SCORE => if (r.length > 4) Some (r(4).toInt) else None case Columns.STRAND => if (r.length > 5) Some (r(5)) else None case Columns.THICK_START => if (r.length > 6) Some (r(6).toInt) else None case Columns.THICK_END => if (r.length > 7) Some (r(7).toInt) else None case Columns.ITEM_RGB => if (r.length > 8) Some (r(8).split(",").map(_.toInt)) else None case Columns.BLOCK_COUNT => if (r.length > 9) Some (r(9).toInt) else None case Columns.BLOCK_SIZES => if (r.length > 10) Some (r(10).split(",").map(_.toInt)) else None case Columns.BLOCK_STARTS => if (r.length > 11) Some (r(11).split(",").map(_.toInt)) else None case _ => throw new Exception(s"Unknown column found: ${colName}") } } override def buildScan(requiredColumns:Array[String], filters:Array[Filter]): RDD[Row] = { sqlContext .sparkContext .textFile(path) .filter(!_.toLowerCase.startsWith("track")) .filter(!_.toLowerCase.startsWith("browser")) .map(_.split("\t")) .map(r=> { val record = new Array[Any](requiredColumns.length) //requiredColumns. for (i <- 0 to requiredColumns.length - 1) { record(i) = getValueFromColumn(requiredColumns(i), r) } Row.fromSeq(record) } ) } }
Example 5
Source File: MetastoreIndexSuite.scala From parquet-index with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.hadoop.fs.Path import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType import com.github.lightcopy.testutil.UnitTestSuite import com.github.lightcopy.testutil.implicits._ // Test catalog to check internal methods private[datasources] class TestIndex extends MetastoreIndex { private var internalIndexFilters: Seq[Filter] = Nil override def tablePath(): Path = ??? override def partitionSchema: StructType = ??? override def indexSchema: StructType = ??? override def dataSchema: StructType = ??? override def setIndexFilters(filters: Seq[Filter]) = { internalIndexFilters = filters } override def indexFilters: Seq[Filter] = internalIndexFilters override def listFilesWithIndexSupport( partitionFilters: Seq[Expression], dataFilters: Seq[Expression], indexFilters: Seq[Filter]): Seq[PartitionDirectory] = ??? override def inputFiles: Array[String] = ??? override def sizeInBytes: Long = ??? } class MetastoreIndexSuite extends UnitTestSuite { test("provide sequence of path based on table path") { val catalog = new TestIndex() { override def tablePath(): Path = new Path("test") } catalog.rootPaths should be (Seq(new Path("test"))) } test("when using listFiles directly supply empty index filter") { var indexSeq: Seq[Filter] = null var filterSeq: Seq[Expression] = null val catalog = new TestIndex() { override def listFilesWithIndexSupport( partitionFilters: Seq[Expression], dataFilters: Seq[Expression], indexFilters: Seq[Filter]): Seq[PartitionDirectory] = { indexSeq = indexFilters filterSeq = partitionFilters Seq.empty } } catalog.listFiles(Seq.empty, Seq.empty) indexSeq should be (Nil) filterSeq should be (Nil) } test("refresh should be no-op by default") { val catalog = new TestIndex() catalog.refresh() } }
Example 6
Source File: MongodbRDDIterator.scala From Spark-MongoDB with Apache License 2.0 | 5 votes |
package com.stratio.datasource.mongodb.rdd import com.mongodb.casbah.Imports._ import com.stratio.datasource.mongodb.query.FilterSection import com.stratio.datasource.mongodb.reader.MongodbReader import com.stratio.datasource.util.Config import org.apache.spark._ import org.apache.spark.sql.sources.Filter class MongodbRDDIterator( taskContext: TaskContext, partition: Partition, config: Config, requiredColumns: Array[String], filters: FilterSection) extends Iterator[DBObject] { private var closed = false private var initialized = false lazy val reader = { initialized = true initReader() } // Register an on-task-completion callback to close the input stream. taskContext.addTaskCompletionListener((context: TaskContext) => closeIfNeeded()) override def hasNext: Boolean = { !closed && reader.hasNext } override def next(): DBObject = { if (!hasNext) { throw new NoSuchElementException("End of stream") } reader.next() } def closeIfNeeded(): Unit = { if (!closed) { closed = true close() } } protected def close(): Unit = { if (initialized) { reader.close() initialized = false } } def initReader() = { val reader = new MongodbReader(config, requiredColumns, filters) reader.init(partition) reader } }
Example 7
Source File: customFilters.scala From Spark-MongoDB with Apache License 2.0 | 5 votes |
package com.stratio.datasource.mongodb.sources import org.apache.spark.sql.sources.Filter trait GeoFilter extends Filter { val attribute: String val maxDistance: Option[Double] } case class Near( attribute: String, x: Double, y: Double, maxDistance: Option[Double] = None ) extends GeoFilter case class NearSphere( attribute: String, longitude: Double, latitude: Double, maxDistance: Option[Double] = None ) extends GeoFilter
Example 8
Source File: DeltaSourceUtils.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.sources import java.util.Locale import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.sources import org.apache.spark.sql.sources.Filter object DeltaSourceUtils { val NAME = "delta" val ALT_NAME = "delta" // Batch relations don't pass partitioning columns to `CreatableRelationProvider`s, therefore // as a hack, we pass in the partitioning columns among the options. val PARTITIONING_COLUMNS_KEY = "__partition_columns" def isDeltaDataSourceName(name: String): Boolean = { name.toLowerCase(Locale.ROOT) == NAME || name.toLowerCase(Locale.ROOT) == ALT_NAME } def translateFilters(filters: Array[Filter]): Expression = filters.map { case sources.EqualTo(attribute, value) => expressions.EqualTo(UnresolvedAttribute(attribute), expressions.Literal.create(value)) case sources.EqualNullSafe(attribute, value) => expressions.EqualNullSafe(UnresolvedAttribute(attribute), expressions.Literal.create(value)) case sources.GreaterThan(attribute, value) => expressions.GreaterThan(UnresolvedAttribute(attribute), expressions.Literal.create(value)) case sources.GreaterThanOrEqual(attribute, value) => expressions.GreaterThanOrEqual( UnresolvedAttribute(attribute), expressions.Literal.create(value)) case sources.LessThan(attribute, value) => expressions.LessThanOrEqual(UnresolvedAttribute(attribute), expressions.Literal.create(value)) case sources.LessThanOrEqual(attribute, value) => expressions.LessThanOrEqual(UnresolvedAttribute(attribute), expressions.Literal.create(value)) case sources.In(attribute, values) => expressions.In(UnresolvedAttribute(attribute), values.map(createLiteral)) case sources.IsNull(attribute) => expressions.IsNull(UnresolvedAttribute(attribute)) case sources.IsNotNull(attribute) => expressions.IsNotNull(UnresolvedAttribute(attribute)) case sources.Not(otherFilter) => expressions.Not(translateFilters(Array(otherFilter))) case sources.And(filter1, filter2) => expressions.And(translateFilters(Array(filter1)), translateFilters(Array(filter2))) case sources.Or(filter1, filter2) => expressions.Or(translateFilters(Array(filter1)), translateFilters(Array(filter2))) case sources.StringStartsWith(attribute, value) => new expressions.Like( UnresolvedAttribute(attribute), expressions.Literal.create(s"${value}%")) case sources.StringEndsWith(attribute, value) => new expressions.Like( UnresolvedAttribute(attribute), expressions.Literal.create(s"%${value}")) case sources.StringContains(attribute, value) => new expressions.Like( UnresolvedAttribute(attribute), expressions.Literal.create(s"%${value}%")) case sources.AlwaysTrue() => expressions.Literal.TrueLiteral case sources.AlwaysFalse() => expressions.Literal.FalseLiteral }.reduce(expressions.And) }
Example 9
Source File: TableIndexConnector.scala From spark-dynamodb with Apache License 2.0 | 5 votes |
package com.audienceproject.spark.dynamodb.connector import com.amazonaws.services.dynamodbv2.document.spec.ScanSpec import com.amazonaws.services.dynamodbv2.document.{ItemCollection, ScanOutcome} import com.amazonaws.services.dynamodbv2.model.ReturnConsumedCapacity import com.amazonaws.services.dynamodbv2.xspec.ExpressionSpecBuilder import org.apache.spark.sql.sources.Filter import scala.collection.JavaConverters._ private[dynamodb] class TableIndexConnector(tableName: String, indexName: String, parallelism: Int, parameters: Map[String, String]) extends DynamoConnector with Serializable { private val consistentRead = parameters.getOrElse("stronglyConsistentReads", "false").toBoolean private val filterPushdown = parameters.getOrElse("filterPushdown", "true").toBoolean private val region = parameters.get("region") private val roleArn = parameters.get("roleArn") override val filterPushdownEnabled: Boolean = filterPushdown override val (keySchema, readLimit, itemLimit, totalSegments) = { val table = getDynamoDB(region, roleArn).getTable(tableName) val indexDesc = table.describe().getGlobalSecondaryIndexes.asScala.find(_.getIndexName == indexName).get // Key schema. val keySchema = KeySchema.fromDescription(indexDesc.getKeySchema.asScala) // User parameters. val bytesPerRCU = parameters.getOrElse("bytesPerRCU", "4000").toInt val maxPartitionBytes = parameters.getOrElse("maxpartitionbytes", "128000000").toInt val targetCapacity = parameters.getOrElse("targetCapacity", "1").toDouble val readFactor = if (consistentRead) 1 else 2 // Table parameters. val indexSize = indexDesc.getIndexSizeBytes val itemCount = indexDesc.getItemCount // Partitioning calculation. val numPartitions = parameters.get("readpartitions").map(_.toInt).getOrElse({ val sizeBased = (indexSize / maxPartitionBytes).toInt max 1 val remainder = sizeBased % parallelism if (remainder > 0) sizeBased + (parallelism - remainder) else sizeBased }) // Provisioned or on-demand throughput. val readThroughput = parameters.getOrElse("throughput", Option(indexDesc.getProvisionedThroughput.getReadCapacityUnits) .filter(_ > 0).map(_.longValue().toString) .getOrElse("100")).toLong // Rate limit calculation. val avgItemSize = indexSize.toDouble / itemCount val readCapacity = readThroughput * targetCapacity val rateLimit = readCapacity / parallelism val itemLimit = ((bytesPerRCU / avgItemSize * rateLimit).toInt * readFactor) max 1 (keySchema, rateLimit, itemLimit, numPartitions) } override def scan(segmentNum: Int, columns: Seq[String], filters: Seq[Filter]): ItemCollection[ScanOutcome] = { val scanSpec = new ScanSpec() .withSegment(segmentNum) .withTotalSegments(totalSegments) .withMaxPageSize(itemLimit) .withReturnConsumedCapacity(ReturnConsumedCapacity.TOTAL) .withConsistentRead(consistentRead) if (columns.nonEmpty) { val xspec = new ExpressionSpecBuilder().addProjections(columns: _*) if (filters.nonEmpty && filterPushdown) { xspec.withCondition(FilterPushdown(filters)) } scanSpec.withExpressionSpec(xspec.buildForScan()) } getDynamoDB(region, roleArn).getTable(tableName).getIndex(indexName).scan(scanSpec) } }
Example 10
Source File: ScanPartition.scala From spark-dynamodb with Apache License 2.0 | 5 votes |
package com.audienceproject.spark.dynamodb.datasource import com.amazonaws.services.dynamodbv2.document.Item import com.audienceproject.shaded.google.common.util.concurrent.RateLimiter import com.audienceproject.spark.dynamodb.connector.DynamoConnector import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.sources.v2.reader.{InputPartition, InputPartitionReader} import org.apache.spark.sql.types.{StructField, StructType} import scala.collection.JavaConverters._ class ScanPartition(schema: StructType, partitionIndex: Int, connector: DynamoConnector, filters: Array[Filter]) extends InputPartition[InternalRow] { private val requiredColumns = schema.map(_.name) @transient private lazy val typeConversions = schema.collect({ case StructField(name, dataType, _, _) => name -> TypeConversion(name, dataType) }).toMap override def createPartitionReader(): InputPartitionReader[InternalRow] = { if (connector.isEmpty) new EmptyReader else new PartitionReader } private class EmptyReader extends InputPartitionReader[InternalRow] { override def next(): Boolean = false override def get(): InternalRow = throw new IllegalStateException("Unable to call get() on empty iterator") override def close(): Unit = {} } private class PartitionReader extends InputPartitionReader[InternalRow] { private val pageIterator = connector.scan(partitionIndex, requiredColumns, filters).pages().iterator().asScala private val rateLimiter = RateLimiter.create(connector.readLimit) private var innerIterator: Iterator[InternalRow] = Iterator.empty private var currentRow: InternalRow = _ private var proceed = false override def next(): Boolean = { proceed = true innerIterator.hasNext || { if (pageIterator.hasNext) { nextPage() next() } else false } } override def get(): InternalRow = { if (proceed) { currentRow = innerIterator.next() proceed = false } currentRow } override def close(): Unit = {} private def nextPage(): Unit = { val page = pageIterator.next() val result = page.getLowLevelResult Option(result.getScanResult.getConsumedCapacity).foreach(cap => rateLimiter.acquire(cap.getCapacityUnits.toInt max 1)) innerIterator = result.getItems.iterator().asScala.map(itemToRow(requiredColumns)) } } private def itemToRow(requiredColumns: Seq[String])(item: Item): InternalRow = if (requiredColumns.nonEmpty) InternalRow.fromSeq(requiredColumns.map(columnName => typeConversions(columnName)(item))) else InternalRow.fromSeq(item.asMap().asScala.values.toSeq.map(_.toString)) }
Example 11
Source File: TextMatchUDF.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.index import org.apache.spark.sql.sources.Filter import org.apache.carbondata.common.annotations.InterfaceAudience @InterfaceAudience.Internal class TextMatchUDF extends ((String) => Boolean) with Serializable { override def apply(v1: String): Boolean = { v1.length > 0 } } @InterfaceAudience.Internal class TextMatchMaxDocUDF extends ((String, Int) => Boolean) with Serializable { override def apply(v1: String, v2: Int): Boolean = { v1.length > 0 } } @InterfaceAudience.Internal case class TextMatch(queryString: String) extends Filter { override def references: Array[String] = null } @InterfaceAudience.Internal case class TextMatchLimit(queryString: String, maxDoc: String) extends Filter { override def references: Array[String] = null }
Example 12
Source File: InPolygonUDF.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.geo import org.apache.spark.sql.sources.Filter import org.apache.carbondata.common.annotations.InterfaceAudience @InterfaceAudience.Internal class InPolygonUDF extends (String => Boolean) with Serializable { override def apply(v1: String): Boolean = { true // Carbon applies the filter. So, Spark do not have to apply filter. } } @InterfaceAudience.Internal case class InPolygon(queryString: String) extends Filter { override def references: Array[String] = null }
Example 13
Source File: RiakRelation.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.riak import com.basho.riak.spark._ import scala.reflect._ import com.basho.riak.spark.rdd.connector.{RiakConnectorConf, RiakConnector} import com.basho.riak.spark.rdd.{ReadConf, RiakTSRDD} import com.basho.riak.spark.util.TSConversionUtil import com.basho.riak.spark.writer.WriteConf import com.basho.riak.spark.writer.mapper.SqlDataMapper import org.apache.spark.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{InsertableRelation, BaseRelation, Filter, PrunedFilteredScan} import org.apache.spark.sql.types._ import org.apache.spark.sql._ import scala.collection.convert.decorateAsScala._ import com.basho.riak.spark.query.QueryBucketDef object RiakRelation { def apply(bucket: String, sqlContext: SQLContext, schema: Option[StructType] = None, connector: Option[RiakConnector] = None, readConf: ReadConf, writeConf: WriteConf): RiakRelation = { new RiakRelation(bucket, connector.getOrElse(RiakConnector(sqlContext.sparkContext.getConf)), readConf, writeConf, sqlContext, schema) } def apply(sqlContext: SQLContext, parameters: Map[String, String], schema: Option[StructType]): RiakRelation = { val existingConf = sqlContext.sparkContext.getConf val bucketDef = BucketDef(parameters(DefaultSource.RiakBucketProperty), None) val riakConnector = new RiakConnector(RiakConnectorConf(existingConf, parameters)) val readConf = ReadConf(existingConf, parameters) val writeConf = WriteConf(existingConf, parameters) RiakRelation(bucketDef.bucket, sqlContext, schema, Some(riakConnector), readConf, writeConf) } }
Example 14
Source File: SequoiadbRDD.scala From spark-sequoiadb with Apache License 2.0 | 5 votes |
package com.sequoiadb.spark.rdd import org.apache.spark.SparkContext import _root_.com.sequoiadb.spark.SequoiadbConfig import com.sequoiadb.spark.partitioner._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.Filter import org.apache.spark.{Partition, TaskContext} import org.bson.BSONObject import org.slf4j.{Logger, LoggerFactory} import scala.collection.mutable.ArrayBuffer //import java.io.FileOutputStream; def apply ( sc: SQLContext, config: SequoiadbConfig, partitioner: Option[SequoiadbPartitioner] = None, requiredColumns: Array[String] = Array(), filters: Array[Filter] = Array(), queryReturnType: Int = SequoiadbConfig.QUERYRETURNBSON, queryLimit: Long = -1) = { new SequoiadbRDD ( sc.sparkContext, config, partitioner, requiredColumns, filters, queryReturnType, queryLimit) } }
Example 15
Source File: SequoiadbRDDIterator.scala From spark-sequoiadb with Apache License 2.0 | 5 votes |
package com.sequoiadb.spark.rdd import _root_.com.sequoiadb.spark.SequoiadbConfig import _root_.com.sequoiadb.spark.io.SequoiadbReader import org.apache.spark._ import org.apache.spark.sql.sources.Filter import org.bson.BSONObject import org.slf4j.{Logger, LoggerFactory} //import java.io.FileOutputStream; class SequoiadbRDDIterator( taskContext: TaskContext, partition: Partition, config: SequoiadbConfig, requiredColumns: Array[String], filters: Array[Filter], queryReturnType: Int = SequoiadbConfig.QUERYRETURNBSON, queryLimit: Long = -1) extends Iterator[BSONObject] { private var LOG: Logger = LoggerFactory.getLogger(this.getClass.getName()) protected var finished = false private var closed = false private var initialized = false lazy val reader = { initialized = true initReader() } // Register an on-task-completion callback to close the input stream. taskContext.addTaskCompletionListener((context: TaskContext) => closeIfNeeded()) override def hasNext: Boolean = { !finished && reader.hasNext } override def next(): BSONObject = { if (!hasNext) { throw new NoSuchElementException("End of stream") } reader.next() } def closeIfNeeded(): Unit = { if (!closed) { close() closed = true } } protected def close(): Unit = { if (initialized) { reader.close() } } def initReader() = { val reader = new SequoiadbReader(config,requiredColumns,filters, queryReturnType, queryLimit) reader.init(partition) reader } }
Example 16
Source File: ArrowFileFormat.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.oap.spark.sql.execution.datasources.arrow import scala.collection.JavaConverters._ import com.intel.oap.spark.sql.execution.datasources.arrow.ArrowFileFormat.UnsafeItr import com.intel.oap.spark.sql.execution.datasources.v2.arrow.{ArrowFilters, ArrowOptions} import com.intel.oap.spark.sql.execution.datasources.v2.arrow.ArrowSQLConf._ import org.apache.arrow.dataset.scanner.ScanOptions import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileStatus import org.apache.hadoop.mapreduce.Job import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile} import org.apache.spark.sql.execution.datasources.v2.arrow.ArrowUtils import org.apache.spark.sql.sources.{DataSourceRegister, Filter} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap; class ArrowFileFormat extends FileFormat with DataSourceRegister with Serializable { val batchSize = 4096 def convert(files: Seq[FileStatus], options: Map[String, String]): Option[StructType] = { ArrowUtils.readSchema(files, new CaseInsensitiveStringMap(options.asJava)) } override def inferSchema( sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = { convert(files, options) } override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = { throw new UnsupportedOperationException("Write is not supported for Arrow source") } override def supportBatch(sparkSession: SparkSession, dataSchema: StructType): Boolean = true override def buildReaderWithPartitionValues(sparkSession: SparkSession, dataSchema: StructType, partitionSchema: StructType, requiredSchema: StructType, filters: Seq[Filter], options: Map[String, String], hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { (file: PartitionedFile) => { val sqlConf = sparkSession.sessionState.conf; val enableFilterPushDown = sqlConf.arrowFilterPushDown val factory = ArrowUtils.makeArrowDiscovery( file.filePath, new ArrowOptions( new CaseInsensitiveStringMap( options.asJava).asScala.toMap)) // todo predicate validation / pushdown val dataset = factory.finish(); val filter = if (enableFilterPushDown) { ArrowFilters.translateFilters(filters) } else { org.apache.arrow.dataset.filter.Filter.EMPTY } val scanOptions = new ScanOptions(requiredSchema.map(f => f.name).toArray, filter, batchSize) val scanner = dataset.newScan(scanOptions) val itrList = scanner .scan() .iterator() .asScala .map(task => task.scan()) .toList val itr = itrList .toIterator .flatMap(itr => itr.asScala) .map(vsr => ArrowUtils.loadVsr(vsr, file.partitionValues, partitionSchema, dataSchema)) new UnsafeItr(itr).asInstanceOf[Iterator[InternalRow]] } } override def shortName(): String = "arrow" } object ArrowFileFormat { class UnsafeItr[T](delegate: Iterator[T]) extends Iterator[T] { override def hasNext: Boolean = delegate.hasNext override def next(): T = delegate.next() } }
Example 17
Source File: ArrowScanBuilder.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.oap.spark.sql.execution.datasources.v2.arrow import org.apache.spark.sql.SparkSession import org.apache.spark.sql.connector.read.{Scan, SupportsPushDownFilters} import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap case class ArrowScanBuilder( sparkSession: SparkSession, fileIndex: PartitioningAwareFileIndex, schema: StructType, dataSchema: StructType, options: CaseInsensitiveStringMap) extends FileScanBuilder(sparkSession, fileIndex, dataSchema) with SupportsPushDownFilters { private var filters: Array[Filter] = Array.empty private lazy val pushedArrowFilters: Array[Filter] = { filters // todo filter validation & pushdown } override def pushFilters(filters: Array[Filter]): Array[Filter] = { this.filters = filters this.filters } override def pushedFilters: Array[Filter] = pushedArrowFilters override def build(): Scan = { ArrowScan( sparkSession, fileIndex, readDataSchema(), readPartitionSchema(), pushedFilters, options) } }
Example 18
Source File: ArrowScan.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.oap.spark.sql.execution.datasources.v2.arrow import scala.collection.JavaConverters._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.connector.read.PartitionReaderFactory import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex import org.apache.spark.sql.execution.datasources.v2.FileScan import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.SerializableConfiguration case class ArrowScan( sparkSession: SparkSession, fileIndex: PartitioningAwareFileIndex, readDataSchema: StructType, readPartitionSchema: StructType, pushedFilters: Array[Filter], options: CaseInsensitiveStringMap, partitionFilters: Seq[Expression] = Seq.empty, dataFilters: Seq[Expression] = Seq.empty) extends FileScan { override def createReaderFactory(): PartitionReaderFactory = { val caseSensitiveMap = options.asCaseSensitiveMap().asScala.toMap val hconf = sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap) val broadcastedConf = sparkSession.sparkContext.broadcast(new SerializableConfiguration(hconf)) ArrowPartitionReaderFactory( sparkSession.sessionState.conf, broadcastedConf, readDataSchema, readPartitionSchema, pushedFilters, new ArrowOptions(options.asScala.toMap)) } override def withFilters(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): FileScan = this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters) }
Example 19
Source File: TestDataFile.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.io import org.apache.hadoop.conf.Configuration import org.apache.spark.sql.execution.datasources.oap.filecache.FiberCache import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType private[oap] case class TestDataFile(path: String, schema: StructType, configuration: Configuration) extends DataFile { override def iterator( requiredIds: Array[Int], filters: Seq[Filter]): OapCompletionIterator[Any] = new OapCompletionIterator(Iterator.empty, {}) override def iteratorWithRowIds( requiredIds: Array[Int], rowIds: Array[Int], filters: Seq[Filter]): OapCompletionIterator[Any] = new OapCompletionIterator(Iterator.empty, {}) override def totalRows(): Long = 0 override def getDataFileMeta(): DataFileMeta = throw new UnsupportedOperationException override def cache(groupId: Int, fiberId: Int): FiberCache = throw new UnsupportedOperationException }
Example 20
Source File: FilterHelper.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.utils import org.apache.hadoop.conf.Configuration import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate} import org.apache.parquet.hadoop.ParquetInputFormat import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.datasources.parquet.ParquetFiltersWrapper import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType object FilterHelper { def tryToPushFilters( sparkSession: SparkSession, requiredSchema: StructType, filters: Seq[Filter]): Option[FilterPredicate] = { tryToPushFilters(sparkSession.sessionState.conf, requiredSchema, filters) } def tryToPushFilters( conf: SQLConf, requiredSchema: StructType, filters: Seq[Filter]): Option[FilterPredicate] = { if (conf.parquetFilterPushDown) { filters // Collects all converted Parquet filter predicates. Notice that not all predicates can be // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap` // is used here. .flatMap(ParquetFiltersWrapper.createFilter(conf, requiredSchema, _)) .reduceOption(FilterApi.and) } else { None } } def setFilterIfExist(configuration: Configuration, pushed: Option[FilterPredicate]): Unit = { pushed match { case Some(filters) => ParquetInputFormat.setFilterPredicate(configuration, filters) case _ => // do nothing } } }
Example 21
Source File: SqlBuilderSuiteBase.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources.sql import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.DataType import org.scalatest.FunSuite import scala.util.matching.Regex trait SqlBuilderSuiteBase { self: FunSuite => val sqlBuilder: SqlBuilder // scalastyle:ignore def testExpressionToSql(sql: String)(expr: Expression): Unit = { val cleanSql = cleanUpSql(sql) test(s"expressionToSql: $cleanSql | with $expr") { assertResult(cleanSql)(sqlBuilder.expressionToSql(expr)) } } def testBuildSelect(sql: String) (i1: SqlLikeRelation, i2: Seq[String], i3: Seq[Filter]): Unit = { val cleanSql = cleanUpSql(sql) test(s"buildSelect: $cleanSql | with $i1 $i2 $i3") { assertResult(cleanSql)(sqlBuilder.buildSelect(i1, i2, i3)) } } def testLogicalPlan(sql: String)(plan: LogicalPlan): Unit = { val cleanSql = cleanUpSql(sql) test(s"logical plan: $cleanSql | with $plan") { assertResult(cleanSql)(sqlBuilder.logicalPlanToSql(plan)) } } def testLogicalPlanInternal(sql: String)(plan: LogicalPlan): Unit = { val cleanSql = cleanUpSql(sql) test(s"logical plan (internal): $cleanSql | with $plan") { assertResult(cleanSql)(sqlBuilder.internalLogicalPlanToSql(plan, noProject = true)) } } def testUnsupportedLogicalPlan(plan: LogicalPlan): Unit = { test(s"invalid logical plan: $plan") { intercept[RuntimeException] { sqlBuilder.logicalPlanToSql(plan) } } } private def cleanUpSql(q: String): String = q.replaceAll("\\s+", " ").trim def testUnsupportedLogicalPlanInternal(plan: LogicalPlan): Unit = { test(s"invalid logical plan (internal): $plan") { intercept[RuntimeException] { sqlBuilder.internalLogicalPlanToSql(plan) } } } def testGeneratedSqlDataType(expected: String)(dataType: DataType): Unit = { test(s"The generated sql type for ${dataType.simpleString} is $expected") { val generated = sqlBuilder.typeToSql(dataType) assertResult(expected)(generated) } } }
Example 22
Source File: ScanAndFilterImplicits.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis.systables import org.apache.spark.sql.Row import org.apache.spark.sql.sources.{And, Filter, FilterUtils} import org.apache.spark.sql.types.StructType import FilterUtils._ values.foldLeft(Seq.empty[Row]) { case (acc, value) => val scanned = scanFunction(value) if (validation(scanned)) { acc :+ scanned } else acc } } } } object ScanAndFilterImplicits extends ScanAndFilterImplicits
Example 23
Source File: RiakTSStreamingRDD.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.streaming import com.basho.riak.spark.rdd.connector.RiakConnector import com.basho.riak.spark.rdd.{ReadConf, RiakTSRDD} import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType import org.apache.spark.streaming.StreamingContext import scala.reflect.ClassTag class RiakTSStreamingRDD[R] private[spark]( ssc: StreamingContext, connector: RiakConnector, bucketName: String, schema: Option[StructType] = None, columnNames: Option[Seq[String]] = None, whereConstraints: Option[(String, Seq[Any])] = None, filters: Array[Filter] = Array(), tsRangeFieldName: Option[String] = None, quantum: Option[Long] = None, query: Option[String] = None, readConf: ReadConf = ReadConf())( implicit ct: ClassTag[R]) extends RiakTSRDD[R]( sc = ssc.sparkContext, connector = connector, bucketName = bucketName, schema = schema, columnNames = columnNames, whereConstraints = whereConstraints, filters = filters, tsRangeFieldName = tsRangeFieldName, quantum = quantum, query = query, readConf = readConf)