org.apache.spark.sql.sources.PrunedFilteredScan Scala Examples
The following examples show how to use org.apache.spark.sql.sources.PrunedFilteredScan.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RiakRelation.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.riak import com.basho.riak.spark._ import scala.reflect._ import com.basho.riak.spark.rdd.connector.{RiakConnectorConf, RiakConnector} import com.basho.riak.spark.rdd.{ReadConf, RiakTSRDD} import com.basho.riak.spark.util.TSConversionUtil import com.basho.riak.spark.writer.WriteConf import com.basho.riak.spark.writer.mapper.SqlDataMapper import org.apache.spark.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{InsertableRelation, BaseRelation, Filter, PrunedFilteredScan} import org.apache.spark.sql.types._ import org.apache.spark.sql._ import scala.collection.convert.decorateAsScala._ import com.basho.riak.spark.query.QueryBucketDef object RiakRelation { def apply(bucket: String, sqlContext: SQLContext, schema: Option[StructType] = None, connector: Option[RiakConnector] = None, readConf: ReadConf, writeConf: WriteConf): RiakRelation = { new RiakRelation(bucket, connector.getOrElse(RiakConnector(sqlContext.sparkContext.getConf)), readConf, writeConf, sqlContext, schema) } def apply(sqlContext: SQLContext, parameters: Map[String, String], schema: Option[StructType]): RiakRelation = { val existingConf = sqlContext.sparkContext.getConf val bucketDef = BucketDef(parameters(DefaultSource.RiakBucketProperty), None) val riakConnector = new RiakConnector(RiakConnectorConf(existingConf, parameters)) val readConf = ReadConf(existingConf, parameters) val writeConf = WriteConf(existingConf, parameters) RiakRelation(bucketDef.bucket, sqlContext, schema, Some(riakConnector), readConf, writeConf) } }
Example 2
Source File: BEDRelation.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.datasources.BED import org.apache.log4j.Logger import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Encoders, Row, SQLContext, SparkSession} import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan} import org.biodatageeks.sequila.utils.{Columns, DataQualityFuncs} class BEDRelation(path: String)(@transient val sqlContext: SQLContext) extends BaseRelation with PrunedFilteredScan with Serializable { @transient val logger = Logger.getLogger(this.getClass.getCanonicalName) override def schema: org.apache.spark.sql.types.StructType = Encoders.product[org.biodatageeks.formats.BrowserExtensibleData].schema private def getValueFromColumn(colName:String, r:Array[String]): Any = { colName match { case Columns.CONTIG => DataQualityFuncs.cleanContig(r(0) ) case Columns.START => r(1).toInt + 1 //Convert interval to 1-based case Columns.END => r(2).toInt case Columns.NAME => if (r.length > 3) Some (r(3)) else None case Columns.SCORE => if (r.length > 4) Some (r(4).toInt) else None case Columns.STRAND => if (r.length > 5) Some (r(5)) else None case Columns.THICK_START => if (r.length > 6) Some (r(6).toInt) else None case Columns.THICK_END => if (r.length > 7) Some (r(7).toInt) else None case Columns.ITEM_RGB => if (r.length > 8) Some (r(8).split(",").map(_.toInt)) else None case Columns.BLOCK_COUNT => if (r.length > 9) Some (r(9).toInt) else None case Columns.BLOCK_SIZES => if (r.length > 10) Some (r(10).split(",").map(_.toInt)) else None case Columns.BLOCK_STARTS => if (r.length > 11) Some (r(11).split(",").map(_.toInt)) else None case _ => throw new Exception(s"Unknown column found: ${colName}") } } override def buildScan(requiredColumns:Array[String], filters:Array[Filter]): RDD[Row] = { sqlContext .sparkContext .textFile(path) .filter(!_.toLowerCase.startsWith("track")) .filter(!_.toLowerCase.startsWith("browser")) .map(_.split("\t")) .map(r=> { val record = new Array[Any](requiredColumns.length) //requiredColumns. for (i <- 0 to requiredColumns.length - 1) { record(i) = getValueFromColumn(requiredColumns(i), r) } Row.fromSeq(record) } ) } }
Example 3
Source File: HiveAcidRelation.scala From spark-acid with Apache License 2.0 | 5 votes |
package com.qubole.spark.hiveacid.datasource import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Column, DataFrame, Row, SQLContext, SparkSession} import org.apache.spark.sql.sources.{BaseRelation, Filter, InsertableRelation, PrunedFilteredScan} import org.apache.spark.sql.types._ import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidTable, SparkAcidConf} import com.qubole.spark.hiveacid.hive.HiveAcidMetadata import com.qubole.spark.hiveacid.merge.{MergeWhenClause, MergeWhenNotInsert} import org.apache.spark.sql.catalyst.AliasIdentifier import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import collection.JavaConversions._ case class HiveAcidRelation(sparkSession: SparkSession, fullyQualifiedTableName: String, parameters: Map[String, String]) extends BaseRelation with InsertableRelation with PrunedFilteredScan with Logging { private val hiveAcidMetadata: HiveAcidMetadata = HiveAcidMetadata.fromSparkSession( sparkSession, fullyQualifiedTableName ) private val hiveAcidTable: HiveAcidTable = new HiveAcidTable(sparkSession, hiveAcidMetadata, parameters) private val readOptions = SparkAcidConf(sparkSession, parameters) override def sqlContext: SQLContext = sparkSession.sqlContext override val schema: StructType = if (readOptions.includeRowIds) { hiveAcidMetadata.tableSchemaWithRowId } else { hiveAcidMetadata.tableSchema } override def insert(data: DataFrame, overwrite: Boolean): Unit = { // sql insert into and overwrite if (overwrite) { hiveAcidTable.insertOverwrite(data) } else { hiveAcidTable.insertInto(data) } } def update(condition: Option[Column], newValues: Map[String, Column]): Unit = { hiveAcidTable.update(condition, newValues) } def delete(condition: Column): Unit = { hiveAcidTable.delete(condition) } override def sizeInBytes: Long = { val compressionFactor = sparkSession.sessionState.conf.fileCompressionFactor (sparkSession.sessionState.conf.defaultSizeInBytes * compressionFactor).toLong } def merge(sourceDf: DataFrame, mergeExpression: Expression, matchedClause: Seq[MergeWhenClause], notMatched: Option[MergeWhenNotInsert], sourceAlias: Option[AliasIdentifier], targetAlias: Option[AliasIdentifier]): Unit = { hiveAcidTable.merge(sourceDf, mergeExpression, matchedClause, notMatched, sourceAlias, targetAlias) } def getHiveAcidTable(): HiveAcidTable = { hiveAcidTable } // FIXME: should it be true / false. Recommendation seems to // be to leave it as true override val needConversion: Boolean = false override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { val readOptions = SparkAcidConf(sparkSession, parameters) // sql "select *" hiveAcidTable.getRdd(requiredColumns, filters, readOptions) } }