org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils Scala Examples
The following examples show how to use org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: PostgresDialect.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.{Connection, Types} import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils} import org.apache.spark.sql.types._ private object PostgresDialect extends JdbcDialect { override def canHandle(url: String): Boolean = url.startsWith("jdbc:postgresql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.REAL) { Some(FloatType) } else if (sqlType == Types.SMALLINT) { Some(ShortType) } else if (sqlType == Types.BIT && typeName.equals("bit") && size != 1) { Some(BinaryType) } else if (sqlType == Types.OTHER) { Some(StringType) } else if (sqlType == Types.ARRAY) { val scale = md.build.getLong("scale").toInt // postgres array type names start with underscore toCatalystType(typeName.drop(1), size, scale).map(ArrayType(_)) } else None } private def toCatalystType( typeName: String, precision: Int, scale: Int): Option[DataType] = typeName match { case "bool" => Some(BooleanType) case "bit" => Some(BinaryType) case "int2" => Some(ShortType) case "int4" => Some(IntegerType) case "int8" | "oid" => Some(LongType) case "float4" => Some(FloatType) case "money" | "float8" => Some(DoubleType) case "text" | "varchar" | "char" | "cidr" | "inet" | "json" | "jsonb" | "uuid" => Some(StringType) case "bytea" => Some(BinaryType) case "timestamp" | "timestamptz" | "time" | "timetz" => Some(TimestampType) case "date" => Some(DateType) case "numeric" | "decimal" => Some(DecimalType.bounded(precision, scale)) case _ => None } override def getJDBCType(dt: DataType): Option[JdbcType] = dt match { case StringType => Some(JdbcType("TEXT", Types.CHAR)) case BinaryType => Some(JdbcType("BYTEA", Types.BINARY)) case BooleanType => Some(JdbcType("BOOLEAN", Types.BOOLEAN)) case FloatType => Some(JdbcType("FLOAT4", Types.FLOAT)) case DoubleType => Some(JdbcType("FLOAT8", Types.DOUBLE)) case ShortType => Some(JdbcType("SMALLINT", Types.SMALLINT)) case t: DecimalType => Some( JdbcType(s"NUMERIC(${t.precision},${t.scale})", java.sql.Types.NUMERIC)) case ArrayType(et, _) if et.isInstanceOf[AtomicType] => getJDBCType(et).map(_.databaseTypeDefinition) .orElse(JdbcUtils.getCommonJDBCType(et).map(_.databaseTypeDefinition)) .map(typeName => JdbcType(s"$typeName[]", java.sql.Types.ARRAY)) case ByteType => throw new IllegalArgumentException(s"Unsupported type in postgresql: $dt"); case _ => None } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def beforeFetch(connection: Connection, properties: Map[String, String]): Unit = { super.beforeFetch(connection, properties) // According to the postgres jdbc documentation we need to be in autocommit=false if we actually // want to have fetchsize be non 0 (all the rows). This allows us to not have to cache all the // rows inside the driver when fetching. // // See: https://jdbc.postgresql.org/documentation/head/query.html#query-with-cursor // if (properties.getOrElse(JDBCOptions.JDBC_BATCH_FETCH_SIZE, "0").toInt > 0) { connection.setAutoCommit(false) } } override def isCascadingTruncateTable(): Option[Boolean] = Some(true) }
Example 2
Source File: PostgresDialect.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.{Connection, Types} import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils} import org.apache.spark.sql.types._ private object PostgresDialect extends JdbcDialect { override def canHandle(url: String): Boolean = url.startsWith("jdbc:postgresql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.REAL) { Some(FloatType) } else if (sqlType == Types.SMALLINT) { Some(ShortType) } else if (sqlType == Types.BIT && typeName.equals("bit") && size != 1) { Some(BinaryType) } else if (sqlType == Types.OTHER) { Some(StringType) } else if (sqlType == Types.ARRAY) { val scale = md.build.getLong("scale").toInt // postgres array type names start with underscore toCatalystType(typeName.drop(1), size, scale).map(ArrayType(_)) } else None } private def toCatalystType( typeName: String, precision: Int, scale: Int): Option[DataType] = typeName match { case "bool" => Some(BooleanType) case "bit" => Some(BinaryType) case "int2" => Some(ShortType) case "int4" => Some(IntegerType) case "int8" | "oid" => Some(LongType) case "float4" => Some(FloatType) case "money" | "float8" => Some(DoubleType) case "text" | "varchar" | "char" | "cidr" | "inet" | "json" | "jsonb" | "uuid" => Some(StringType) case "bytea" => Some(BinaryType) case "timestamp" | "timestamptz" | "time" | "timetz" => Some(TimestampType) case "date" => Some(DateType) case "numeric" | "decimal" if precision > 0 => Some(DecimalType.bounded(precision, scale)) case "numeric" | "decimal" => // SPARK-26538: handle numeric without explicit precision and scale. Some(DecimalType. SYSTEM_DEFAULT) case _ => None } override def getJDBCType(dt: DataType): Option[JdbcType] = dt match { case StringType => Some(JdbcType("TEXT", Types.CHAR)) case BinaryType => Some(JdbcType("BYTEA", Types.BINARY)) case BooleanType => Some(JdbcType("BOOLEAN", Types.BOOLEAN)) case FloatType => Some(JdbcType("FLOAT4", Types.FLOAT)) case DoubleType => Some(JdbcType("FLOAT8", Types.DOUBLE)) case ShortType => Some(JdbcType("SMALLINT", Types.SMALLINT)) case t: DecimalType => Some( JdbcType(s"NUMERIC(${t.precision},${t.scale})", java.sql.Types.NUMERIC)) case ArrayType(et, _) if et.isInstanceOf[AtomicType] => getJDBCType(et).map(_.databaseTypeDefinition) .orElse(JdbcUtils.getCommonJDBCType(et).map(_.databaseTypeDefinition)) .map(typeName => JdbcType(s"$typeName[]", java.sql.Types.ARRAY)) case ByteType => throw new IllegalArgumentException(s"Unsupported type in postgresql: $dt"); case _ => None } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def isCascadingTruncateTable(): Option[Boolean] = Some(false) override def getTruncateQuery( table: String, cascade: Option[Boolean] = isCascadingTruncateTable): String = { cascade match { case Some(true) => s"TRUNCATE TABLE ONLY $table CASCADE" case _ => s"TRUNCATE TABLE ONLY $table" } } override def beforeFetch(connection: Connection, properties: Map[String, String]): Unit = { super.beforeFetch(connection, properties) // According to the postgres jdbc documentation we need to be in autocommit=false if we actually // want to have fetchsize be non 0 (all the rows). This allows us to not have to cache all the // rows inside the driver when fetching. // // See: https://jdbc.postgresql.org/documentation/head/query.html#query-with-cursor // if (properties.getOrElse(JDBCOptions.JDBC_BATCH_FETCH_SIZE, "0").toInt > 0) { connection.setAutoCommit(false) } } }
Example 3
Source File: PostgresDialect.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.{Connection, Types} import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils} import org.apache.spark.sql.types._ private object PostgresDialect extends JdbcDialect { override def canHandle(url: String): Boolean = url.startsWith("jdbc:postgresql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.REAL) { Some(FloatType) } else if (sqlType == Types.SMALLINT) { Some(ShortType) } else if (sqlType == Types.BIT && typeName.equals("bit") && size != 1) { Some(BinaryType) } else if (sqlType == Types.OTHER) { Some(StringType) } else if (sqlType == Types.ARRAY) { val scale = md.build.getLong("scale").toInt // postgres array type names start with underscore toCatalystType(typeName.drop(1), size, scale).map(ArrayType(_)) } else None } private def toCatalystType( typeName: String, precision: Int, scale: Int): Option[DataType] = typeName match { case "bool" => Some(BooleanType) case "bit" => Some(BinaryType) case "int2" => Some(ShortType) case "int4" => Some(IntegerType) case "int8" | "oid" => Some(LongType) case "float4" => Some(FloatType) case "money" | "float8" => Some(DoubleType) case "text" | "varchar" | "char" | "cidr" | "inet" | "json" | "jsonb" | "uuid" => Some(StringType) case "bytea" => Some(BinaryType) case "timestamp" | "timestamptz" | "time" | "timetz" => Some(TimestampType) case "date" => Some(DateType) case "numeric" | "decimal" => Some(DecimalType.bounded(precision, scale)) case _ => None } override def getJDBCType(dt: DataType): Option[JdbcType] = dt match { case StringType => Some(JdbcType("TEXT", Types.CHAR)) case BinaryType => Some(JdbcType("BYTEA", Types.BINARY)) case BooleanType => Some(JdbcType("BOOLEAN", Types.BOOLEAN)) case FloatType => Some(JdbcType("FLOAT4", Types.FLOAT)) case DoubleType => Some(JdbcType("FLOAT8", Types.DOUBLE)) case ShortType => Some(JdbcType("SMALLINT", Types.SMALLINT)) case t: DecimalType => Some( JdbcType(s"NUMERIC(${t.precision},${t.scale})", java.sql.Types.NUMERIC)) case ArrayType(et, _) if et.isInstanceOf[AtomicType] => getJDBCType(et).map(_.databaseTypeDefinition) .orElse(JdbcUtils.getCommonJDBCType(et).map(_.databaseTypeDefinition)) .map(typeName => JdbcType(s"$typeName[]", java.sql.Types.ARRAY)) case ByteType => throw new IllegalArgumentException(s"Unsupported type in postgresql: $dt"); case _ => None } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def beforeFetch(connection: Connection, properties: Map[String, String]): Unit = { super.beforeFetch(connection, properties) // According to the postgres jdbc documentation we need to be in autocommit=false if we actually // want to have fetchsize be non 0 (all the rows). This allows us to not have to cache all the // rows inside the driver when fetching. // // See: https://jdbc.postgresql.org/documentation/head/query.html#query-with-cursor // if (properties.getOrElse(JDBCOptions.JDBC_BATCH_FETCH_SIZE, "0").toInt > 0) { connection.setAutoCommit(false) } } override def isCascadingTruncateTable(): Option[Boolean] = Some(true) }
Example 4
Source File: MemsqlDialect.scala From memsql-spark-connector with Apache License 2.0 | 5 votes |
package com.memsql.spark import java.sql.Types import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils import org.apache.spark.sql.jdbc.{JdbcDialect, JdbcType} import org.apache.spark.sql.types._ case object MemsqlDialect extends JdbcDialect { override def canHandle(url: String): Boolean = url.startsWith("jdbc:memsql") val MEMSQL_DECIMAL_MAX_SCALE = 30 override def getJDBCType(dt: DataType): Option[JdbcType] = dt match { case BooleanType => Option(JdbcType("BOOL", Types.BOOLEAN)) case ByteType => Option(JdbcType("TINYINT", Types.TINYINT)) case ShortType => Option(JdbcType("SMALLINT", Types.SMALLINT)) case FloatType => Option(JdbcType("FLOAT", Types.FLOAT)) case TimestampType => Option(JdbcType("TIMESTAMP(6)", Types.TIMESTAMP)) case dt: DecimalType if (dt.scale <= MEMSQL_DECIMAL_MAX_SCALE) => Option(JdbcType(s"DECIMAL(${dt.precision}, ${dt.scale})", Types.DECIMAL)) case dt: DecimalType => throw new IllegalArgumentException( s"Too big scale specified(${dt.scale}). MemSQL DECIMAL maximum scale is ${MEMSQL_DECIMAL_MAX_SCALE}") case NullType => throw new IllegalArgumentException( "No corresponding MemSQL type found for NullType. If you want to use NullType, please write to an already existing MemSQL table.") case t => JdbcUtils.getCommonJDBCType(t) } override def getCatalystType(sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { (sqlType, typeName) match { case (Types.REAL, "FLOAT") => Option(FloatType) case (Types.BIT, "BIT") => Option(BinaryType) case (Types.TINYINT, "TINYINT") => Option(ShortType) case (Types.SMALLINT, "SMALLINT") => Option(ShortType) case (Types.DECIMAL, "DECIMAL") => { if (size > DecimalType.MAX_PRECISION) { throw new IllegalArgumentException( s"DECIMAL precision ${size} exceeds max precision ${DecimalType.MAX_PRECISION}") } else { Option( DecimalType(size, md.build().getLong("scale").toInt) ) } } case _ => None } } override def quoteIdentifier(colName: String): String = { s"`$colName`" } override def isCascadingTruncateTable(): Option[Boolean] = Some(false) }
Example 5
Source File: MemsqlRDD.scala From memsql-spark-connector with Apache License 2.0 | 5 votes |
package com.memsql.spark import java.sql.{Connection, PreparedStatement, ResultSet} import com.memsql.spark.SQLGen.VariableList import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils} import org.apache.spark.sql.types._ import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext} case class MemsqlRDD(query: String, variables: VariableList, options: MemsqlOptions, schema: StructType, expectedOutput: Seq[Attribute], @transient val sc: SparkContext) extends RDD[Row](sc, Nil) { override protected def getPartitions: Array[Partition] = MemsqlQueryHelpers.GetPartitions(options, query, variables) override def compute(rawPartition: Partition, context: TaskContext): Iterator[Row] = { var closed = false var rs: ResultSet = null var stmt: PreparedStatement = null var conn: Connection = null var partition: MemsqlPartition = rawPartition.asInstanceOf[MemsqlPartition] def tryClose(name: String, what: AutoCloseable): Unit = { try { if (what != null) { what.close() } } catch { case e: Exception => logWarning(s"Exception closing $name", e) } } def close(): Unit = { if (closed) { return } tryClose("resultset", rs) tryClose("statement", stmt) tryClose("connection", conn) closed = true } context.addTaskCompletionListener { context => close() } conn = JdbcUtils.createConnectionFactory(partition.connectionInfo)() stmt = conn.prepareStatement(partition.query) JdbcHelpers.fillStatement(stmt, partition.variables) rs = stmt.executeQuery() var rowsIter = JdbcUtils.resultSetToRows(rs, schema) if (expectedOutput.nonEmpty) { val schemaDatatypes = schema.map(_.dataType) val expectedDatatypes = expectedOutput.map(_.dataType) if (schemaDatatypes != expectedDatatypes) { val columnEncoders = schemaDatatypes.zip(expectedDatatypes).zipWithIndex.map { case ((_: StringType, _: NullType), _) => ((_: Row) => null) case ((_: ShortType, _: BooleanType), i) => ((r: Row) => r.getShort(i) != 0) case ((_: IntegerType, _: BooleanType), i) => ((r: Row) => r.getInt(i) != 0) case ((_: LongType, _: BooleanType), i) => ((r: Row) => r.getLong(i) != 0) case ((l, r), i) => { options.assert(l == r, s"MemsqlRDD: unable to encode ${l} into ${r}") ((r: Row) => r.get(i)) } } rowsIter = rowsIter .map(row => Row.fromSeq(columnEncoders.map(_(row)))) } } CompletionIterator[Row, Iterator[Row]](new InterruptibleIterator[Row](context, rowsIter), close) } }
Example 6
Source File: JdbcUtil.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.bahir.sql.streaming.jdbc import java.sql.{Connection, PreparedStatement} import java.util.Locale import org.apache.spark.sql.Row import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils import org.apache.spark.sql.jdbc.{JdbcDialect, JdbcType} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String object JdbcUtil { def getJdbcType(dt: DataType, dialect: JdbcDialect): JdbcType = { dialect.getJDBCType(dt).orElse(JdbcUtils.getCommonJDBCType(dt)).getOrElse( throw new IllegalArgumentException(s"Can't get JDBC type for ${dt.simpleString}")) } // A `JDBCValueSetter` is responsible for setting a value from `Row` into a field for // `PreparedStatement`. The last argument `Int` means the index for the value to be set // in the SQL statement and also used for the value in `Row`. type JDBCValueSetter = (PreparedStatement, Row, Int) => Unit def makeSetter( conn: Connection, dialect: JdbcDialect, dataType: DataType): JDBCValueSetter = dataType match { case IntegerType => (stmt: PreparedStatement, row: Row, pos: Int) => stmt.setInt(pos + 1, row.getInt(pos)) case LongType => (stmt: PreparedStatement, row: Row, pos: Int) => stmt.setLong(pos + 1, row.getLong(pos)) case DoubleType => (stmt: PreparedStatement, row: Row, pos: Int) => stmt.setDouble(pos + 1, row.getDouble(pos)) case FloatType => (stmt: PreparedStatement, row: Row, pos: Int) => stmt.setFloat(pos + 1, row.getFloat(pos)) case ShortType => (stmt: PreparedStatement, row: Row, pos: Int) => stmt.setInt(pos + 1, row.getShort(pos)) case ByteType => (stmt: PreparedStatement, row: Row, pos: Int) => stmt.setInt(pos + 1, row.getByte(pos)) case BooleanType => (stmt: PreparedStatement, row: Row, pos: Int) => stmt.setBoolean(pos + 1, row.getBoolean(pos)) case StringType => (stmt: PreparedStatement, row: Row, pos: Int) => val strValue = row.get(pos) match { case str: UTF8String => str.toString case str: String => str } stmt.setString(pos + 1, strValue) case BinaryType => (stmt: PreparedStatement, row: Row, pos: Int) => stmt.setBytes(pos + 1, row.getAs[Array[Byte]](pos)) case TimestampType => (stmt: PreparedStatement, row: Row, pos: Int) => stmt.setTimestamp(pos + 1, row.getAs[java.sql.Timestamp](pos)) case DateType => (stmt: PreparedStatement, row: Row, pos: Int) => stmt.setDate(pos + 1, row.getAs[java.sql.Date](pos)) case t: DecimalType => (stmt: PreparedStatement, row: Row, pos: Int) => stmt.setBigDecimal(pos + 1, row.getDecimal(pos)) case ArrayType(et, _) => // remove type length parameters from end of type name val typeName = getJdbcType(et, dialect).databaseTypeDefinition .toLowerCase(Locale.ROOT).split("\\(")(0) (stmt: PreparedStatement, row: Row, pos: Int) => val array = conn.createArrayOf( typeName, row.getSeq[AnyRef](pos).toArray) stmt.setArray(pos + 1, array) case _ => (_: PreparedStatement, _: Row, pos: Int) => throw new IllegalArgumentException( s"Can't translate non-null value for field $pos") } }
Example 7
Source File: PostgresDialect.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.{Connection, Types} import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils} import org.apache.spark.sql.types._ private object PostgresDialect extends JdbcDialect { override def canHandle(url: String): Boolean = url.startsWith("jdbc:postgresql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.REAL) { Some(FloatType) } else if (sqlType == Types.SMALLINT) { Some(ShortType) } else if (sqlType == Types.BIT && typeName.equals("bit") && size != 1) { Some(BinaryType) } else if (sqlType == Types.OTHER) { Some(StringType) } else if (sqlType == Types.ARRAY) { val scale = md.build.getLong("scale").toInt // postgres array type names start with underscore toCatalystType(typeName.drop(1), size, scale).map(ArrayType(_)) } else None } private def toCatalystType( typeName: String, precision: Int, scale: Int): Option[DataType] = typeName match { case "bool" => Some(BooleanType) case "bit" => Some(BinaryType) case "int2" => Some(ShortType) case "int4" => Some(IntegerType) case "int8" | "oid" => Some(LongType) case "float4" => Some(FloatType) case "money" | "float8" => Some(DoubleType) case "text" | "varchar" | "char" | "cidr" | "inet" | "json" | "jsonb" | "uuid" => Some(StringType) case "bytea" => Some(BinaryType) case "timestamp" | "timestamptz" | "time" | "timetz" => Some(TimestampType) case "date" => Some(DateType) case "numeric" | "decimal" => Some(DecimalType.bounded(precision, scale)) case _ => None } override def getJDBCType(dt: DataType): Option[JdbcType] = dt match { case StringType => Some(JdbcType("TEXT", Types.CHAR)) case BinaryType => Some(JdbcType("BYTEA", Types.BINARY)) case BooleanType => Some(JdbcType("BOOLEAN", Types.BOOLEAN)) case FloatType => Some(JdbcType("FLOAT4", Types.FLOAT)) case DoubleType => Some(JdbcType("FLOAT8", Types.DOUBLE)) case ShortType => Some(JdbcType("SMALLINT", Types.SMALLINT)) case t: DecimalType => Some( JdbcType(s"NUMERIC(${t.precision},${t.scale})", java.sql.Types.NUMERIC)) case ArrayType(et, _) if et.isInstanceOf[AtomicType] => getJDBCType(et).map(_.databaseTypeDefinition) .orElse(JdbcUtils.getCommonJDBCType(et).map(_.databaseTypeDefinition)) .map(typeName => JdbcType(s"$typeName[]", java.sql.Types.ARRAY)) case ByteType => throw new IllegalArgumentException(s"Unsupported type in postgresql: $dt"); case _ => None } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def beforeFetch(connection: Connection, properties: Map[String, String]): Unit = { super.beforeFetch(connection, properties) // According to the postgres jdbc documentation we need to be in autocommit=false if we actually // want to have fetchsize be non 0 (all the rows). This allows us to not have to cache all the // rows inside the driver when fetching. // // See: https://jdbc.postgresql.org/documentation/head/query.html#query-with-cursor // if (properties.getOrElse(JDBCOptions.JDBC_BATCH_FETCH_SIZE, "0").toInt > 0) { connection.setAutoCommit(false) } } override def isCascadingTruncateTable(): Option[Boolean] = Some(true) }
Example 8
Source File: PostgresDialect.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.{Connection, Types} import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils} import org.apache.spark.sql.types._ private object PostgresDialect extends JdbcDialect { override def canHandle(url: String): Boolean = url.startsWith("jdbc:postgresql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.REAL) { Some(FloatType) } else if (sqlType == Types.SMALLINT) { Some(ShortType) } else if (sqlType == Types.BIT && typeName.equals("bit") && size != 1) { Some(BinaryType) } else if (sqlType == Types.OTHER) { Some(StringType) } else if (sqlType == Types.ARRAY) { val scale = md.build.getLong("scale").toInt // postgres array type names start with underscore toCatalystType(typeName.drop(1), size, scale).map(ArrayType(_)) } else None } private def toCatalystType( typeName: String, precision: Int, scale: Int): Option[DataType] = typeName match { case "bool" => Some(BooleanType) case "bit" => Some(BinaryType) case "int2" => Some(ShortType) case "int4" => Some(IntegerType) case "int8" | "oid" => Some(LongType) case "float4" => Some(FloatType) case "money" | "float8" => Some(DoubleType) case "text" | "varchar" | "char" | "cidr" | "inet" | "json" | "jsonb" | "uuid" => Some(StringType) case "bytea" => Some(BinaryType) case "timestamp" | "timestamptz" | "time" | "timetz" => Some(TimestampType) case "date" => Some(DateType) case "numeric" | "decimal" => Some(DecimalType.bounded(precision, scale)) case _ => None } override def getJDBCType(dt: DataType): Option[JdbcType] = dt match { case StringType => Some(JdbcType("TEXT", Types.CHAR)) case BinaryType => Some(JdbcType("BYTEA", Types.BINARY)) case BooleanType => Some(JdbcType("BOOLEAN", Types.BOOLEAN)) case FloatType => Some(JdbcType("FLOAT4", Types.FLOAT)) case DoubleType => Some(JdbcType("FLOAT8", Types.DOUBLE)) case ShortType => Some(JdbcType("SMALLINT", Types.SMALLINT)) case t: DecimalType => Some( JdbcType(s"NUMERIC(${t.precision},${t.scale})", java.sql.Types.NUMERIC)) case ArrayType(et, _) if et.isInstanceOf[AtomicType] => getJDBCType(et).map(_.databaseTypeDefinition) .orElse(JdbcUtils.getCommonJDBCType(et).map(_.databaseTypeDefinition)) .map(typeName => JdbcType(s"$typeName[]", java.sql.Types.ARRAY)) case ByteType => throw new IllegalArgumentException(s"Unsupported type in postgresql: $dt"); case _ => None } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def getTruncateQuery(table: String): String = { s"TRUNCATE TABLE ONLY $table" } override def beforeFetch(connection: Connection, properties: Map[String, String]): Unit = { super.beforeFetch(connection, properties) // According to the postgres jdbc documentation we need to be in autocommit=false if we actually // want to have fetchsize be non 0 (all the rows). This allows us to not have to cache all the // rows inside the driver when fetching. // // See: https://jdbc.postgresql.org/documentation/head/query.html#query-with-cursor // if (properties.getOrElse(JDBCOptions.JDBC_BATCH_FETCH_SIZE, "0").toInt > 0) { connection.setAutoCommit(false) } } override def isCascadingTruncateTable(): Option[Boolean] = Some(false) }
Example 9
Source File: PostgresDialect.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.{Connection, Types} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils import org.apache.spark.sql.types._ private object PostgresDialect extends JdbcDialect { override def canHandle(url: String): Boolean = url.startsWith("jdbc:postgresql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.BIT && typeName.equals("bit") && size != 1) { Some(BinaryType) } else if (sqlType == Types.OTHER) { toCatalystType(typeName).filter(_ == StringType) } else if (sqlType == Types.ARRAY && typeName.length > 1 && typeName(0) == '_') { toCatalystType(typeName.drop(1)).map(ArrayType(_)) } else None } // TODO: support more type names. private def toCatalystType(typeName: String): Option[DataType] = typeName match { case "bool" => Some(BooleanType) case "bit" => Some(BinaryType) case "int2" => Some(ShortType) case "int4" => Some(IntegerType) case "int8" | "oid" => Some(LongType) case "float4" => Some(FloatType) case "money" | "float8" => Some(DoubleType) case "text" | "varchar" | "char" | "cidr" | "inet" | "json" | "jsonb" | "uuid" => Some(StringType) case "bytea" => Some(BinaryType) case "timestamp" | "timestamptz" | "time" | "timetz" => Some(TimestampType) case "date" => Some(DateType) case "numeric" => Some(DecimalType.SYSTEM_DEFAULT) case _ => None } override def getJDBCType(dt: DataType): Option[JdbcType] = dt match { case StringType => Some(JdbcType("TEXT", Types.CHAR)) case BinaryType => Some(JdbcType("BYTEA", Types.BINARY)) case BooleanType => Some(JdbcType("BOOLEAN", Types.BOOLEAN)) case FloatType => Some(JdbcType("FLOAT4", Types.FLOAT)) case DoubleType => Some(JdbcType("FLOAT8", Types.DOUBLE)) case ArrayType(et, _) if et.isInstanceOf[AtomicType] => getJDBCType(et).map(_.databaseTypeDefinition) .orElse(JdbcUtils.getCommonJDBCType(et).map(_.databaseTypeDefinition)) .map(typeName => JdbcType(s"$typeName[]", java.sql.Types.ARRAY)) case ByteType => throw new IllegalArgumentException(s"Unsupported type in postgresql: $dt"); case _ => None } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def beforeFetch(connection: Connection, properties: Map[String, String]): Unit = { super.beforeFetch(connection, properties) // According to the postgres jdbc documentation we need to be in autocommit=false if we actually // want to have fetchsize be non 0 (all the rows). This allows us to not have to cache all the // rows inside the driver when fetching. // // See: https://jdbc.postgresql.org/documentation/head/query.html#query-with-cursor // if (properties.getOrElse("fetchsize", "0").toInt > 0) { connection.setAutoCommit(false) } } }